ensembl-hive  2.7.0
reason_changes.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 ###################################################
18 #
19 # A script to investigate the results of canonical assignment pre-run log
20 # output. We know of 3 normal scenarios for assignment
21 #
22 # - Stable ID ordering
23 # - Transcript length (longest)
24 # - Translation length (longest)
25 #
26 # A not so normal scenario is the choice of an NMD over an Ensembl protein
27 # coder. To emit these in all their glory, add NMD to the command line. e.g.
28 #
29 # perl reason_changes.pl logfile NMD
30 #
31 # If it was not one of these we will complain bitterly and you should
32 # investigate. If it is a known reason after investigation, that is an
33 # allowed scenario, then code it into this script.
34 #
35 ##################################################
36 
37 use strict;
38 use warnings;
39 use JSON;
40 
41 my $file = $ARGV[0];
42 my $blurt = $ARGV[1] ;
43 
44 if(! $file) {
45  die "Was not given a file. Command is $0 PATH_TO_FILE";
46 }
47 if(! -f $file) {
48  die "Could not find file";
49 }
50 
51 my %reason_count = (
52  transcript_length => 0,
53  id_ordering => 0,
54  translation_length => 0,
55  havana_merge_nmd_over_e_coding => 0,
56  other => 0
57 );
58 
59 sub compare {
60  my ($old, $new) = @_;
61  my (@old_contents) = @{$old}[1..4];
62  my ($old_id, $old_length) = ($old->[-1], $old->[-2]);
63  my (@new_contents) = @{$new}[1..4];
64  my ($new_id, $new_length) = ($new->[-1], $new->[-2]);
65 
66  my $old_key = join(q{'=!='}, @old_contents) if defined($old_contents[0]);
67  my $new_key = join(q{'=!='}, @new_contents);
68 
69  my $reason_key;
70  my $possible_reason;
71  if($old_key && $old_key eq $new_key) {
72  if($new_length > $old_length) {
73  $reason_key = 'transcript_length';
74  }
75  elsif($new_length < $old_length) {
76  $reason_key = 'other';
77  $possible_reason = 'the old transcript length was longer than the new transcript';
78  }
79  else {
80  if( ($new_id cmp $old_id) > 0 ) {
81  $reason_key = 'other';
82  $possible_reason = 'the old ID was lexographically lower than the new ID';
83  }
84  else {
85  $reason_key = 'id_ordering';
86  }
87  }
88  }
89  else {
90  #If all other keys are equal then we used translation length
91  if( defined($old->[1]) && join(q{=!=},@{$old}[1..3]) eq join(q{=!=},@{$new}[1..3]) ) {
92  $reason_key = 'translation_length';
93  }
94  #If we have translatable genes and we prefered Havana merged NMD over E/H PC
95  elsif(
96  $old->[1] && $new->[1] && #We had translatable genes
97  $old->[2] == 3 && #Old was an Ensembl/Havana transcript
98  $new->[2] == 2 && #New was E! Havana merged
99  $old->[3] == 1 && #Old was a Protein Coding
100  $new->[3] == 2 && #New was a "NMD"
101  $blurt !~ /nmd/i #User suppressing this test
102  ) {
103  $reason_key = 'havana_merge_nmd_over_e_coding';
104  } elsif (! $old->[1] ) {
105  # The old canonical transcript was not found.
106  $reason_key = 'new';
107  } else {
108  $reason_key = 'other';
109  }
110  }
111 
112  if($reason_key eq 'other') {
113  if($possible_reason) {
114  printf("ERROR: %s -> %s change was wrong because %s. Please investigate\n", $old_id, $new_id, $possible_reason);
115  }
116  else {
117  printf("ERROR: %s -> %s change not due to a known reason. Please investigate\n", $old_id, $new_id);
118  }
119  printf("\tURL: www.ensemblgenomes.org/id/%s\n", $old_id);
120  printf("\tURL: www.ensemblgenomes.org/id/%s\n", $new_id);
121  printf("\tENC: %s\n", encode_json($old));
122  printf("\tENC: %s\n", encode_json($new));
123  print("\tSee Bio::EnsEMBL::Utils::TranscriptSelector::sort_into_canonical_order() POD to decode\n");
124  }
125 
126  $reason_count{$reason_key}++;
127 
128  return;
129 }
130 
131 my $old;
132 my $new;
133 
134 open my $fh, '<', $file or die "Could not open $file for reading: $!";
135 while(my $line = <$fh>) {
136  chomp $line;
137  if($line eq '//' && $old && $new) {
138  compare($old, $new);
139  ($old, $new) = (undef, undef);
140  #Perform comparison
141  }
142  elsif($line =~ /^Old=(\[.+\])$/) {
143  $old = eval $1;
144  }
145  elsif($line =~ /^New=(\[.+\])$/) {
146  $new = eval $1;
147  }
148 }
149 
150 compare($old, $new) if $old && $new;
151 
152 foreach my $reason (keys %reason_count) {
153  printf('%s = %d'."\n", $reason, $reason_count{$reason});
154 }
transcript
public transcript()
compare
public compare()