2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 ###################################################
19 # A script to investigate the results of canonical assignment pre-run log
20 # output. We know of 3 normal scenarios for assignment
22 # - Stable ID ordering
23 # - Transcript length (longest)
24 # - Translation length (longest)
26 # A not so normal scenario is the choice of an NMD over an Ensembl protein
27 # coder. To emit these in all their glory, add NMD to the command line. e.g.
29 # perl reason_changes.pl logfile NMD
31 # If it was not one of these we will complain bitterly and you should
32 # investigate. If it is a known reason after investigation, that is an
33 # allowed scenario, then code it into this script.
35 ##################################################
42 my $blurt = $ARGV[1] ;
45 die
"Was not given a file. Command is $0 PATH_TO_FILE";
48 die
"Could not find file";
52 transcript_length => 0,
54 translation_length => 0,
55 havana_merge_nmd_over_e_coding => 0,
61 my (@old_contents) = @{$old}[1..4];
62 my ($old_id, $old_length) = ($old->[-1], $old->[-2]);
63 my (@new_contents) = @{$new}[1..4];
64 my ($new_id, $new_length) = ($new->[-1], $new->[-2]);
66 my $old_key = join(q{
'=!='}, @old_contents)
if defined($old_contents[0]);
67 my $new_key = join(q{
'=!='}, @new_contents);
71 if($old_key && $old_key eq $new_key) {
72 if($new_length > $old_length) {
73 $reason_key =
'transcript_length';
75 elsif($new_length < $old_length) {
76 $reason_key =
'other';
77 $possible_reason =
'the old transcript length was longer than the new transcript';
80 if( ($new_id cmp $old_id) > 0 ) {
81 $reason_key =
'other';
82 $possible_reason =
'the old ID was lexographically lower than the new ID';
85 $reason_key =
'id_ordering';
90 #If all other keys are equal then we used translation length
91 if( defined($old->[1]) && join(q{=!=},@{$old}[1..3]) eq join(q{=!=},@{$new}[1..3]) ) {
92 $reason_key =
'translation_length';
94 #If we have translatable genes and we prefered Havana merged NMD over E/H PC
96 $old->[1] && $new->[1] && #We had translatable genes
97 $old->[2] == 3 && #Old was an Ensembl/Havana
transcript
98 $new->[2] == 2 && #New was E! Havana merged
99 $old->[3] == 1 && #Old was a Protein Coding
100 $new->[3] == 2 && #New was a
"NMD"
101 $blurt !~ /nmd/i #User suppressing
this test
103 $reason_key =
'havana_merge_nmd_over_e_coding';
104 } elsif (! $old->[1] ) {
105 # The old canonical transcript was not found.
108 $reason_key =
'other';
112 if($reason_key eq
'other') {
113 if($possible_reason) {
114 printf(
"ERROR: %s -> %s change was wrong because %s. Please investigate\n", $old_id, $new_id, $possible_reason);
117 printf(
"ERROR: %s -> %s change not due to a known reason. Please investigate\n", $old_id, $new_id);
119 printf(
"\tURL: www.ensemblgenomes.org/id/%s\n", $old_id);
120 printf(
"\tURL: www.ensemblgenomes.org/id/%s\n", $new_id);
121 printf(
"\tENC: %s\n", encode_json($old));
122 printf(
"\tENC: %s\n", encode_json($new));
123 print(
"\tSee Bio::EnsEMBL::Utils::TranscriptSelector::sort_into_canonical_order() POD to decode\n");
126 $reason_count{$reason_key}++;
134 open my $fh,
'<', $file or die
"Could not open $file for reading: $!";
135 while(my $line = <$fh>) {
137 if($line eq
'//' && $old && $new) {
139 ($old, $new) = (undef, undef);
142 elsif($line =~ /^Old=(\[.+\])$/) {
145 elsif($line =~ /^New=(\[.+\])$/) {
150 compare($old, $new)
if $old && $new;
152 foreach my $reason (keys %reason_count) {
153 printf(
'%s = %d'.
"\n", $reason, $reason_count{$reason});