3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
42 The decision process
for choosing a canonical
transcript of a given
Gene is
43 an involved process. This
package converts transcript attributes into
44 numeric values, sorts the values and returns the favourite transcript.
46 The canonical order of precedence is as follows:
47 longest translation of transcript present in CCDS that is reference sequence
48 longest translation of protein-coding transcript
49 longest translation of transcript marked nonsense-mediated-decay
50 longest translation of any other transcript (premature stop codon translations have an effective length of 0)
51 longest non-coding transcript
52 first stable ID in alphabetical order
54 The last condition is to give consistent behaviour when everything is else is equal.
55 It selects the "older" stable ID, thus preventing new IDs supplanting old ones that
59 package Bio::EnsEMBL::Utils::TranscriptSelector;
67 Arg [1] : Optional - CCDS database adaptor - needed
for species with CCDS only
68 Arg [2] : Optional - Boolean verbose flag. Turn on to fill your logs
69 Description: Constructor
81 if (not defined ($self->{
'ccds_dba'}) ) { warning (
"Running without CCDS DB");}
85 =head2 select_canonical_transcript_for_Gene
88 Example : $canonical_transcript = $selector->select_canonical_transcript_for_Gene
89 Description: Sorts the Transcripts of
this Gene into order of suitability,
96 sub select_canonical_transcript_for_Gene {
100 if (!$gene) {
throw (
'Cannot select canonical transcript without a gene.');}
102 my $transcript_array = $gene->get_all_Transcripts;
104 if ($transcript_array) {
105 @transcripts = @$transcript_array;
107 warning(
'No transcripts attached to gene '.$gene->stable_id);
110 my @encoded; # array of encoded transcripts
112 foreach my $transcript (@transcripts) {
113 my $encoded_transcript = $self->encode_transcript($transcript);
114 push(@encoded, $encoded_transcript );
115 if ($self->{
'verbose'}) {
116 printf
"%s encoded to: [%s,%s,%s,%s,%s,%s,%s]\n",$transcript->stable_id, @$encoded_transcript;
120 my $sorted_ids = $self->sort_into_canonical_order(\@encoded);
121 if ($self->{
'verbose'}) {
122 print
"Sorted order: \n";
123 foreach my $dbID (@$sorted_ids) {
127 my $canonical_dbID = $sorted_ids->[0];
129 foreach my $transcript (@transcripts) {
130 if ($transcript->dbID == $canonical_dbID) {
131 if ($self->{
'verbose'}) {print
'Chosen transcript: '.$transcript->stable_id.
"\n";}
135 throw (
'Run out of transcripts without finding selected canonical dbID.')
139 # Constants for doing numerical sorting of transcripts based upon which ones we prefer. Lowest is best.
140 my %source_priority = (
'ccds' => 1,
143 my %biotype_priority = (
'protein_coding' => 1,
144 'nonsense_mediated_decay' => 2,
145 'non_stop_decay' => 2,
146 'polymorphic_pseudogene' => 2,
147 'protein_coding_LoF' => 2,
151 =head2 encode_transcript
154 Description: Converts a
transcript into a list of encoded values
for sorting
155 Priorities are defined immediately above
156 Unimportant biotypes and sources are classed as
'other'
157 Returntype : Listref of encoded attributes
160 sub encode_transcript {
162 my $transcript = shift;
165 if ( $self->{
'ccds_dba'} && $transcript->slice->is_reference()
166 && $self->check_Ens_trans_against_CCDS($transcript) ) {
168 } elsif ($transcript->analysis->logic_name eq
'ensembl_havana_transcript') {
174 my $translation = $transcript->translate;
175 my $translation_length = 0;
177 $translation_length = $translation->length;
178 # Translations containing premature stops are undesirable.
179 if ($translation->seq =~ /\*/) {$translation_length = 0;}
181 # Zero-length/non-existent translations are ok. We sort by coding and non-coding first
183 if ($translation_length != 0) {$translates = 1;}
185 my $transcript_length = $transcript->length;
188 if ( $transcript->biotype() ne
'protein_coding'
189 && $transcript->biotype() ne
'nonsense_mediated_decay'
190 && $transcript->biotype() ne
'non_stop_decay'
191 && $transcript->biotype() ne
'polymorphic_pseudogene'
192 && $transcript->biotype() ne
'protein_coding_LoF' ) {
194 }
else { $biotype = $transcript->biotype(); }
196 return [$transcript->dbID,
198 $source_priority{$type},
199 $biotype_priority{$biotype},
202 $transcript->stable_id];
206 =head2 sort_into_canonical_order
208 Arg 1 : 2D array reference of numerically encoded values
210 ( [
transcript dbID, translates, source , biotype, translation length,
transcript length, stable ID],
213 Description: see Schwartzian transform
for method in the following madness:
214 sort the 6-column array by the last 5 columns, then
map the first elements
215 into a list of dbIDs, now in canonical order.
216 Returntype : Listref of ensembl dbIDs
217 Caller : select_canonical_transcript_for_Gene
220 sub sort_into_canonical_order {
222 my $encoded_list_ref = shift;
224 my @sorted_ids =
map { $_->[0] }
227 $b->[1] <=> $a->[1] || # translates
228 $a->[2] <=> $b->[2] || # source
229 $a->[3] <=> $b->[3] || # biotype
230 $b->[4] <=> $a->[4] || # translation length (largest is best, $a and $b reversed)
232 $a->[6] cmp $b->[6] # stable id. All other things being equal, 'lowest' transcript ID wins
233 } @{$encoded_list_ref};
238 =head2 check_Ens_trans_against_CCDS
241 Description: Attempts to find a matching transcript in CCDS by comparing Exon
242 composition. Returns true if one is found, or silently ends.
244 Caller : encode_transcript
247 sub check_Ens_trans_against_CCDS {
248 my ( $self ,$transcript ) = @_;
250 my @translateable_exons = @{ $transcript->get_all_translateable_Exons };
252 my $seq_region_name = $transcript->slice->seq_region_name;
253 my $seq_region_start = $transcript->seq_region_start;
254 my $seq_region_end = $transcript->seq_region_end;
255 my $ccds_dba = $self->{'ccds_dba'};
257 my $ext_slice = $ccds_dba->get_SliceAdaptor->fetch_by_region(
263 EXT_GENE: foreach my $ext_gene ( @{ $ext_slice->get_all_Genes } ) {
264 EXT_TRANS: foreach my $ext_trans ( @{ $ext_gene->get_all_Transcripts } ) {
265 my @ext_exons = @{ $ext_trans->get_all_Exons };
267 if ( scalar(@translateable_exons) == scalar(@ext_exons) ) {
268 for ( my $i = 0 ; $i < scalar(@translateable_exons) ; $i++ ) {
269 if ( $translateable_exons[$i]->coding_region_start($transcript)
270 != $ext_exons[$i]->seq_region_start
271 || $translateable_exons[$i]->strand
272 != $ext_exons[$i]->strand
273 || $translateable_exons[$i]->coding_region_end($transcript)
274 != $ext_exons[$i]->seq_region_end
280 . $transcript->display_id
282 . $ext_gene->display_id
283 . " in CCDS DB.\n
" if $self->{'verbose'};
284 if ($ext_gene->stable_id !~ /^CCDS/) {
285 throw (sprintf("Database does not appear to contain CCDS IDs. Possible configuration problem with CCDS source. Found ID %s
", $ext_gene->stable_id()));
289 } # end foreach EXT_TRANS
290 } # end foreach EXT_GENE
291 } ## end sub check_Ens_trans_against_CCDS