ensembl-hive  2.8.1
TranscriptSelector.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::Utils::TranscriptSelector - Finds canonical transcripts
34 
35 =head1 SYNOPSIS
36 
37  my $selector = Bio::EnsEMBL::Utils::TranscriptSelector->new($ccds_dba);
38  my $canonical_transcript = $selector->select_canonical_transcript_for_Gene($gene);
39 
40 =head1 DESCRIPTION
41 
42  The decision process for choosing a canonical transcript of a given Gene is
43  an involved process. This package converts transcript attributes into
44  numeric values, sorts the values and returns the favourite transcript.
45 
46  The canonical order of precedence is as follows:
47  longest translation of transcript present in CCDS that is reference sequence
48  longest translation of protein-coding transcript
49  longest translation of transcript marked nonsense-mediated-decay
50  longest translation of any other transcript (premature stop codon translations have an effective length of 0)
51  longest non-coding transcript
52  first stable ID in alphabetical order
53 
54  The last condition is to give consistent behaviour when everything is else is equal.
55  It selects the "older" stable ID, thus preventing new IDs supplanting old ones that
56  remain correct.
57 =cut
58 
59 package Bio::EnsEMBL::Utils::TranscriptSelector;
60 
61 use strict;
62 use warnings;
64 
65 =head2 new
66 
67  Arg [1] : Optional - CCDS database adaptor - needed for species with CCDS only
68  Arg [2] : Optional - Boolean verbose flag. Turn on to fill your logs
69  Description: Constructor
71 
72 =cut
73 
74 sub new {
75  my $class = shift;
76  my $self = {
77  'ccds_dba' => shift,
78  'verbose' => shift,
79  };
80  bless $self, $class;
81  if (not defined ($self->{'ccds_dba'}) ) { warning ("Running without CCDS DB");}
82  return $self;
83 }
84 
85 =head2 select_canonical_transcript_for_Gene
86 
87  Arg 1 : Bio::EnsEMBL::Gene
88  Example : $canonical_transcript = $selector->select_canonical_transcript_for_Gene
89  Description: Sorts the Transcripts of this Gene into order of suitability,
90  and returns the favourite Transcript.
91  Returntype : Bio::EnsEMBL::Transcript
92  Exceptions :
93 
94 =cut
95 
96 sub select_canonical_transcript_for_Gene {
97  my $self = shift;
98  my $gene = shift;
99 
100  if (!$gene) {throw ('Cannot select canonical transcript without a gene.');}
101 
102  my $transcript_array = $gene->get_all_Transcripts;
103  my @transcripts;
104  if ($transcript_array) {
105  @transcripts = @$transcript_array;
106  } else {
107  warning('No transcripts attached to gene '.$gene->stable_id);
108  return;
109  }
110  my @encoded; # array of encoded transcripts
111 
112  foreach my $transcript (@transcripts) {
113  my $encoded_transcript = $self->encode_transcript($transcript);
114  push(@encoded, $encoded_transcript );
115  if ($self->{'verbose'}) {
116  printf "%s encoded to: [%s,%s,%s,%s,%s,%s,%s]\n",$transcript->stable_id, @$encoded_transcript;
117  }
118  }
119 
120  my $sorted_ids = $self->sort_into_canonical_order(\@encoded);
121  if ($self->{'verbose'}) {
122  print "Sorted order: \n";
123  foreach my $dbID (@$sorted_ids) {
124  print $dbID."\n";
125  }
126  }
127  my $canonical_dbID = $sorted_ids->[0];
128 
129  foreach my $transcript (@transcripts) {
130  if ($transcript->dbID == $canonical_dbID) {
131  if ($self->{'verbose'}) {print 'Chosen transcript: '.$transcript->stable_id."\n";}
132  return $transcript;
133  }
134  }
135  throw ('Run out of transcripts without finding selected canonical dbID.')
136 }
137 
138 
139 # Constants for doing numerical sorting of transcripts based upon which ones we prefer. Lowest is best.
140 my %source_priority = ('ccds' => 1,
141  'merged' => 2,
142  'other' => 3);
143 my %biotype_priority = ('protein_coding' => 1,
144  'nonsense_mediated_decay' => 2,
145  'non_stop_decay' => 2,
146  'polymorphic_pseudogene' => 2,
147  'protein_coding_LoF' => 2,
148  'other' => 3,
149 );
150 
151 =head2 encode_transcript
152 
153  Arg 1 : Transcript
154  Description: Converts a transcript into a list of encoded values for sorting
155  Priorities are defined immediately above
156  Unimportant biotypes and sources are classed as 'other'
157  Returntype : Listref of encoded attributes
158 =cut
159 
160 sub encode_transcript {
161  my $self = shift;
162  my $transcript = shift;
163 
164  my $type;
165  if ( $self->{'ccds_dba'} && $transcript->slice->is_reference()
166  && $self->check_Ens_trans_against_CCDS($transcript) ) {
167  $type = 'ccds';
168  } elsif ($transcript->analysis->logic_name eq 'ensembl_havana_transcript') {
169  $type = 'merged';
170  } else {
171  $type = 'other';
172  }
173 
174  my $translation = $transcript->translate;
175  my $translation_length = 0;
176  if ($translation) {
177  $translation_length = $translation->length;
178  # Translations containing premature stops are undesirable.
179  if ($translation->seq =~ /\*/) {$translation_length = 0;}
180  }
181  # Zero-length/non-existent translations are ok. We sort by coding and non-coding first
182  my $translates = 0;
183  if ($translation_length != 0) {$translates = 1;}
184 
185  my $transcript_length = $transcript->length;
186 
187  my $biotype;
188  if ( $transcript->biotype() ne 'protein_coding'
189  && $transcript->biotype() ne 'nonsense_mediated_decay'
190  && $transcript->biotype() ne 'non_stop_decay'
191  && $transcript->biotype() ne 'polymorphic_pseudogene'
192  && $transcript->biotype() ne 'protein_coding_LoF' ) {
193  $biotype = 'other';
194  } else { $biotype = $transcript->biotype(); }
195 
196  return [$transcript->dbID,
197  $translates,
198  $source_priority{$type},
199  $biotype_priority{$biotype},
200  $translation_length,
201  $transcript_length,
202  $transcript->stable_id];
203 }
204 
205 
206 =head2 sort_into_canonical_order
207 
208  Arg 1 : 2D array reference of numerically encoded values
209  0 1 2 3 4 5 6
210  ( [transcript dbID, translates, source , biotype, translation length, transcript length, stable ID],
211  ...
212  )
213  Description: see Schwartzian transform for method in the following madness:
214  sort the 6-column array by the last 5 columns, then map the first elements
215  into a list of dbIDs, now in canonical order.
216  Returntype : Listref of ensembl dbIDs
217  Caller : select_canonical_transcript_for_Gene
218 =cut
219 
220 sub sort_into_canonical_order {
221  my $self = shift;
222  my $encoded_list_ref = shift;
223 
224  my @sorted_ids = map { $_->[0] }
225  sort {
226  # [0] contains ID
227  $b->[1] <=> $a->[1] || # translates
228  $a->[2] <=> $b->[2] || # source
229  $a->[3] <=> $b->[3] || # biotype
230  $b->[4] <=> $a->[4] || # translation length (largest is best, $a and $b reversed)
231  $b->[5] <=> $a->[5] || # transcript length "
232  $a->[6] cmp $b->[6] # stable id. All other things being equal, 'lowest' transcript ID wins
233  } @{$encoded_list_ref};
234 
235  return \@sorted_ids;
236 }
237 
238 =head2 check_Ens_trans_against_CCDS
239 
240  Arg 1 : Transcript
241  Description: Attempts to find a matching transcript in CCDS by comparing Exon
242  composition. Returns true if one is found, or silently ends.
243  Returntype : Boolean
244  Caller : encode_transcript
245 =cut
246 
247 sub check_Ens_trans_against_CCDS {
248  my ( $self ,$transcript ) = @_;
249 
250  my @translateable_exons = @{ $transcript->get_all_translateable_Exons };
251 
252  my $seq_region_name = $transcript->slice->seq_region_name;
253  my $seq_region_start = $transcript->seq_region_start;
254  my $seq_region_end = $transcript->seq_region_end;
255  my $ccds_dba = $self->{'ccds_dba'};
256 
257  my $ext_slice = $ccds_dba->get_SliceAdaptor->fetch_by_region(
258  'toplevel',
259  $seq_region_name,
260  $seq_region_start,
261  $seq_region_end );
262 
263  EXT_GENE: foreach my $ext_gene ( @{ $ext_slice->get_all_Genes } ) {
264  EXT_TRANS: foreach my $ext_trans ( @{ $ext_gene->get_all_Transcripts } ) {
265  my @ext_exons = @{ $ext_trans->get_all_Exons };
266 
267  if ( scalar(@translateable_exons) == scalar(@ext_exons) ) {
268  for ( my $i = 0 ; $i < scalar(@translateable_exons) ; $i++ ) {
269  if ( $translateable_exons[$i]->coding_region_start($transcript)
270  != $ext_exons[$i]->seq_region_start
271  || $translateable_exons[$i]->strand
272  != $ext_exons[$i]->strand
273  || $translateable_exons[$i]->coding_region_end($transcript)
274  != $ext_exons[$i]->seq_region_end
275  ) {
276  next EXT_TRANS;
277  }
278  }
279  print "Ensembl transcript "
280  . $transcript->display_id
281  . " found match "
282  . $ext_gene->display_id
283  . " in CCDS DB.\n" if $self->{'verbose'};
284  if ($ext_gene->stable_id !~ /^CCDS/) {
285  throw (sprintf("Database does not appear to contain CCDS IDs. Possible configuration problem with CCDS source. Found ID %s", $ext_gene->stable_id()));
286  }
287  return 1;
288  }
289  } # end foreach EXT_TRANS
290  } # end foreach EXT_GENE
291 } ## end sub check_Ens_trans_against_CCDS
292 
293 1;
transcript
public transcript()
map
public map()
Bio::EnsEMBL::Utils::TranscriptSelector::select_canonical_transcript_for_Gene
public Bio::EnsEMBL::Transcript select_canonical_transcript_for_Gene()
Bio::EnsEMBL::Utils::TranscriptSelector::new
public Bio::EnsEMBL::Utils::TranscriptSelector new()
Bio::EnsEMBL::Gene
Definition: Gene.pm:37
Bio::EnsEMBL::Transcript
Definition: Transcript.pm:44
Bio::EnsEMBL::Utils::TranscriptSelector
Definition: TranscriptSelector.pm:30
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68