ensembl-hive  2.7.0
Translation.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 =head1 CONTACT
21  Please email comments or questions to the public Ensembl
22  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
23  Questions may also be sent to the Ensembl help desk at
24  <http://www.ensembl.org/Help/Contact>.
25 
26 =cut
27 =head1 NAME
28 =head1 SYNOPSIS
29 =head1 DESCRIPTION
30 =head1 METHODS
31 =cut
32 
33 package Bio::EnsEMBL::Utils::VegaCuration::Translation;
34 use strict;
35 use warnings;
36 use vars qw(@ISA);
37 use Data::Dumper;
40 
41 =head2 check_CDS_start_end_remarks
42 
43  Args : B::E::Transcript
44  Example : my $results = $support->check_CDS_end_remarks($transcript)
45  Description: identifies incorrect 'CDS end...' transcript remarks in a
46  otter-derived Vega database
47  Returntype : hashref
48 
49 =cut
50 
51 sub check_CDS_start_end_remarks {
52  my $self = shift;
53  my $trans = shift;
54  # info for checking
55  my @remarks = @{$trans->get_all_Attributes('remark')};
56  my $coding_end = $trans->cdna_coding_end;
57  my $coding_start = $trans->cdna_coding_start;
58  my $trans_end = $trans->length;
59  my $trans_seq = $trans->seq->seq;
60  my $stop_codon = substr($trans_seq, $coding_end-3, 3);
61  my $start_codon = substr($trans_seq, $coding_start-1, 3);
62  #hashref to return results
63  my $results;
64 
65  #extra CDS end not found remarks
66  if (grep {$_->value eq 'CDS end not found'} @remarks) {
67  if ( ($coding_end != $trans_end)
68  && ( grep {$_ eq $stop_codon} qw(TGA TAA TAG) ) ) {
69  $results->{'END_EXTRA'} = 1;
70  }
71  }
72  #missing CDS end not found remark
73  if ( $coding_end == $trans_end ) {
74  if (! grep {$_->value eq 'CDS end not found'} @remarks) {
75  if (grep {$_ eq $stop_codon} qw(TGA TAA TAG)) {
76  $results->{'END_MISSING_2'} = 1;
77  }
78  else {
79  $results->{'END_MISSING_1'} = $stop_codon;
80  }
81  }
82  }
83  #extra CDS start not found remark
84  if (grep {$_->value eq 'CDS start not found'} @remarks) {
85  if ( ($coding_start != 1)
86  && ($start_codon eq 'ATG') ) {
87  $results->{'START_EXTRA'} = 1;
88  }
89  }
90  #missing CDS start not found remark
91  if ( $coding_start == 1) {
92  if ( ! grep {$_->value eq 'CDS start not found'} @remarks) {
93  if ($start_codon eq 'ATG') {
94  $results->{'START_MISSING_2'} = 1;
95  } else {
96  $results->{'START_MISSING_1'} = $start_codon;
97  }
98  }
99  }
100  return $results;
101 }
102 
103 =head2 check_CDS_start_end_remarks_loutre
104 
105  Args : B::E::Transcript
106  Example : my $results = $support->check_CDS_end_remarks($transcript)
107  Description: identifies incorrect 'CDS end...' transcript attribs in a loutre
108  of a loutre-derived Vega database.
109  Returntype : hashref
110 
111 =cut
112 
113 sub check_CDS_start_end_remarks_loutre {
114  my $self = shift;
115  my $trans = shift;
116 
117  # info for checking
118  my @stops = qw(TGA TAA TAG);
119  my %attributes;
120  foreach my $attribute (@{$trans->get_all_Attributes()}) {
121  push @{$attributes{$attribute->code}}, $attribute;
122  }
123 # warn $trans->stable_id;
124 # warn Data::Dumper::Dumper(\%attributes);
125  my $coding_end = $trans->cdna_coding_end;
126  my $coding_start = $trans->cdna_coding_start;
127  my $trans_end = $trans->length;
128  my $trans_seq = $trans->seq->seq;
129  my $stop_codon_offset = 3 + $trans->translation->end_Exon->end_phase;
130  my $initial_exon_phase = $trans->translation->start_Exon->phase;
131  my $stop_codon = substr($trans_seq, $coding_end-3, 3);
132  my $start_codon = substr($trans_seq, $coding_start-1, 3);
133 
134  my $start_codon_incorrect = 1;
135  if ($start_codon eq 'ATG' ) {
136  $start_codon_incorrect = 0;
137  }
138  elsif ($start_codon eq 'CTG') {
139  foreach my $attrib (@{$attributes{'remark'}}) {
140  if ($attrib->value =~ /non[- ]ATG start/) {
141  $start_codon_incorrect = 0;
142  }
143  }
144  }
145 
146 # warn "$start_codon -- $initial_exon_phase -- $coding_start -- $start_codon_incorrect";
147 
148  #hashref to return results
149  my $results;
150 
151  #extra CDS end not found remarks
152  if ($attributes{'cds_end_NF'}) {
153  if ( ($attributes{'cds_end_NF'}->[0]->value == 1)
154  && ($coding_end != $trans_end)
155  && ( grep {$_ eq $stop_codon} @stops) ) {
156 # warn $trans->stable_id.": $coding_end--$trans_end--$stop_codon";
157 # warn $trans->translation->end_Exon->end_phase;
158  $results->{'END_EXTRA'} = $stop_codon;
159  }
160  }
161  #missing CDS end not found remark
162  if ( $coding_end == $trans_end ) {
163  if ($attributes{'cds_end_NF'}) {
164  if ($attributes{'cds_end_NF'}->[0]->value == 0 ) {
165  if (! grep {$_ eq $stop_codon} @stops) {
166 # warn $trans->translation->end_Exon->end_phase;
167  $results->{'END_MISSING'}{'WRONG'} = $stop_codon;
168  }
169  }
170  }
171  elsif (! grep {$_ eq $stop_codon} @stops) {
172  $results->{'END_MISSING'}{'ABSENT'} = $stop_codon;
173  }
174  }
175  #extra CDS start not found remark
176  if ( $attributes{'cds_start_NF'}) {
177  if ( ($attributes{'cds_start_NF'}->[0]->value == 1 )
178  && (! $start_codon_incorrect)) {
179  unless ( ($coding_start == 1) && ( $initial_exon_phase > 0)) {
180  $results->{'START_EXTRA'} = $start_codon;
181  }
182  }
183  }
184  #missing CDS start not found remark
185  if ( $coding_start == 1) {
186  if ( $attributes{'cds_start_NF'} ) {
187  if ( $attributes{'cds_start_NF'}->[0]->value == 0 ) {
188  if ($start_codon_incorrect) {
189  $results->{'START_MISSING'}{'ABSENT'} = $start_codon;
190  }
191  elsif ($initial_exon_phase > 0) {
192  $results->{'START_MISSING'}{'INITIAL_PHASE'} = $initial_exon_phase;
193  }
194  }
195  }
196  elsif ($start_codon ne 'ATG') {
197  $results->{'START_MISSING'}{'ABSENT'} = $start_codon;
198  }
199 
200  }
201  return $results;
202 }
203 
204 =head2 get_havana_seleno_comments
205 
206  Args : none
207  Example : my $results = $support->get_havana_seleno_comments
208  Description: parses the HEREDOC containing Havana comments in this module
209  Returntype : hashref
210 
211 =cut
212 
213 sub get_havana_seleno_comments {
214  my $seen_translations;
215  while (<DATA>) {
216  next if /^\s+$/ or /#+/;
217  my ($obj,$comment) = split /=/;
218  $obj =~ s/^\s+|\s+$//g;
219  $comment =~ s/^\s+|\s+$//g;
220  # We add the origin as now "seen" can come from a number of places, and have
221  # a number of consequences in different cases, not just discounted Secs from this method. -- ds23
222  $seen_translations->{$obj} = [ $comment,"notsec-havana" ];
223  }
224  return $seen_translations;
225 }
226 
227 sub check_for_stops {
228  my $support = shift;
229  my ($gene,$seen_transcripts,$log_object) = @_;
230  my $transcripts;
231  my $has_log_object=defined($log_object);
232  if($has_log_object){
233  my @help = $log_object->species_params->get_trans($gene->stable_id);
234  $transcripts=\@help;
235  }else{
236  $log_object=$support;
237  $transcripts=$gene->get_all_Transcripts;
238  }
239 
240  my $gname = $gene->get_all_Attributes('name')->[0]->value;
241  my $gsi = $gene->stable_id;
242  my $scodon = 'TGA';
243  my $mod_date = $support->date_format( $gene->modified_date,'%d/%m/%y' );
244  my $hidden_remak_ttributes;
245  TRANS:
246  foreach my $trans (@$transcripts) {
247  my $tsi = $trans->stable_id;
248  my $tID = $trans->dbID;
249  my $tname = $trans->get_all_Attributes('name')->[0]->value;
250  if($has_log_object){
251  $hidden_remak_ttributes=$log_object->species_params->get_attributes->{$tsi}->{'hidden_remark'};
252  }else{
253  $hidden_remak_ttributes=$trans->get_all_Attributes('hidden_remark');
254  }
255  foreach my $rem (@$hidden_remak_ttributes) {
256  if ($rem->value =~ /not_for_Vega/) {
257  #$support->log_verbose("Skipping transcript $tname ($tsi) since 'not_for_Vega'\n",1);
258  $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Skipping transcript $tname ($tsi) since 'not_for_Vega'");
259  next TRANS;
260  }
261  }
262 
263  # go no further if there is a ribosomal framshift attribute
264  foreach my $attrib (@{$trans->get_all_Attributes('_rib_frameshift')}) {
265  if ($attrib->value) {
266  $log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift attribute");
267  next TRANS;
268  }
269  }
270  foreach my $attrib (@{$trans->get_all_Attributes('hidden_remark')}) {
271  if ($attrib->value eq 'ribosomal_frameshift') {
272  $log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift hidden_remark");
273  next TRANS;
274  }
275  }
276 
277  #$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1);
278  $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Studying transcript $tsi ($tname, $tID)");
279  my $peptide;
280 
281  # go no further if the transcript doesn't translate or if there are no stops
282  next TRANS unless ($peptide = $trans->translate);
283  my $pseq = $peptide->seq;
284  my $orig_seq = $pseq;
285  # (translate method trims stops from sequence end)
286 
287  next TRANS unless ($pseq =~ /\*/);
288  next TRANS if ($trans->biotype eq 'polymorphic_pseudogene' or
289  $trans->biotype =~ /processed_pseudogene$/);
290 
291  # warn sprintf("Stop codon is '%s'\n",substr($trans->translateable_seq,-3));
292  #$support->log_verbose("Stops found in $tsi ($tname)\n",1);
293  $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Stops found in $tsi ($tname)");
294 
295  # find out where and how many stops there are
296  my @found_stops;
297  my $mrna = $trans->translateable_seq;
298  my $offset = 0;
299  my $tstop;
300  while ($pseq =~ /^([^\*]*)\*(.*)/) {
301  my $pseq1_f = $1;
302  $pseq = $2;
303  my $seq_flag = 0;
304  $offset += length($pseq1_f) * 3;
305  my $stop = substr($mrna, $offset, 3);
306  my $aaoffset = int($offset / 3)+1;
307  push(@found_stops, [ $stop, $aaoffset ]);
308  $tstop .= "$aaoffset ";
309  $offset += 3;
310  }
311 
312  # are all stops TGA...?
313  my $num_stops = scalar(@found_stops);
314  my $num_tga = 0;
315  my $positions;
316  foreach my $stop (@found_stops) {
317  $positions .= $stop->[0]."(".$stop->[1].") ";
318  if ($stop->[0] eq $scodon) {
319  $num_tga++;
320  }
321  }
322  my $source = $gene->source;
323  #...no - an internal stop codon error in the database...
324  if ($num_tga < $num_stops) {
325  if ($source eq 'havana') {
326  #$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
327  $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]: Sequence = $orig_seq Stops at $positions)");
328  }
329  else {
330  #$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
331  $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]: Sequence = $orig_seq Stops at $positions)");
332  }
333  }
334  #...yes - check remarks
335  else {
336  my $flag_remark = 0; # 1 if word seleno has been used
337  my $flag_remark2 = 0; # 1 if existing remark has correct numbering
338  my $alabel = 'Annotation_remark- selenocysteine ';
339  my $alabel2 = 'selenocysteine ';
340  my $annot_stops;
341  my $remarks;
342  my $att;
343  #get both hidden_remarks and remarks
344  foreach my $remark_type ('remark','hidden_remark') {
345  if($has_log_object){
346  $att=$log_object->species_params->get_attributes->{$trans->stable_id}->{$remark_type};
347  }else{
348  $att=$trans->get_all_Attributes($remark_type)
349  }
350  foreach my $attrib ( @$att) {
351  push @{$remarks->{$remark_type}}, $attrib->value;
352  }
353  }
354  #parse remarks to check syntax for location of edits
355  while (my ($attrib,$remarks)= each %$remarks) {
356  foreach my $text (@{$remarks}) {
357  if ( ($attrib eq 'remark') && ($text=~/^$alabel(.*)/) ){
358  #$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n");
359  $log_object->_save_log('log_warning', '', $gsi, '', $tsi, 'VQCT_wrong_selC_coord', "seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]");
360  $annot_stops=$1;
361  }
362  elsif ($text =~ /^$alabel2(.*)/) {
363  my $maybe = $1;
364  if($maybe =~ /^\s*\d+(\s+\d+)*\s*$/) {
365  $annot_stops=$maybe;
366  } else {
367  $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, '', "Maybe annotated stop in incorrect format, maybe just a remark that happens to begin '$alabel2'".
368  " -- might need to investigate: '$alabel2$maybe' [$mod_date]");
369  }
370  }
371  }
372  }
373 
374  #check the location of the annotated edits matches actual stops in the sequence
375  my @annotated_stops;
376  if ($annot_stops){
377  my $i = 0;
378  foreach my $offset (split(/\s+/, $annot_stops)) {
379  #OK if it matches a known stop
380  if (
381  defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && ($found_stops[$i]->[1] == $offset)) {
382  push @annotated_stops, $offset;
383  }
384  # catch old annotations where number was in DNA not peptide coordinates
385  elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1] * 3) == $offset)) {
386  $log_object->_save_log('log_warning', '', $gene->stable_id, 'DNA', $tsi, 'VQCT_wrong_selC_coord', "DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]");
387  }
388  # catch old annotations where number off by one
389  elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1]) == $offset+1)) {
390  $log_object->_save_log('log_warning', '', $gene->stable_id, 'PEPTIDE', $tsi, 'VQCT_wrong_selC_coord', "PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]");
391  }
392  elsif (defined($offset) && ($offset=~/^\d+$/)){
393  if ($offset == length($orig_seq)+1) {
394  if($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-tga-stop') {
395  $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a stop codon. Ok. [$mod_date]");
396  } elsif($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-terminal-sec') {
397  $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a terminal Sec. Ok. [$mod_date]");
398  } else {
399  $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) \"$offset\" matches actual stop codon yet has no entry in script config to disambiguate it. Please investigate and add appropriate entry to config arrays in add_selcys.pl. [$mod_date]");
400  }
401  }
402  else {
403  $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, 'VQCT_wrong_selC_coord', "Annotated stop for transcript $tsi ($tname) \"$offset\" does not match a TGA codon) [$mod_date]");
404  push @annotated_stops, $offset;
405  }
406  }
407  $i++;
408  }
409  }
410 
411  #check location of found stops matches annotated ones - any new ones are reported
412  foreach my $stop ( @found_stops ) {
413  my $pos = $stop->[1];
414  my $seq = $stop->[0];
415  unless ( grep { $pos == $_} @annotated_stops) {
416  if ($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'notsec-havana') {
417  #$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n");
418  $log_object->_save_log('log_verbose', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators: ".$seen_transcripts->{$tsi}->[0].") [$mod_date]");
419  }
420  else {
421  #$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n");
422  $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]");
423  }
424  }
425  }
426  }
427  }
428 }
429 sub _save_log{
430  my $self=shift;
431  my $log_type = shift;
432  my $chrom_name=shift || '';
433  my $gsi=shift || '';
434  my $type=shift || '';
435  my $tsi=shift || '';
436  my $tag=shift || '';
437  my $txt=shift || '';
438  $self->$log_type($txt."\n");
439 }
440 
441 #details of annotators comments
442 __DATA__
443 OTTHUMT00000144659 = FIXED- changed to transcript
444 OTTHUMT00000276377 = FIXED- changed to transcript
445 OTTHUMT00000257741 = FIXED- changed to nmd
446 OTTHUMT00000155694 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
447 OTTHUMT00000155695 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
448 OTTHUMT00000282573 = FIXED- changed to unprocessed pseudogene
449 OTTHUMT00000285227 = FIXED- changed start site
450 OTTHUMT00000151008 = FIXED- incorrect trimming of CDS, removed extra stop codon
451 OTTHUMT00000157999 = FIXED- changed incorrect stop
452 OTTHUMT00000150523 = FIXED- incorrect trimming of CDS
453 OTTHUMT00000150525 = FIXED- incorrect trimming of CDS
454 OTTHUMT00000150522 = FIXED- incorrect trimming of CDS
455 OTTHUMT00000150521 = FIXED- incorrect trimming of CDS
456 OTTHUMT00000246819 = FIXED- corrected frame
457 OTTHUMT00000314078 = FIXED- corrected frame
458 OTTHUMT00000080133 = FIXED- corrected frame
459 OTTHUMT00000286423 = FIXED- changed to transcript
460 OTTMUST00000055509 = FIXED- error
461 OTTMUST00000038729 = FIXED- corrected frame
462 OTTMUST00000021760 = FIXED- corrected frame
463 OTTMUST00000023057 = FIXED- corrected frame
464 OTTMUST00000015207 = FIXED- corrected frame
465 OTTMUST00000056646 = FIXED- error
466 OTTMUST00000059686 = FIXED- corrected frame
467 OTTMUST00000013426 = FIXED- corrected frame
468 OTTMUST00000044412 = FIXED- error
transcript
public transcript()
Bio::EnsEMBL::Utils::VegaCuration::Transcript
Definition: Transcript.pm:14