ensembl-hive  2.8.1
Translation.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 =head1 CONTACT
21  Please email comments or questions to the public Ensembl
22  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
23  Questions may also be sent to the Ensembl help desk at
24  <http://www.ensembl.org/Help/Contact>.
25 
26 =cut
27 =head1 NAME
28 =head1 SYNOPSIS
29 =head1 DESCRIPTION
30 =head1 METHODS
31 =cut
32 
33 package Bio::EnsEMBL::Utils::VegaCuration::Translation;
34 
35 use strict;
36 use warnings;
37 use vars qw(@ISA);
38 use Data::Dumper;
41 
42 =head2 check_CDS_start_end_remarks
43 
44  Args : B::E::Transcript
45  Example : my $results = $support->check_CDS_end_remarks($transcript)
46  Description: identifies incorrect 'CDS end...' transcript remarks in a
47  otter-derived Vega database
48  Returntype : hashref
49 
50 =cut
51 
52 sub check_CDS_start_end_remarks {
53  my $self = shift;
54  my $trans = shift;
55  # info for checking
56  my @remarks = @{$trans->get_all_Attributes('remark')};
57  my $coding_end = $trans->cdna_coding_end;
58  my $coding_start = $trans->cdna_coding_start;
59  my $trans_end = $trans->length;
60  my $trans_seq = $trans->seq->seq;
61  my $stop_codon = substr($trans_seq, $coding_end-3, 3);
62  my $start_codon = substr($trans_seq, $coding_start-1, 3);
63  #hashref to return results
64  my $results;
65 
66  #extra CDS end not found remarks
67  if (grep {$_->value eq 'CDS end not found'} @remarks) {
68  if ( ($coding_end != $trans_end)
69  && ( grep {$_ eq $stop_codon} qw(TGA TAA TAG) ) ) {
70  $results->{'END_EXTRA'} = 1;
71  }
72  }
73  #missing CDS end not found remark
74  if ( $coding_end == $trans_end ) {
75  if (! grep {$_->value eq 'CDS end not found'} @remarks) {
76  if (grep {$_ eq $stop_codon} qw(TGA TAA TAG)) {
77  $results->{'END_MISSING_2'} = 1;
78  }
79  else {
80  $results->{'END_MISSING_1'} = $stop_codon;
81  }
82  }
83  }
84  #extra CDS start not found remark
85  if (grep {$_->value eq 'CDS start not found'} @remarks) {
86  if ( ($coding_start != 1)
87  && ($start_codon eq 'ATG') ) {
88  $results->{'START_EXTRA'} = 1;
89  }
90  }
91  #missing CDS start not found remark
92  if ( $coding_start == 1) {
93  if ( ! grep {$_->value eq 'CDS start not found'} @remarks) {
94  if ($start_codon eq 'ATG') {
95  $results->{'START_MISSING_2'} = 1;
96  } else {
97  $results->{'START_MISSING_1'} = $start_codon;
98  }
99  }
100  }
101  return $results;
102 }
103 
104 =head2 check_CDS_start_end_remarks_loutre
105 
106  Args : B::E::Transcript
107  Example : my $results = $support->check_CDS_end_remarks($transcript)
108  Description: identifies incorrect 'CDS end...' transcript attribs in a loutre
109  of a loutre-derived Vega database.
110  Returntype : hashref
111 
112 =cut
113 
114 sub check_CDS_start_end_remarks_loutre {
115  my $self = shift;
116  my $trans = shift;
117 
118  # info for checking
119  my @stops = qw(TGA TAA TAG);
120  my %attributes;
121  foreach my $attribute (@{$trans->get_all_Attributes()}) {
122  push @{$attributes{$attribute->code}}, $attribute;
123  }
124 # warn $trans->stable_id;
125 # warn Data::Dumper::Dumper(\%attributes);
126  my $coding_end = $trans->cdna_coding_end;
127  my $coding_start = $trans->cdna_coding_start;
128  my $trans_end = $trans->length;
129  my $trans_seq = $trans->seq->seq;
130  my $stop_codon_offset = 3 + $trans->translation->end_Exon->end_phase;
131  my $initial_exon_phase = $trans->translation->start_Exon->phase;
132  my $stop_codon = substr($trans_seq, $coding_end-3, 3);
133  my $start_codon = substr($trans_seq, $coding_start-1, 3);
134 
135  my $start_codon_incorrect = 1;
136  if ($start_codon eq 'ATG' ) {
137  $start_codon_incorrect = 0;
138  }
139  elsif ($start_codon eq 'CTG') {
140  foreach my $attrib (@{$attributes{'remark'}}) {
141  if ($attrib->value =~ /non[- ]ATG start/) {
142  $start_codon_incorrect = 0;
143  }
144  }
145  }
146 
147 # warn "$start_codon -- $initial_exon_phase -- $coding_start -- $start_codon_incorrect";
148 
149  #hashref to return results
150  my $results;
151 
152  #extra CDS end not found remarks
153  if ($attributes{'cds_end_NF'}) {
154  if ( ($attributes{'cds_end_NF'}->[0]->value == 1)
155  && ($coding_end != $trans_end)
156  && ( grep {$_ eq $stop_codon} @stops) ) {
157 # warn $trans->stable_id.": $coding_end--$trans_end--$stop_codon";
158 # warn $trans->translation->end_Exon->end_phase;
159  $results->{'END_EXTRA'} = $stop_codon;
160  }
161  }
162  #missing CDS end not found remark
163  if ( $coding_end == $trans_end ) {
164  if ($attributes{'cds_end_NF'}) {
165  if ($attributes{'cds_end_NF'}->[0]->value == 0 ) {
166  if (! grep {$_ eq $stop_codon} @stops) {
167 # warn $trans->translation->end_Exon->end_phase;
168  $results->{'END_MISSING'}{'WRONG'} = $stop_codon;
169  }
170  }
171  }
172  elsif (! grep {$_ eq $stop_codon} @stops) {
173  $results->{'END_MISSING'}{'ABSENT'} = $stop_codon;
174  }
175  }
176  #extra CDS start not found remark
177  if ( $attributes{'cds_start_NF'}) {
178  if ( ($attributes{'cds_start_NF'}->[0]->value == 1 )
179  && (! $start_codon_incorrect)) {
180  unless ( ($coding_start == 1) && ( $initial_exon_phase > 0)) {
181  $results->{'START_EXTRA'} = $start_codon;
182  }
183  }
184  }
185  #missing CDS start not found remark
186  if ( $coding_start == 1) {
187  if ( $attributes{'cds_start_NF'} ) {
188  if ( $attributes{'cds_start_NF'}->[0]->value == 0 ) {
189  if ($start_codon_incorrect) {
190  $results->{'START_MISSING'}{'ABSENT'} = $start_codon;
191  }
192  elsif ($initial_exon_phase > 0) {
193  $results->{'START_MISSING'}{'INITIAL_PHASE'} = $initial_exon_phase;
194  }
195  }
196  }
197  elsif ($start_codon ne 'ATG') {
198  $results->{'START_MISSING'}{'ABSENT'} = $start_codon;
199  }
200 
201  }
202  return $results;
203 }
204 
205 =head2 get_havana_seleno_comments
206 
207  Args : none
208  Example : my $results = $support->get_havana_seleno_comments
209  Description: parses the HEREDOC containing Havana comments in this module
210  Returntype : hashref
211 
212 =cut
213 
214 sub get_havana_seleno_comments {
215  my $seen_translations;
216  while (<DATA>) {
217  next if /^\s+$/ or /#+/;
218  my ($obj,$comment) = split /=/;
219  $obj =~ s/^\s+|\s+$//g;
220  $comment =~ s/^\s+|\s+$//g;
221  # We add the origin as now "seen" can come from a number of places, and have
222  # a number of consequences in different cases, not just discounted Secs from this method. -- ds23
223  $seen_translations->{$obj} = [ $comment,"notsec-havana" ];
224  }
225  return $seen_translations;
226 }
227 
228 sub check_for_stops {
229  my $support = shift;
230  my ($gene,$seen_transcripts,$log_object) = @_;
231  my $transcripts;
232  my $has_log_object=defined($log_object);
233  if($has_log_object){
234  my @help = $log_object->species_params->get_trans($gene->stable_id);
235  $transcripts=\@help;
236  }else{
237  $log_object=$support;
238  $transcripts=$gene->get_all_Transcripts;
239  }
240 
241  my $gname = $gene->get_all_Attributes('name')->[0]->value;
242  my $gsi = $gene->stable_id;
243  my $scodon = 'TGA';
244  my $mod_date = $support->date_format( $gene->modified_date,'%d/%m/%y' );
245  my $hidden_remak_ttributes;
246  TRANS:
247  foreach my $trans (@$transcripts) {
248  my $tsi = $trans->stable_id;
249  my $tID = $trans->dbID;
250  my $tname = $trans->get_all_Attributes('name')->[0]->value;
251  if($has_log_object){
252  $hidden_remak_ttributes=$log_object->species_params->get_attributes->{$tsi}->{'hidden_remark'};
253  }else{
254  $hidden_remak_ttributes=$trans->get_all_Attributes('hidden_remark');
255  }
256  foreach my $rem (@$hidden_remak_ttributes) {
257  if ($rem->value =~ /not_for_Vega/) {
258  #$support->log_verbose("Skipping transcript $tname ($tsi) since 'not_for_Vega'\n",1);
259  $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Skipping transcript $tname ($tsi) since 'not_for_Vega'");
260  next TRANS;
261  }
262  }
263 
264  # go no further if there is a ribosomal framshift attribute
265  foreach my $attrib (@{$trans->get_all_Attributes('_rib_frameshift')}) {
266  if ($attrib->value) {
267  $log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift attribute");
268  next TRANS;
269  }
270  }
271  foreach my $attrib (@{$trans->get_all_Attributes('hidden_remark')}) {
272  if ($attrib->value eq 'ribosomal_frameshift') {
273  $log_object->_save_log('log', '', $gsi, '', $tsi, '', "Skipping $tsi ($tname) since it has a ribosomal frameshift hidden_remark");
274  next TRANS;
275  }
276  }
277 
278  #$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1);
279  $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Studying transcript $tsi ($tname, $tID)");
280  my $peptide;
281 
282  # go no further if the transcript doesn't translate or if there are no stops
283  next TRANS unless ($peptide = $trans->translate);
284  my $pseq = $peptide->seq;
285  my $orig_seq = $pseq;
286  # (translate method trims stops from sequence end)
287 
288  next TRANS unless ($pseq =~ /\*/);
289  next TRANS if ($trans->biotype eq 'polymorphic_pseudogene' or
290  $trans->biotype =~ /processed_pseudogene$/);
291 
292  # warn sprintf("Stop codon is '%s'\n",substr($trans->translateable_seq,-3));
293  #$support->log_verbose("Stops found in $tsi ($tname)\n",1);
294  $log_object->_save_log('log_verbose', '', $gsi, '', $tsi, '', "Stops found in $tsi ($tname)");
295 
296  # find out where and how many stops there are
297  my @found_stops;
298  my $mrna = $trans->translateable_seq;
299  my $offset = 0;
300  my $tstop;
301  while ($pseq =~ /^([^\*]*)\*(.*)/) {
302  my $pseq1_f = $1;
303  $pseq = $2;
304  my $seq_flag = 0;
305  $offset += length($pseq1_f) * 3;
306  my $stop = substr($mrna, $offset, 3);
307  my $aaoffset = int($offset / 3)+1;
308  push(@found_stops, [ $stop, $aaoffset ]);
309  $tstop .= "$aaoffset ";
310  $offset += 3;
311  }
312 
313  # are all stops TGA...?
314  my $num_stops = scalar(@found_stops);
315  my $num_tga = 0;
316  my $positions;
317  foreach my $stop (@found_stops) {
318  $positions .= $stop->[0]."(".$stop->[1].") ";
319  if ($stop->[0] eq $scodon) {
320  $num_tga++;
321  }
322  }
323  my $source = $gene->source;
324  #...no - an internal stop codon error in the database...
325  if ($num_tga < $num_stops) {
326  if ($source eq 'havana') {
327  #$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
328  $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]: Sequence = $orig_seq Stops at $positions)");
329  }
330  else {
331  #$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
332  $log_object->_save_log('log_warning', '', $gsi, 'TRANSCRIPT', $tsi, 'VQCT_internal_stop', "INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]: Sequence = $orig_seq Stops at $positions)");
333  }
334  }
335  #...yes - check remarks
336  else {
337  my $flag_remark = 0; # 1 if word seleno has been used
338  my $flag_remark2 = 0; # 1 if existing remark has correct numbering
339  my $alabel = 'Annotation_remark- selenocysteine ';
340  my $alabel2 = 'selenocysteine ';
341  my $annot_stops;
342  my $remarks;
343  my $att;
344  #get both hidden_remarks and remarks
345  foreach my $remark_type ('remark','hidden_remark') {
346  if($has_log_object){
347  $att=$log_object->species_params->get_attributes->{$trans->stable_id}->{$remark_type};
348  }else{
349  $att=$trans->get_all_Attributes($remark_type)
350  }
351  foreach my $attrib ( @$att) {
352  push @{$remarks->{$remark_type}}, $attrib->value;
353  }
354  }
355  #parse remarks to check syntax for location of edits
356  while (my ($attrib,$remarks)= each %$remarks) {
357  foreach my $text (@{$remarks}) {
358  if ( ($attrib eq 'remark') && ($text=~/^$alabel(.*)/) ){
359  #$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n");
360  $log_object->_save_log('log_warning', '', $gsi, '', $tsi, 'VQCT_wrong_selC_coord', "seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]");
361  $annot_stops=$1;
362  }
363  elsif ($text =~ /^$alabel2(.*)/) {
364  my $maybe = $1;
365  if($maybe =~ /^\s*\d+(\s+\d+)*\s*$/) {
366  $annot_stops=$maybe;
367  } else {
368  $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, '', "Maybe annotated stop in incorrect format, maybe just a remark that happens to begin '$alabel2'".
369  " -- might need to investigate: '$alabel2$maybe' [$mod_date]");
370  }
371  }
372  }
373  }
374 
375  #check the location of the annotated edits matches actual stops in the sequence
376  my @annotated_stops;
377  if ($annot_stops){
378  my $i = 0;
379  foreach my $offset (split(/\s+/, $annot_stops)) {
380  #OK if it matches a known stop
381  if (
382  defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && ($found_stops[$i]->[1] == $offset)) {
383  push @annotated_stops, $offset;
384  }
385  # catch old annotations where number was in DNA not peptide coordinates
386  elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1] * 3) == $offset)) {
387  $log_object->_save_log('log_warning', '', $gene->stable_id, 'DNA', $tsi, 'VQCT_wrong_selC_coord', "DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]");
388  }
389  # catch old annotations where number off by one
390  elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1]) == $offset+1)) {
391  $log_object->_save_log('log_warning', '', $gene->stable_id, 'PEPTIDE', $tsi, 'VQCT_wrong_selC_coord', "PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]");
392  }
393  elsif (defined($offset) && ($offset=~/^\d+$/)){
394  if ($offset == length($orig_seq)+1) {
395  if($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-tga-stop') {
396  $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a stop codon. Ok. [$mod_date]");
397  } elsif($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'known-terminal-sec') {
398  $log_object->_save_log('log', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) known to be a terminal Sec. Ok. [$mod_date]");
399  } else {
400  $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, '', "Annotated stop for transcript $tsi ($tname) \"$offset\" matches actual stop codon yet has no entry in script config to disambiguate it. Please investigate and add appropriate entry to config arrays in add_selcys.pl. [$mod_date]");
401  }
402  }
403  else {
404  $log_object->_save_log('log_warning', '', $gene->stable_id, 'TRANSCRIPT', $tsi, 'VQCT_wrong_selC_coord', "Annotated stop for transcript $tsi ($tname) \"$offset\" does not match a TGA codon) [$mod_date]");
405  push @annotated_stops, $offset;
406  }
407  }
408  $i++;
409  }
410  }
411 
412  #check location of found stops matches annotated ones - any new ones are reported
413  foreach my $stop ( @found_stops ) {
414  my $pos = $stop->[1];
415  my $seq = $stop->[0];
416  unless ( grep { $pos == $_} @annotated_stops) {
417  if ($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq 'notsec-havana') {
418  #$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n");
419  $log_object->_save_log('log_verbose', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators: ".$seen_transcripts->{$tsi}->[0].") [$mod_date]");
420  }
421  else {
422  #$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n");
423  $log_object->_save_log('log', '', $gene->stable_id, '', $tsi, 'VQCT_pot_selC', "POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]");
424  }
425  }
426  }
427  }
428  }
429 }
430 sub _save_log{
431  my $self=shift;
432  my $log_type = shift;
433  my $chrom_name=shift || '';
434  my $gsi=shift || '';
435  my $type=shift || '';
436  my $tsi=shift || '';
437  my $tag=shift || '';
438  my $txt=shift || '';
439  $self->$log_type($txt."\n");
440 }
441 
442 #details of annotators comments
443 __DATA__
444 OTTHUMT00000144659 = FIXED- changed to transcript
445 OTTHUMT00000276377 = FIXED- changed to transcript
446 OTTHUMT00000257741 = FIXED- changed to nmd
447 OTTHUMT00000155694 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
448 OTTHUMT00000155695 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
449 OTTHUMT00000282573 = FIXED- changed to unprocessed pseudogene
450 OTTHUMT00000285227 = FIXED- changed start site
451 OTTHUMT00000151008 = FIXED- incorrect trimming of CDS, removed extra stop codon
452 OTTHUMT00000157999 = FIXED- changed incorrect stop
453 OTTHUMT00000150523 = FIXED- incorrect trimming of CDS
454 OTTHUMT00000150525 = FIXED- incorrect trimming of CDS
455 OTTHUMT00000150522 = FIXED- incorrect trimming of CDS
456 OTTHUMT00000150521 = FIXED- incorrect trimming of CDS
457 OTTHUMT00000246819 = FIXED- corrected frame
458 OTTHUMT00000314078 = FIXED- corrected frame
459 OTTHUMT00000080133 = FIXED- corrected frame
460 OTTHUMT00000286423 = FIXED- changed to transcript
461 OTTMUST00000055509 = FIXED- error
462 OTTMUST00000038729 = FIXED- corrected frame
463 OTTMUST00000021760 = FIXED- corrected frame
464 OTTMUST00000023057 = FIXED- corrected frame
465 OTTMUST00000015207 = FIXED- corrected frame
466 OTTMUST00000056646 = FIXED- error
467 OTTMUST00000059686 = FIXED- corrected frame
468 OTTMUST00000013426 = FIXED- corrected frame
469 OTTMUST00000044412 = FIXED- error
transcript
public transcript()
Bio::EnsEMBL::Utils::VegaCuration::Transcript
Definition: Transcript.pm:14