3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
21 Please email comments or questions to the
public Ensembl
22 developers list at <http:
23 Questions may also be sent to the Ensembl help desk at
33 package Bio::EnsEMBL::Utils::VegaCuration::Translation;
42 =head2 check_CDS_start_end_remarks
44 Args : B::E::Transcript
45 Example : my $results = $support->check_CDS_end_remarks($transcript)
46 Description: identifies incorrect
'CDS end...' transcript remarks in a
47 otter-derived Vega database
52 sub check_CDS_start_end_remarks {
56 my @remarks = @{$trans->get_all_Attributes(
'remark')};
57 my $coding_end = $trans->cdna_coding_end;
58 my $coding_start = $trans->cdna_coding_start;
59 my $trans_end = $trans->length;
60 my $trans_seq = $trans->seq->seq;
61 my $stop_codon = substr($trans_seq, $coding_end-3, 3);
62 my $start_codon = substr($trans_seq, $coding_start-1, 3);
63 #hashref to return results
66 #extra CDS end not found remarks
67 if (grep {$_->value eq
'CDS end not found'} @remarks) {
68 if ( ($coding_end != $trans_end)
69 && ( grep {$_ eq $stop_codon} qw(TGA TAA TAG) ) ) {
70 $results->{
'END_EXTRA'} = 1;
73 #missing CDS end not found remark
74 if ( $coding_end == $trans_end ) {
75 if (! grep {$_->value eq
'CDS end not found'} @remarks) {
76 if (grep {$_ eq $stop_codon} qw(TGA TAA TAG)) {
77 $results->{
'END_MISSING_2'} = 1;
80 $results->{
'END_MISSING_1'} = $stop_codon;
84 #extra CDS start not found remark
85 if (grep {$_->value eq
'CDS start not found'} @remarks) {
86 if ( ($coding_start != 1)
87 && ($start_codon eq
'ATG') ) {
88 $results->{
'START_EXTRA'} = 1;
91 #missing CDS start not found remark
92 if ( $coding_start == 1) {
93 if ( ! grep {$_->value eq
'CDS start not found'} @remarks) {
94 if ($start_codon eq
'ATG') {
95 $results->{
'START_MISSING_2'} = 1;
97 $results->{
'START_MISSING_1'} = $start_codon;
104 =head2 check_CDS_start_end_remarks_loutre
106 Args : B::E::Transcript
107 Example : my $results = $support->check_CDS_end_remarks($transcript)
108 Description: identifies incorrect
'CDS end...' transcript attribs in a loutre
109 of a loutre-derived Vega database.
114 sub check_CDS_start_end_remarks_loutre {
119 my @stops = qw(TGA TAA TAG);
121 foreach my $attribute (@{$trans->get_all_Attributes()}) {
122 push @{$attributes{$attribute->code}}, $attribute;
124 # warn $trans->stable_id;
125 # warn Data::Dumper::Dumper(\%attributes);
126 my $coding_end = $trans->cdna_coding_end;
127 my $coding_start = $trans->cdna_coding_start;
128 my $trans_end = $trans->length;
129 my $trans_seq = $trans->seq->seq;
130 my $stop_codon_offset = 3 + $trans->translation->end_Exon->end_phase;
131 my $initial_exon_phase = $trans->translation->start_Exon->phase;
132 my $stop_codon = substr($trans_seq, $coding_end-3, 3);
133 my $start_codon = substr($trans_seq, $coding_start-1, 3);
135 my $start_codon_incorrect = 1;
136 if ($start_codon eq
'ATG' ) {
137 $start_codon_incorrect = 0;
139 elsif ($start_codon eq
'CTG') {
140 foreach my $attrib (@{$attributes{
'remark'}}) {
141 if ($attrib->value =~ /non[- ]ATG start/) {
142 $start_codon_incorrect = 0;
147 # warn "$start_codon -- $initial_exon_phase -- $coding_start -- $start_codon_incorrect";
149 #hashref to return results
152 #extra CDS end not found remarks
153 if ($attributes{
'cds_end_NF'}) {
154 if ( ($attributes{
'cds_end_NF'}->[0]->value == 1)
155 && ($coding_end != $trans_end)
156 && ( grep {$_ eq $stop_codon} @stops) ) {
157 # warn $trans->stable_id.": $coding_end--$trans_end--$stop_codon";
158 # warn $trans->translation->end_Exon->end_phase;
159 $results->{
'END_EXTRA'} = $stop_codon;
162 #missing CDS end not found remark
163 if ( $coding_end == $trans_end ) {
164 if ($attributes{
'cds_end_NF'}) {
165 if ($attributes{
'cds_end_NF'}->[0]->value == 0 ) {
166 if (! grep {$_ eq $stop_codon} @stops) {
167 # warn $trans->translation->end_Exon->end_phase;
168 $results->{
'END_MISSING'}{
'WRONG'} = $stop_codon;
172 elsif (! grep {$_ eq $stop_codon} @stops) {
173 $results->{
'END_MISSING'}{
'ABSENT'} = $stop_codon;
176 #extra CDS start not found remark
177 if ( $attributes{
'cds_start_NF'}) {
178 if ( ($attributes{
'cds_start_NF'}->[0]->value == 1 )
179 && (! $start_codon_incorrect)) {
180 unless ( ($coding_start == 1) && ( $initial_exon_phase > 0)) {
181 $results->{
'START_EXTRA'} = $start_codon;
185 #missing CDS start not found remark
186 if ( $coding_start == 1) {
187 if ( $attributes{
'cds_start_NF'} ) {
188 if ( $attributes{
'cds_start_NF'}->[0]->value == 0 ) {
189 if ($start_codon_incorrect) {
190 $results->{
'START_MISSING'}{
'ABSENT'} = $start_codon;
192 elsif ($initial_exon_phase > 0) {
193 $results->{
'START_MISSING'}{
'INITIAL_PHASE'} = $initial_exon_phase;
197 elsif ($start_codon ne
'ATG') {
198 $results->{
'START_MISSING'}{
'ABSENT'} = $start_codon;
205 =head2 get_havana_seleno_comments
208 Example : my $results = $support->get_havana_seleno_comments
209 Description: parses the HEREDOC containing Havana comments in
this module
214 sub get_havana_seleno_comments {
215 my $seen_translations;
217 next
if /^\s+$/ or /#+/;
218 my ($obj,$comment) = split /=/;
220 $comment =~ s/^\s+|\s+$
221 # We add the origin as now "seen" can come from a number of places, and have
222 # a number of consequences in different cases, not just discounted Secs from this method. -- ds23
223 $seen_translations->{$obj} = [ $comment,
"notsec-havana" ];
225 return $seen_translations;
228 sub check_for_stops {
230 my ($gene,$seen_transcripts,$log_object) = @_;
232 my $has_log_object=defined($log_object);
234 my @help = $log_object->species_params->get_trans($gene->stable_id);
237 $log_object=$support;
238 $transcripts=$gene->get_all_Transcripts;
241 my $gname = $gene->get_all_Attributes(
'name')->[0]->value;
242 my $gsi = $gene->stable_id;
244 my $mod_date = $support->date_format( $gene->modified_date,
'%d/%m/%y' );
245 my $hidden_remak_ttributes;
247 foreach my $trans (@$transcripts) {
248 my $tsi = $trans->stable_id;
249 my $tID = $trans->dbID;
250 my $tname = $trans->get_all_Attributes(
'name')->[0]->value;
252 $hidden_remak_ttributes=$log_object->species_params->get_attributes->{$tsi}->{
'hidden_remark'};
254 $hidden_remak_ttributes=$trans->get_all_Attributes(
'hidden_remark');
256 foreach my $rem (@$hidden_remak_ttributes) {
257 if ($rem->value =~ /not_for_Vega/) {
258 #$support->log_verbose("Skipping transcript $tname ($tsi) since 'not_for_Vega'\n",1);
259 $log_object->_save_log(
'log_verbose',
'', $gsi,
'', $tsi,
'',
"Skipping transcript $tname ($tsi) since 'not_for_Vega'");
264 # go no further if there is a ribosomal framshift attribute
265 foreach my $attrib (@{$trans->get_all_Attributes(
'_rib_frameshift')}) {
266 if ($attrib->value) {
267 $log_object->_save_log(
'log',
'', $gsi,
'', $tsi,
'',
"Skipping $tsi ($tname) since it has a ribosomal frameshift attribute");
271 foreach my $attrib (@{$trans->get_all_Attributes(
'hidden_remark')}) {
272 if ($attrib->value eq
'ribosomal_frameshift') {
273 $log_object->_save_log(
'log',
'', $gsi,
'', $tsi,
'',
"Skipping $tsi ($tname) since it has a ribosomal frameshift hidden_remark");
278 #$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1);
279 $log_object->_save_log(
'log_verbose',
'', $gsi,
'', $tsi,
'',
"Studying transcript $tsi ($tname, $tID)");
282 # go no further if the transcript doesn't translate or if there are no stops
283 next TRANS unless ($peptide = $trans->translate);
284 my $pseq = $peptide->seq;
285 my $orig_seq = $pseq;
286 # (translate method trims stops from sequence end)
288 next TRANS unless ($pseq =~ /\*/);
289 next TRANS
if ($trans->biotype eq
'polymorphic_pseudogene' or
290 $trans->biotype =~ /processed_pseudogene$/);
292 # warn sprintf("Stop codon is '%s'\n",substr($trans->translateable_seq,-3));
293 #$support->log_verbose("Stops found in $tsi ($tname)\n",1);
294 $log_object->_save_log(
'log_verbose',
'', $gsi,
'', $tsi,
'',
"Stops found in $tsi ($tname)");
296 # find out where and how many stops there are
298 my $mrna = $trans->translateable_seq;
301 while ($pseq =~ /^([^\*]*)\*(.*)/) {
305 $offset += length($pseq1_f) * 3;
306 my $stop = substr($mrna, $offset, 3);
307 my $aaoffset = int($offset / 3)+1;
308 push(@found_stops, [ $stop, $aaoffset ]);
309 $tstop .=
"$aaoffset ";
313 # are all stops TGA...?
314 my $num_stops = scalar(@found_stops);
317 foreach my $stop (@found_stops) {
318 $positions .= $stop->[0].
"(".$stop->[1].
") ";
319 if ($stop->[0] eq $scodon) {
323 my $source = $gene->source;
324 #...no - an internal stop codon error in the database...
325 if ($num_tga < $num_stops) {
326 if ($source eq
'havana') {
327 #$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
328 $log_object->_save_log(
'log_warning',
'', $gsi,
'TRANSCRIPT', $tsi,
'VQCT_internal_stop',
"INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]: Sequence = $orig_seq Stops at $positions)");
331 #$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
332 $log_object->_save_log(
'log_warning',
'', $gsi,
'TRANSCRIPT', $tsi,
'VQCT_internal_stop',
"INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]: Sequence = $orig_seq Stops at $positions)");
335 #...yes - check remarks
337 my $flag_remark = 0; # 1
if word seleno has been used
338 my $flag_remark2 = 0; # 1
if existing remark has correct numbering
339 my $alabel =
'Annotation_remark- selenocysteine ';
340 my $alabel2 =
'selenocysteine ';
344 #get both hidden_remarks and remarks
345 foreach my $remark_type (
'remark',
'hidden_remark') {
347 $att=$log_object->species_params->get_attributes->{$trans->stable_id}->{$remark_type};
349 $att=$trans->get_all_Attributes($remark_type)
351 foreach my $attrib ( @$att) {
352 push @{$remarks->{$remark_type}}, $attrib->value;
355 #parse remarks to check syntax for location of edits
356 while (my ($attrib,$remarks)= each %$remarks) {
357 foreach my $text (@{$remarks}) {
358 if ( ($attrib eq
'remark') && ($text=~/^$alabel(.*)/) ){
359 #$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n");
360 $log_object->_save_log(
'log_warning',
'', $gsi,
'', $tsi,
'VQCT_wrong_selC_coord',
"seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]");
363 elsif ($text =~ /^$alabel2(.*)/) {
365 if($maybe =~ /^\s*\d+(\s+\d+)*\s*$/) {
368 $log_object->_save_log(
'log',
'', $gene->stable_id,
'', $tsi,
'',
"Maybe annotated stop in incorrect format, maybe just a remark that happens to begin '$alabel2'".
369 " -- might need to investigate: '$alabel2$maybe' [$mod_date]");
375 #check the location of the annotated edits matches actual stops in the sequence
379 foreach my $offset (split(/\s+/, $annot_stops)) {
380 #OK if it matches a known stop
382 defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && ($found_stops[$i]->[1] == $offset)) {
383 push @annotated_stops, $offset;
385 # catch old annotations where number was in DNA not peptide coordinates
386 elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1] * 3) == $offset)) {
387 $log_object->_save_log(
'log_warning',
'', $gene->stable_id,
'DNA', $tsi,
'VQCT_wrong_selC_coord',
"DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]");
389 # catch old annotations where number off by one
390 elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1]) == $offset+1)) {
391 $log_object->_save_log(
'log_warning',
'', $gene->stable_id,
'PEPTIDE', $tsi,
'VQCT_wrong_selC_coord',
"PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]");
393 elsif (defined($offset) && ($offset=~/^\d+$/)){
394 if ($offset == length($orig_seq)+1) {
395 if($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq
'known-tga-stop') {
396 $log_object->_save_log(
'log',
'', $gene->stable_id,
'TRANSCRIPT', $tsi,
'',
"Annotated stop for transcript $tsi ($tname) known to be a stop codon. Ok. [$mod_date]");
397 } elsif($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq
'known-terminal-sec') {
398 $log_object->_save_log(
'log',
'', $gene->stable_id,
'TRANSCRIPT', $tsi,
'',
"Annotated stop for transcript $tsi ($tname) known to be a terminal Sec. Ok. [$mod_date]");
400 $log_object->_save_log(
'log_warning',
'', $gene->stable_id,
'TRANSCRIPT', $tsi,
'',
"Annotated stop for transcript $tsi ($tname) \"$offset\" matches actual stop codon yet has no entry in script config to disambiguate it. Please investigate and add appropriate entry to config arrays in add_selcys.pl. [$mod_date]");
404 $log_object->_save_log(
'log_warning',
'', $gene->stable_id,
'TRANSCRIPT', $tsi,
'VQCT_wrong_selC_coord',
"Annotated stop for transcript $tsi ($tname) \"$offset\" does not match a TGA codon) [$mod_date]");
405 push @annotated_stops, $offset;
412 #check location of found stops matches annotated ones - any new ones are reported
413 foreach my $stop ( @found_stops ) {
414 my $pos = $stop->[1];
415 my $seq = $stop->[0];
416 unless ( grep { $pos == $_} @annotated_stops) {
417 if ($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq
'notsec-havana') {
418 #$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n");
419 $log_object->_save_log(
'log_verbose',
'', $gene->stable_id,
'', $tsi,
'VQCT_pot_selC',
"Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators: ".$seen_transcripts->{$tsi}->[0].
") [$mod_date]");
422 #$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n");
423 $log_object->_save_log(
'log',
'', $gene->stable_id,
'', $tsi,
'VQCT_pot_selC',
"POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]");
432 my $log_type = shift;
433 my $chrom_name=shift ||
'';
435 my $type=shift ||
'';
439 $self->$log_type($txt.
"\n");
442 #details of annotators comments
444 OTTHUMT00000144659 = FIXED- changed to
transcript
445 OTTHUMT00000276377 = FIXED- changed to
transcript
446 OTTHUMT00000257741 = FIXED- changed to nmd
447 OTTHUMT00000155694 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
448 OTTHUMT00000155695 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
449 OTTHUMT00000282573 = FIXED- changed to unprocessed pseudogene
450 OTTHUMT00000285227 = FIXED- changed start site
451 OTTHUMT00000151008 = FIXED- incorrect trimming of CDS, removed extra stop codon
452 OTTHUMT00000157999 = FIXED- changed incorrect stop
453 OTTHUMT00000150523 = FIXED- incorrect trimming of CDS
454 OTTHUMT00000150525 = FIXED- incorrect trimming of CDS
455 OTTHUMT00000150522 = FIXED- incorrect trimming of CDS
456 OTTHUMT00000150521 = FIXED- incorrect trimming of CDS
457 OTTHUMT00000246819 = FIXED- corrected frame
458 OTTHUMT00000314078 = FIXED- corrected frame
459 OTTHUMT00000080133 = FIXED- corrected frame
460 OTTHUMT00000286423 = FIXED- changed to
transcript
461 OTTMUST00000055509 = FIXED- error
462 OTTMUST00000038729 = FIXED- corrected frame
463 OTTMUST00000021760 = FIXED- corrected frame
464 OTTMUST00000023057 = FIXED- corrected frame
465 OTTMUST00000015207 = FIXED- corrected frame
466 OTTMUST00000056646 = FIXED- error
467 OTTMUST00000059686 = FIXED- corrected frame
468 OTTMUST00000013426 = FIXED- corrected frame
469 OTTMUST00000044412 = FIXED- error