3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
21 Please email comments or questions to the
public Ensembl
22 developers list at <http:
23 Questions may also be sent to the Ensembl help desk at
33 package Bio::EnsEMBL::Utils::VegaCuration::Translation;
41 =head2 check_CDS_start_end_remarks
43 Args : B::E::Transcript
44 Example : my $results = $support->check_CDS_end_remarks($transcript)
45 Description: identifies incorrect
'CDS end...' transcript remarks in a
46 otter-derived Vega database
51 sub check_CDS_start_end_remarks {
55 my @remarks = @{$trans->get_all_Attributes(
'remark')};
56 my $coding_end = $trans->cdna_coding_end;
57 my $coding_start = $trans->cdna_coding_start;
58 my $trans_end = $trans->length;
59 my $trans_seq = $trans->seq->seq;
60 my $stop_codon = substr($trans_seq, $coding_end-3, 3);
61 my $start_codon = substr($trans_seq, $coding_start-1, 3);
62 #hashref to return results
65 #extra CDS end not found remarks
66 if (grep {$_->value eq
'CDS end not found'} @remarks) {
67 if ( ($coding_end != $trans_end)
68 && ( grep {$_ eq $stop_codon} qw(TGA TAA TAG) ) ) {
69 $results->{
'END_EXTRA'} = 1;
72 #missing CDS end not found remark
73 if ( $coding_end == $trans_end ) {
74 if (! grep {$_->value eq
'CDS end not found'} @remarks) {
75 if (grep {$_ eq $stop_codon} qw(TGA TAA TAG)) {
76 $results->{
'END_MISSING_2'} = 1;
79 $results->{
'END_MISSING_1'} = $stop_codon;
83 #extra CDS start not found remark
84 if (grep {$_->value eq
'CDS start not found'} @remarks) {
85 if ( ($coding_start != 1)
86 && ($start_codon eq
'ATG') ) {
87 $results->{
'START_EXTRA'} = 1;
90 #missing CDS start not found remark
91 if ( $coding_start == 1) {
92 if ( ! grep {$_->value eq
'CDS start not found'} @remarks) {
93 if ($start_codon eq
'ATG') {
94 $results->{
'START_MISSING_2'} = 1;
96 $results->{
'START_MISSING_1'} = $start_codon;
103 =head2 check_CDS_start_end_remarks_loutre
105 Args : B::E::Transcript
106 Example : my $results = $support->check_CDS_end_remarks($transcript)
107 Description: identifies incorrect
'CDS end...' transcript attribs in a loutre
108 of a loutre-derived Vega database.
113 sub check_CDS_start_end_remarks_loutre {
118 my @stops = qw(TGA TAA TAG);
120 foreach my $attribute (@{$trans->get_all_Attributes()}) {
121 push @{$attributes{$attribute->code}}, $attribute;
123 # warn $trans->stable_id;
124 # warn Data::Dumper::Dumper(\%attributes);
125 my $coding_end = $trans->cdna_coding_end;
126 my $coding_start = $trans->cdna_coding_start;
127 my $trans_end = $trans->length;
128 my $trans_seq = $trans->seq->seq;
129 my $stop_codon_offset = 3 + $trans->translation->end_Exon->end_phase;
130 my $initial_exon_phase = $trans->translation->start_Exon->phase;
131 my $stop_codon = substr($trans_seq, $coding_end-3, 3);
132 my $start_codon = substr($trans_seq, $coding_start-1, 3);
134 my $start_codon_incorrect = 1;
135 if ($start_codon eq
'ATG' ) {
136 $start_codon_incorrect = 0;
138 elsif ($start_codon eq
'CTG') {
139 foreach my $attrib (@{$attributes{
'remark'}}) {
140 if ($attrib->value =~ /non[- ]ATG start/) {
141 $start_codon_incorrect = 0;
146 # warn "$start_codon -- $initial_exon_phase -- $coding_start -- $start_codon_incorrect";
148 #hashref to return results
151 #extra CDS end not found remarks
152 if ($attributes{
'cds_end_NF'}) {
153 if ( ($attributes{
'cds_end_NF'}->[0]->value == 1)
154 && ($coding_end != $trans_end)
155 && ( grep {$_ eq $stop_codon} @stops) ) {
156 # warn $trans->stable_id.": $coding_end--$trans_end--$stop_codon";
157 # warn $trans->translation->end_Exon->end_phase;
158 $results->{
'END_EXTRA'} = $stop_codon;
161 #missing CDS end not found remark
162 if ( $coding_end == $trans_end ) {
163 if ($attributes{
'cds_end_NF'}) {
164 if ($attributes{
'cds_end_NF'}->[0]->value == 0 ) {
165 if (! grep {$_ eq $stop_codon} @stops) {
166 # warn $trans->translation->end_Exon->end_phase;
167 $results->{
'END_MISSING'}{
'WRONG'} = $stop_codon;
171 elsif (! grep {$_ eq $stop_codon} @stops) {
172 $results->{
'END_MISSING'}{
'ABSENT'} = $stop_codon;
175 #extra CDS start not found remark
176 if ( $attributes{
'cds_start_NF'}) {
177 if ( ($attributes{
'cds_start_NF'}->[0]->value == 1 )
178 && (! $start_codon_incorrect)) {
179 unless ( ($coding_start == 1) && ( $initial_exon_phase > 0)) {
180 $results->{
'START_EXTRA'} = $start_codon;
184 #missing CDS start not found remark
185 if ( $coding_start == 1) {
186 if ( $attributes{
'cds_start_NF'} ) {
187 if ( $attributes{
'cds_start_NF'}->[0]->value == 0 ) {
188 if ($start_codon_incorrect) {
189 $results->{
'START_MISSING'}{
'ABSENT'} = $start_codon;
191 elsif ($initial_exon_phase > 0) {
192 $results->{
'START_MISSING'}{
'INITIAL_PHASE'} = $initial_exon_phase;
196 elsif ($start_codon ne
'ATG') {
197 $results->{
'START_MISSING'}{
'ABSENT'} = $start_codon;
204 =head2 get_havana_seleno_comments
207 Example : my $results = $support->get_havana_seleno_comments
208 Description: parses the HEREDOC containing Havana comments in
this module
213 sub get_havana_seleno_comments {
214 my $seen_translations;
216 next
if /^\s+$/ or /#+/;
217 my ($obj,$comment) = split /=/;
219 $comment =~ s/^\s+|\s+$
220 # We add the origin as now "seen" can come from a number of places, and have
221 # a number of consequences in different cases, not just discounted Secs from this method. -- ds23
222 $seen_translations->{$obj} = [ $comment,
"notsec-havana" ];
224 return $seen_translations;
227 sub check_for_stops {
229 my ($gene,$seen_transcripts,$log_object) = @_;
231 my $has_log_object=defined($log_object);
233 my @help = $log_object->species_params->get_trans($gene->stable_id);
236 $log_object=$support;
237 $transcripts=$gene->get_all_Transcripts;
240 my $gname = $gene->get_all_Attributes(
'name')->[0]->value;
241 my $gsi = $gene->stable_id;
243 my $mod_date = $support->date_format( $gene->modified_date,
'%d/%m/%y' );
244 my $hidden_remak_ttributes;
246 foreach my $trans (@$transcripts) {
247 my $tsi = $trans->stable_id;
248 my $tID = $trans->dbID;
249 my $tname = $trans->get_all_Attributes(
'name')->[0]->value;
251 $hidden_remak_ttributes=$log_object->species_params->get_attributes->{$tsi}->{
'hidden_remark'};
253 $hidden_remak_ttributes=$trans->get_all_Attributes(
'hidden_remark');
255 foreach my $rem (@$hidden_remak_ttributes) {
256 if ($rem->value =~ /not_for_Vega/) {
257 #$support->log_verbose("Skipping transcript $tname ($tsi) since 'not_for_Vega'\n",1);
258 $log_object->_save_log(
'log_verbose',
'', $gsi,
'', $tsi,
'',
"Skipping transcript $tname ($tsi) since 'not_for_Vega'");
263 # go no further if there is a ribosomal framshift attribute
264 foreach my $attrib (@{$trans->get_all_Attributes(
'_rib_frameshift')}) {
265 if ($attrib->value) {
266 $log_object->_save_log(
'log',
'', $gsi,
'', $tsi,
'',
"Skipping $tsi ($tname) since it has a ribosomal frameshift attribute");
270 foreach my $attrib (@{$trans->get_all_Attributes(
'hidden_remark')}) {
271 if ($attrib->value eq
'ribosomal_frameshift') {
272 $log_object->_save_log(
'log',
'', $gsi,
'', $tsi,
'',
"Skipping $tsi ($tname) since it has a ribosomal frameshift hidden_remark");
277 #$support->log_verbose("Studying transcript $tsi ($tname, $tID)\n",1);
278 $log_object->_save_log(
'log_verbose',
'', $gsi,
'', $tsi,
'',
"Studying transcript $tsi ($tname, $tID)");
281 # go no further if the transcript doesn't translate or if there are no stops
282 next TRANS unless ($peptide = $trans->translate);
283 my $pseq = $peptide->seq;
284 my $orig_seq = $pseq;
285 # (translate method trims stops from sequence end)
287 next TRANS unless ($pseq =~ /\*/);
288 next TRANS
if ($trans->biotype eq
'polymorphic_pseudogene' or
289 $trans->biotype =~ /processed_pseudogene$/);
291 # warn sprintf("Stop codon is '%s'\n",substr($trans->translateable_seq,-3));
292 #$support->log_verbose("Stops found in $tsi ($tname)\n",1);
293 $log_object->_save_log(
'log_verbose',
'', $gsi,
'', $tsi,
'',
"Stops found in $tsi ($tname)");
295 # find out where and how many stops there are
297 my $mrna = $trans->translateable_seq;
300 while ($pseq =~ /^([^\*]*)\*(.*)/) {
304 $offset += length($pseq1_f) * 3;
305 my $stop = substr($mrna, $offset, 3);
306 my $aaoffset = int($offset / 3)+1;
307 push(@found_stops, [ $stop, $aaoffset ]);
308 $tstop .=
"$aaoffset ";
312 # are all stops TGA...?
313 my $num_stops = scalar(@found_stops);
316 foreach my $stop (@found_stops) {
317 $positions .= $stop->[0].
"(".$stop->[1].
") ";
318 if ($stop->[0] eq $scodon) {
322 my $source = $gene->source;
323 #...no - an internal stop codon error in the database...
324 if ($num_tga < $num_stops) {
325 if ($source eq
'havana') {
326 #$support->log_warning("INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
327 $log_object->_save_log(
'log_warning',
'', $gsi,
'TRANSCRIPT', $tsi,
'VQCT_internal_stop',
"INTERNAL STOPS HAVANA: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons [$mod_date]: Sequence = $orig_seq Stops at $positions)");
330 #$support->log_warning("INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]:\nSequence = $orig_seq\nStops at $positions)\n\n");
331 $log_object->_save_log(
'log_warning',
'', $gsi,
'TRANSCRIPT', $tsi,
'VQCT_internal_stop',
"INTERNAL STOPS EXTERNAL: Transcript $tsi ($tname) from gene $gname has non \'$scodon\' stop codons[$mod_date]: Sequence = $orig_seq Stops at $positions)");
334 #...yes - check remarks
336 my $flag_remark = 0; # 1
if word seleno has been used
337 my $flag_remark2 = 0; # 1
if existing remark has correct numbering
338 my $alabel =
'Annotation_remark- selenocysteine ';
339 my $alabel2 =
'selenocysteine ';
343 #get both hidden_remarks and remarks
344 foreach my $remark_type (
'remark',
'hidden_remark') {
346 $att=$log_object->species_params->get_attributes->{$trans->stable_id}->{$remark_type};
348 $att=$trans->get_all_Attributes($remark_type)
350 foreach my $attrib ( @$att) {
351 push @{$remarks->{$remark_type}}, $attrib->value;
354 #parse remarks to check syntax for location of edits
355 while (my ($attrib,$remarks)= each %$remarks) {
356 foreach my $text (@{$remarks}) {
357 if ( ($attrib eq
'remark') && ($text=~/^$alabel(.*)/) ){
358 #$support->log_warning("seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]\n");
359 $log_object->_save_log(
'log_warning',
'', $gsi,
'', $tsi,
'VQCT_wrong_selC_coord',
"seleno remark for $tsi stored as Annotation_remark not hidden remark) [$mod_date]");
362 elsif ($text =~ /^$alabel2(.*)/) {
364 if($maybe =~ /^\s*\d+(\s+\d+)*\s*$/) {
367 $log_object->_save_log(
'log',
'', $gene->stable_id,
'', $tsi,
'',
"Maybe annotated stop in incorrect format, maybe just a remark that happens to begin '$alabel2'".
368 " -- might need to investigate: '$alabel2$maybe' [$mod_date]");
374 #check the location of the annotated edits matches actual stops in the sequence
378 foreach my $offset (split(/\s+/, $annot_stops)) {
379 #OK if it matches a known stop
381 defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && ($found_stops[$i]->[1] == $offset)) {
382 push @annotated_stops, $offset;
384 # catch old annotations where number was in DNA not peptide coordinates
385 elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1] * 3) == $offset)) {
386 $log_object->_save_log(
'log_warning',
'', $gene->stable_id,
'DNA', $tsi,
'VQCT_wrong_selC_coord',
"DNA: Annotated stop for transcript tsi ($tname) is in DNA not peptide coordinates) [$mod_date]");
388 # catch old annotations where number off by one
389 elsif (defined($found_stops[$i]) && defined($found_stops[$i]->[1]) && (($found_stops[$i]->[1]) == $offset+1)) {
390 $log_object->_save_log(
'log_warning',
'', $gene->stable_id,
'PEPTIDE', $tsi,
'VQCT_wrong_selC_coord',
"PEPTIDE: Annotated stop for transcript $tsi ($tname) is out by one) [$mod_date]");
392 elsif (defined($offset) && ($offset=~/^\d+$/)){
393 if ($offset == length($orig_seq)+1) {
394 if($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq
'known-tga-stop') {
395 $log_object->_save_log(
'log',
'', $gene->stable_id,
'TRANSCRIPT', $tsi,
'',
"Annotated stop for transcript $tsi ($tname) known to be a stop codon. Ok. [$mod_date]");
396 } elsif($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq
'known-terminal-sec') {
397 $log_object->_save_log(
'log',
'', $gene->stable_id,
'TRANSCRIPT', $tsi,
'',
"Annotated stop for transcript $tsi ($tname) known to be a terminal Sec. Ok. [$mod_date]");
399 $log_object->_save_log(
'log_warning',
'', $gene->stable_id,
'TRANSCRIPT', $tsi,
'',
"Annotated stop for transcript $tsi ($tname) \"$offset\" matches actual stop codon yet has no entry in script config to disambiguate it. Please investigate and add appropriate entry to config arrays in add_selcys.pl. [$mod_date]");
403 $log_object->_save_log(
'log_warning',
'', $gene->stable_id,
'TRANSCRIPT', $tsi,
'VQCT_wrong_selC_coord',
"Annotated stop for transcript $tsi ($tname) \"$offset\" does not match a TGA codon) [$mod_date]");
404 push @annotated_stops, $offset;
411 #check location of found stops matches annotated ones - any new ones are reported
412 foreach my $stop ( @found_stops ) {
413 my $pos = $stop->[1];
414 my $seq = $stop->[0];
415 unless ( grep { $pos == $_} @annotated_stops) {
416 if ($seen_transcripts->{$tsi} && $seen_transcripts->{$tsi}->[1] eq
'notsec-havana') {
417 #$support->log_verbose("Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators:\n\t".$seen_transcripts->{$tsi}.") [$mod_date]\n");
418 $log_object->_save_log(
'log_verbose',
'', $gene->stable_id,
'', $tsi,
'VQCT_pot_selC',
"Transcript $tsi ($tname) has potential selenocysteines but has been discounted by annotators: ".$seen_transcripts->{$tsi}->[0].
") [$mod_date]");
421 #$support->log("POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]\n");
422 $log_object->_save_log(
'log',
'', $gene->stable_id,
'', $tsi,
'VQCT_pot_selC',
"POTENTIAL SELENO ($seq) in $tsi ($tname, gene $gname) found at $pos [$mod_date]");
431 my $log_type = shift;
432 my $chrom_name=shift ||
'';
434 my $type=shift ||
'';
438 $self->$log_type($txt.
"\n");
441 #details of annotators comments
443 OTTHUMT00000144659 = FIXED- changed to
transcript
444 OTTHUMT00000276377 = FIXED- changed to
transcript
445 OTTHUMT00000257741 = FIXED- changed to nmd
446 OTTHUMT00000155694 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
447 OTTHUMT00000155695 = NOT_FIXED- should be nmd but external annotation but cannot be fixed
448 OTTHUMT00000282573 = FIXED- changed to unprocessed pseudogene
449 OTTHUMT00000285227 = FIXED- changed start site
450 OTTHUMT00000151008 = FIXED- incorrect trimming of CDS, removed extra stop codon
451 OTTHUMT00000157999 = FIXED- changed incorrect stop
452 OTTHUMT00000150523 = FIXED- incorrect trimming of CDS
453 OTTHUMT00000150525 = FIXED- incorrect trimming of CDS
454 OTTHUMT00000150522 = FIXED- incorrect trimming of CDS
455 OTTHUMT00000150521 = FIXED- incorrect trimming of CDS
456 OTTHUMT00000246819 = FIXED- corrected frame
457 OTTHUMT00000314078 = FIXED- corrected frame
458 OTTHUMT00000080133 = FIXED- corrected frame
459 OTTHUMT00000286423 = FIXED- changed to
transcript
460 OTTMUST00000055509 = FIXED- error
461 OTTMUST00000038729 = FIXED- corrected frame
462 OTTMUST00000021760 = FIXED- corrected frame
463 OTTMUST00000023057 = FIXED- corrected frame
464 OTTMUST00000015207 = FIXED- corrected frame
465 OTTMUST00000056646 = FIXED- error
466 OTTMUST00000059686 = FIXED- corrected frame
467 OTTMUST00000013426 = FIXED- corrected frame
468 OTTMUST00000044412 = FIXED- error