3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
41 package Bio::EnsEMBL::Utils::VegaCuration::Transcript;
45 no warnings
'uninitialized';
54 =head2 find_non_overlaps
56 Args : arrayref of B::E::Transcripts
57 Example : find_non_overlaps($all_transcripts)
58 Description: identifies any non-overlapping transcripts
59 Returntype : array refs of stable IDs
64 sub find_non_overlaps {
66 my ($all_transcripts) = @_;
67 my $non_overlaps = [];
68 foreach my $transcript1 (@{$all_transcripts}) {
69 foreach my $transcript2 (@{$all_transcripts}) {
70 if ($transcript1->end < $transcript2->start) {
71 push @{$non_overlaps}, $transcript1->stable_id;
72 push @{$non_overlaps}, $transcript2->stable_id;
79 =head2 check_remarks_and_update_names
81 Arg[1] : B::E::Gene (with potentially duplicated
transcript names)
82 Arg[2] : counter 1 (no. of patched genes)
83 Arg[3] : counter 2 (no. of patched transcripts)
84 Example : $support->update_names($gene,\$c1,\$c2)
85 Description: - checks remarks and patches transcripts with identical names according to
87 Returntype :
true |
false (depending on whether patched or not), counter1, counter2
91 sub check_remarks_and_update_names {
93 my ($gene,$gene_c,$trans_c) = @_;
94 my $action = ($self->param(
'dry_run')) ?
'Would add' :
'Added';
95 my $aa = $gene->
adaptor->
db->get_AttributeAdaptor;
98 #get list of IDs that have previously been sent to annotators
99 my $seen_genes = $self->get_havana_fragmented_loci_comments;
101 my $gsi = $gene->stable_id;
102 my $gid = $gene->dbID;
106 $g_name = $gene->display_xref->display_id;
109 $g_name = $gene->get_all_Attributes(
'name')->[0]->value;
112 #get existing gene remarks
113 my $remarks = [
map {$_->value} @{$gene->get_all_Attributes(
'remark')} ];
115 #shout if there is no remark to identify this as being fragmented
116 if ( grep {$_ eq
'fragmented locus' } @$remarks) {
120 $self->log_warning(
"Gene $gsi should have a fragmented locus remark\n");
123 ##patch transcript names according to length and CDS
126 #separate coding and non_coding transcripts
127 my $coding_trans = [];
128 my $noncoding_trans = [];
129 foreach my $trans ( @{$gene->get_all_Transcripts()} ) {
130 if ($trans->translate) {
131 push @$coding_trans, $trans;
134 push @$noncoding_trans, $trans;
138 #sort transcripts coding > non-coding, then on length
140 $self->log(
"\nPatching names according to CDS and length:\n",1);
141 foreach my $array_ref ($coding_trans,$noncoding_trans) {
142 foreach my $trans ( sort { $b->length <=> $a->length } @$array_ref ) {
144 my $tsi = $trans->stable_id;
147 $t_name = $trans->display_xref->display_id;
150 $t_name = $trans->get_all_Attributes(
'name')->[0]->value;
153 my $ext = sprintf(
"%03d", $c);
154 my $new_name = $g_name.
'-'.$ext;
155 $self->log(sprintf(
"%-20s%-3s%-20s",
"$t_name ",
"-->",
"$new_name").
"\n",1);
156 if (! $self->param(
'dry_run')) {
158 # update transcript display xref
159 $dbh->do(qq(UPDATE xref x, external_db edb
160 SET x.display_label =
"$new_name"
161 WHERE x.external_db_id = edb.external_db_id
162 AND x.dbprimary_acc =
"$tsi"
163 AND edb.db_name =
"Vega_transcript"));
167 return ($study_more,$gene_c,$trans_c);
170 =head2 check_names_and_overlap
172 Arg[1] : arayref of arrayrefs of duplicated names
173 Arg[2] : B::E::Gene (with potentially duplicated
transcript names)
174 Arg[3] : FH (to log
new duplicates)
175 Example : $support->check_names_and_overlap($transcripts,$gene,$fh)
176 Description: checks pairs of transcripts identified as having duplicate Vega names:
177 - to see
if they have identical names in loutre (shouldn
't have)
178 - distinguish between overlapping and non overlapping transcripts
183 sub check_names_and_overlap {
185 my ($transcript_info,$gene,$n_flist_fh) = @_;
186 my $ta = $gene->adaptor->db->get_TranscriptAdaptor;
187 my $gsi = $gene->stable_id;
188 my $g_name = $gene->get_all_Attributes('name
')->[0]->value;
189 foreach my $set (values %{$transcript_info} ) {
190 next if (scalar @{$set} == 1);
191 my $transcripts = [];
194 foreach my $id1 (@{$set}) {
195 my ($name1,$tsi1) = split /\|/, $id1;
196 $ids_to_names{$tsi1} = $name1;
197 $all_t_names .= "$tsi1 [$name1] ";
198 my $t = $ta->fetch_by_stable_id($tsi1);
199 push @{$transcripts}, $t;
204 $non_overlaps = $self->find_non_overlaps($transcripts);
207 $self->log_warning("Problem looking for overlapping transcripts for gene $gsi (is_current = 0 ?). Skipping this bit\n");
210 #if the transcripts don't overlap
211 elsif (@{$non_overlaps}) {
213 foreach my $id (@{$non_overlaps}) {
214 my $string =
" $id [ $ids_to_names{$id} ] ";
215 $tsi_string .= $string;
218 $self->log_warning(
"NEW: Non-overlapping: $gsi ($g_name) has non-overlapping transcripts ($tsi_string) with duplicated Vega names, and it has no \'fragmented locus\' gene remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this needs checking by Havana.\n");
219 #log gsi (to be sent to Havana)
220 print $n_flist_fh
"$gsi\n";
222 #...otherwise if the transcripts do overlap
224 $self->log_warning(
"NEW: Overlapping: $gsi ($g_name) has overlapping transcripts ($all_t_names) with duplicated Vega names and it has no \'fragmented locus\' gene_remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this could be checked by Havana if they were feeling keen.\n");
225 print $n_flist_fh
"$gsi\n";
230 =head2 get_havana_fragmented_loci_comments
233 Example : my $results = $support->get_havana_fragmented_loci_comments
234 Description: parses the HEREDOC containing Havana comments in
this module
239 sub get_havana_fragmented_loci_comments {
242 next
if /^\s+$/ or /#+/;
243 my ($obj,$comment) = split /=/;
245 $comment =~ s/^\s+|\s+$
246 $seen_genes->{$obj} = $comment;
253 #details of genes with duplicated transcript names that have already been reported to Havana
254 #identified as either fragmented or as being OK to patch
257 OTTMUSG00000005478 = fragmented
258 OTTMUSG00000001936 = fragmented
259 OTTMUSG00000017081 = fragmented
260 OTTMUSG00000011441 = fragmented
261 OTTMUSG00000013335 = fragmented
262 OTTMUSG00000011654 = fragmented
263 OTTMUSG00000001835 = fragmented
264 OTTHUMG00000035221 = fragmented
265 OTTHUMG00000037378 = fragmented
266 OTTHUMG00000060732 = fragmented
267 OTTHUMG00000132441 = fragmented
268 OTTHUMG00000031383 = fragmented
269 OTTHUMG00000012716 = fragmented
270 OTTHUMG00000031102 = fragmented
271 OTTHUMG00000148816 = fragmented
272 OTTHUMG00000149059 = fragmented
273 OTTHUMG00000149221 = fragmented
274 OTTHUMG00000149326 = fragmented
275 OTTHUMG00000149644 = fragmented
276 OTTHUMG00000149574 = fragmented
277 OTTHUMG00000058101 = fragmented
279 OTTHUMG00000150119 = OK
280 OTTHUMG00000149850 = OK
281 OTTHUMG00000058101 = OK
282 OTTHUMG00000058907 = OK
284 OTTMUSG00000011654 = fragmented
285 OTTMUSG00000019369 = fragmented
286 OTTMUSG00000017081 = fragmented
287 OTTMUSG00000001835 = fragmented
288 OTTMUSG00000011499 = fragmented
289 OTTMUSG00000013335 = fragmented
290 OTTMUSG00000008023 = fragmented
291 OTTMUSG00000019369 = fragmented