ensembl-hive  2.8.1
Transcript.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 =head1 SYNOPSIS
34 
35 =head1 DESCRIPTION
36 
37 =head1 METHODS
38 
39 =cut
40 
41 package Bio::EnsEMBL::Utils::VegaCuration::Transcript;
42 
43 use strict;
44 use warnings;
45 no warnings 'uninitialized';
46 use vars qw(@ISA);
47 
49 use Data::Dumper;
50 
52 
53 
54 =head2 find_non_overlaps
55 
56  Args : arrayref of B::E::Transcripts
57  Example : find_non_overlaps($all_transcripts)
58  Description: identifies any non-overlapping transcripts
59  Returntype : array refs of stable IDs
60  Exceptions : none
61 
62 =cut
63 
64 sub find_non_overlaps {
65  my $self = shift;
66  my ($all_transcripts) = @_;
67  my $non_overlaps = [];
68  foreach my $transcript1 (@{$all_transcripts}) {
69  foreach my $transcript2 (@{$all_transcripts}) {
70  if ($transcript1->end < $transcript2->start) {
71  push @{$non_overlaps}, $transcript1->stable_id;
72  push @{$non_overlaps}, $transcript2->stable_id;
73  }
74  }
75  }
76  return $non_overlaps;
77 }
78 
79 =head2 check_remarks_and_update_names
80 
81  Arg[1] : B::E::Gene (with potentially duplicated transcript names)
82  Arg[2] : counter 1 (no. of patched genes)
83  Arg[3] : counter 2 (no. of patched transcripts)
84  Example : $support->update_names($gene,\$c1,\$c2)
85  Description: - checks remarks and patches transcripts with identical names according to
86  CDS and length
87  Returntype : true | false (depending on whether patched or not), counter1, counter2
88 
89 =cut
90 
91 sub check_remarks_and_update_names {
92  my $self = shift;
93  my ($gene,$gene_c,$trans_c) = @_;
94  my $action = ($self->param('dry_run')) ? 'Would add' : 'Added';
95  my $aa = $gene->adaptor->db->get_AttributeAdaptor;
96  my $dbh = $gene->adaptor->db->dbc->db_handle;
97 
98  #get list of IDs that have previously been sent to annotators
99  my $seen_genes = $self->get_havana_fragmented_loci_comments;
100 
101  my $gsi = $gene->stable_id;
102  my $gid = $gene->dbID;
103  my $g_name;
104  my $study_more = 1;
105  eval {
106  $g_name = $gene->display_xref->display_id;
107  };
108  if ($@) {
109  $g_name = $gene->get_all_Attributes('name')->[0]->value;
110  }
111 
112  #get existing gene remarks
113  my $remarks = [ map {$_->value} @{$gene->get_all_Attributes('remark')} ];
114 
115  #shout if there is no remark to identify this as being fragmented
116  if ( grep {$_ eq 'fragmented locus' } @$remarks) {
117  $study_more = 0;
118  }
119  else {
120  $self->log_warning("Gene $gsi should have a fragmented locus remark\n");
121  }
122 
123  ##patch transcript names according to length and CDS
124  $gene_c++;
125 
126  #separate coding and non_coding transcripts
127  my $coding_trans = [];
128  my $noncoding_trans = [];
129  foreach my $trans ( @{$gene->get_all_Transcripts()} ) {
130  if ($trans->translate) {
131  push @$coding_trans, $trans;
132  }
133  else {
134  push @$noncoding_trans, $trans;
135  }
136  }
137 
138  #sort transcripts coding > non-coding, then on length
139  my $c = 0;
140  $self->log("\nPatching names according to CDS and length:\n",1);
141  foreach my $array_ref ($coding_trans,$noncoding_trans) {
142  foreach my $trans ( sort { $b->length <=> $a->length } @$array_ref ) {
143  $trans_c++;
144  my $tsi = $trans->stable_id;
145  my $t_name;
146  eval {
147  $t_name = $trans->display_xref->display_id;
148  };
149  if ($@) {
150  $t_name = $trans->get_all_Attributes('name')->[0]->value;
151  }
152  $c++;
153  my $ext = sprintf("%03d", $c);
154  my $new_name = $g_name.'-'.$ext;
155  $self->log(sprintf("%-20s%-3s%-20s", "$t_name ", "-->", "$new_name")."\n",1);
156  if (! $self->param('dry_run')) {
157 
158  # update transcript display xref
159  $dbh->do(qq(UPDATE xref x, external_db edb
160  SET x.display_label = "$new_name"
161  WHERE x.external_db_id = edb.external_db_id
162  AND x.dbprimary_acc = "$tsi"
163  AND edb.db_name = "Vega_transcript"));
164  }
165  }
166  }
167  return ($study_more,$gene_c,$trans_c);
168 }
169 
170 =head2 check_names_and_overlap
171 
172  Arg[1] : arayref of arrayrefs of duplicated names
173  Arg[2] : B::E::Gene (with potentially duplicated transcript names)
174  Arg[3] : FH (to log new duplicates)
175  Example : $support->check_names_and_overlap($transcripts,$gene,$fh)
176  Description: checks pairs of transcripts identified as having duplicate Vega names:
177  - to see if they have identical names in loutre (shouldn't have)
178  - distinguish between overlapping and non overlapping transcripts
179  Returntype : none
180 
181 =cut
182 
183 sub check_names_and_overlap {
184  my $self = shift;
185  my ($transcript_info,$gene,$n_flist_fh) = @_;
186  my $ta = $gene->adaptor->db->get_TranscriptAdaptor;
187  my $gsi = $gene->stable_id;
188  my $g_name = $gene->get_all_Attributes('name')->[0]->value;
189  foreach my $set (values %{$transcript_info} ) {
190  next if (scalar @{$set} == 1);
191  my $transcripts = [];
192  my $all_t_names;
193  my %ids_to_names;
194  foreach my $id1 (@{$set}) {
195  my ($name1,$tsi1) = split /\|/, $id1;
196  $ids_to_names{$tsi1} = $name1;
197  $all_t_names .= "$tsi1 [$name1] ";
198  my $t = $ta->fetch_by_stable_id($tsi1);
199  push @{$transcripts}, $t;
200  }
201 
202  my $non_overlaps;
203  eval {
204  $non_overlaps = $self->find_non_overlaps($transcripts);
205  };
206  if ($@) {
207  $self->log_warning("Problem looking for overlapping transcripts for gene $gsi (is_current = 0 ?). Skipping this bit\n");
208  }
209 
210  #if the transcripts don't overlap
211  elsif (@{$non_overlaps}) {
212  my $tsi_string;
213  foreach my $id (@{$non_overlaps}) {
214  my $string = " $id [ $ids_to_names{$id} ] ";
215  $tsi_string .= $string;
216  }
217 
218  $self->log_warning("NEW: Non-overlapping: $gsi ($g_name) has non-overlapping transcripts ($tsi_string) with duplicated Vega names, and it has no \'fragmented locus\' gene remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this needs checking by Havana.\n");
219  #log gsi (to be sent to Havana)
220  print $n_flist_fh "$gsi\n";
221  }
222  #...otherwise if the transcripts do overlap
223  else {
224  $self->log_warning("NEW: Overlapping: $gsi ($g_name) has overlapping transcripts ($all_t_names) with duplicated Vega names and it has no \'fragmented locus\' gene_remark. Neither has it been OKeyed by Havana before. Transcript names are being patched but this could be checked by Havana if they were feeling keen.\n");
225  print $n_flist_fh "$gsi\n";
226  }
227  }
228 }
229 
230 =head2 get_havana_fragmented_loci_comments
231 
232  Args : none
233  Example : my $results = $support->get_havana_fragmented_loci_comments
234  Description: parses the HEREDOC containing Havana comments in this module
235  Returntype : hashref
236 
237 =cut
238 
239 sub get_havana_fragmented_loci_comments {
240  my $seen_genes;
241  while (<DATA>) {
242  next if /^\s+$/ or /#+/;
243  my ($obj,$comment) = split /=/;
244  $obj =~ s/^\s+|\s+$//g;
245  $comment =~ s/^\s+|\s+$//g;
246  $seen_genes->{$obj} = $comment;
247  }
248  return $seen_genes;
249 }
250 
251 
252 
253 #details of genes with duplicated transcript names that have already been reported to Havana
254 #identified as either fragmented or as being OK to patch
255 __DATA__
256 
257 OTTMUSG00000005478 = fragmented
258 OTTMUSG00000001936 = fragmented
259 OTTMUSG00000017081 = fragmented
260 OTTMUSG00000011441 = fragmented
261 OTTMUSG00000013335 = fragmented
262 OTTMUSG00000011654 = fragmented
263 OTTMUSG00000001835 = fragmented
264 OTTHUMG00000035221 = fragmented
265 OTTHUMG00000037378 = fragmented
266 OTTHUMG00000060732 = fragmented
267 OTTHUMG00000132441 = fragmented
268 OTTHUMG00000031383 = fragmented
269 OTTHUMG00000012716 = fragmented
270 OTTHUMG00000031102 = fragmented
271 OTTHUMG00000148816 = fragmented
272 OTTHUMG00000149059 = fragmented
273 OTTHUMG00000149221 = fragmented
274 OTTHUMG00000149326 = fragmented
275 OTTHUMG00000149644 = fragmented
276 OTTHUMG00000149574 = fragmented
277 OTTHUMG00000058101 = fragmented
278 
279 OTTHUMG00000150119 = OK
280 OTTHUMG00000149850 = OK
281 OTTHUMG00000058101 = OK
282 OTTHUMG00000058907 = OK
283 
284 OTTMUSG00000011654 = fragmented
285 OTTMUSG00000019369 = fragmented
286 OTTMUSG00000017081 = fragmented
287 OTTMUSG00000001835 = fragmented
288 OTTMUSG00000011499 = fragmented
289 OTTMUSG00000013335 = fragmented
290 OTTMUSG00000008023 = fragmented
291 OTTMUSG00000019369 = fragmented
292 
293 
294 OTTMUSG00000022266
295 OTTMUSG00000006697
296 
297 
298 
299 
300 
301 OTTMUSG00000012302 =
302 OTTMUSG00000013368 =
303 OTTMUSG00000015766 =
304 OTTMUSG00000016025 =
305 OTTMUSG00000001066 =
306 OTTMUSG00000016331 =
307 OTTMUSG00000006935 =
308 OTTMUSG00000007263 =
309 OTTMUSG00000000304 =
310 OTTMUSG00000009150 =
311 OTTMUSG00000008023 =
312 OTTMUSG00000017077 =
313 OTTMUSG00000003440 =
314 OTTMUSG00000016310 =
315 OTTMUSG00000026199 =
316 OTTMUSG00000028423 =
317 OTTMUSG00000007427 =
transcript
public transcript()
map
public map()
Bio::EnsEMBL::Utils::VegaCuration::Gene
Definition: Gene.pm:13
Bio::EnsEMBL::DBSQL::DBAdaptor::dbc
public Bio::EnsEMBL::DBSQL::DBConnection dbc()
Bio::EnsEMBL::DBSQL::DBConnection::db_handle
public DBI db_handle()
Bio::EnsEMBL::CDS
Definition: CDS.pm:28
Bio::EnsEMBL::DBSQL::BaseAdaptor::db
public Bio::EnsEMBL::DBSQL::DBAdaptor db()
Bio::EnsEMBL::Storable::adaptor
public Bio::EnsEMBL::DBSQL::BaseAdaptor adaptor()