ensembl-hive  2.8.1
shortintrons2frameshifts.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 use strict;
18 use warnings;
19 
21 
22 use Getopt::Long;
23 use POSIX qw(ceil);
24 
25 my ($host, $port, $user, $pass, $dbname, $testid);
26 
27 GetOptions('host=s' => \$host,
28  'user=s' => \$user,
29  'port=i' => \$port,
30  'pass=s' => \$pass,
31  'dbname=s' => \$dbname,
32  'testid=i' => \$testid);
33 
34 $port ||= 3306;
35 
36 usage("-user, -host, -dbname args are required")
37  if(!$user || !$dbname || !$host);
38 
39 my $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new
40  (-host => $host,
41  -user => $user,
42  -pass => $pass,
43  -dbname => $dbname,
44  -port => $port);
45 
46 
47 # algorithm:
48 # - find genes with transcripts that have short (1-2bp) introns)
49 # - remember the exon positions of the exons of each transcript
50 # - loop through each transcript:
51 # * merge exons seperated by short introns
52 # * store frameshifts as attributes for transcripts
53 # - compare set of exons now used by updated transcripts
54 # * exons which have same positions as before as left alone
55 # * old exons which are no longer used are deleted
56 # * new exons are given new stable ids and stored
57 # - update the exon_transcript table with new transcript exon composition
58 
59 
60 print STDERR "Finding short introns\n";
61 
62 
63 my $del_et_sth = $db->dbc->prepare(qq{DELETE FROM exon_transcript
64  WHERE transcript_id = ?});
65 
66 my $ins_et_sth = $db->dbc->prepare(qq{INSERT INTO exon_transcript
67  SET exon_id = ?,
68  transcript_id = ?,
69  rank = ?});
70 
71 my $upd_tl_sth = $db->dbc->prepare(qq{UPDATE translation
72  SET start_exon_id = ?,
73  end_exon_id = ?,
74  seq_start = ?,
75  seq_end = ?
76  WHERE translation_id = ?});
77 
78 
79 my $sth = $db->dbc->prepare(qq{SELECT max(stable_id) FROM exon});
80 $sth->execute();
81 my $ex_stable_id = $sth->fetchall_arrayref->[0]->[0];
82 $ex_stable_id++;
83 $sth->finish();
84 
85 if(!$testid) {
86 
87  $sth = $db->dbc()->prepare
88  (qq{SELECT t.gene_id,
89  MIN(IF(e1.seq_region_strand = 1,
90  e2.seq_region_start - e1.seq_region_end - 1,
91  e1.seq_region_start - e2.seq_region_end - 1)) AS intron_len
92  FROM exon e1, exon e2, exon_transcript et1, exon_transcript et2,
93  transcript t
94  WHERE et1.exon_id = e1.exon_id
95  AND et2.exon_id = e2.exon_id
96  AND et1.transcript_id = et2.transcript_id
97  AND et1.rank = et2.rank - 1
98  AND et1.transcript_id = t.transcript_id
99  GROUP BY t.gene_id
100  HAVING intron_len < 3});
101 
102  $sth->execute();
103 
104 }
105 
106 my @gene_ids =
107  ($testid) ? ($testid) : map{$_->[0]} @{$sth->fetchall_arrayref()};
108 
109 my $total_rows = ($testid) ? 1 : $sth->rows();
110 my $cur_row = 0;
111 my $last_percent = undef;
112 
113 my $ga = $db->get_GeneAdaptor();
114 my $ea = $db->get_ExonAdaptor();
115 my $aa = $db->get_AttributeAdaptor();
116 
117 print STDERR "Merging exons and storing frameshifts\n";
118 foreach my $gene_id (@gene_ids) {
119 
120  my $percent = ceil(($cur_row++ / $total_rows) * 100);
121  if(($percent % 5) == 0 &&
122  (!defined($last_percent) || $percent != $last_percent)) {
123  $last_percent = $percent;
124  print STDERR "$percent% complete\n";
125  }
126 
127  my $g = $ga->fetch_by_dbID($gene_id);
128 
129  my %old_exons = map {$_->hashkey() => $_} @{$g->get_all_Exons()};
130  my %new_exons = ();
131 
132  my %tl_cdna_starts;
133  my %tl_cdna_ends;
134 
135  foreach my $tr (@{$g->get_all_Transcripts}) {
136  # keep track of translation cdna coordinates before messing with transcript
137  my ($cdna_coding_start, $cdna_coding_end);
138  if($tr->translation()) {
139  $cdna_coding_start = $tr->cdna_coding_start();
140  $cdna_coding_end = $tr->cdna_coding_end();
141  }
142 
143  my @frameshifts;
144 
145  # collect list of frameshifts
146  foreach my $intron (@{$tr->get_all_Introns()}) {
147  push(@frameshifts, $intron) if($intron->length() < 3);
148  }
149 
150  my %seen_evidence;
151  my @exons;
152  my $fs = shift(@frameshifts);
153  my $cdna_start = 1;
154 
155  # merge together exons which are seperated by frameshift introns
156  foreach my $ex (@{$tr->get_all_Exons()}) {
157 
158  # was the previous exon seperated from this one by a frameshift intron?
159  if($fs && $fs->next_Exon()->stable_id() eq $ex->stable_id()) {
160 
161  # merge this exon with the previous one
162  if($ex->strand() == 1) {
163  $exons[$#exons]->end($ex->end());
164  } else {
165  $exons[$#exons]->start($ex->start());
166  }
167 
168  # merge supporting evidence
169  foreach my $sf (@{$ex->get_all_supporting_features()}) {
170  if(!$seen_evidence{ref($sf) . ':' . $sf->dbID()}) {
171  $ex->add_supporting_feature($sf);
172  $seen_evidence{ref($sf . ':' . $sf->dbID())} = 1;
173  }
174  }
175 
176  # store frameshift as transcript attrib
177  my $seqed = Bio::EnsEMBL::SeqEdit->new
178  (-CODE => '_rna_edit',
179  -NAME => 'RNA Edit',
180  -DESC => 'Post transcriptional RNA edit',
181  -START => $cdna_start,
182  -END => $cdna_start + $fs->length() - 1,
183  -ALT_SEQ => '');
184  $aa->store_on_Transcript($tr->dbID, [$seqed->get_Attribute]);
185 
186  # adjust cdna coordinates for frameshifted basepairs
187  if($tr->translation) {
188  if($cdna_coding_start >= $cdna_start) {
189  $cdna_coding_start += $fs->length();
190  }
191  if($cdna_coding_end >= $cdna_start) {
192  $cdna_coding_end += $fs->length();
193  }
194  }
195  $cdna_start += $fs->length();
196 
197  # look at the next frameshift
198  $fs = shift(@frameshifts);
199  } else {
200  push @exons, $ex;
201 
202  %seen_evidence = ();
203  foreach my $sf (@{$ex->get_all_supporting_features()}) {
204  $seen_evidence{ref($_) . ':' . $_->dbID()} = 1;
205  }
206  }
207 
208  $cdna_start += $ex->length();
209  }
210 
211  # rebuild transcript from new set of exons
212  $tr->flush_Exons();
213 
214  foreach my $ex (@exons) {
215  $new_exons{$ex->hashkey} = $ex;
216  $tr->add_Exon($ex);
217  }
218 
219  $tl_cdna_starts{$tr->dbID()} = $cdna_coding_start;
220  $tl_cdna_ends{$tr->dbID()} = $cdna_coding_end;
221  }
222 
223  # determine which exons can be deleted
224  foreach my $old_key (keys %old_exons) {
225  if(!$new_exons{$old_key}) {
226  $ea->remove($old_exons{$old_key});
227  delete $old_exons{$old_key};
228  }
229  }
230 
231  # determine which exons are brand new and need storing
232  foreach my $new_key (keys %new_exons) {
233  if(!$old_exons{$new_key}) {
234  my $new_ex = $new_exons{$new_key};
235  # "unstore" the merged exon by unsetting adaptor and dbID
236  $new_ex->dbID(undef);
237  $new_ex->adaptor(undef);
238 
239  # assign a new stable id
240  $new_ex->stable_id($ex_stable_id++);
241  $ea->store($new_exons{$new_key});
242  }
243  }
244 
245  # update the transcript composition by updating the exon transcript table
246  foreach my $tr (@{$g->get_all_Transcripts()}) {
247  $del_et_sth->execute($tr->dbID());
248 
249  my $rank = 1;
250  foreach my $ex (@{$tr->get_all_Exons()}) {
251  my $ex_id;
252 
253  if($old_exons{$ex->hashkey()}) {
254  $ex_id = $old_exons{$ex->hashkey()}->dbID();
255  } else {
256  $ex_id = $new_exons{$ex->hashkey()}->dbID();
257  }
258 
259  $ins_et_sth->execute($ex_id, $tr->dbID(), $rank);
260 
261  $rank++;
262  }
263  }
264 
265  # update the translations to use the new exons
266  foreach my $tr (@{$g->get_all_Transcripts()}) {
267  my $tl = $tr->translation();
268 
269  next if(!$tl);
270 
271  my $tl_start = $tl_cdna_starts{$tr->dbID()};
272  my $tl_end = $tl_cdna_ends{$tr->dbID()};
273 
274  foreach my $ex (@{$tr->get_all_Exons()}) {
275  if($tl_start > 0 && $tl_start <= $ex->length()) {
276  $tl->start_Exon($ex);
277  $tl->start($tl_start);
278  }
279 
280  if($tl_end > 0 && $tl_end <= $ex->length()) {
281  $tl->end_Exon($ex);
282  $tl->end($tl_end);
283  }
284 
285  $tl_start -= $ex->length();
286  $tl_end -= $ex->length();
287  }
288 
289  my ($start_ex_id, $end_ex_id);
290 
291  # use consolidated exon ids. Exons may have been duplicated across
292  # multiple transcripts but only one has been stored and had id set.
293  if($old_exons{$tl->start_Exon->hashkey()}) {
294  $start_ex_id = $old_exons{$tl->start_Exon->hashkey()}->dbID();
295  } else {
296  $start_ex_id = $new_exons{$tl->start_Exon->hashkey()}->dbID();
297  }
298 
299  if($old_exons{$tl->end_Exon->hashkey()}) {
300  $end_ex_id = $old_exons{$tl->end_Exon->hashkey()}->dbID();
301  } else {
302  $end_ex_id = $new_exons{$tl->end_Exon->hashkey()}->dbID();
303  }
304 
305 
306  $upd_tl_sth->execute($start_ex_id, $end_ex_id,
307  $tl->start(), $tl->end(), $tl->dbID());
308  }
309 }
310 
311 $del_et_sth->finish();
312 $ins_et_sth->finish();
313 $sth->finish();
314 $upd_tl_sth->finish();
315 
316 print STDERR "\nAll Done.\n";
317 
318 
319 sub usage {
320  print STDERR qq {
321 This program merges exons which are seperated by introns of length 1 or 2
322 and replaces them with a frameshift mrna edit.
323 
324 Usage:
325 perl shortintrons2frameshift.pl -user <user> -host <host> [-pass <password>] \
326  -dbname <dbname> [-port <port>]
327 };
328 
329  exit;
330 }
transcript
public transcript()
EnsEMBL
Definition: Filter.pm:1
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
map
public map()
Bio::EnsEMBL::SeqEdit::new
public Bio::EnsEMBL::SeqEdit new()
usage
public usage()
Bio::EnsEMBL::SeqEdit
Definition: SeqEdit.pm:55
exon
public exon()
Bio
Definition: AltAlleleGroup.pm:4