2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
25 my ($host, $port, $user, $pass, $dbname, $testid);
27 GetOptions(
'host=s' => \$host,
31 'dbname=s' => \$dbname,
32 'testid=i' => \$testid);
36 usage(
"-user, -host, -dbname args are required")
37 if(!$user || !$dbname || !$host);
48 # - find genes with transcripts that have short (1-2bp) introns)
49 # - remember the exon positions of the exons of each transcript
50 # - loop through each transcript:
51 # * merge exons seperated by short introns
52 # * store frameshifts as attributes for transcripts
53 # - compare set of exons now used by updated transcripts
54 # * exons which have same positions as before as left alone
55 # * old exons which are no longer used are deleted
56 # * new exons are given new stable ids and stored
57 # - update the exon_transcript table with new transcript exon composition
60 print STDERR
"Finding short introns\n";
63 my $del_et_sth = $db->dbc->prepare(qq{DELETE FROM exon_transcript
64 WHERE transcript_id = ?});
66 my $ins_et_sth = $db->dbc->prepare(qq{INSERT INTO exon_transcript
71 my $upd_tl_sth = $db->dbc->prepare(qq{UPDATE translation
72 SET start_exon_id = ?,
76 WHERE translation_id = ?});
79 my $sth = $db->dbc->prepare(qq{SELECT max(stable_id) FROM
exon});
81 my $ex_stable_id = $sth->fetchall_arrayref->[0]->[0];
87 $sth = $db->dbc()->prepare
89 MIN(IF(e1.seq_region_strand = 1,
90 e2.seq_region_start - e1.seq_region_end - 1,
91 e1.seq_region_start - e2.seq_region_end - 1)) AS intron_len
92 FROM
exon e1,
exon e2, exon_transcript et1, exon_transcript et2,
94 WHERE et1.exon_id = e1.exon_id
95 AND et2.exon_id = e2.exon_id
96 AND et1.transcript_id = et2.transcript_id
97 AND et1.rank = et2.rank - 1
98 AND et1.transcript_id = t.transcript_id
100 HAVING intron_len < 3});
107 ($testid) ? ($testid) :
map{$_->[0]} @{$sth->fetchall_arrayref()};
109 my $total_rows = ($testid) ? 1 : $sth->rows();
111 my $last_percent = undef;
113 my $ga = $db->get_GeneAdaptor();
114 my $ea = $db->get_ExonAdaptor();
115 my $aa = $db->get_AttributeAdaptor();
117 print STDERR
"Merging exons and storing frameshifts\n";
118 foreach my $gene_id (@gene_ids) {
120 my $percent = ceil(($cur_row++ / $total_rows) * 100);
121 if(($percent % 5) == 0 &&
122 (!defined($last_percent) || $percent != $last_percent)) {
123 $last_percent = $percent;
124 print STDERR
"$percent% complete\n";
127 my $g = $ga->fetch_by_dbID($gene_id);
129 my %old_exons =
map {$_->hashkey() => $_} @{$g->get_all_Exons()};
135 foreach my $tr (@{$g->get_all_Transcripts}) {
136 # keep track of translation cdna coordinates before messing with transcript
137 my ($cdna_coding_start, $cdna_coding_end);
138 if($tr->translation()) {
139 $cdna_coding_start = $tr->cdna_coding_start();
140 $cdna_coding_end = $tr->cdna_coding_end();
145 # collect list of frameshifts
146 foreach my $intron (@{$tr->get_all_Introns()}) {
147 push(@frameshifts, $intron)
if($intron->length() < 3);
152 my $fs = shift(@frameshifts);
155 # merge together exons which are seperated by frameshift introns
156 foreach my $ex (@{$tr->get_all_Exons()}) {
158 # was the previous exon seperated from this one by a frameshift intron?
159 if($fs && $fs->next_Exon()->stable_id() eq $ex->stable_id()) {
161 # merge this exon with the previous one
162 if($ex->strand() == 1) {
163 $exons[$#exons]->end($ex->end());
165 $exons[$#exons]->start($ex->start());
168 # merge supporting evidence
169 foreach my $sf (@{$ex->get_all_supporting_features()}) {
170 if(!$seen_evidence{ref($sf) .
':' . $sf->dbID()}) {
171 $ex->add_supporting_feature($sf);
172 $seen_evidence{ref($sf .
':' . $sf->dbID())} = 1;
176 # store frameshift as transcript attrib
178 (-CODE =>
'_rna_edit',
180 -DESC =>
'Post transcriptional RNA edit',
181 -START => $cdna_start,
182 -END => $cdna_start + $fs->length() - 1,
184 $aa->store_on_Transcript($tr->dbID, [$seqed->get_Attribute]);
186 # adjust cdna coordinates for frameshifted basepairs
187 if($tr->translation) {
188 if($cdna_coding_start >= $cdna_start) {
189 $cdna_coding_start += $fs->length();
191 if($cdna_coding_end >= $cdna_start) {
192 $cdna_coding_end += $fs->length();
195 $cdna_start += $fs->length();
197 # look at the next frameshift
198 $fs = shift(@frameshifts);
203 foreach my $sf (@{$ex->get_all_supporting_features()}) {
204 $seen_evidence{ref($_) .
':' . $_->dbID()} = 1;
208 $cdna_start += $ex->length();
211 # rebuild transcript from new set of exons
214 foreach my $ex (@exons) {
215 $new_exons{$ex->hashkey} = $ex;
219 $tl_cdna_starts{$tr->dbID()} = $cdna_coding_start;
220 $tl_cdna_ends{$tr->dbID()} = $cdna_coding_end;
223 # determine which exons can be deleted
224 foreach my $old_key (keys %old_exons) {
225 if(!$new_exons{$old_key}) {
226 $ea->remove($old_exons{$old_key});
227 delete $old_exons{$old_key};
231 # determine which exons are brand new and need storing
232 foreach my $new_key (keys %new_exons) {
233 if(!$old_exons{$new_key}) {
234 my $new_ex = $new_exons{$new_key};
235 # "unstore" the merged exon by unsetting adaptor and dbID
236 $new_ex->dbID(undef);
237 $new_ex->adaptor(undef);
239 # assign a new stable id
240 $new_ex->stable_id($ex_stable_id++);
241 $ea->store($new_exons{$new_key});
245 # update the transcript composition by updating the exon transcript table
246 foreach my $tr (@{$g->get_all_Transcripts()}) {
247 $del_et_sth->execute($tr->dbID());
250 foreach my $ex (@{$tr->get_all_Exons()}) {
253 if($old_exons{$ex->hashkey()}) {
254 $ex_id = $old_exons{$ex->hashkey()}->dbID();
256 $ex_id = $new_exons{$ex->hashkey()}->dbID();
259 $ins_et_sth->execute($ex_id, $tr->dbID(), $rank);
265 # update the translations to use the new exons
266 foreach my $tr (@{$g->get_all_Transcripts()}) {
267 my $tl = $tr->translation();
271 my $tl_start = $tl_cdna_starts{$tr->dbID()};
272 my $tl_end = $tl_cdna_ends{$tr->dbID()};
274 foreach my $ex (@{$tr->get_all_Exons()}) {
275 if($tl_start > 0 && $tl_start <= $ex->length()) {
276 $tl->start_Exon($ex);
277 $tl->start($tl_start);
280 if($tl_end > 0 && $tl_end <= $ex->length()) {
285 $tl_start -= $ex->length();
286 $tl_end -= $ex->length();
289 my ($start_ex_id, $end_ex_id);
291 # use consolidated exon ids. Exons may have been duplicated across
292 # multiple transcripts but only one has been stored and had id set.
293 if($old_exons{$tl->start_Exon->hashkey()}) {
294 $start_ex_id = $old_exons{$tl->start_Exon->hashkey()}->dbID();
296 $start_ex_id = $new_exons{$tl->start_Exon->hashkey()}->dbID();
299 if($old_exons{$tl->end_Exon->hashkey()}) {
300 $end_ex_id = $old_exons{$tl->end_Exon->hashkey()}->dbID();
302 $end_ex_id = $new_exons{$tl->end_Exon->hashkey()}->dbID();
306 $upd_tl_sth->execute($start_ex_id, $end_ex_id,
307 $tl->start(), $tl->end(), $tl->dbID());
311 $del_et_sth->finish();
312 $ins_et_sth->finish();
314 $upd_tl_sth->finish();
316 print STDERR
"\nAll Done.\n";
321 This program merges exons which are seperated by introns of length 1 or 2
322 and replaces them with a frameshift mrna edit.
325 perl shortintrons2frameshift.pl -user <user> -host <host> [-pass <password>] \
326 -dbname <dbname> [-port <port>]