ensembl-hive  2.7.0
push_align_features.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 use strict;
18 use warnings;
19 
21 
22 use Getopt::Long;
23 
24 
25 #my $LIMIT = ' LIMIT 1000 ';
26 my $LIMIT = '';
27 
28 my ($db, $dnadbname);
29 
30 {
31  my ($dbname, $host, $port, $user, $pass);
32 
33  GetOptions('dbname=s' => \$dbname,
34  'dnadbname=s' => \$dnadbname,
35  'user=s' => \$user,
36  'host=s' => \$host,
37  'port=i' => \$port,
38  'pass=s' => \$pass);
39 
40  $port ||= 3306;
41 
42  usage() if(!$host || !$user || !$dbname);
43 
44  $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new
45  (-host => $host,
46  -user => $user,
47  -pass => $pass,
48  -port => $port,
49  -dbname => $dbname,
50  -dnadbname => $dnadbname);
51 
52  if($dnadbname) {
54  (-host => $host,
55  -user => $user,
56  -pass => $pass,
57  -port => $port,
58  -dbname => $dnadbname);
59  $db->dnadb($dnadb);
60  } else {
61  $dnadbname = $dbname;
62  }
63 }
64 
65 my @daf_css = get_feat_coord_systems($db, 'dna_align_feature');
66 my @paf_css = get_feat_coord_systems($db, 'protein_align_feature');
67 
68 ### convert dna align features
69 
70 my $sth = $db->dbc()->prepare("SHOW CREATE TABLE dna_align_feature");
71 $sth->execute();
72 my $create_table = $sth->fetchrow_arrayref()->[1];
73 $sth->finish();
74 
75 
76 $create_table =~ s/CREATE TABLE dna_align_feature/CREATE TABLE tmp_dna_align_feature/;
77 # difference b/w mysql 4 and mysql 3
78 $create_table =~ s/CREATE TABLE `dna_align_feature`/CREATE TABLE tmp_dna_align_feature/;
79 
80 $sth = $db->dbc()->prepare($create_table);
81 $sth->execute();
82 
83 $sth = $db->dbc()->prepare
84  (qq{INSERT INTO tmp_dna_align_feature
85  SELECT daf.dna_align_feature_id,
86  a.asm_seq_region_id,
87  if(a.ori = 1,
88  (a.asm_start + daf.seq_region_start - a.cmp_start),
89  (a.asm_start + a.cmp_end - daf.seq_region_end))
90  as seq_region_start,
91  if(a.ori = 1,
92  (a.asm_start + daf.seq_region_end - a.cmp_start),
93  (a.asm_end + a.cmp_end - daf.seq_region_start))
94  as seq_region_end,
95  a.ori * daf.seq_region_strand as seq_region_strand,
96  daf.hit_start, daf.hit_end, daf.hit_strand, daf.hit_name,
97  daf.analysis_id, daf.score, daf.evalue, daf.perc_ident,
98  daf.cigar_line
99  FROM $dnadbname.assembly a,
100  $dnadbname.seq_region asm_sr,
101  $dnadbname.seq_region cmp_sr,
102  dna_align_feature daf,
103  $dnadbname.seq_region_attrib sra,
104  $dnadbname.attrib_type at
105  WHERE asm_sr.coord_system_id = ? AND cmp_sr.coord_system_id = ?
106  AND daf.seq_region_id = a.cmp_seq_region_id
107  AND a.asm_seq_region_id = asm_sr.seq_region_id
108  AND a.cmp_seq_region_id = cmp_sr.seq_region_id
109  AND daf.seq_region_start >= a.cmp_start
110  AND daf.seq_region_end <= a.cmp_end
111  AND asm_sr.seq_region_id = sra.seq_region_id
112  AND sra.attrib_type_id = at.attrib_type_id
113  AND at.code = 'toplevel'
114  $LIMIT});
115 
116 foreach my $cs_pair (@daf_css) {
117  my ($top_cs, $feat_cs) = @$cs_pair;
118  print STDERR "Converting dna align features between from coord system ",
119  $feat_cs->name(), " to coord system ", $top_cs->name(), "\n";
120  $sth->execute($top_cs->dbID(), $feat_cs->dbID());
121 }
122 $sth->finish();
123 
124 
125 ### convert protein align features
126 
127 $sth = $db->dbc()->prepare("SHOW CREATE TABLE protein_align_feature");
128 $sth->execute();
129 $create_table = $sth->fetchrow_arrayref()->[1];
130 $sth->finish();
131 
132 $create_table =~ s/CREATE TABLE protein_align_feature/CREATE TABLE tmp_protein_align_feature/;
133 $create_table =~ s/CREATE TABLE `protein_align_feature`/CREATE TABLE tmp_protein_align_feature/;
134 
135 
136 $sth = $db->dbc()->prepare($create_table);
137 $sth->execute();
138 
139 $sth = $db->dbc()->prepare
140  (qq{INSERT INTO tmp_protein_align_feature
141  SELECT paf.protein_align_feature_id,
142  a.asm_seq_region_id,
143  if(a.ori = 1,
144  (a.asm_start + paf.seq_region_start - a.cmp_start),
145  (a.asm_start + a.cmp_end - paf.seq_region_end))
146  as seq_region_start,
147  if(a.ori = 1,
148  (a.asm_start + paf.seq_region_end - a.cmp_start),
149  (a.asm_end + a.cmp_end - paf.seq_region_start))
150  as seq_region_end,
151  a.ori * paf.seq_region_strand as seq_region_strand,
152  paf.hit_start, paf.hit_end, paf.hit_name, paf.analysis_id,
153  paf.score, paf.evalue, paf.perc_ident, paf.cigar_line
154  FROM $dnadbname.assembly a,
155  $dnadbname.seq_region asm_sr,
156  $dnadbname.seq_region cmp_sr,
157  protein_align_feature paf,
158  $dnadbname.seq_region_attrib sra,
159  $dnadbname.attrib_type at
160  WHERE asm_sr.coord_system_id = ? AND cmp_sr.coord_system_id = ?
161  AND paf.seq_region_id = a.cmp_seq_region_id
162  AND a.asm_seq_region_id = asm_sr.seq_region_id
163  AND a.cmp_seq_region_id = cmp_sr.seq_region_id
164  AND paf.seq_region_start >= a.cmp_start
165  AND paf.seq_region_end <= a.cmp_end
166  AND asm_sr.seq_region_id = sra.seq_region_id
167  AND sra.attrib_type_id = at.attrib_type_id
168  AND at.code = 'toplevel'
169  $LIMIT});
170 
171 foreach my $cs_pair (@paf_css) {
172  my ($top_cs, $feat_cs) = @$cs_pair;
173  print STDERR "Converting protein align features between from coord system ",
174  $feat_cs->name(), " to coord system ", $top_cs->name(), "\n";
175  $sth->execute($top_cs->dbID(), $feat_cs->dbID());
176 }
177 $sth->finish();
178 
179 
180 print STDERR "Replacing existing align feature tables with new tables\n";
181 
182 $db->dbc->do("drop table protein_align_feature");
183 $db->dbc->do("drop table dna_align_feature");
184 $db->dbc->do("alter table tmp_protein_align_feature rename protein_align_feature");
185 $db->dbc->do("alter table tmp_dna_align_feature rename dna_align_feature");
186 
187 
188 print STDERR "Updating meta_coord table\n";
189 
190 $db->dbc->do("delete from meta_coord where table_name = 'dna_align_feature'");
191 $db->dbc->do("delete from meta_coord where table_name = 'protein_align_feature'");
192 
193 
194 $sth = $db->dbc->prepare("INSERT INTO meta_coord set table_name = ?, " .
195  "coord_system_id = ?");
196 
197 
198 my %seen = ();
199 foreach my $cs_pair (@daf_css) {
200  my ($top_cs) = @$cs_pair;
201  if(!$seen{$top_cs->dbID()}) {
202  $sth->execute('dna_align_feature', $top_cs->dbID());
203  $seen{$top_cs->dbID()} = 1;
204  }
205 }
206 
207 %seen = ();
208 foreach my $cs_pair (@paf_css) {
209  my ($top_cs) = @$cs_pair;
210  if(!$seen{$top_cs->dbID()}) {
211  $sth->execute('protein_align_feature', $top_cs->dbID());
212  $seen{$top_cs->dbID()} = 1;
213  }
214 }
215 
216 $sth->finish();
217 
218 
219 
221  my $db = shift;
222  my $type = shift;
223 
224  my $csa = $db->get_CoordSystemAdaptor();
225  my $mcc = $db->get_MetaCoordContainer();
226 
227  # determine what coordinate systems have top-level seq_regions
228 
229  my $sth = $db->dbc->prepare
230  (qq{SELECT distinct(sr.coord_system_id)
231  FROM seq_region sr, seq_region_attrib sra, attrib_type at
232  WHERE sr.seq_region_id = sra.seq_region_id
233  AND sra.attrib_type_id = at.attrib_type_id
234  AND at.code = 'toplevel'});
235 
236  $sth->execute();
237 
238  my @top_css = map {$csa->fetch_by_dbID($_->[0])}
239  @{$sth->fetchall_arrayref()};
240 
241 
242  $sth->finish();
243 
244  # determine what coord systems features are stored in
245 
246  my @feat_css = @{$mcc->fetch_all_CoordSystems_by_feature_type($type)};
247 
248  my @css;
249 
250  foreach my $feat_cs (@feat_css) {
251  foreach my $top_cs (@top_css) {
252  if(!$feat_cs->equals($top_cs)) {
253  if($feat_cs->rank() > $top_cs->rank()) {
254  my @mp = @{$csa->get_mapping_path($top_cs, $feat_cs)};
255  if(@mp == 2) {
256  if($mp[0]->equals($top_cs)) {
257  push @css, \@mp;
258  } else {
259  die("Unexpected: ". $top_cs->name() .
260  " coord system should not be component of " .
261  $feat_cs->name(), " coord system.");
262  }
263  }
264  } else {
265  die("There is no 1 step mapping path between coord systems " .
266  $feat_cs->name()." and ".$top_cs->name().". This script " .
267  "requires one step mapping paths.");
268  }
269  }
270  }
271  }
272 
273 
274  if(!@css) {
275  die("Nothing to do for ${type}s (or missing meta/assembly info)\n");
276  }
277 
278  return @css;
279 }
280 
281 
282 
283 sub usage {
284  print STDERR
285 qq{
286 This program will convert the coordinates of alignment features from
287 their current cooridinate systems to coordinates on toplevel sequence
288 regions. This can speed up the retrieval of these features dramatically.
289 
290 Features which are partially or entirely non-golden portions of
291 sequence regions will be discarded.
292 
293 This program is only capable of mapping if there is a direct relationship
294 between the coordinate systems defined in the assembly and meta tables.
295 
296 This program can use a different database to obtain the assembly information
297 but the database must be on the same MySQL instance. This is useful for
298 satellite databases such as the est database.
299 
300 usage:
301  perl push_align_features -dbname <dbname> -user <user> -host <host> \
302  [-port <port>] [-pass <pass>] [-dnadbname <dbname>]
303 };
304 
305  exit;
306 }
EnsEMBL
Definition: Filter.pm:1
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
map
public map()
usage
public usage()
Bio::EnsEMBL::DBSQL::DBAdaptor::new
public Bio::EnsEMBL::DBSQL::DBAdaptor new()
get_feat_coord_systems
public get_feat_coord_systems()
Bio::EnsEMBL::DBSQL::DBAdaptor::dnadb
public Dna dnadb()
Bio
Definition: AltAlleleGroup.pm:4