2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
20 load_alternative_assembly.pl - create a db
for transfering annotation to the
25 load_alternative_assembly.pl [arguments]
29 --dbname, db_name=NAME database name NAME
30 --host, --dbhost, --db_host=HOST database host HOST
31 --port, --dbport, --db_port=PORT database port PORT
32 --user, --dbuser, --db_user=USER database username USER
33 --pass, --dbpass, --db_pass=PASS database passwort PASS
34 --assembly=ASSEMBLY assembly version ASSEMBLY
36 --altdbname=NAME alternative database NAME
37 --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
38 --coord_systems, --cs=CS coord_systems to load (
default: all
43 --conffile, --conf=FILE read parameters from FILE
44 (
default: conf/Conversion.ini)
46 --logfile, --log=FILE log to FILE (
default: *STDOUT)
47 --logpath=PATH write logfile to PATH (
default: .)
48 --logappend, --log_append append to logfile (
default: truncate)
50 -v, --verbose=0|1 verbose logging (
default:
false)
51 -i, --interactive=0|1
run script interactively (
default:
true)
52 -n, --dry_run, --dry=0|1 dont write results to database
53 -h, --help, -? print help (
this message)
57 This script is part of a series of scripts to create a mapping between two
58 assemblies. It assembles the toplevel coordinate systems of two different
59 assemblies of a genome by creating a whole genome alignment between the two.
61 The process assumes that the two assemblies are reasonably similar, i.e. there
62 are no major rearrangements or clones moved from one toplevel seq_region to
65 See
"Related files" below
for an overview of the whole process.
67 This particular script loads the alternative toplevel seq_regions into the
68 Ensembl database
for further processing.
72 The whole process of creating a whole genome alignment between two assemblies
73 is done by a series of scripts. Please see
75 ensembl/misc-scripts/assembly/README
77 for a high-level description of
this process, and POD in the individual scripts
83 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
87 Please post comments/questions to the Ensembl development list
94 no warnings
'uninitialized';
109 $support->parse_common_options(@_);
110 $support->parse_extra_options(
114 'coord_systems|cs=s',
116 $support->allowed_params(
117 $support->get_common_params,
124 if ($support->param(
'help') or $support->error) {
125 warn $support->error
if $support->error;
129 $support->comma_to_list(
'coord_systems');
131 # ask user to confirm parameters to proceed
132 $support->confirm_params;
134 # get log filehandle and print heading and parameters to logfile
137 $support->check_required_params(
143 if ($support->param(
'dry_run')) {
144 $support->log(
"Nothing to do for a dry run. Exiting.\n\n");
145 $support->finish_log;
150 # connect to database and get adaptors
152 my ($dba, $dbh, $sql, $sth);
154 # first set connection parameters for alternative db
155 # both databases have to be on the same host, so we don't need to configure
157 map { $support->param(
"alt$_", $support->param($_)) } qw(host port user pass);
160 $dba->{
'ref'} = $support->get_database(
'ensembl');
161 $dbh->{
'ref'} = $dba->{
'ref'}->dbc->db_handle;
163 # database containing the alternative assembly
164 $dba->{
'alt'} = $support->get_database(
'core',
'alt');
165 $dbh->{
'alt'} = $dba->{
'alt'}->dbc->db_handle;
168 # create backups of the tables that will be modified
170 $support->log_stamped(
"Creating table backups...\n");
171 $support->log_stamped(
"seq_region...\n", 1);
172 $dbh->{
'ref'}->do(
'CREATE TABLE seq_region_bak LIKE seq_region');
173 $dbh->{
'ref'}->do(
'INSERT INTO seq_region_bak SELECT * FROM seq_region');
174 $support->log_stamped(
"seq_region_attrib...\n", 1);
175 $dbh->{
'ref'}->do(
'CREATE TABLE seq_region_attrib_bak LIKE seq_region_attrib');
176 $dbh->{
'ref'}->do(
'INSERT INTO seq_region_attrib_bak SELECT * FROM seq_region_attrib');
177 $support->log_stamped(
"assembly...\n", 1);
178 $dbh->{
'ref'}->do(
'CREATE TABLE assembly_bak LIKE assembly');
179 $dbh->{
'ref'}->do(
'INSERT INTO assembly_bak SELECT * FROM assembly');
180 $support->log_stamped(
"meta...\n", 1);
181 $dbh->{
'ref'}->do(
'CREATE TABLE meta_bak LIKE meta');
182 $dbh->{
'ref'}->do(
'INSERT INTO meta_bak SELECT * FROM meta');
183 $support->log_stamped(
"coord_system...\n", 1);
184 $dbh->{
'ref'}->do(
'CREATE TABLE coord_system_bak LIKE coord_system');
185 $dbh->{
'ref'}->do(
'INSERT INTO coord_system_bak SELECT * FROM coord_system');
186 $support->log_stamped(
"Done.\n\n");
189 # load seq_regions from alternative assembly db
191 $support->log_stamped(
"Load seq_regions from alternative db...\n");
193 # determine which coord_systems we want to include
194 # by default, all coord_systems with toplevel seq_regions will be used
195 my @coord_systems = $support->param(
'coord_systems');
197 unless (@coord_systems) {
198 # get toplevel coord_systems from both dbs
202 SELECT distinct(cs.name)
203 FROM seq_region sr, coord_system cs, seq_region_attrib sra, attrib_type at
204 WHERE sr.coord_system_id = cs.coord_system_id
205 AND sr.seq_region_id = sra.seq_region_id
206 AND sra.attrib_type_id = at.attrib_type_id
207 AND at.code =
'toplevel'
211 $sth = $dbh->{
'ref'}->prepare($sql);
213 while (my ($cs) = $sth->fetchrow_array) {
219 $sth = $dbh->{
'alt'}->prepare($sql);
221 while (my ($cs) = $sth->fetchrow_array) {
226 # now determine the common ones
227 foreach my $cs (sort keys %common_cs) {
228 push @coord_systems, $cs
if ($common_cs{$cs} > 1);
232 unless (@coord_systems) {
233 $support->error(
"No common toplevel coord_systems found.\n");
236 my $cs_string = join(
"', '", @coord_systems);
237 $cs_string =
"'$cs_string'";
238 $support->log(
"Will use these coord_systems: $cs_string\n", 1);
240 # determine max(seq_region_id) and max(coord_system_id) in Ensembl
241 $sql = qq(SELECT MAX(seq_region_id) FROM seq_region);
242 $sth = $dbh->{
'ref'}->prepare($sql);
244 my ($max_sri) = $sth->fetchrow_array;
245 my $sri_adjust = 1+$max_sri;
247 $sql = qq(SELECT MAX(coord_system_id) FROM coord_system);
248 $sth = $dbh->{
'ref'}->prepare($sql);
250 my ($max_csi) = $sth->fetchrow_array;
251 my $csi_adjust = 10**(length($max_csi));
253 my $ref_db = $support->param(
'dbname');
254 my $alt_assembly = $support->param(
'altassembly');
256 # fetch and insert alternative seq_regions with adjusted seq_region_id and
259 INSERT IGNORE INTO $ref_db.seq_region
261 sr.seq_region_id+$sri_adjust,
263 sr.coord_system_id+$csi_adjust,
265 FROM seq_region sr, coord_system cs, seq_region_attrib sra, attrib_type at
266 WHERE sr.coord_system_id = cs.coord_system_id
267 AND sr.seq_region_id = sra.seq_region_id
268 AND sra.attrib_type_id = at.attrib_type_id
269 AND at.code =
'toplevel'
270 AND cs.name IN ($cs_string)
271 AND cs.version =
'$alt_assembly'
273 my $c = $dbh->{
'alt'}->do($sql);
274 $support->log_stamped(
"Done loading $c seq_regions.\n\n");
276 ## Add in any codon table attributes as HCs complain otherwise
278 INSERT IGNORE INTO $ref_db.seq_region_attrib
280 sr.seq_region_id+$sri_adjust,
281 codon_at.attrib_type_id,
283 FROM seq_region sr, coord_system cs, seq_region_attrib sra, attrib_type at, seq_region_attrib codon_sra, attrib_type codon_at
284 WHERE sr.coord_system_id = cs.coord_system_id
285 AND sr.seq_region_id = sra.seq_region_id
286 AND sra.attrib_type_id = at.attrib_type_id
287 AND sr.seq_region_id = codon_sra.seq_region_id
288 AND codon_sra.attrib_type_id = codon_at.attrib_type_id
289 AND at.code =
'toplevel'
290 AND cs.name IN ($cs_string)
291 AND cs.version =
'$alt_assembly'
292 AND codon_at.code =
'codon_table';
295 $c = $dbh->{
'alt'}->do($sql);
296 $support->log_stamped(
"Done loading $c seq_region codon_table attributes.\n\n");
299 # add appropriate entries to coord_system
301 $support->log_stamped(
"Adding coord_system entries...\n");
303 foreach my $cs (@coord_systems) {
305 INSERT IGNORE INTO $ref_db.coord_system
306 SELECT coord_system_id+$csi_adjust, 1,name, version,
307 (SELECT MAX(rank)+1 FROM $ref_db.coord_system),
''
310 AND version =
'$alt_assembly'
312 $c += $dbh->{
'alt'}->do($sql);
314 $support->log_stamped(
"Done adding $c coord_system entries.\n\n");
317 # add assembly.mapping to meta table
319 $support->log_stamped(
"Adding assembly.mapping entry to meta table...\n");
321 foreach my $cs (@coord_systems) {
322 my $mappingstring =
"$cs:".$support->param(
'assembly').
323 "#$cs:".$support->param(
'altassembly');
325 INSERT IGNORE INTO meta (meta_key, meta_value)
326 VALUES (
'assembly.mapping',
'$mappingstring')
328 $c += $dbh->{
'ref'}->do($sql);
330 $support->log_stamped(
"Done inserting $c meta entries.\n\n");
333 $support->finish_log;