ensembl-hive  2.7.0
fuller_test_genome_9_chr20.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 use strict;
18 use warnings;
19 
20 use Getopt::Std;
21 use DBI;
22 
23 use vars qw($opt_d $opt_s $opt_u $opt_h $opt_p $opt_P);
24 
25 #get command line arguments
26 getopts('s:d:h:u:p:P:');
27 
28 my $usage = "Usage: " .
29  "test_genome.pl -s srcDB -d destDB -h host -u user [-p pass] [-P port]\n";
30 
31 my ($user, $pass, $host, $destDB, $srcDB, $port) =
32  ($opt_u, $opt_p, $opt_h, $opt_d, $opt_s, $opt_P);
33 
34 unless($port) {
35  $port = 3306;
36 }
37 
38 # If needed command line args are missing print the usage string and quit
39 $user and $host and $destDB and $srcDB or die $usage;
40 
41 
42 
43 my $dsn = "DBI:mysql:host=$host;port=$port";
44 
45 #print "User: $user; Pass: $pass; DSN: $dsn\n";
46 
47 #
48 # Connect to the mySQL host
49 #
50 my $dbh = DBI->connect( $dsn, $user, $pass, {RaiseError => 1})
51  or die "Could not connect to database host : " . DBI->errstr;
52 
53 print "\nWARNING: If the $destDB database already exists the existing copy \n"
54  . "will be destroyed. Proceed (Y/N)? ";
55 
56 my $key = lc(getc());
57 
58 unless( $key =~ /y/ ) {
59  $dbh->disconnect();
60  print "Test Genome Creation Aborted\n";
61  exit;
62 }
63 
64 print "Proceeding with test genome database $destDB creation\n";
65 
66 #
67 # Create the new database, dropping any existing database
68 #
69 $dbh->do("DROP DATABASE $destDB");
70 
71 $dbh->do( "CREATE DATABASE " . $destDB )
72  or die "Could not create database $destDB: " . $dbh->errstr;
73 
74 #
75 # Dump the source database table structure (w/o data) and use it to create
76 # the new database schema
77 #
78 
79 # May have to eliminate the -p pass part... not sure
80 
81 my $rc = 0xffff & system(
82  "mysqldump -p$pass -u $user -h $host -P $port --no-data $srcDB | " .
83  "mysql -p$pass -u $user -h $host -P port $destDB");
84 
85 if($rc != 0) {
86  $rc >>= 8;
87  die "mysqldump and insert failed with return code: $rc";
88 }
89 
90 #
91 # Create a temp table to store ids of clones we are interested in
92 #
93 $dbh->do("
94 CREATE TEMPORARY TABLE $destDB.tmp1(
95  clone_id INT(10) NOT NULL)
96 ") or die "Could create tmp1 table " . $dbh->errstr;
97 
98 
99 # Find clones present in a central, exciting 1Mb region of Chromosome 20
100 $dbh->do("
101 INSERT INTO $destDB.tmp1
102 SELECT distinct( c.clone_id )
103 FROM $srcDB.contig c, $srcDB.assembly a, $srcDB.chromosome chr
104 WHERE a.contig_id = c.contig_id
105 AND a.chromosome_id = chr.chromosome_id
106 AND chr.name = '20'
107 AND a.chr_end >= 30252000
108 AND a.chr_start < 31252001
109 ") or die "Could not do tmp1 chr2 clones insert statement:" . $dbh->errstr;
110 
111 
112 #Select relevant clones from the source database for the new database
113 $dbh->do("
114 INSERT INTO $destDB.clone
115 SELECT c.*
116 FROM $srcDB.clone c, $destDB.tmp1 t
117 WHERE c.clone_id = t.clone_id
118 ") or die "Could not do clone insertion statement:" . $dbh->errstr;
119 
120 #
121 # Retrieve all contigs on the clones present in the first 1Mbases whether
122 # or not they are on the golden path
123 #
124 $dbh->do("
125 INSERT INTO $destDB.contig
126 SELECT c.contig_id, c.name, c.clone_id, c.length, c.embl_offset, c.dna_id
127 FROM $srcDB.contig c, $destDB.tmp1 t
128 WHERE c.clone_id = t.clone_id
129 ") or die "Could not do contig insertion statement:" . $dbh->errstr;
130 
131 #
132 # Create the relevant dna table for the contigs in the test genome
133 #
134 
135 $dbh->do("
136 INSERT INTO $destDB.dna
137 SELECT d.*
138 FROM $srcDB.dna d, $destDB.contig c
139 WHERE d.dna_id = c.dna_id
140 ") or die "Could not do dna insertion statement:" . $dbh->errstr;
141 
142 #
143 # Copy the entire analysis table (This could be improved I think [mcvicker])
144 #
145 
146 $dbh->do("
147 INSERT INTO $destDB.analysis
148 SELECT *
149 FROM $srcDB.analysis
150 ") or die "Could not do analysis insertion statement:" . $dbh->errstr;
151 
152 #
153 # Copy the static golden path table
154 #
155 $dbh->do("
156 INSERT INTO $destDB.assembly
157 SELECT a.*
158 FROM $srcDB.assembly a, $srcDB.chromosome chr
159 WHERE a.chromosome_id = chr.chromosome_id
160 AND chr.name = '20'
161 ") or die "Could not do assembly insertion statement for chr2: $dbh->errstr";
162 
163 #
164 # Copy overlapping features, repeats, genes, exons, etc
165 #
166 
167 # first find gene start end thing
168 
169 $dbh->do("
170 CREATE TABLE $destDB.gene_global_start_end
171 SELECT STRAIGHT_JOIN tr.gene_id,
172 MIN(IF( a.contig_ori=1,
173  (e.contig_start+a.chr_start-a.contig_start),
174  (a.chr_start+a.contig_end-e.contig_end)))
175  as start,
176 MAX(IF( a.contig_ori=1,
177  (e.contig_end+a.chr_start-a.contig_start),
178  (a.chr_start+a.contig_end-e.contig_start)))
179  as end,
180 IF (a.contig_ori=1, e.contig_strand, (-e.contig_strand)) as strand,
181 chr.name as chromosome
182 
183 FROM $srcDB.transcript tr, $srcDB.exon_transcript et,
184  $srcDB.exon e, $srcDB.assembly a, $srcDB.chromosome chr
185 WHERE tr.transcript_id = et.transcript_id
186 AND et.exon_id = e.exon_id
187 AND e.contig_id = a.contig_id
188 AND a.chromosome_id = chr.chromosome_id
189 GROUP BY tr.gene_id
190 ") or die "Could not do gene_global_start_end table creation statement: $dbh->errstr";
191 
192 #
193 # Then make gene list for the area. Genes have to be completely in
194 #
195 
196 $dbh->do("
197 CREATE TABLE $destDB.gene_list
198 SELECT gene_id
199 FROM $destDB.gene_global_start_end g
200 WHERE g.chromosome = '20'
201 AND g.end >= 30252000
202 AND g.start < 31252001
203 ") or die "Could not do gene_list table insertion from chr2: " . $dbh->errstr;
204 
205 $dbh->do("
206 ALTER TABLE $destDB.gene_list ADD INDEX gene_id_idx( gene_id )
207 ") or die "Could not prepare genelist table alteration: " . $dbh->errstr;
208 
209 # Now copy gene
210 
211 $dbh->do("
212 INSERT INTO $destDB.gene
213 SELECT g.*
214 FROM $srcDB.gene g, $destDB.gene_list gl
215 WHERE g.gene_id = gl.gene_id
216 ") or die "Could not do gene insertion statement: " . $dbh->errstr;
217 
218 # now transcript
219 
220 $dbh->do("
221 INSERT INTO $destDB.transcript
222 SELECT tr.*
223 FROM $srcDB.transcript tr, $destDB.gene_list gl
224 WHERE tr.gene_id = gl.gene_id
225 ") or die "Could not do transcript insertion: " . $dbh->errstr;
226 
227 
228 # translations
229 
230 $dbh->do("
231 INSERT INTO $destDB.translation
232 SELECT tl.*
233 FROM $srcDB.translation tl, $destDB.transcript tr
234 WHERE tr.translation_id = tl.translation_id
235 ") or die "Could not do translation insertion " . $dbh->errstr;
236 
237 
238 # exon_transcript
239 
240 $dbh->do("
241 INSERT INTO $destDB.exon_transcript
242 SELECT et.*
243 FROM $srcDB.exon_transcript et, $destDB.transcript tr
244 WHERE tr.transcript_id = et.transcript_id
245 ") or die "Could not do exon_transcript insertion: " . $dbh->errstr;
246 
247 # exons are tricky, first make a unique list
248 
249 $dbh->do ("
250 CREATE TABLE $destDB.exon_unique
251 SELECT distinct exon_id
252 FROM $destDB.exon_transcript
253 ") or die "Could not do exon_unique table create: " . $dbh->errstr;
254 
255 # then uses this list to copy
256 
257 $dbh->do("
258 INSERT INTO $destDB.exon
259 SELECT e.*
260 FROM $srcDB.exon e, $destDB.exon_unique eu
261 WHERE e.exon_id = eu.exon_id
262 ") or die "Could not do exon insertion: " . $dbh->errstr;
263 
264 
265 $dbh->do("drop table $destDB.exon_unique")
266  or die "Could not drop exon_unique temp table: $dbh->errstr\n";
267 $dbh->do("drop table $destDB.gene_list")
268  or die "Could not drop gene_list temp table: $dbh->errstr\n";
269 $dbh->do("drop table $destDB.gene_global_start_end")
270  or die "Could not drop gene_global_start_end temp table: $dbh->errstr\n";
271 
272 # meta table
273 
274 $dbh->do("
275 INSERT INTO $destDB.meta
276 SELECT *
277 FROM $srcDB.meta
278 ") or die "Could not do meta table insertion: $dbh->errstr\n";
279 
280 $dbh->do("
281 INSERT INTO $destDB.chromosome
282 SELECT *
283 FROM $srcDB.chromosome
284 ") or die "Could not do meta table insertion: $dbh->errstr\n";
285 
286 # object_xref, identity_xref, xref
287 # gene_description
288 # external_db
289 
290 $dbh->do("
291 INSERT INTO $destDB.gene_description
292 SELECT gd.*
293 FROM $srcDB.gene_description gd, $destDB.gene g
294 WHERE gd.gene_id = g.gene_id
295 ") or die "Could not do gene description insertion: $dbh->errstr\n";
296 
297 $dbh->do("
298 INSERT INTO $destDB.object_xref
299 SELECT ox.*
300 FROM $srcDB.object_xref ox, $destDB.translation tr
301 WHERE ox.ensembl_id = tr.translation_id
302 AND ox.ensembl_object_type = 'Translation'
303 ") or die "Could not do object_xref insertion: $dbh->errstr\n";
304 
305 $dbh->do("
306 INSERT INTO $destDB.xref
307 SELECT x.*
308 FROM $srcDB.xref x, $destDB.object_xref ox
309 WHERE x.xref_id = ox.xref_id
310 ") or die "Could not do xref insertion: $dbh->errstr\n";
311 
312 $dbh->do("
313 INSERT INTO $destDB.identity_xref
314 SELECT ix.*
315 FROM $srcDB.identity_xref ix, $destDB.object_xref ox
316 WHERE ix.object_xref_id = ox.object_xref_id
317 ") or die "Could not do identity insertion: $dbh->errstr\n";
318 
319 $dbh->do("
320 INSERT INTO $destDB.external_db
321 SELECT ed.*
322 FROM $srcDB.external_db ed
323 ") or die "Could not do external_db insertion: $dbh->errstr\n";
324 
325 # the interpro we need are those which are in xref
326 
327 
328 # copy across associated simple features
329 $dbh->do("
330 INSERT INTO $destDB.simple_feature
331 SELECT esf.*
332 FROM $destDB.contig dc, $srcDB.simple_feature esf
333 WHERE esf.contig_id = dc.contig_id
334 ") or die "Could not do simple_feature insertion: $dbh->errstr\n";
335 
336 
337 # copy across associated dna_align_features
338 $dbh->do("
339 INSERT INTO $destDB.dna_align_feature
340 SELECT edna.*
341 FROM $destDB.contig dc, $srcDB.dna_align_feature edna
342 WHERE edna.contig_id = dc.contig_id
343 ") or die "Could not do dna_align_feature insertion: $dbh->errstr\n";
344 
345 
346 # copy across associated protein_align_features
347 $dbh->do("
348 INSERT INTO $destDB.protein_align_feature
349 SELECT eprot.*
350 FROM $destDB.contig dc, $srcDB.protein_align_feature eprot
351 WHERE eprot.contig_id = dc.contig_id
352 ") or die "Could not do protein_align_feature insertion: $dbh->errstr\n";
353 
354 
355 # copy across associated protein_align_features
356 $dbh->do("
357 INSERT INTO $destDB.prediction_transcript
358 SELECT epred.*
359 FROM $destDB.contig dc, $srcDB.prediction_transcript epred
360 WHERE epred.contig_id = dc.contig_id
361 ") or die "Could not do prediction_transcript insertion: $dbh->errstr\n";
362 
363 
364 # copy across associated supporting_features
365 $dbh->do("
366 INSERT INTO $destDB.supporting_feature
367 SELECT esup.*
368 FROM $destDB.exon dex, $srcDB.supporting_feature esup
369 WHERE esup.exon_id = dex.exon_id
370 ") or die "Could not do supporting_feature insertion: $dbh->errstr\n";
371 
372 
373 # copy across associated protein_features
374 $dbh->do("
375 INSERT INTO $destDB.protein_feature
376 SELECT eprot.*
377 FROM $destDB.translation dt, $srcDB.protein_feature eprot
378 WHERE eprot.translation_id = dt.translation_id
379 ") or die "Could not do protein_feature insertion: $dbh->errstr\n";
380 
381 
382 # copy across associated repeat_features
383 $dbh->do("
384 INSERT INTO $destDB.repeat_feature
385 SELECT erep.*
386 FROM $destDB.contig dc, $srcDB.repeat_feature erep
387 WHERE erep.contig_id = dc.contig_id
388 ") or die "Could not do protein_align_feature insertion: $dbh->errstr\n";
389 
390 
391 # copy across associated repeat_consensi
392 $dbh->do("
393 INSERT INTO $destDB.repeat_consensus
394 SELECT econ.*
395 FROM $destDB.repeat_feature dr, $srcDB.repeat_consensus econ
396 WHERE econ.repeat_consensus_id = dr.repeat_consensus_id
397 ") or die "Could not do repeat_consensus insertion: $dbh->errstr\n";
398 
399 
400 # copy across associated karyotype info
401 $dbh->do("
402 INSERT INTO $destDB.karyotype
403 SELECT ek.*
404 FROM $srcDB.karyotype ek
405 WHERE ek.chromosome_id = '20'
406 AND ek.chr_end >= 30252000
407 AND ek.chr_start < 31252001
408 ") or die "Could not do karyotype insertion: $dbh->errstr\n";
409 
410 
411 #
412 # Create a temp table to store superctg_name we are interested in
413 #
414 $dbh->do("
415 CREATE TEMPORARY TABLE $destDB.tmp2(
416  superctg_name VARCHAR(20) NOT NULL)
417 ") or die "Could create tmp2 table " . $dbh->errstr;
418 
419 
420 # grab the unique superctg_names
421 $dbh->do("
422 INSERT INTO $destDB.tmp2
423 SELECT distinct(superctg_name)
424 FROM $destDB.assembly
425 ") or die "Could not select the unique superctg_names: $dbh->errstr\n";
426 
427 
428 # copy across mapfrag
429 $dbh->do("
430 INSERT INTO $destDB.mapfrag
431 SELECT emf.*
432 FROM $srcDB.mapfrag emf, $destDB.tmp2 sctg
433 WHERE emf.name = sctg.superctg_name
434 ") or die "Could not do mapfrag insertion: $dbh->errstr\n";
435 
436 
437 # copy across mapset
438 $dbh->do("
439 INSERT INTO $destDB.mapset
440 SELECT ems.*
441 FROM $srcDB.mapset ems
442 ") or die "Could not do mapset insertion: $dbh->errstr\n";
443 
444 
445 # copy across map_density
446 $dbh->do("
447 INSERT INTO $destDB.map_density
448 SELECT emd.*
449 FROM $srcDB.map_density emd
450 WHERE emd.chromosome_id = '20'
451 AND emd.chr_start < 31252001
452 AND emd.chr_end >= 30252000
453 ") or die "Could not do map_density insertion: $dbh->errstr\n";
454 
455 
456 # copy across mapannotation
457 $dbh->do("
458 INSERT INTO $destDB.mapannotation
459 SELECT ema.*
460 FROM $srcDB.mapannotation ema, $destDB.tmp2 sctg
461 WHERE ema.value = sctg.superctg_name
462 ") or die "Could not do mapannotation insertion: $dbh->errstr\n";
463 
464 
465 # copy across mapannotationtype
466 $dbh->do("
467 INSERT INTO $destDB.mapannotationtype
468 SELECT emt.*
469 FROM $srcDB.mapannotationtype emt
470 ") or die "Could not do mapannotationtype insertion: $dbh->errstr\n";
471 
472 
473 # copy across mapfrag_mapset
474 $dbh->do("
475 INSERT INTO $destDB.mapfrag_mapset
476 SELECT emfms.*
477 FROM $srcDB.mapfrag_mapset emfms, $destDB.mapfrag dmf
478 WHERE emfms.mapfrag_id = dmf.mapfrag_id
479 ") or die "Could not do mapfrag_mapset insertion: $dbh->errstr\n";
480 
481 
482 # copy across dnafrag
483 $dbh->do("
484 INSERT INTO $destDB.dnafrag
485 SELECT edf.*
486 FROM $srcDB.dnafrag edf
487 ") or die "Could not do dnafrag insertion: $dbh->errstr\n";
488 
489 
490 # finally, drop the temporary tables
491 $dbh->do("
492 DROP TABLE $destDB.tmp1
493 ") or die "Could not drop temporary table tmp1: $dbh->errstr\n";
494 
495 $dbh->do("
496 DROP TABLE $destDB.tmp2
497 ") or die "Could not drop temporary table tmp2: $dbh->errstr\n";
498 
499 
500 #disconnect from the DB
501 $dbh->disconnect();
502 
503 print "Test genome database $destDB created\n";
504 
505 1;