2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 #The script populates a stable_id lookup database with all stable ids found in databases on a specified server for
18 #a specified db release.
19 #The stable ids are copied for objects listed in hash %group_objects
23 use DBI qw( :sql_types );
27 my ( $rdbname, $version, $host, $port, $user, $pass );
28 my ( $lhost, $lport, $luser, $lpass, $test );
42 # OperonTranscript => 1, # these are in transcript table anyway
44 TranscriptArchive => 1,
45 TranslationArchive => 1,
48 # otherfeatures can be skipped for now as plant reuse the same ids in core and otherfeatures
55 # not sure these are supposed to work - genetrees and families can't have species ids ..
56 # in fact in eg_23_76 there are no compara ids in the stable_id_lookup
64 "lhost|lh=s" => \$lhost,
66 "luser|lu=s" => \$luser,
67 "lpass|lp=s" => \$lpass,
72 "dbname|db=s" => \$rdbname,
73 "create!" => \$create,
74 "create_index!" => \$create_index,
75 "version|v=s" => \$version,
81 usage() if ( !defined $lhost || !defined $luser || !defined $lpass );
82 usage() if ( !defined $host || !defined $port || !defined $user || !defined $version );
84 $rdbname ||= "ensembl_stable_ids_$version";
85 #$rdbname ||= "ensemblgenomes_stable_ids_$version";
87 my ( $readDB, $writeDB ) =
init_db();
89 print
"readDB credentials:\n";
90 print Dumper($readDB);
92 print
"writeDB credentials:\n";
93 print Dumper($writeDB);
97 # This is where the new stable_id database will be created
98 my $writeDB_credentials = {
99 user => defined($luser) ? $luser : $user,
100 pass => defined($lpass) ? $lpass : $pass,
101 host => defined($lhost) ? $lhost : $host,
102 port => defined($lport) ? $lport : $port,
107 # This is where all the object dbs reside and queried against for stable ids
108 my $readDB_credentials = {
116 return ( $readDB_credentials, $writeDB_credentials );
123 print(
"lastSpeciesID + $lastSpeciesID\n");
126 my $new_species = {};
135 my $dbname = $writeDB->{
'dbname'};
136 print
"Creating index for $dbname\n";
138 my $host = $writeDB->{
'host'};
139 my $port = $writeDB->{
'port'};
140 my $user = $writeDB->{
'user'};
141 my $pass = $writeDB->{
'pass'};
145 my $cmd =
"mysql -h $host";
149 $cmd .=
" -u $user --password=$pass $dbname < ./sql/indices.sql";
150 system($cmd) == 0 or die(
"error encountered when creating index for database $dbname\n");
154 die(
"An SQL error occured while creating database $dbname:\n$@");
156 my $took = time - $startAt;
158 my $m = ( $took / 60 ) % 60;
159 my $h = $took / 3600;
161 warn sprintf(
"Indexed in %02d:%02d:%02d\n", $h, $m, $s );
168 my $dbname = $writeDB->{
'dbname'};
170 print
"Creating database $dbname\n";
172 my $host = $writeDB->{
'host'};
173 my $port = $writeDB->{
'port'};
174 my $user = $writeDB->{
'user'};
175 my $pass = $writeDB->{
'pass'};
179 $dbh->do(
"drop database if exists $dbname");
180 $dbh->do(
"create database $dbname");
182 my $cmd =
"mysql -h $host";
186 $cmd .=
" -u $user --password=$pass $dbname < ./sql/tables.sql";
187 system($cmd) == 0 or die(
"error encountered when creating schema for database $dbname\n");
189 $dbh->do(
"use $dbname");
192 "INSERT INTO meta(species_id,meta_key,meta_value) VALUES (NULL,'schema_version','$version')"
198 die(
"An SQL error occured while creating database $dbname:\n$@");
205 my ($connectDB) = @_;
207 my $host = $connectDB->{
'host'};
208 my $user = $connectDB->{
'user'};
209 my $pass = $connectDB->{
'pass'};
210 my $port = $connectDB->{
'port'};
212 my $counter_core = 1;
213 my $counter_other = 1;
216 if ( defined $pass ) {
217 $out = `mysql -h $host -P $port -u $user -p$pass -e
'show databases like "%$version%"'`;
220 $out = `mysql -h $host -P $port -u $user -e
'show databases like "%$version%"'`;
222 my @dbs = split /\n/, $out;
226 foreach my $db (@dbs) {
228 if ( $db =~ /([a-z]+_[a-z0-9]+(?:_[a-z0-9]+)?_([a-z]+)(?:_\d+)?)_(\d+)_\w+/ )
230 my ( $species, $dbtype, $dbversion ) = ( $1, $2, $3 );
231 print
"SPECIES: $species\tDBTYPE $dbtype\tDBVERSION: $dbversion\n";
232 if ( $dbversion =~ /^$version/ ) {
233 if ( exists $dba_species->{$species} ) {
234 if ( exists $dba_species->{$species}->{ID} ) {
235 warn
"* $species : LOADED\n";
240 my $speciesOffset = $lastSpeciesID;
241 if ( exists $new_species->{$species} ) {
242 $speciesOffset = $new_species->{$species} - 1; #
this is the offset.
245 if ( $species =~ /_collection/ ) {
249 #print "\t\tGoing to add species $db offset $speciesOffset\n";
254 elsif ( $db =~ /([\w\_]+)_(
compare)_([\d\_\w]+)/ ) {
255 my ( $division, $dbtype, $dbversion ) = ( $1, $2, $3 );
256 if ( $dbversion =~ /^$version/ ) {
257 #add_compara_db($db);
263 my $took = time - $startAt;
265 my $m = ( $took / 60 ) % 60;
266 my $h = $took / 3600;
268 #warn sprintf( "Loaded in %02d:%02d:%02d\n", $h, $m, $s );
274 # not sure what is supposed to happen to compara IDs - they do not have species_id
275 warn
"- Skipping $dbname\n";
280 my ( $dbname, $speciesOffset ) = @_;
282 # 1 comes from species_id = 1 in meta table
283 #print "From add_species_db\n";
284 #warn "- Adding species $dbname (Species ID: ", 1 + $speciesOffset, ")\n";
286 if ( $dbname =~ /([\w\_]+)_(core|otherfeatures)_([\d\_\w]+)/ ) {
287 my ( $species, $dbtype, $dbversion ) = ( $1, $2, $3 );
290 my $dba_read =
db_connect( $dbname, $readDB );
291 my $dba_write =
db_connect( $writeDB->{
"dbname"}, $writeDB );
293 load_ids( $dba_read, $dba_write, $dbtype, $speciesOffset );
295 if ( not exists $new_species->{$species} ) {
296 load_species( $dba_read, $dba_write, $speciesOffset, $dbname );
297 $new_species->{$species} = $lastSpeciesID;
300 $dba_read->disconnect();
301 $dba_write->disconnect();
303 #warn "+ Loaded in ", time - $t1, "s\n";
308 my ( $dbname, $speciesOffset ) = @_;
310 warn
"- Adding collection $dbname (from Species ID ", 1 + $speciesOffset,
")\n";
312 if ( $dbname =~ /([\w\_]+)_(core|otherfeatures)_([\d\_\w]+)/ ) {
313 my ( $species, $dbtype, $dbversion ) = ( $1, $2, $3 );
317 my $dba_read =
db_connect( $dbname, $readDB );
318 my $dba_write =
db_connect( $writeDB->{
"dbname"}, $writeDB );
320 load_ids( $dba_read, $dba_write, $dbtype, $speciesOffset );
322 if ( not exists $new_species->{$species} ) {
323 load_species( $dba_read, $dba_write, $speciesOffset, $dbname );
324 $new_species->{$species} = $lastSpeciesID;
327 $dba_read->disconnect();
328 $dba_write->disconnect();
330 #warn "+ Loaded in ", time - $t1, "s\n";
335 my $dbh_read = shift;
336 my $dbh_write = shift;
337 my $speciesOffset = shift;
340 my $sqlName = qq{SELECT species_id + $speciesOffset, meta_value FROM meta WHERE meta_key =
"species.production_name"};
341 my $sqlTaxon = qq{SELECT species_id + $speciesOffset, meta_value FROM meta WHERE meta_key =
"species.taxonomy_id"};
345 my $sthN = $dbh_read->prepare($sqlName);
347 while ( my ( $sid, $name ) = $sthN->fetchrow_array() ) {
348 $shash->{$sid}->{Name} = $name;
349 if ( $lastSpeciesID < $sid ) {
350 $lastSpeciesID = $sid;
355 my $sthT = $dbh_read->prepare($sqlTaxon);
357 while ( my ( $sid, $taxid ) = $sthT->fetchrow_array() ) {
358 $shash->{$sid}->{TaxID} = $taxid;
363 my $insertSQL = qq{ INSERT INTO $writeDB->{
'dbname'}.species (species_id, name, taxonomy_id) VALUES };
366 foreach my $sid ( sort keys %{ $shash || {} } ) {
367 push(@tuples, sprintf( q{(%s, %s, %s)}, $sid, $dbh_write->quote( $shash->{$sid}->{Name} ), $dbh_write->quote( $shash->{$sid}->{TaxID} ) )
371 # Add the collection as well so if restart the script it does not load this collection again
372 if ( $dbname =~ /([\w\_]+_collection)_(core|otherfeatures)_([\d\_\w]+)/ ) {
373 my ( $species, $t, $v ) = ( $1, $2, $3 );
375 sprintf( q{(%s, %s, 0)},
377 $dbh_write->quote($species) );
382 $dbh_write->do( $insertSQL . join(
',', @tuples ) );
384 warn $insertSQL . join(
',', @tuples ),
"\n";
385 die
"ERROR: ", $DBI::errstr;
392 my ( $dbh_read, $dbh_write, $dbtype, $speciesOffset ) = @_;
393 #print "DbType : $dbtype speciesOffset : $speciesOffset\n";
395 my @stable_id_objects = keys %{ $group_objects{$dbtype} || {} };
397 foreach my $object_name (@stable_id_objects) {
398 my $object = lc($object_name);
402 if ( $object_name =~ /([A-Za-z]+)Archive/ ) {
405 #Note: Archive is not needed by ensemblgenomes as the stable_id_event is not populated
406 #It is needed by ensembl. For the time being, we have to assume that the coord_system holds only one species with id of 1
407 #In future ensembl also might need to support multiple databases
410 "SELECT DISTINCT old_stable_id, $species_id + $speciesOffset, '$dbtype', '$object' \
412 WHERE old_stable_id IS NOT NULL
414 AND old_stable_id NOT IN (SELECT stable_id FROM $object)";
415 $select_sql = $test ? $select_sql .
" limit 10" : $select_sql;
419 my $rows_inserted =
build_insert_sql( $select_sql, $dbh_read, $dbh_write, $is_archive );
421 elsif ( $object_name =~ / RNAProduct | Translation /x ) {
422 my $sth = $dbh_read->prepare(
"SELECT COUNT(*) FROM $object");
424 my ($count) = $sth->fetchrow_array;
426 if ( $dbtype eq
'core' ) {
428 "SELECT DISTINCT o.stable_id, cs.species_id + $speciesOffset, '$dbtype', '$object_name' \
430 LEFT JOIN transcript t USING (transcript_id) \
431 LEFT JOIN seq_region sr USING(seq_region_id) \
432 LEFT JOIN coord_system cs USING(coord_system_id) \
433 WHERE o.stable_id IS NOT NULL";
435 $select_sql = $test ? $select_sql .
" limit 10" : $select_sql;
439 elsif ( $dbtype eq
'otherfeatures' ) {
442 "SELECT DISTINCT tl.stable_id, cs.species_id + $speciesOffset, '$dbtype', '$object_name' \
443 FROM translation tl \
444 LEFT JOIN transcript t USING (transcript_id) \
445 LEFT JOIN analysis a USING (analysis_id) \
446 LEFT JOIN seq_region sr USING(seq_region_id) \
447 LEFT JOIN coord_system cs USING(coord_system_id) \
448 WHERE logic_name like 'RefSeq_%' OR logic_name like 'CCDS_%'";
450 $select_sql = $test ? $select_sql .
" limit 10" : $select_sql;
457 my $sth = $dbh_read->prepare(
"SELECT COUNT(*) FROM $object");
459 my ($count) = $sth->fetchrow_array;
461 if ( $dbtype eq
'core' ) {
463 "SELECT DISTINCT o.stable_id, cs.species_id + $speciesOffset, '$dbtype', '$object_name' \
465 LEFT JOIN seq_region sr USING(seq_region_id) \
466 LEFT JOIN coord_system cs USING(coord_system_id) \
467 WHERE o.stable_id is not NULL";
469 $select_sql = $test ? $select_sql .
" limit 10" : $select_sql;
474 elsif ( $dbtype eq
'otherfeatures' ) {
477 "SELECT DISTINCT o.stable_id, cs.species_id + $speciesOffset, '$dbtype', '$object_name' \
479 LEFT JOIN analysis a USING (analysis_id) \
480 LEFT JOIN seq_region sr USING(seq_region_id) \
481 LEFT JOIN coord_system cs USING(coord_system_id) \
482 WHERE logic_name like 'RefSeq_%' OR logic_name like 'CCDS_%'";
484 $select_sql = $test ? $select_sql .
" limit 10" : $select_sql;
495 my ( $select_sql, $dbh_read, $dbh_write, $is_archive ) = @_;
497 my $insert_sql = qq{INSERT INTO $writeDB->{
'dbname'}.stable_id_lookup(stable_id, species_id, db_type, object_type) VALUES };
500 $insert_sql = qq{INSERT INTO $writeDB->{
'dbname'}.archive_id_lookup(archive_id, species_id, db_type, object_type) VALUES };
503 my $import_sth = $dbh_read->prepare($select_sql);
504 $import_sth->execute;
505 my $rows = $import_sth->rows;
509 my $max_rows_in_insert = 99999; #Batch size of 100,000
510 my $rows_in_insert = 0;
512 my @insert_container = ();
514 while ( my $ref = $import_sth->fetchrow_arrayref ) {
515 if ( $rows_in_insert < $max_rows_in_insert ) {
516 $insert_values .=
',' if $start++;
517 $insert_values .=
'(' . ( join(
",",
map { $dbh_read->quote($_) } @{$ref} ) ) .
')';
521 $insert_values .=
',' if $start++;
522 $insert_values .=
'(' . ( join(
",",
map { $dbh_read->quote($_) } @{$ref} ) ) .
')';
523 my $insert_sql_with_values = $insert_sql . $insert_values;
524 push( @insert_container, $insert_sql_with_values );
526 $insert_values = undef;
532 if ($insert_values) {
533 my $insert_sql_with_values = $insert_sql . $insert_values;
534 push( @insert_container, $insert_sql_with_values );
537 my $rows_inserted = 0;
538 foreach my $insert_stmt (@insert_container) {
539 my $affected = $dbh_write->do($insert_stmt);
540 $rows_inserted += $affected;
543 #print "Number of rows fetched $rows \n Number of rows inserted into $writeDB->{'dbname'}.stable_id_lookup ", $rows_inserted, "\n";
544 return $rows_inserted;
551 my $sql = qq{SELECT species_id, meta_value FROM meta WHERE meta_key =
"species.production_name"};
552 my $sth = $dbh->prepare($sql);
554 while ( my ( $sid, $name ) = $sth->fetchrow_array() ) {
555 $dba_species->{$name}->{ID} = $sid + $offset;
561 my ( $dbname, $connectDB ) = @_;
563 $host = $connectDB->{
'host'};
564 $user = $connectDB->{
'user'};
565 $pass = $connectDB->{
'pass'};
566 $port = $connectDB->{
'port'};
568 my $dsn =
"DBI:mysql:host=$host;";
570 $dsn .=
"port=$port;";
572 $dsn .=
"database=$dbname" if $dbname;
574 my $dbh = DBI->connect( $dsn, $user, $pass,
575 {
'PrintError' => 1,
'RaiseError' => 1 } );
578 die
"ERROR: $DBI::errstr";
590 my $dbh =
db_connect( $readDB->{
'dbname'}, $readDB );
592 my $sth = $dbh->prepare(
"SELECT species_id, name FROM species");
594 while ( my ( $sid, $name ) = $sth->fetchrow_array() ) {
595 $shash->{$name}->{ID} = $sid;
596 if ( $sid > $ssid ) {
602 return ( $shash, $ssid );
606 my $indent =
' ' x length($0);
607 print <<EOF; exit(0);
609 The script populates a stable_id lookup database with all stable ids found in databases
610 on a specified server
for a specified db release.
611 Stable ids are copied
for objects listed in hash %group_objects
613 The script is used
for both ensembl and ensembl genomes
615 Options -lhost -lport -luser -lpass -version are mandatory and specify the credentials
for the server on which a stable
id lookup database exists or is to be created (
if using option -create).
616 If an argument
for option -ldbname is not provided, the
default name
for the database wil be used:
'ensemblgenomes_stable_id_lookup_xx', where xx is the database release (option -version).
618 Options -host -user -port specify the credentials of the server(s) where stable ids are to be copied from.
620 To
run the script cd into the directory where the script lives eg:
621 cd ensembl/misc-scripts/stable_id_lookup/
624 This command will create database ensembl_stable_ids_88 on server mysql-xxx-dev.ebi.ac.uk
for release 88 databases found on mysql-ens-xxx.ebi.ac.uk:
628 (with -test flag create a small subset database)
629 perl populate_stable_id_lookup.pl -lhost mysql-xxx-dev.ebi.ac.uk -luser xxxrw -lpass xxxx -lport 4484 -dbname ensembl_stable_ids_88 -create -host mysql-ens-xxx.ebi.ac.uk -user ensro -port 4519 -version 88 -test
631 (without -test flag creates a full version)
632 perl populate_stable_id_lookup.pl -lhost mysql-xxx-dev.ebi.ac.uk -luser xxxrw -lpass xxxx -lport 4484 -dbname ensembl_stable_ids_88 -create -host mysql-ens-xxx.ebi.ac.uk -user ensro -port 4519 -version 88
637 $0 -lhost host_name -lport port_number -luser user_name -lpass password -version db_version
638 $indent [-create] [-dbname database_name]
641 -h|host Database host where stable_ids are to be copied from (multiple hosts can be specified)
643 -u|user Database user where stable_ids are to be copied from (each host needs a user specified,
644 if multiple -h|host options are given and fewer -u|user options are specified,
645 the first user name will be used
for the hosts where no user name was given)
647 -port Database port where stable_ids are to be copied from (
if more than one host is specified
648 multiple ports can be provided)
650 -lh|lhost Database host where stable_id lookup database exists or is to be created
652 -lu|luser Database user where stable_id lookup database exists or is to be created
654 -lp|lpass Database password where stable_id lookup database exists or is to be created
656 -lport Database port where stable_id lookup database exists or is to be created
658 -v|version EG version to match, e.g 88_77 OR Ensembl version to match 88
660 -db|dbname Database name
for the stable
id lookup database, e.g ensembl_stable_ids_88
662 -create Create the stable
id lookup database
using sql source ./sql/tables.sql