3 # See the NOTICE file distributed with this work for additional information
4 # regarding copyright ownership.
6 # Licensed under the Apache License, Version 2.0 (the "License");
7 # you may not use this file except in compliance with the License.
8 # You may obtain a copy of the License at
10 # http://www.apache.org/licenses/LICENSE-2.0
12 # Unless required by applicable law or agreed to in writing, software
13 # distributed under the License is distributed on an "AS IS" BASIS,
14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 # See the License for the specific language governing permissions and
16 # limitations under the License.
21 update_mapping_set.pl - script to update 2 tables: mapping_set and
22 seq_region_id mapping. These tables are
23 supposed to store different mapping of
24 seq_region_id between releases
28 update_mapping_set.pl [arguments]
32 --user=user username
for the database
34 --pass=pass password
for the database
36 --release=num_release release we want to
run the updating
40 --host=host server where the core databases are stored
41 (
default: ens-staging)
43 --oldhost = oldhost server where the old release databases are stored
45 --dbname=dbname
if you want a single database to update
46 the mapping_set information (
default: all
49 --port=port port (
default: 3306)
51 --oldport=port old database server port (
default: 3306)
53 --olduser=user old database server username
55 --oldpass=pass password
for old database server
57 --previous_dbname old database name
58 allows to skip the expected name matching between
61 --help print help (
this message)
65 This script will update the mapping_set table with the current
66 schema_build information and indicate
if the seq_region table has
67 changed from previous release.
69 If it has, the table seq_region_mapping will contain the
70 relation between old (external_seq_region_id) and the current
71 (internal_seq_region_id).
75 Update mapping_set information
for all databases in ens-staging in
76 release NN (the usual use
case in release process):
78 $ ./update_mapping_set.pl --user ensadmin \
79 --pass password --release NN --old_host ensembldb-ensembl.org
81 Update mapping_set information only for pig database in ens-genomics1:
83 $ ./update_mapping_set.pl --host ens-genomics1 \
84 --user ensadmin --pass password --dbname my_pig_db --release 52
89 Daniel Rios <dani@ebi.ac.uk>, Ensembl core API team
101 use DBI qw(:sql_types);
105 ## Command line options
107 my $host =
'ens-staging';
108 my $host2 =
'ens-staging2';
109 my $oldhost =
'ens-livemirror';
115 my $olduser =
"ensro";
117 my $compare_dbname = undef;
124 GetOptions(
'host=s' => \$host,
125 'host2=s' => \$host2,
126 'dbname=s' => \$dbname,
130 'release=i' => \$release,
131 'oldhost=s' => \$oldhost,
132 'oldport=s' => \$oldport,
133 'olduser=s' => \$olduser,
134 'oldpass=s' => \$oldpass,
135 'previous_dbname=s' => \$compare_dbname,
137 'dry_run' => \$dry_run,
138 'dbtype=s' => \$dbtype,
141 pod2usage(1) if($help);
142 throw("--user argument required") if (!defined($user));
143 throw("--release argument required") if(!defined($release));
145 my $database = 'information_schema';
146 my $old_dbh = DBI->connect("DBI:mysql:database=$database;host=$oldhost;port=$oldport",$olduser,$oldpass);
148 foreach my $h ($host,$host2) {
150 my $dbh = DBI->connect(
"DBI:mysql:database=$database;host=$h;port=$port",$user,$pass);
152 #since there is no database defined, will run it agains all core databases
154 if (!defined ($dbname)){
155 $pattern =
"_".$dbtype.
"_".$release.
"_";
161 # Fetch all databases matching the pattern
162 my $sth = $dbh->prepare(
"SHOW DATABASES like \'%$pattern%\'");
164 my $dbs = $sth->fetchall_arrayref();
165 foreach my $db_name (@{$dbs}){
166 my $current_dbname = $db_name->[0];
167 print STDERR
"Going to update mapping for $current_dbname....\n";
171 if ($latest_schema_build eq $schema_build) {
172 print STDERR
"$current_dbname already has a mapping for $schema_build, skipping\n";
176 if (!$compare_dbname) {
179 $previous_dbname = $compare_dbname;
182 if (!defined($previous_dbname)) {
183 print STDERR
"First instance known for $current_dbname, no mapping needed\n";
187 # If it is a new assembly, no mapping needed
188 my $current_assembly =
get_assembly($dbh,$current_dbname) ;
189 my $old_assembly =
get_assembly($old_dbh,$previous_dbname);
190 if ($old_assembly ne $current_assembly) {
191 print STDERR
"New assembly $current_assembly for $current_dbname, no mapping needed\n" ;
195 update_mapping($dbh, $current_dbname, $previous_dbname, $dry_run);
198 my $sth_mapping_set = $dbh->prepare(
"INSERT INTO $current_dbname.mapping_set VALUES(?,?,?)");
199 my $sth_update_build = $dbh->prepare(
"UPDATE $current_dbname.mapping_set SET internal_schema_build = ?");
201 my $new_mapping = $sth_mapping_set->execute($mapping_set_id,$schema_build,$previous_schema_build) unless $dry_run;
202 $sth_update_build->execute($schema_build) unless $dry_run;
209 my ($dbh, $current_dbname, $previous_dbname, $dry_run) = @_;
213 my $sth_seq_mapping = $dbh->prepare(
"INSERT INTO $current_dbname.seq_region_mapping VALUES(?,?,?)");
214 my $sth_update_old = $dbh->prepare(
"UPDATE $current_dbname.seq_region_mapping SET internal_seq_region_id = ? WHERE internal_seq_region_id = ?");
215 my $sth_remove_deprecated = $dbh->prepare(
"DELETE FROM $current_dbname.seq_region_mapping WHERE internal_seq_region_id = ?");
217 my $current_assembly =
get_assembly($dbh,$current_dbname) ;
219 my $count_removed = 0;
220 my $count_updated = 0;
223 # If there has been no change in seq_region, no mapping needed
226 if ($cur_seq_region_checksum == $previous_seq_region_checksum) {
227 print STDERR
"No change in seq_region for $current_dbname, no mapping needed\n";
231 # There has been a seq_region change between releases, add the relation old_seq_region_id->new_seq_region_id
235 # Build a hash of currently used seq region ids to ensure we do not map to overlapping IDs
236 # i.e. a database has reused seq region IDs between releases
239 # Update the seq_region_mapping table with the old->new seq_region_id relation
240 foreach my $seq_region_name (keys %{$old_seq_region}){
241 my $current_name_hash = $current_seq_region->{$seq_region_name};
242 my $old_name_hash = $old_seq_region->{$seq_region_name};
244 # If the seq_region has disappeared, remove previous entries for that id
245 if (!defined $current_name_hash) {
247 $count_removed += $sth_remove_deprecated->execute($id) unless $dry_run;
250 # while ( my ($length, $hash_length) = each %{$old_name_hash})
251 foreach my $length (keys %{$old_name_hash}){
252 my $current_length_hash = $current_name_hash->{$length};
253 my $old_length_hash = $old_name_hash->{$length};
255 # The seq_region might have a different length
256 if (!defined $current_length_hash) {
259 foreach my $cs (keys %{$old_length_hash}) {
260 my $current_cs_hash = $current_length_hash->{$cs};
261 my $old_cs_hash = $old_length_hash->{$cs};
263 # The coord system might have changed
264 if (!defined $current_cs_hash) {
267 foreach my $id (keys %{$old_cs_hash}) {
268 my $current_id = $current_cs_hash->{$id};
269 my $old_id = $old_cs_hash->{$id};
271 # If no change, no need to write out
272 if (!defined $current_id || $old_id == $current_id) {
276 if(exists $current_seq_region_ids->{$old_id}) {
277 printf STDERR
"Skipping the mapping for old id %d to current id %d as the old ID is in use in the DB. This means IDs have been reused. Do not reused seq_region_id primary keys\n", $old_id, $current_id;
282 # If there is a change, update any existing entries for this seq_region to the new id
283 # Then, add a new entry to map said id to the old release
284 $count_updated += $sth_update_old->execute($current_id,$old_id) unless $dry_run;
285 $count_added += $sth_seq_mapping->execute($old_id,$current_id, $mapping_set_id) unless $dry_run;
294 print STDERR
"For $current_dbname, removed $count_removed, added $count_added, updated $count_updated seq_region_mapping entries\n\n" ;
298 die
"Error detected when loading the mapping sets. Check STDERR for more information";
303 # For a given database, will return the seq_region_name->seq_region_id relation
305 my ($dbh, $dbname) = @_;
306 my (%seq_region_hash, $seq_region_id, $seq_region_name, $coord_system_id, $length, $cs_name, $cs_rank);
307 my $sth = $dbh->prepare(
"SELECT seq_region_id, s.name, length, cs.name, cs.rank FROM $dbname.seq_region s, $dbname.coord_system cs WHERE cs.coord_system_id = s.coord_system_id");
309 $sth->bind_col(1,\$seq_region_id);
310 $sth->bind_col(2,\$seq_region_name);
311 $sth->bind_col(3,\$length);
312 $sth->bind_col(4,\$cs_name);
313 $sth->bind_col(5,\$cs_rank);
315 $seq_region_hash{$seq_region_name}{$length}{$cs_name}{$cs_rank} = $seq_region_id;
317 return \%seq_region_hash;
321 my ($dbh, $dbname) = @_;
322 my $sql = qq{select seq_region_id from ${dbname}.seq_region};
323 my %hash =
map { $_, 1 } @{$dbh->selectcol_arrayref($sql)};
327 # For a given database, returns the size of the seq_region_table
329 my ($dbh, $dbname) = @_;
330 my $sth_status = $dbh->prepare(
"checksum table $dbname.seq_region") ;
331 $sth_status->execute();
332 my $table_status = $sth_status->fetchrow_array();
333 return $table_status; #
return the size of the table
337 my ($dbh, $dbname, $seq_region_name) = @_;
338 my $sth_region = $dbh->prepare(
"SELECT seq_region_id FROM $dbname.seq_region WHERE name = ?");
339 $sth_region->execute($seq_region_name);
340 my $seq_region_id = $sth_region->fetchrow_array();
341 return $seq_region_id;
344 # Will return the max mapping_set_id being used in the mapping_set table
346 my ($dbh, $dbname) = @_;
347 my $sth_mapping = $dbh->prepare(
"select max(mapping_set_id) from $dbname.mapping_set");
348 $sth_mapping->execute();
349 my ($max_mapping_set_id) = $sth_mapping->fetchrow_array();
350 if (!defined $max_mapping_set_id) {
return 0; }
351 return $max_mapping_set_id;
355 my ($dbh, $dbname) = @_;
356 my $sth_mapping = $dbh->prepare(
"select internal_schema_build from $dbname.mapping_set order by mapping_set_id desc limit 1");
357 $sth_mapping->execute();
358 my ($internal_schema_build) = $sth_mapping->fetchrow_array();
359 if (!defined $internal_schema_build) {
return 0; }
360 return $internal_schema_build;
363 # This method will return the name of the previous database to release for same species (assuming is present)
365 my ($dbh, $dbname, $release) = @_;
367 $dbname =~ /(^([a-z]+_){2,3}[a-z]+_)/;
368 if (!$1) {
throw(
"Database name $dbname is not in the right format"); }
369 my $previous_release_name = $1 . (--$release);
370 my $previous_sth = $dbh->prepare(
"show databases like \'%$previous_release_name%\'");
371 $previous_sth->execute();
372 ($previous_dbname) = $previous_sth->fetchrow_array() ;
373 return $previous_dbname;
376 # For a standard ensembl database name, returns the release number and assembly
379 my @dbname = split/_/, $dbname;
380 return join
"_", $dbname[($#dbname -1)], $dbname[($#dbname)];
383 # Returns the assembly name for a given database
385 my ($dbh, $dbname) = @_;
386 my $sth = $dbh->prepare(
"select distinct version from $dbname.coord_system where attrib = 'default_version'");
388 return $sth->fetchrow();