ensembl-hive  2.8.1
update_mapping_set.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 #
3 # See the NOTICE file distributed with this work for additional information
4 # regarding copyright ownership.
5 #
6 # Licensed under the Apache License, Version 2.0 (the "License");
7 # you may not use this file except in compliance with the License.
8 # You may obtain a copy of the License at
9 #
10 # http://www.apache.org/licenses/LICENSE-2.0
11 #
12 # Unless required by applicable law or agreed to in writing, software
13 # distributed under the License is distributed on an "AS IS" BASIS,
14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 # See the License for the specific language governing permissions and
16 # limitations under the License.
17 
18 
19 =head1 NAME
20 
21 update_mapping_set.pl - script to update 2 tables: mapping_set and
22  seq_region_id mapping. These tables are
23  supposed to store different mapping of
24  seq_region_id between releases
25 
26 =head1 SYNOPSIS
27 
28 update_mapping_set.pl [arguments]
29 
30 Required arguments:
31 
32  --user=user username for the database
33 
34  --pass=pass password for the database
35 
36  --release=num_release release we want to run the updating
37 
38 Optional arguments:
39 
40  --host=host server where the core databases are stored
41  (default: ens-staging)
42 
43  --oldhost = oldhost server where the old release databases are stored
44 
45  --dbname=dbname if you want a single database to update
46  the mapping_set information (default: all
47  databases)
48 
49  --port=port port (default: 3306)
50 
51  --oldport=port old database server port (default: 3306)
52 
53  --olduser=user old database server username
54 
55  --oldpass=pass password for old database server
56 
57  --previous_dbname old database name
58  allows to skip the expected name matching between
59  old and new database
60 
61  --help print help (this message)
62 
63 =head1 DESCRIPTION
64 
65 This script will update the mapping_set table with the current
66 schema_build information and indicate if the seq_region table has
67 changed from previous release.
68 
69 If it has, the table seq_region_mapping will contain the
70 relation between old (external_seq_region_id) and the current
71 (internal_seq_region_id).
72 
73 =head1 EXAMPLES
74 
75 Update mapping_set information for all databases in ens-staging in
76 release NN (the usual use case in release process):
77 
78  $ ./update_mapping_set.pl --user ensadmin \
79  --pass password --release NN --old_host ensembldb-ensembl.org
80 
81 Update mapping_set information only for pig database in ens-genomics1:
82 
83  $ ./update_mapping_set.pl --host ens-genomics1 \
84  --user ensadmin --pass password --dbname my_pig_db --release 52
85 
86 
87 =head1 AUTHOR
88 
89 Daniel Rios <dani@ebi.ac.uk>, Ensembl core API team
90 
91 =head1 CONTACT
92 
93 =cut
94 
95 use strict;
96 use warnings;
97 
98 use Getopt::Long;
99 use Pod::Usage;
100 
101 use DBI qw(:sql_types);
102 
103 use Bio::EnsEMBL::Utils::Exception qw(throw);
104 
105 ## Command line options
106 
107 my $host = 'ens-staging';
108 my $host2 = 'ens-staging2';
109 my $oldhost = 'ens-livemirror';
110 my $dbname = undef;
111 my $user = undef;
112 my $pass = undef;
113 my $port = 3306;
114 my $oldport = 3306;
115 my $olduser = "ensro";
116 my $oldpass = undef;
117 my $compare_dbname = undef;
118 my $help = undef;
119 my $release = undef;
120 my $dry_run = undef;
121 my $dbtype = "core";
122 my $previous_dbname;
123 
124 GetOptions('host=s' => \$host,
125  'host2=s' => \$host2,
126  'dbname=s' => \$dbname,
127  'user=s' => \$user,
128  'pass=s' => \$pass,
129  'port=s' => \$port,
130  'release=i' => \$release,
131  'oldhost=s' => \$oldhost,
132  'oldport=s' => \$oldport,
133  'olduser=s' => \$olduser,
134  'oldpass=s' => \$oldpass,
135  'previous_dbname=s' => \$compare_dbname,
136  'help' => \$help,
137  'dry_run' => \$dry_run,
138  'dbtype=s' => \$dbtype,
139  );
140 
141 pod2usage(1) if($help);
142 throw("--user argument required") if (!defined($user));
143 throw("--release argument required") if(!defined($release));
144 
145 my $database = 'information_schema';
146 my $old_dbh = DBI->connect("DBI:mysql:database=$database;host=$oldhost;port=$oldport",$olduser,$oldpass);
147 
148 foreach my $h ($host,$host2) {
149 
150  my $dbh = DBI->connect("DBI:mysql:database=$database;host=$h;port=$port",$user,$pass);
151 
152  #since there is no database defined, will run it agains all core databases
153  my $pattern;
154  if (!defined ($dbname)){
155  $pattern = "_".$dbtype."_".$release."_";
156  }
157  else{
158  $pattern = $dbname;
159  }
160 
161 # Fetch all databases matching the pattern
162  my $sth = $dbh->prepare("SHOW DATABASES like \'%$pattern%\'");
163  $sth->execute();
164  my $dbs = $sth->fetchall_arrayref();
165  foreach my $db_name (@{$dbs}){
166  my $current_dbname = $db_name->[0];
167  print STDERR "Going to update mapping for $current_dbname....\n";
168 
169  my $schema_build = get_schema_and_build($current_dbname);
170  my $latest_schema_build = get_latest_schema_build($dbh, $current_dbname);
171  if ($latest_schema_build eq $schema_build) {
172  print STDERR "$current_dbname already has a mapping for $schema_build, skipping\n";
173  next;
174  }
175 
176  if (!$compare_dbname) {
177  $previous_dbname = &get_previous_dbname($old_dbh,$current_dbname,$release);
178  } else {
179  $previous_dbname = $compare_dbname;
180  }
181 
182  if (!defined($previous_dbname)) {
183  print STDERR "First instance known for $current_dbname, no mapping needed\n";
184  next;
185  }
186 
187 # If it is a new assembly, no mapping needed
188  my $current_assembly = get_assembly($dbh,$current_dbname) ;
189  my $old_assembly = get_assembly($old_dbh,$previous_dbname);
190  if ($old_assembly ne $current_assembly) {
191  print STDERR "New assembly $current_assembly for $current_dbname, no mapping needed\n" ;
192  next;
193  }
194 
195  update_mapping($dbh, $current_dbname, $previous_dbname, $dry_run);
196 
197  my $mapping_set_id = get_max_mapping_set_id($dbh,$current_dbname) + 1;
198  my $sth_mapping_set = $dbh->prepare("INSERT INTO $current_dbname.mapping_set VALUES(?,?,?)");
199  my $sth_update_build = $dbh->prepare("UPDATE $current_dbname.mapping_set SET internal_schema_build = ?");
200  my $previous_schema_build = get_schema_and_build($previous_dbname);
201  my $new_mapping = $sth_mapping_set->execute($mapping_set_id,$schema_build,$previous_schema_build) unless $dry_run;
202  $sth_update_build->execute($schema_build) unless $dry_run;
203  }
204 }
205 
206 
207 
208 sub update_mapping {
209  my ($dbh, $current_dbname, $previous_dbname, $dry_run) = @_;
210  my $error;
211  my $mapping_set_id = get_max_mapping_set_id($dbh,$current_dbname) + 1;
212 
213  my $sth_seq_mapping = $dbh->prepare("INSERT INTO $current_dbname.seq_region_mapping VALUES(?,?,?)");
214  my $sth_update_old = $dbh->prepare("UPDATE $current_dbname.seq_region_mapping SET internal_seq_region_id = ? WHERE internal_seq_region_id = ?");
215  my $sth_remove_deprecated = $dbh->prepare("DELETE FROM $current_dbname.seq_region_mapping WHERE internal_seq_region_id = ?");
216 
217  my $current_assembly = get_assembly($dbh,$current_dbname) ;
218 
219  my $count_removed = 0;
220  my $count_updated = 0;
221  my $count_added = 0;
222 
223 # If there has been no change in seq_region, no mapping needed
224  my $cur_seq_region_checksum = &get_seq_region_checksum($dbh,$current_dbname);
225  my $previous_seq_region_checksum = &get_seq_region_checksum($old_dbh,$previous_dbname);
226  if ($cur_seq_region_checksum == $previous_seq_region_checksum) {
227  print STDERR "No change in seq_region for $current_dbname, no mapping needed\n";
228  return;
229  }
230 
231 # There has been a seq_region change between releases, add the relation old_seq_region_id->new_seq_region_id
232  my $current_seq_region = &read_seq_region($dbh,$current_dbname);
233  my $old_seq_region = &read_seq_region($old_dbh,$previous_dbname);
234 
235 # Build a hash of currently used seq region ids to ensure we do not map to overlapping IDs
236 # i.e. a database has reused seq region IDs between releases
237  my $current_seq_region_ids = get_seq_region_ids($dbh, $current_dbname);
238 
239 # Update the seq_region_mapping table with the old->new seq_region_id relation
240  foreach my $seq_region_name (keys %{$old_seq_region}){
241  my $current_name_hash = $current_seq_region->{$seq_region_name};
242  my $old_name_hash = $old_seq_region->{$seq_region_name};
243 
244 # If the seq_region has disappeared, remove previous entries for that id
245  if (!defined $current_name_hash) {
246  my $id = get_seq_region_id($old_dbh,$previous_dbname, $seq_region_name);
247  $count_removed += $sth_remove_deprecated->execute($id) unless $dry_run;
248  next;
249  }
250 # while ( my ($length, $hash_length) = each %{$old_name_hash})
251  foreach my $length (keys %{$old_name_hash}){
252  my $current_length_hash = $current_name_hash->{$length};
253  my $old_length_hash = $old_name_hash->{$length};
254 
255 # The seq_region might have a different length
256  if (!defined $current_length_hash) {
257  next;
258  }
259  foreach my $cs (keys %{$old_length_hash}) {
260  my $current_cs_hash = $current_length_hash->{$cs};
261  my $old_cs_hash = $old_length_hash->{$cs};
262 
263 # The coord system might have changed
264  if (!defined $current_cs_hash) {
265  next;
266  }
267  foreach my $id (keys %{$old_cs_hash}) {
268  my $current_id = $current_cs_hash->{$id};
269  my $old_id = $old_cs_hash->{$id};
270 
271 # If no change, no need to write out
272  if (!defined $current_id || $old_id == $current_id) {
273  next;
274  }
275 
276  if(exists $current_seq_region_ids->{$old_id}) {
277  printf STDERR "Skipping the mapping for old id %d to current id %d as the old ID is in use in the DB. This means IDs have been reused. Do not reused seq_region_id primary keys\n", $old_id, $current_id;
278  $error = 1;
279  next;
280  }
281 
282 # If there is a change, update any existing entries for this seq_region to the new id
283 # Then, add a new entry to map said id to the old release
284  $count_updated += $sth_update_old->execute($current_id,$old_id) unless $dry_run;
285  $count_added += $sth_seq_mapping->execute($old_id,$current_id, $mapping_set_id) unless $dry_run;
286  if($dry_run) {
287  $count_updated++;
288  $count_added++;
289  }
290  }
291  }
292  }
293  }
294  print STDERR "For $current_dbname, removed $count_removed, added $count_added, updated $count_updated seq_region_mapping entries\n\n" ;
295 
296 
297  if($error) {
298  die "Error detected when loading the mapping sets. Check STDERR for more information";
299  }
300 }
301 
302 
303 # For a given database, will return the seq_region_name->seq_region_id relation
304 sub read_seq_region {
305  my ($dbh, $dbname) = @_;
306  my (%seq_region_hash, $seq_region_id, $seq_region_name, $coord_system_id, $length, $cs_name, $cs_rank);
307  my $sth = $dbh->prepare("SELECT seq_region_id, s.name, length, cs.name, cs.rank FROM $dbname.seq_region s, $dbname.coord_system cs WHERE cs.coord_system_id = s.coord_system_id");
308  $sth->execute();
309  $sth->bind_col(1,\$seq_region_id);
310  $sth->bind_col(2,\$seq_region_name);
311  $sth->bind_col(3,\$length);
312  $sth->bind_col(4,\$cs_name);
313  $sth->bind_col(5,\$cs_rank);
314  while ($sth->fetch){
315  $seq_region_hash{$seq_region_name}{$length}{$cs_name}{$cs_rank} = $seq_region_id;
316  }
317  return \%seq_region_hash;
318 }
319 
320 sub get_seq_region_ids {
321  my ($dbh, $dbname) = @_;
322  my $sql = qq{select seq_region_id from ${dbname}.seq_region};
323  my %hash = map { $_, 1 } @{$dbh->selectcol_arrayref($sql)};
324  return \%hash;
325 }
326 
327 # For a given database, returns the size of the seq_region_table
329  my ($dbh, $dbname) = @_;
330  my $sth_status = $dbh->prepare("checksum table $dbname.seq_region") ;
331  $sth_status->execute();
332  my $table_status = $sth_status->fetchrow_array();
333  return $table_status; #return the size of the table
334 }
335 
336 sub get_seq_region_id {
337  my ($dbh, $dbname, $seq_region_name) = @_;
338  my $sth_region = $dbh->prepare("SELECT seq_region_id FROM $dbname.seq_region WHERE name = ?");
339  $sth_region->execute($seq_region_name);
340  my $seq_region_id = $sth_region->fetchrow_array();
341  return $seq_region_id;
342 }
343 
344 # Will return the max mapping_set_id being used in the mapping_set table
346  my ($dbh, $dbname) = @_;
347  my $sth_mapping = $dbh->prepare("select max(mapping_set_id) from $dbname.mapping_set");
348  $sth_mapping->execute();
349  my ($max_mapping_set_id) = $sth_mapping->fetchrow_array();
350  if (!defined $max_mapping_set_id) { return 0; }
351  return $max_mapping_set_id;
352 }
353 
355  my ($dbh, $dbname) = @_;
356  my $sth_mapping = $dbh->prepare("select internal_schema_build from $dbname.mapping_set order by mapping_set_id desc limit 1");
357  $sth_mapping->execute();
358  my ($internal_schema_build) = $sth_mapping->fetchrow_array();
359  if (!defined $internal_schema_build) { return 0; }
360  return $internal_schema_build;
361 }
362 
363 # This method will return the name of the previous database to release for same species (assuming is present)
365  my ($dbh, $dbname, $release) = @_;
366  my $previous_dbname;
367  $dbname =~ /(^([a-z]+_){2,3}[a-z]+_)/;
368  if (!$1) { throw("Database name $dbname is not in the right format"); }
369  my $previous_release_name = $1 . (--$release);
370  my $previous_sth = $dbh->prepare("show databases like \'%$previous_release_name%\'");
371  $previous_sth->execute();
372  ($previous_dbname) = $previous_sth->fetchrow_array() ;
373  return $previous_dbname;
374 }
375 
376 # For a standard ensembl database name, returns the release number and assembly
378  my ($dbname) = @_;
379  my @dbname = split/_/, $dbname;
380  return join "_", $dbname[($#dbname -1)], $dbname[($#dbname)];
381 }
382 
383 # Returns the assembly name for a given database
384 sub get_assembly {
385  my ($dbh, $dbname) = @_;
386  my $sth = $dbh->prepare("select distinct version from $dbname.coord_system where attrib = 'default_version'");
387  $sth->execute();
388  return $sth->fetchrow();
389 }
get_previous_dbname
public get_previous_dbname()
map
public map()
update_mapping
public update_mapping()
get_latest_schema_build
public get_latest_schema_build()
get_schema_and_build
public get_schema_and_build()
get_seq_region_checksum
public get_seq_region_checksum()
get_assembly
public get_assembly()
get_seq_region_id
public get_seq_region_id()
run
public run()
get_max_mapping_set_id
public get_max_mapping_set_id()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
get_seq_region_ids
public get_seq_region_ids()
read_seq_region
public read_seq_region()