ensembl-hive  2.5
DatabaseDumper.pm
Go to the documentation of this file.
1 =pod
2 
3 =head1 NAME
4 
6 
7 =head1 SYNOPSIS
8 
9  standaloneJob.pl RunnableDB/DatabaseDumper.pm -exclude_ehive 1 -exclude_list 1 \
10  -table_list "['peptide_align_%']" -src_db_conn mysql://ensro@127.0.0.1:4313/mm14_compara_homology_67 -output_file ~/dump1.sql
11 
12 =head1 DESCRIPTION
13 
14 This is a Runnable to dump the tables of a database (by default,
15 all of them).
16 
17 The following parameters are accepted:
18 
19  - src_db_conn : the connection parameters to the database to be
20  dumped (by default, the current eHive database if available)
21 
22  - exclude_ehive [boolean=0] : do we exclude the eHive-specific tables
23  from the dump
24 
25  - table_list [string or array of strings]: the list of tables
26  to include in the dump. The '%' wildcard is accepted.
27 
28  - exclude_list [boolean=0] : do we consider 'table_list' as a list
29  of tables to be excluded from the dump (instead of included)
30 
31  - output_file [string] : the file to write the dump to. If the filename
32  ends with ".gz", the file is compressed with "gzip" (default parameters)
33 
34  - output_db [string] : URL of a database to write the dump to. In this
35  mode, the Runnable acts like MySQLTransfer
36 
37  - skip_dump [boolean=0] : set this to 1 to skip the dump
38 
39 
40 The decision process regarding which tables should be dumped is quite complex.
41 The following sections explain the various scenarios.
42 
43 1. eHive database
44 
45 1.a. Hybrid database
46 
47 If "table_list" is undefined or maps to an empty list, the list
48 of tables to be dumped is decided accordingly to "exclude_list" (EL)
49 and "exclude_ehive" (EH). "exclude_list" controls the whole list of
50 non-eHive tables.
51 
52 EL EH List of tables to dump
53 0 0 => all the tables
54 0 1 => all the tables, except the eHive ones
55 1 0 => all the tables, except the non-eHive ones = only the eHive tables
56 1 1 => both eHive and non-eHive tables are excluded = nothing is dumped
57 
58 If "table_list" is defined to non-empty list T, the table of decision is:
59 
60 EL EH List of tables to dump
61 0 0 => all the tables in T + the eHive tables
62 0 1 => all the tables in T
63 1 0 => all the tables, except the ones in T
64 1 1 => all the tables, except the ones in T and the eHive ones
65 
66 1.b. eHive-only database
67 
68 The decision table can be simplified if the database only contains eHive tables.
69 In particular, the "exclude_list" and "table_list" parameters have no effect.
70 
71 EH List of tables to dump
72 0 => All the eHive tables, i.e. the whole database
73 1 => No eHive tables, i.e. nothing
74 
75 2. non-eHive database
76 
77 The "exclude_ehive" parameter is ignored.
78 
79 empty "table_list":
80 EL List of tables to dump
81 0 => all the tables
82 1 => all the tables are excluded = nothing is dumped
83 
84 non-empty "table_list" T:
85 EL List of tables to dump
86 0 => all the tables in T
87 1 => all the tables, except the ones in T
88 
89 
90 =head1 LICENSE
91 
92  Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
93  Copyright [2016-2022] EMBL-European Bioinformatics Institute
94 
95  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
96  You may obtain a copy of the License at
97 
98  http://www.apache.org/licenses/LICENSE-2.0
99 
100  Unless required by applicable law or agreed to in writing, software distributed under the License
101  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
102  See the License for the specific language governing permissions and limitations under the License.
103 
104 =head1 CONTACT
105 
106  Please subscribe to the Hive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss Hive-related questions or to be notified of our updates
107 
108 =cut
109 
110 
111 package Bio::EnsEMBL::Hive::RunnableDB::DatabaseDumper;
112 
113 use strict;
114 use warnings;
115 
116 use Bio::EnsEMBL::Hive::Utils ('go_figure_dbc');
117 
118 use base ('Bio::EnsEMBL::Hive::RunnableDB::SystemCmd');
119 
120 sub param_defaults {
121  my $self = shift;
122  return {
123  %{$self->SUPER::param_defaults(@_)},
124 
125  # Which tables to dump. How the options are combined is explained above
126  'table_list' => undef, # array-ref
127  'exclude_ehive' => 0, # boolean
128  'exclude_list' => 0, # boolean
129 
130  # Input / output
131  'src_db_conn' => undef, # URL, hash-ref, or Registry name
132  'output_file' => undef, # String
133  'output_db' => undef, # URL, hash-ref, or Registry name
134 
135  # Other options
136  'skip_dump' => 0, # boolean
137  'dump_options' => undef, # Extra options to pass to the dump program
138 
139  # SystemCmd's options to make sure the whole command succeeded
140  'use_bash_pipefail' => 1,
141  'use_bash_errexit' => 1,
142  }
143 }
144 
145 sub fetch_input {
146  my $self = shift @_;
147 
148  # The final list of tables
149  my @tables = ();
150  my @ignores = ();
151 
152  # Connection parameters
153  my $src_db_conn = $self->param('src_db_conn');
154  my $src_dbc = $src_db_conn ? go_figure_dbc($src_db_conn) : $self->data_dbc;
155  $self->param('src_dbc', $src_dbc);
156 
157  $self->input_job->transient_error(0);
158  die 'Only the "mysql" driver is supported.' if $src_dbc->driver ne 'mysql';
159 
160  my @ehive_tables = ();
161  {
162  ## Only query the list of eHive tables if there is a "hive_meta" table
163  my $meta_sth = $src_dbc->table_info(undef, undef, 'hive_meta');
164  if ($meta_sth->fetchrow_arrayref) {
165  my $src_dba = Bio::EnsEMBL::Hive::DBSQL::DBAdaptor->new( -dbconn => $src_dbc, -disconnect_when_inactive => 1, -no_sql_schema_version_check => 1 );
166  @ehive_tables = (@{$src_dba->hive_pipeline->list_all_hive_tables}, @{$src_dba->hive_pipeline->list_all_hive_views});
167  unless (@ehive_tables) {
168  my @ref_ehive_tables = qw(hive_meta pipeline_wide_parameters worker dataflow_rule analysis_base analysis_ctrl_rule job accu log_message job_file analysis_data resource_description analysis_stats analysis_stats_monitor role msg progress resource_class worker_resource_usage);
169  # The hard-coded list is comprehensive, so some tables may not be
170  # in this database (which may be on a different version)
171  push @ehive_tables, @{$self->_get_table_list($src_dbc, $_)} for @ref_ehive_tables;
172  }
173  }
174  $meta_sth->finish();
175  }
176  $self->param('nb_ehive_tables', scalar(@ehive_tables));
177 
178  # Get the table list in either "tables" or "ignores"
179  my $table_list = $self->_get_table_list($src_dbc, $self->param('table_list') || '');
180  $self->say_with_header(sprintf("table_list: %d %s", scalar(@$table_list), join('/', @$table_list)));
181  my $nothing_to_dump = 0;
182 
183  if ($self->param('exclude_list')) {
184  push @ignores, @$table_list;
185  $nothing_to_dump = 1 if !$self->param('table_list');
186  } else {
187  push @tables, @$table_list;
188  $nothing_to_dump = 1 if $self->param('table_list') and !@$table_list;
189  }
190 
191  # eHive tables are ignored if exclude_ehive is set
192  if ($self->param('exclude_ehive')) {
193  push @ignores, @ehive_tables;
194  } elsif (@ehive_tables) {
195  if (@tables || $nothing_to_dump) {
196  push @tables, @ehive_tables;
197  $nothing_to_dump = 0;
198  }
199  }
200 
201  # Output file / output database
202  $self->param('output_file') || $self->param('output_db') || die 'One of the parameters "output_file" and "output_db" is mandatory';
203  unless ($self->param('output_file')) {
204  $self->param('real_output_db', go_figure_dbc( $self->param('output_db') ) );
205  die 'Only the "mysql" driver is supported.' if $self->param('real_output_db')->driver ne 'mysql';
206  }
207 
208  $self->input_job->transient_error(1);
209 
210  $self->say_with_header(sprintf("tables: %d %s", scalar(@tables), join('/', @tables)));
211  $self->say_with_header(sprintf("ignores: %d %s", scalar(@ignores), join('/', @ignores)));
212 
213  my @options = qw(--skip-lock-tables);
214  # Without any table names, mysqldump thinks that it should dump
215  # everything. We need to add special arguments to handle this
216  if ($nothing_to_dump) {
217  $self->say_with_header("everything is excluded, nothing to dump !");
218  push @options, qw(--no-create-info --no-data);
219  @ignores = (); # to clean-up the command-line
220  }
221 
222  # mysqldump command
223  my $output = "";
224  if ($self->param('output_file')) {
225  if (lc $self->param('output_file') =~ /\.gz$/) {
226  $output = sprintf(' | gzip > %s', $self->param('output_file'));
227  } else {
228  $output = sprintf('> %s', $self->param('output_file'));
229  }
230  } else {
231  $output = join(' ', '|', @{ $self->param('real_output_db')->to_cmd(undef, undef, undef, undef, 1) } );
232  };
233 
234  # Extra parameter to add to the command-line
235  my $dump_options = $self->param('dump_options') // [];
236 
237  # Must be joined because of the redirection / the pipe
238  my $cmd = join(' ',
239  @{ $src_dbc->to_cmd('mysqldump', undef, undef, undef, 1) },
240  @options,
241  @tables,
242  ref($dump_options) ? @$dump_options : ($dump_options,),
243  (map {sprintf('--ignore-table=%s.%s', $src_dbc->dbname, $_)} @ignores),
244  $output
245  );
246 
247  # Check whether the current database has been restored from a snapshot.
248  # If it is the case, we shouldn't re-dump and overwrite the file.
249  # We also check here the value of the "skip_dump" parameter
250  my $completion_signature = sprintf('dump_%d_restored', defined $self->input_job->dbID ? $self->input_job->dbID : 0);
251 
252  if ($self->param('skip_dump') or $self->param($completion_signature)) {
253  # A command that always succeeds
254  $self->param('cmd', 'true');
255  if ($self->param('skip_dump')) {
256  $self->warning('Skipping the dump because "skip_dump" is defined');
257  } else {
258  $self->warning("Skipping the dump because this database has been restored from the target dump. We don't want to overwrite it");
259  }
260  } elsif ($self->param('nb_ehive_tables')) {
261  # OK, we can dump and this is an eHive database.
262  # We add the signature to the dump, so that the
263  # job won't rerun on a restored database
264  # We're very lucky that gzipped streams can be concatenated and the
265  # output is still valid !
266  my $extra_sql = qq{echo "INSERT INTO pipeline_wide_parameters VALUES ('$completion_signature', 1);\n" $output};
267  $extra_sql =~ s/>/>>/;
268  $self->param('cmd', "$cmd; $extra_sql");
269  } else {
270  # Direct dump on a non-eHive database
271  $self->param('cmd', $cmd);
272  }
273 }
274 
275 
276 # Splits a string into a list of strings
277 # Ask the database for the list of tables that match the wildcard "%"
278 # and also select the tables that actually exist
279 sub _get_table_list {
280  my ($self, $dbc, $table_list) = @_;
281 
282  my @newtables = ();
283  foreach my $initable (ref($table_list) eq 'ARRAY' ? @$table_list : split(' ', $table_list)) {
284  if ($initable =~ /%/) {
285  $initable =~ s/_/\\_/g;
286  }
287  my $sth = $dbc->table_info(undef, undef, $initable, undef);
288  push @newtables, map( {$_->[2]} @{$sth->fetchall_arrayref});
289  }
290  return \@newtables;
291 }
292 
293 
294 1;