ensembl-hive  2.7.0
dump_cache.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 # Don't change the above line.
18 # Change the PATH in the myRun.ksh script if you want to use another perl.
19 
20 =head1 NAME
21 
22 
23 =head1 SYNOPSIS
24 
25 .pl [arguments]
26 
27 Required arguments:
28 
29  --dbname, db_name=NAME database name NAME
30  --host, --dbhost, --db_host=HOST database host HOST
31  --port, --dbport, --db_port=PORT database port PORT
32  --user, --dbuser, --db_user=USER database username USER
33  --pass, --dbpass, --db_pass=PASS database passwort PASS
34 
35 Optional arguments:
36 
37  --conffile, --conf=FILE read parameters from FILE
38  (default: conf/Conversion.ini)
39 
40  --logfile, --log=FILE log to FILE (default: *STDOUT)
41  --logpath=PATH write logfile to PATH (default: .)
42  --logappend, --log_append append to logfile (default: truncate)
43  --loglevel=LEVEL define log level (default: INFO)
44 
45  -i, --interactive=0|1 run script interactively (default: true)
46  -n, --dry_run, --dry=0|1 don't write results to database
47  -h, --help, -? print help (this message)
48 
49 =head1 DESCRIPTION
50 
51 Use --sourceschema and --targetschema to specify a schema version (default:
52 latest). This will be used to determine the subroutine to build the cache. By
53 default, &build_cache_latest() is run which uses Bio::EnsEMBL::IdMapping::Cache
54 to read from the database and write the cache. An alternative subroutine can
55 use a different module for that, which will usually inherit from the former and
56 overwrite Cache->build_cache(). This is useful for backwards compatibility with
57 older schema versions. Once the cache is built, no API access is needed,
58 therefore the ID mapping application is independent of the underlying database
59 schema.
60 
61 
62 
63 =head1 AUTHOR
64 
65 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
66 
67 =head1 CONTACT
68 
69 Please post comments/questions to the Ensembl development list
70 <http://lists.ensembl.org/mailman/listinfo/dev>
71 
72 =cut
73 
74 use strict;
75 use warnings;
76 no warnings 'uninitialized';
77 
78 use FindBin qw($Bin);
79 use Bio::EnsEMBL::Utils::ConfParser;
80 use Bio::EnsEMBL::Utils::Logger;
81 use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append);
82 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
83 
84 # parse configuration and commandline arguments
85 my $conf = new Bio::EnsEMBL::Utils::ConfParser(
86  -SERVERROOT => "$Bin/../../..",
87  -DEFAULT_CONF => "$Bin/default.conf"
88 );
89 
90 $conf->parse_options(
91  'sourcehost|source_host=s' => 1,
92  'sourceport|source_port=n' => 1,
93  'sourceuser|source_user=s' => 1,
94  'sourcepass|source_pass=s' => 0,
95  'sourcedbname|source_dbname=s' => 1,
96  'targethost|target_host=s' => 1,
97  'targetport|target_port=n' => 1,
98  'targetuser|target_user=s' => 1,
99  'targetpass|target_pass=s' => 0,
100  'targetdbname|target_dbname=s' => 1,
101  'basedir|basedir=s' => 1,
102  'chromosomes|chr=s@' => 0,
103  'region=s' => 0,
104  'biotypes=s@' => 0,
105  'biotypes_include=s@' => 0,
106  'biotypes_exclude=s@' => 0,
107  'lsf_opt_dump_cache|lsfoptdumpcache=s' => 0,
108  'cache_method=s' => 0,
109  'build_cache_auto_threshold=n' => 0,
110  'build_cache_concurrent_jobs=n' => 0,
111 );
112 
113 # set default logpath
114 unless ($conf->param('logpath')) {
115  $conf->param('logpath', path_append($conf->param('basedir'), 'log'));
116 }
117 
118 # get log filehandle and print heading and parameters to logfile
119 my $logger = new Bio::EnsEMBL::Utils::Logger(
120  -LOGFILE => $conf->param('logfile'),
121  -LOGAUTO => $conf->param('logauto'),
122  -LOGAUTOBASE => 'dump_cache',
123  -LOGAUTOID => $conf->param('logautoid'),
124  -LOGPATH => $conf->param('logpath'),
125  -LOGAPPEND => $conf->param('logappend'),
126  -LOGLEVEL => $conf->param('loglevel'),
127  -IS_COMPONENT => $conf->param('is_component'),
128 );
129 
130 # initialise log
131 $logger->init_log($conf->list_param_values);
132 
133 # determin cache method to use.
134 # this can be used to support different caching strategies or access to old
135 # database schemas.
136 my $cache_method = $conf->param('cache_method') || 'build_cache_auto';
137 no strict 'refs';
138 my $retval = &$cache_method;
139 
140 # finish logfile
141 $logger->finish_log;
142 
143 exit($retval);
144 
145 
146 ### END main ###
147 
148 
149 sub build_cache_auto {
150  # load the cache implementation
151  my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
152  inject($cache_impl);
153 
154  my $cache = $cache_impl->new(
155  -LOGGER => $logger,
156  -CONF => $conf,
157  );
158 
159  $logger->debug("\nChecking number of toplevel seq_regions...\n");
160  my $max = 0;
161 
162  foreach my $dbtype (qw(source target)) {
163  my $num = scalar(@{ $cache->slice_names($dbtype) });
164  $max = $num if ($num > $max);
165  $logger->debug("$dbtype: $num.\n", 1);
166  }
167 
168  my $threshold = $conf->param('build_cache_auto_threshold') || 100;
169  my $retval;
170 
171  if ($max > $threshold) {
172  $logger->debug("\nWill use build_cache_all.\n");
173  $retval = &build_cache_all;
174  } else {
175  $logger->debug("\nWill use build_cache_by_seq_region.\n");
176  $retval = &build_cache_by_seq_region;
177  }
178 
179  return $retval;
180 }
181 
182 
183 sub build_cache_by_seq_region {
184 
185  my %jobs = ();
186 
187  # create empty directory for logs
188  my $logpath = path_append($conf->param('logpath'), 'dump_by_seq_region');
189  system("rm -rf $logpath") == 0 or
190  $logger->error("Unable to delete lsf log dir $logpath: $!\n");
191  system("mkdir -p $logpath") == 0 or
192  $logger->error("Can't create lsf log dir $logpath: $!\n");
193 
194  # load the cache implementation
195  my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
196  inject($cache_impl);
197 
198  my $cache = $cache_impl->new(
199  -LOGGER => $logger,
200  -CONF => $conf,
201  );
202 
203  # submit jobs to lsf
204  foreach my $dbtype (qw(source target)) {
205 
206  $logger->info("\n".ucfirst($dbtype)." db...\n", 0, 'stamped');
207 
208  # determine which slices need to be done
209  my $filename = "$dbtype.dump_cache.slices.txt";
210  open(my $fh, '>', "$logpath/$filename") or
211  throw("Unable to open $logpath/$filename for writing: $!");
212 
213  my $num_jobs = 0;
214 
215  foreach my $slice_name (@{ $cache->slice_names($dbtype) }) {
216  my $type = "$dbtype.$slice_name";
217  unless ($cache->cache_file_exists($type)) {
218  print $fh "$slice_name\n";
219  $num_jobs++;
220  }
221  }
222 
223  close($fh);
224 
225  unless ($num_jobs) {
226  $logger->info("All cache files for $dbtype exist.\n");
227  next;
228  }
229 
230  # build lsf command
231  my $lsf_name = 'dump_by_seq_region_'.time;
232  my $concurrent = $conf->param('build_cache_concurrent_jobs') || 200;
233 
234  my $options = $conf->create_commandline_options(
235  logauto => 1,
236  logautobase => "dump_by_seq_region",
237  interactive => 0,
238  is_component => 1,
239  dbtype => $dbtype,
240  cache_impl => $cache_impl,
241  );
242 
243  my $cmd = qq{./dump_by_seq_region.pl $options --index \$LSB_JOBINDEX};
244 
245  my $pipe =
246  qq{|bsub -J '$lsf_name\[1-$num_jobs\]\%$concurrent' }
247  . qq{-o $logpath/dump_by_seq_region.$dbtype.\%I.out }
248  . qq{-e $logpath/dump_by_seq_region.$dbtype.\%I.err }
249  . $conf->param('lsf_opt_dump_cache');
250 
251  # run lsf job array
252  $logger->info("\nSubmitting $num_jobs jobs to lsf.\n");
253  $logger->debug("$cmd\n\n");
254  $logger->debug("$pipe\n\n");
255 
256  local *BSUB;
257  open BSUB, $pipe or
258  $logger->error("Could not open open pipe to bsub: $!\n");
259 
260  print BSUB $cmd;
261  $logger->error("Error submitting jobs: $!\n")
262  unless ($? == 0);
263  close BSUB;
264 
265  # submit dependent job to monitor finishing of jobs
266  $logger->info("Waiting for jobs to finish...\n", 0, 'stamped');
267 
268  my $dependent_job =
269  qq{bsub -K -w "ended($lsf_name)" -q production } .
270  qq{-M 100 -R 'select[mem>100]' -R 'rusage[mem=100]' } .
271  qq{-o $logpath/dump_cache.$dbtype.depend.out /bin/true};
272 
273  system($dependent_job) == 0 or
274  $logger->error("Error submitting dependent job: $!\n");
275 
276  $logger->info("All jobs finished.\n", 0, 'stamped');
277 
278  # check for lsf errors
279  sleep(5);
280  my $err;
281  foreach my $i (1..$num_jobs) {
282  $err++ unless (-e "$logpath/dump_by_seq_region.$dbtype.$i.success");
283  }
284 
285  if ($err) {
286  $logger->error("At least one of your jobs failed.\nPlease check the logfiles at $logpath for errors.\n");
287  return 1;
288  }
289 
290  }
291 
292  return 0;
293 }
294 
295 
296 sub build_cache_all {
297 
298  # load the cache implementation
299  my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
300  inject($cache_impl);
301 
302  my $cache = $cache_impl->new(
303  -LOGGER => $logger,
304  -CONF => $conf,
305  );
306 
307  # submit jobs to lsf
308  foreach my $dbtype (qw(source target)) {
309 
310  $logger->info("\n".ucfirst($dbtype)." db...\n", 0, 'stamped');
311  $logger->info("Building cache for whole genome...\n");
312 
313  my $i = 0;
314  my $size = 0;
315  ($i, $size) = $cache->build_cache_all($dbtype);
316 
317  $logger->info("Done with $dbtype (genes: $i, filesize: $size).\n", 0,
318  'stamped');
319  }
320 
321  return 0;
322 }
323 
324 
build_cache_auto
public build_cache_auto()
Bio::EnsEMBL::IdMapping::Cache
Definition: Cache.pm:18
run
public run()