2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 # Don't change the above line.
18 # Change the PATH in the myRun.ksh script if you want to use another perl.
29 --dbname, db_name=NAME database name NAME
30 --host, --dbhost, --db_host=HOST database host HOST
31 --port, --dbport, --db_port=PORT database port PORT
32 --user, --dbuser, --db_user=USER database username USER
33 --pass, --dbpass, --db_pass=PASS database passwort PASS
37 --conffile, --conf=FILE read parameters from FILE
38 (
default: conf/Conversion.ini)
40 --logfile, --log=FILE log to FILE (
default: *STDOUT)
41 --logpath=PATH write logfile to PATH (
default: .)
42 --logappend, --log_append append to logfile (
default: truncate)
43 --loglevel=LEVEL define log level (
default: INFO)
45 -i, --interactive=0|1
run script interactively (
default:
true)
46 -n, --dry_run, --dry=0|1 don
't write results to database
47 -h, --help, -? print help (this message)
51 Use --sourceschema and --targetschema to specify a schema version (default:
52 latest). This will be used to determine the subroutine to build the cache. By
53 default, &build_cache_latest() is run which uses Bio::EnsEMBL::IdMapping::Cache
54 to read from the database and write the cache. An alternative subroutine can
55 use a different module for that, which will usually inherit from the former and
56 overwrite Cache->build_cache(). This is useful for backwards compatibility with
57 older schema versions. Once the cache is built, no API access is needed,
58 therefore the ID mapping application is independent of the underlying database
65 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
69 Please post comments/questions to the Ensembl development list
70 <http://lists.ensembl.org/mailman/listinfo/dev>
76 no warnings 'uninitialized
';
79 use Bio::EnsEMBL::Utils::ConfParser;
80 use Bio::EnsEMBL::Utils::Logger;
81 use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append);
82 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
84 # parse configuration and commandline arguments
85 my $conf = new Bio::EnsEMBL::Utils::ConfParser(
86 -SERVERROOT => "$Bin/../../..",
87 -DEFAULT_CONF => "$Bin/default.conf"
91 'sourcehost|source_host=s
' => 1,
92 'sourceport|source_port=n
' => 1,
93 'sourceuser|source_user=s
' => 1,
94 'sourcepass|source_pass=s
' => 0,
95 'sourcedbname|source_dbname=s
' => 1,
96 'targethost|target_host=s
' => 1,
97 'targetport|target_port=n
' => 1,
98 'targetuser|target_user=s
' => 1,
99 'targetpass|target_pass=s
' => 0,
100 'targetdbname|target_dbname=s
' => 1,
101 'basedir|basedir=s
' => 1,
102 'chromosomes|chr=s@
' => 0,
105 'biotypes_include=s@
' => 0,
106 'biotypes_exclude=s@
' => 0,
107 'lsf_opt_dump_cache|lsfoptdumpcache=s
' => 0,
108 'cache_method=s
' => 0,
109 'build_cache_auto_threshold=n
' => 0,
110 'build_cache_concurrent_jobs=n
' => 0,
113 # set default logpath
114 unless ($conf->param('logpath
')) {
115 $conf->param('logpath
', path_append($conf->param('basedir
'), 'log
'));
118 # get log filehandle and print heading and parameters to logfile
119 my $logger = new Bio::EnsEMBL::Utils::Logger(
120 -LOGFILE => $conf->param('logfile
'),
121 -LOGAUTO => $conf->param('logauto
'),
122 -LOGAUTOBASE => 'dump_cache
',
123 -LOGAUTOID => $conf->param('logautoid
'),
124 -LOGPATH => $conf->param('logpath
'),
125 -LOGAPPEND => $conf->param('logappend
'),
126 -LOGLEVEL => $conf->param('loglevel
'),
127 -IS_COMPONENT => $conf->param('is_component
'),
131 $logger->init_log($conf->list_param_values);
133 # determin cache method to use.
134 # this can be used to support different caching strategies or access to old
138 my $retval = &$cache_method;
149 sub build_cache_auto {
150 # load the cache implementation
154 my $cache = $cache_impl->new(
159 $logger->debug("\nChecking number of toplevel seq_regions...\n");
162 foreach my $dbtype (qw(source target)) {
163 my $num = scalar(@{ $cache->slice_names($dbtype) });
164 $max = $num if ($num > $max);
165 $logger->debug("$dbtype: $num.\n", 1);
168 my $threshold = $conf->param('build_cache_auto_threshold
') || 100;
171 if ($max > $threshold) {
172 $logger->debug("\nWill use build_cache_all.\n");
173 $retval = &build_cache_all;
175 $logger->debug("\nWill use build_cache_by_seq_region.\n");
176 $retval = &build_cache_by_seq_region;
183 sub build_cache_by_seq_region {
187 # create empty directory for logs
188 my $logpath = path_append($conf->param('logpath
'), 'dump_by_seq_region
');
189 system("rm -rf $logpath") == 0 or
190 $logger->error("Unable to delete lsf log dir $logpath: $!\n");
191 system("mkdir -p $logpath") == 0 or
192 $logger->error("Can't create lsf log dir $logpath: $!\n
");
194 # load the cache implementation
195 my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
198 my $cache = $cache_impl->new(
204 foreach my $dbtype (qw(source target)) {
206 $logger->info("\n
".ucfirst($dbtype)." db...\n
", 0, 'stamped');
208 # determine which slices need to be done
209 my $filename = "$dbtype.dump_cache.slices.txt
";
210 open(my $fh, '>', "$logpath/$filename
") or
211 throw("Unable to open $logpath/$filename
for writing: $!
");
215 foreach my $slice_name (@{ $cache->slice_names($dbtype) }) {
216 my $type = "$dbtype.$slice_name
";
217 unless ($cache->cache_file_exists($type)) {
218 print $fh "$slice_name\n
";
226 $logger->info("All cache files
for $dbtype exist.\n
");
231 my $lsf_name = 'dump_by_seq_region_'.time;
232 my $concurrent = $conf->param('build_cache_concurrent_jobs') || 200;
234 my $options = $conf->create_commandline_options(
236 logautobase => "dump_by_seq_region
",
240 cache_impl => $cache_impl,
243 my $cmd = qq{./dump_by_seq_region.pl $options --index \$LSB_JOBINDEX};
246 qq{|bsub -J '$lsf_name\[1-$num_jobs\]\%$concurrent' }
247 . qq{-o $logpath/dump_by_seq_region.$dbtype.\%I.out }
248 . qq{-e $logpath/dump_by_seq_region.$dbtype.\%I.err }
249 . $conf->param('lsf_opt_dump_cache');
252 $logger->info("\nSubmitting $num_jobs jobs to lsf.\n
");
253 $logger->debug("$cmd\n\n
");
254 $logger->debug("$pipe\n\n
");
258 $logger->error("Could not open open pipe to bsub: $!\n
");
261 $logger->error("Error submitting jobs: $!\n
")
265 # submit dependent job to monitor finishing of jobs
266 $logger->info("Waiting
for jobs to finish...\n
", 0, 'stamped');
269 qq{bsub -K -w "ended($lsf_name)
" -q production } .
270 qq{-M 100 -R 'select[mem>100]' -R 'rusage[mem=100]' } .
271 qq{-o $logpath/dump_cache.$dbtype.depend.out /bin/true};
273 system($dependent_job) == 0 or
274 $logger->error("Error submitting dependent job: $!\n
");
276 $logger->info("All jobs finished.\n
", 0, 'stamped');
278 # check for lsf errors
281 foreach my $i (1..$num_jobs) {
282 $err++ unless (-e "$logpath/dump_by_seq_region.$dbtype.$i.success
");
286 $logger->error("At least one of your jobs failed.\nPlease check the logfiles at $logpath
for errors.\n
");
296 sub build_cache_all {
298 # load the cache implementation
299 my $cache_impl = 'Bio::EnsEMBL::IdMapping::Cache';
302 my $cache = $cache_impl->new(
308 foreach my $dbtype (qw(source target)) {
310 $logger->info("\n
".ucfirst($dbtype)." db...\n
", 0, 'stamped');
311 $logger->info("Building cache
for whole genome...\n
");
315 ($i, $size) = $cache->build_cache_all($dbtype);
317 $logger->info("Done with $dbtype (genes: $i, filesize: $size).\n
", 0,