ensembl-hive  2.7.0
runWorker.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 
3 use strict;
4 use warnings;
5 
6  # Finding out own path in order to reference own components (including own modules):
7 use Cwd ();
8 use File::Basename ();
9 BEGIN {
10  $ENV{'EHIVE_ROOT_DIR'} ||= File::Basename::dirname( File::Basename::dirname( Cwd::realpath($0) ) );
11  unshift @INC, $ENV{'EHIVE_ROOT_DIR'}.'/modules';
12 }
13 
14 use Getopt::Long qw(:config no_auto_abbrev);
15 use Pod::Usage;
16 
17 use Bio::EnsEMBL::Hive::Version qw(report_versions);
21 
23 
24 main();
25 
26 
27 sub main {
28  my ($url, $reg_conf, $reg_type, $reg_alias, $nosqlvc); # Connection parameters
29  my ($preregistered, $resource_class_id, $resource_class_name, $analyses_pattern, $analysis_id, $logic_name, $job_id, $force, $beekeeper_id); # Task specification parameters
30  my ($job_limit, $life_span, $no_cleanup, $no_write, $worker_cur_dir, $hive_log_dir, $worker_log_dir, $worker_base_temp_dir, $retry_throwing_jobs, $can_respecialize, # Worker control parameters
31  $worker_delay_startup_seconds, $worker_crash_on_startup_prob, $config_files);
32  my ($help, $report_versions, $debug);
33 
34  # Default values
35  $config_files = [];
36 
37  $|=1; # make STDOUT unbuffered (STDERR is unbuffered anyway)
38 
39  GetOptions(
40 
41  # Connection parameters:
42  'url=s' => \$url,
43  'reg_conf|regfile|reg_file=s' => \$reg_conf,
44  'reg_type=s' => \$reg_type,
45  'reg_alias|regname|reg_name=s' => \$reg_alias,
46  'nosqlvc' => \$nosqlvc, # using "nosqlvc" instead of "sqlvc!" for consistency with scripts where it is a propagated option
47 
48  # json config files
49  'config_file=s@' => $config_files,
50 
51  # Task specification parameters:
52  'preregistered!' => \$preregistered,
53  'rc_id=i' => \$resource_class_id,
54  'rc_name=s' => \$resource_class_name,
55  'analyses_pattern=s' => \$analyses_pattern,
56  'analysis_id=i' => \$analysis_id,
57  'logic_name=s' => \$logic_name,
58  'job_id=i' => \$job_id,
59  'force!' => \$force,
60  'beekeeper_id=i' => \$beekeeper_id,
61 
62  # Worker control parameters:
63  'job_limit=i' => \$job_limit,
64  'life_span|lifespan=i' => \$life_span,
65  'no_cleanup' => \$no_cleanup,
66  'no_write' => \$no_write,
67  'worker_cur_dir|cwd=s' => \$worker_cur_dir,
68  'hive_log_dir|hive_output_dir=s' => \$hive_log_dir, # keep compatibility with the old name
69  'worker_log_dir|worker_output_dir=s' => \$worker_log_dir, # will take precedence over hive_log_dir if set
70  'worker_base_temp_dir=s' => \$worker_base_temp_dir,
71  'retry_throwing_jobs!' => \$retry_throwing_jobs,
72  'can_respecialize|can_respecialise!' => \$can_respecialize,
73  'worker_delay_startup_seconds=i' => \$worker_delay_startup_seconds,
74  'worker_crash_on_startup_prob=f' => \$worker_crash_on_startup_prob,
75 
76  # Other commands
77  'h|help' => \$help,
78  'v|version|versions' => \$report_versions,
79  'debug=i' => \$debug,
80  ) or die "Error in command line arguments\n";
81 
82  if (@ARGV) {
83  die "ERROR: There are invalid arguments on the command-line: ". join(" ", @ARGV). "\n";
84  }
85 
86  if ($help) {
87  pod2usage({-exitvalue => 0, -verbose => 2});
88  }
89 
90  if($report_versions) {
91  report_versions();
92  exit(0);
93  }
94 
95  chdir $worker_cur_dir if $worker_cur_dir; # Allows using relative paths for Sqlite URLs, registry files etc
96 
97  my $pipeline;
98 
99  if($url or $reg_alias) {
100 
102  -url => $url,
103  -reg_conf => $reg_conf,
104  -reg_type => $reg_type,
105  -reg_alias => $reg_alias,
106  -no_sql_schema_version_check => $nosqlvc,
107  );
108  $pipeline->hive_dba()->dbc->requires_write_access();
109 
110  } else {
111  die "\nERROR: Connection parameters (url or reg_conf+reg_alias) need to be specified\n";
112  }
113 
114  unless($pipeline->hive_dba) {
115  die "ERROR : no database connection, the pipeline could not be accessed\n\n";
116  }
117 
118  if( $logic_name ) {
119  warn "-logic_name is now deprecated, please use -analyses_pattern that extends the functionality of -logic_name and -analysis_id .\n";
120  $analyses_pattern = $logic_name;
121  } elsif ( $analysis_id ) {
122  warn "-analysis_id is now deprecated, please use -analyses_pattern that extends the functionality of -analysis_id and -logic_name .\n";
123  $analyses_pattern = $analysis_id;
124  }
125 
126  my %specialisation_options = (
127  preregistered => $preregistered,
128  resource_class_id => $resource_class_id,
129  resource_class_name => $resource_class_name,
130  can_respecialize => $can_respecialize,
131  analyses_pattern => $analyses_pattern,
132  job_id => $job_id,
133  force => $force,
134  beekeeper_id => $beekeeper_id,
135  );
136  my %life_options = (
137  job_limit => $job_limit,
138  life_span => $life_span,
139  retry_throwing_jobs => $retry_throwing_jobs,
140  worker_delay_startup_seconds => $worker_delay_startup_seconds,
141  worker_crash_on_startup_prob => $worker_crash_on_startup_prob,
142  );
143  my %execution_options = (
144  config_files => $config_files,
145  no_cleanup => $no_cleanup,
146  no_write => $no_write,
147  worker_base_temp_dir=> $worker_base_temp_dir,
148  worker_log_dir => $worker_log_dir,
149  hive_log_dir => $hive_log_dir,
150  debug => $debug,
151  );
152 
153  Bio::EnsEMBL::Hive::Scripts::RunWorker::runWorker($pipeline, \%specialisation_options, \%life_options, \%execution_options);
154 }
155 
156 
157 __DATA__
158 
159 =pod
160 
161 =head1 NAME
162 
163 runWorker.pl [options]
164 
165 =head1 DESCRIPTION
166 
167 runWorker.pl is an eHive component script that does the work of a single Worker.
168 It specialises in one of the analyses and starts executing Jobs of that Analysis one-by-one or batch-by-batch.
169 
170 Most of the functionality of the eHive is accessible via beekeeper.pl script,
171 but feel free to run the runWorker.pl if you think you need a direct access to the running Jobs.
172 
173 =head1 USAGE EXAMPLES
174 
175  # Run one local Worker process in ehive_dbname and let the system pick up the Analysis
176  runWorker.pl -url mysql://username:secret@hostname:port/ehive_dbname
177 
178  # Run one local Worker process in ehive_dbname and let the system pick up the Analysis from the given resource_class
179  runWorker.pl -url mysql://username:secret@hostname:port/ehive_dbname -rc_name low_mem
180 
181  # Run one local Worker process in ehive_dbname and constrain its initial specialisation within a subset of analyses
182  runWorker.pl -url mysql://username:secret@hostname:port/ehive_dbname -analyses_pattern '1..15,analysis_X,21'
183 
184  # Run one local Worker process in ehive_dbname and allow it to respecialize within a subset of Analyses
185  runWorker.pl -url mysql://username:secret@hostname:port/ehive_dbname -can_respecialize -analyses_pattern 'blast%-4..6'
186 
187  # Run a specific Job in a local Worker process:
188  runWorker.pl -url mysql://username:secret@hostname:port/ehive_dbname -job_id 123456
189 
190 =head1 OPTIONS
191 
192 =head2 Connection parameters:
193 
194 =over
195 
196 =item --reg_conf <path>
197 
198 path to a Registry configuration file
199 
200 =item --reg_alias <string>
201 
202 species/alias name for the eHive DBAdaptor
203 
204 =item --reg_type <string>
205 
206 type of the registry entry ("hive", "core", "compara", etc - defaults to "hive")
207 
208 =item --url <url string>
209 
210 URL defining where database is located
211 
212 =item --nosqlvc
213 
214 "No SQL Version Check" - set if you want to force working with a database created by a potentially schema-incompatible API
215 
216 =back
217 
218 =head2 Configs overriding
219 
220 =over
221 
222 =item --config_file <string>
223 
224 JSON file (with absolute path) to override the default configurations (could be multiple)
225 
226 =back
227 
228 =head2 Task specification parameters:
229 
230 =over
231 
232 =item --rc_id <id>
233 
234 resource class id
235 
236 =item --rc_name <string>
237 
238 resource class name
239 
240 =item --analyses_pattern <string>
241 
242 restrict the specialisation of the Worker to the specified subset of Analyses
243 
244 =item --analysis_id <id>
245 
246 run a Worker and have it specialise to an Analysis with this analysis_id
247 
248 =item --job_id <id>
249 
250 run a specific Job defined by its database id
251 
252 =item --force
253 
254 set if you want to force running a Worker over a BLOCKED Analysis or to run a specific DONE/SEMAPHORED job_id
255 
256 =back
257 
258 =head2 Worker control parameters:
259 
260 =over
261 
262 =item --job_limit <num>
263 
264 number of Jobs to run before the Worker can die naturally
265 
266 =item --life_span <num>
267 
268 number of minutes this Worker is allowed to run
269 
270 =item --no_cleanup
271 
272 don't perform temp directory cleanup when the Worker exits
273 
274 =item --no_write
275 
276 don't write_output or auto_dataflow input_job
277 
278 =item --worker_base_temp_dir <path>
279 
280 The base directory that this worker will use for temporary operations. This overrides the default set
281 in the JSON config file and in the code (/tmp)
282 
283 =item --hive_log_dir <path>
284 
285 directory where stdout/stderr of the whole eHive of workers is redirected
286 
287 =item --worker_log_dir <path>
288 
289 directory where stdout/stderr of this particular Worker is redirected
290 
291 =item --retry_throwing_jobs
292 
293 By default, Jobs are allowed to fail a few times (up to the Analysis' max_retry_count parameter) until the systems "gives up" and considers them as FAILED.
294 retry Jobs if the Job dies knowingly (e.g. due to encountering a die statement in the Runnable)
295 
296 =item --can_respecialize
297 
298 allow this Worker to re-specialise into another Analysis (within resource_class) after it has exhausted all Jobs of the current one
299 
300 =item --worker_delay_startup_seconds <number>
301 
302 number of seconds each Worker has to wait before first talking to the database (0 by default, useful for debugging)
303 
304 =item --worker_crash_on_startup_prob <float>
305 
306 probability of each Worker failing at startup (0 by default, useful for debugging)
307 
308 =back
309 
310 =head2 Other options:
311 
312 =over
313 
314 =item --help
315 
316 print this help
317 
318 =item --versions
319 
320 report both eHive code version and eHive database schema version
321 
322 =item --debug <level>
323 
324 turn on debug messages at <level>
325 
326 =back
327 
328 =head1 LICENSE
329 
330  See the NOTICE file distributed with this work for additional information
331  regarding copyright ownership.
332 
333  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
334  You may obtain a copy of the License at
335 
336  http://www.apache.org/licenses/LICENSE-2.0
337 
338  Unless required by applicable law or agreed to in writing, software distributed under the License
339  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
340  See the License for the specific language governing permissions and limitations under the License.
341 
342 =head1 CONTACT
343 
344 Please subscribe to the eHive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss eHive-related questions or to be notified of our updates
345 
346 =cut
347 
Bio::EnsEMBL::Hive::Utils::URL::hide_url_password
public Void hide_url_password()
Bio::EnsEMBL::Hive::Utils::URL
Definition: URL.pm:11
Bio::EnsEMBL::Hive::HivePipeline::new
public new()
Bio::EnsEMBL::Hive::Version
Definition: Version.pm:19
debug
public debug()
Bio::EnsEMBL::Hive::HivePipeline
Definition: HivePipeline.pm:13
BEGIN
public BEGIN()
Bio::EnsEMBL::Hive::Scripts::RunWorker
Definition: RunWorker.pm:6
main
public main()
run
public run()
Bio::EnsEMBL::Hive::Scripts::RunWorker::runWorker
public runWorker()