9 # Example 1: specifying only the mandatory option:
12 # Example 2: specifying the mandatory options as well as overriding some defaults:
17 Generic configuration module
for all Hive pipelines with loader functionality.
18 All other Hive PipeConfig modules should inherit from
this module and will probably need to redefine some or all of the following
interface methods:
20 * default_options: returns a hash of (possibly multilevel) defaults for the options on which depend the rest of the configuration
22 * pipeline_create_commands: returns a list of strings that will be executed as system commands needed to create and set up the pipeline database
24 * pipeline_wide_parameters: returns a hash of pipeline-wide parameter names and their values
26 * resource_classes: returns a hash of resource class definitions
28 * pipeline_analyses: returns a list of hash structures that define analysis objects bundled with definitions of corresponding jobs, rules and resources
30 * beekeeper_extra_cmdline_options returns a string with command line options that you want to be passed to the beekeeper.pl
32 When defining anything except the keys of default_options() a call to $self->o('myoption') can be used.
33 This call means "substitute this call for the value of 'myoption' at the time of configuring the pipeline".
34 All option names mentioned in $self->o() calls within the five interface methods above can be given non-default values from the command line.
36 Please make sure you have studied the pipeline configuraton examples in
Bio::EnsEMBL::Hive::PipeConfig before creating your own PipeConfig modules.
40 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
41 Copyright [2016-2024] EMBL-European Bioinformatics Institute
43 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
44 You may obtain a copy of the License at
48 Unless required by applicable law or agreed to in writing, software distributed under the License
49 is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
50 See the License for the specific language governing permissions and limitations under the License.
54 Please subscribe to the
Hive mailing list: http:
64 use Exporter 'import';
65 our @EXPORT = qw(WHEN ELSE INPUT_PLUS);
67 use Scalar::Util qw(looks_like_number);
84 use base ('Bio::EnsEMBL::Hive::DependentOptions');
87 # ---------------------------[the following methods will be overridden by specific pipelines]-------------------------
90 =head2 default_options
92 Description : Interface method that should return a hash of option_name->default_option_value pairs.
93 Please see existing PipeConfig modules for examples.
100 'hive_root_dir' => $ENV{
'EHIVE_ROOT_DIR'}, #
this value is set up automatically
if this code is
run by init_pipeline.pl
102 'hive_driver' =>
'mysql',
103 'host' => $ENV{
'EHIVE_HOST'} ||
'localhost', # BEWARE that
'localhost' for mysql driver usually means a UNIX socket, not a TCPIP socket!
104 # If you need to connect to TCPIP socket, set -host => '127.0.0.1' instead.
106 'port' => $ENV{
'EHIVE_PORT'}, # or remain undef, which means
default for the driver
107 'user' => $ENV{
'EHIVE_USER'}
108 'password' => $ENV{
'EHIVE_PASS'}
109 'dbowner' => $ENV{
'EHIVE_USER'} ||
whoami() || $self->o(
'dbowner'), # although it is very unlikely that the current user has no name
111 'hive_use_triggers' => 0, # there have been a few cases of big pipelines misbehaving with triggers on, let
's keep the default off.
112 'hive_use_param_stack
' => 0, # do not reconstruct the calling stack of parameters by default (yet)
113 'hive_auto_rebalance_semaphores
' => 0, # do not attempt to rebalance semaphores periodically by default
114 'hive_default_max_retry_count
' => 3, # default value for the max_retry_count parameter of each analysis
115 'hive_force_init
' => 0, # setting it to 1 will drop the database prior to creation (use with care!)
116 'hive_no_init
' => 0, # setting it to 1 will skip pipeline_create_commands (useful for topping up)
117 'hive_debug_init
' => 0, # setting it to 1 will make init_pipeline.pl tell everything it's doing
119 'pipeline_name' => $self->default_pipeline_name(),
122 -driver => $self->o(
'hive_driver'),
123 -host => $self->o(
'host'),
124 -port => $self->o(
'port'),
125 -user => $self->o(
'user'),
126 -pass => $self->o(
'password'),
127 -dbname => $self->o(
'dbowner').
'_'.$self->o(
'pipeline_name'),
133 =head2 pipeline_create_commands
135 Description : Interface method that should
return a list of command lines to be
run in order to create and set up the pipeline database.
136 Please see existing PipeConfig modules
for examples.
140 sub pipeline_create_commands {
143 my $pipeline_url = $self->pipeline_url();
144 my $second_pass = $pipeline_url!~ /^#:subst/;
147 my $driver = $second_pass ? $parsed_url->{
'driver'} :
'';
148 my $hive_force_init = $self->o(
'hive_force_init');
150 # Will insert two keys: "hive_all_base_tables" and "hive_all_views"
151 my $hive_tables_sql =
'INSERT INTO hive_meta SELECT CONCAT("hive_all_", REPLACE(LOWER(TABLE_TYPE), " ", "_"), "s"), GROUP_CONCAT(TABLE_NAME) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = "%s" GROUP BY TABLE_TYPE';
154 $hive_force_init ? $self->db_cmd(
'DROP DATABASE IF EXISTS') : (),
155 $self->db_cmd(
'CREATE DATABASE'),
157 # we got table definitions for all drivers:
158 $self->db_cmd().
' <'.$self->o(
'hive_root_dir').
'/sql/tables.'.$driver,
160 # auto-sync'ing triggers are off by default:
161 $self->o(
'hive_use_triggers') ? ( $self->db_cmd().
' <'.$self->o(
'hive_root_dir').
'/sql/triggers.'.$driver ) : (),
163 # FOREIGN KEY constraints cannot be defined in sqlite separately from table definitions, so they are off there:
164 ($driver ne
'sqlite') ? ( $self->db_cmd().
' <'.$self->o(
'hive_root_dir').
'/sql/foreign_keys.sql' ) : (),
166 # we got procedure definitions for all drivers:
167 $self->db_cmd().
' <'.$self->o(
'hive_root_dir').
'/sql/procedures.'.$driver,
169 # list of all tables and views (MySQL only)
170 ($driver eq
'mysql' ? ($self->db_cmd(sprintf($hive_tables_sql, $parsed_url->{
'dbname'}))) : ()),
172 # when the database was created
173 $self->db_cmd(q{INSERT INTO hive_meta (meta_key, meta_value) VALUES (
'creation_timestamp', CURRENT_TIMESTAMP)}),
178 =head2 pipeline_wide_parameters
180 Description : Interface method that should
return a hash of pipeline_wide_parameter_name->pipeline_wide_parameter_value pairs.
181 The value doesn
't have to be a scalar, can be any Perl structure now (will be stringified and de-stringified automagically).
182 Please see existing PipeConfig modules for examples.
186 sub pipeline_wide_parameters {
189 # 'variable1
' => 'value1
',
190 # 'variable2
' => 'value2
',
195 =head2 resource_classes
197 Description : Interface method that should return a hash of resource_description_id->resource_description_hash.
198 Please see existing PipeConfig modules for examples.
202 sub resource_classes {
205 ## No longer supported resource declaration syntax:
206 # 1 => { -desc => 'default', 'LSF
' => '' },
207 # 2 => { -desc => 'urgent
', 'LSF
' => '-q production
' },
208 ## Currently supported resource declaration syntax:
209 'default' => { 'LSF
' => '' },
210 'urgent
' => { 'LSF
' => '-q production
' },
215 =head2 pipeline_analyses
217 Description : Interface method that should return a list of hashes that define analysis bundled with corresponding jobs, dataflow and analysis_ctrl rules and resource_id.
218 Please see existing PipeConfig modules for examples.
222 sub pipeline_analyses {
229 =head2 beekeeper_extra_cmdline_options
231 Description : Interface method that should return a string with extra parameters that you want to be passed to beekeeper.pl
235 sub beekeeper_extra_cmdline_options {
242 # ---------------------------------[now comes the interfacing stuff - feel free to call but not to modify]--------------------
245 sub hive_meta_table {
249 'hive_sql_schema_version
' => Bio::EnsEMBL::Hive::DBSQL::SqlSchemaAdaptor->get_code_sql_schema_version(),
250 'hive_pipeline_name
' => $self->o('pipeline_name
'),
251 'hive_use_param_stack
' => $self->o('hive_use_param_stack
'),
252 'hive_auto_rebalance_semaphores
' => $self->o('hive_auto_rebalance_semaphores
'),
253 'hive_default_max_retry_count
' => $self->o('hive_default_max_retry_count
'),
262 'pipeline_url
' => '',
263 'pipeline_name
' => '',
270 Description : A convenience method used to stringify a connection-parameters hash into a 'pipeline_url
' that beekeeper.pl will undestand
275 my ($self, $db_conn, $with_db) = @_;
277 $with_db = 1 unless(defined($with_db));
279 my $driver = $self->o($db_conn, '-driver
');
280 my $port = $self->o($db_conn,'-port
');
282 return ( ($driver eq 'sqlite
')
284 : $driver.
'://'.$self->o($db_conn,
'-user').
':'.$self->o($db_conn,
'-pass').
'@'.$self->o($db_conn,
'-host').($port ?
':'.$port :
'').
'/'
285 ) . ($with_db ? $self->o($db_conn,
'-dbname') :
'');
292 return $self->root()->{
'pipeline_url'} || $self->dbconn_2_url(
'pipeline_db', 1); # used to force vivification of the whole
'pipeline_db' structure (used in
run() )
298 Description : Returns a db_cmd.pl-based command line that should execute by any supported driver (mysql/pgsql/sqlite)
303 my ($self, $sql_command, $db_url) = @_;
306 my $db_cmd_path = $self->o(
'hive_root_dir').
'/scripts/db_cmd.pl';
307 $sql_command =~ s/
'/'\\
''/g
if $sql_command;
308 return "$db_cmd_path -url '$db_url'".($sql_command ?
" -sql '$sql_command'" :
'');
314 print @_
if $self->o(
'hive_debug_init');
318 sub process_pipeline_name {
319 my ($self, $ppn) = @_;
321 $ppn=~s/([[:lower:]])([[:upper:]])/${1}_${2}/g; # CamelCase into Camel_Case
322 $ppn=~s/[\s\/]/_/g; # remove all spaces and other annoying characters
329 sub default_pipeline_name {
332 my $dpn = ref($self); # get the original
class name
340 =head2 process_options
342 Description : The method that does all the parameter parsing magic.
343 It is two-pass through the
interface methods: first pass collects the options, second is intelligent substitution.
345 Caller : init_pipeline.pl or any other script that will drive this module.
347 Note : You can override parsing the command line bit by providing a hash as the argument to this method.
348 This hash should contain definitions of all the parameters you would otherwise be providing from the command line.
349 Useful if you are creating batches of hive pipelines using a script.
353 sub process_options {
354 my ($self, $include_pcc_use_case) = @_;
356 # pre-patch definitely_used_options:
357 $self->{
'_extra_options'} = $self->load_cmdline_options( $self->pre_options() );
358 $self->root()->{
'pipeline_url'} = $self->{
'_extra_options'}{
'pipeline_url'};
360 my @use_cases = (
'pipeline_wide_parameters',
'resource_classes',
'pipeline_analyses',
'beekeeper_extra_cmdline_options',
'hive_meta_table',
'print_debug' );
361 if($include_pcc_use_case) {
362 unshift @use_cases,
'overridable_pipeline_create_commands';
363 push @use_cases,
'useful_commands_legend';
365 $self->use_cases( \@use_cases );
367 $self->SUPER::process_options();
370 $self->root()->{
'pipeline_name'} = $self->process_pipeline_name( $self->root()->{
'pipeline_name'} );
371 $self->root()->{
'pipeline_db'}{
'-dbname'} &&= $self->process_pipeline_name( $self->root()->{
'pipeline_db'}{
'-dbname'} ); # may be used to construct $self->pipeline_url()
375 sub overridable_pipeline_create_commands {
377 my $pipeline_create_commands = $self->pipeline_create_commands();
379 return $self->o(
'hive_no_init') ? [] : $pipeline_create_commands;
383 sub is_analysis_topup {
386 return $self->o(
'hive_no_init');
390 sub run_pipeline_create_commands {
393 foreach my $cmd (@{$self->overridable_pipeline_create_commands}) {
394 # We allow commands to be given as an arrayref, but we join the
395 # array elements anyway
396 (my $dummy,$cmd) = join_command_args($cmd);
397 $self->print_debug(
"$cmd\n" );
398 if(my $retval = system($cmd)) {
399 die
"Return value = $retval, possibly an error running $cmd\n";
402 $self->print_debug(
"\n" );
406 =head2 add_objects_from_config
408 Description : The method that uses the Hive/
EnsEMBL API to actually create all the analyses, jobs, dataflow and control rules and resource descriptions.
410 Caller : init_pipeline.pl or any other script that will drive
this module.
414 sub add_objects_from_config {
416 my $pipeline = shift @_;
418 $self->print_debug(
"Adding hive_meta table entries ...\n" );
419 my $new_meta_entries = $self->hive_meta_table();
420 while( my ($meta_key, $meta_value) = each %$new_meta_entries ) {
421 $pipeline->add_new_or_update(
'MetaParameters', $self->o(
'hive_debug_init'),
422 'meta_key' => $meta_key,
423 'meta_value' => $meta_value,
426 $self->print_debug(
"Done.\n\n" );
428 $self->print_debug(
"Adding pipeline-wide parameters ...\n" );
429 my $new_pwp_entries = $self->pipeline_wide_parameters();
430 while( my ($param_name, $param_value) = each %$new_pwp_entries ) {
431 $pipeline->add_new_or_update(
'PipelineWideParameters', $self->o(
'hive_debug_init'),
432 'param_name' => $param_name,
433 'param_value' => stringify($param_value),
436 $self->print_debug(
"Done.\n\n" );
438 $self->print_debug(
"Adding Resources ...\n" );
439 my $resource_classes_hash = $self->resource_classes;
440 unless( exists $resource_classes_hash->{
'default'} ) {
441 warn
"\tNB:'default' resource class is not in the database (did you forget to inherit from SUPER::resource_classes ?) - creating it for you\n";
442 $resource_classes_hash->{
'default'} = {};
444 my @resource_classes_order = sort { ($b eq
'default') or -($a eq
'default') or ($a cmp $b) } keys %$resource_classes_hash; # put
'default' to the front
445 my %cached_resource_classes =
map {$_->name => $_} $pipeline->collection_of(
'ResourceClass')->list();
446 foreach my $rc_name (@resource_classes_order) {
447 if($rc_name=~/^\d+$/) {
448 die
"-rc_id syntax is no longer supported, please use the new resource notation (-rc_name)";
451 my ($resource_class) = $pipeline->add_new_or_update(
'ResourceClass', # NB: add_new_or_update returns a list
454 $cached_resource_classes{$rc_name} = $resource_class;
456 while( my($meadow_type, $resource_param_list) = each %{ $resource_classes_hash->{$rc_name} } ) {
457 $resource_param_list = [ $resource_param_list ] unless(ref($resource_param_list)); # expecting either a scalar or a 2-element array
459 my ($resource_description) = $pipeline->add_new_or_update(
'ResourceDescription', $self->o(
'hive_debug_init'), # NB: add_new_or_update returns a list
460 'resource_class' => $resource_class,
461 'meadow_type' => $meadow_type,
462 'submission_cmd_args' => $resource_param_list->[0],
463 'worker_cmd_args' => $resource_param_list->[1],
468 $self->print_debug(
"Done.\n\n" );
473 my %seen_logic_name = ();
474 my %analyses_by_logic_name =
map {$_->logic_name => $_} $pipeline->collection_of(
'Analysis')->list();
476 $self->print_debug(
"Adding Analyses ...\n" );
477 foreach my $aha (@{$self->pipeline_analyses}) {
478 my %aha_copy = %$aha;
479 my ($logic_name, $module, $parameters_hash, $comment, $tags, $input_ids, $blocked, $batch_size, $hive_capacity, $failed_job_tolerance,
480 $max_retry_count, $can_be_empty, $rc_id, $rc_name, $priority, $meadow_type, $analysis_capacity, $language, $wait_for, $flow_into)
481 =
delete @aha_copy{qw(-logic_name -module -parameters -comment -tags -input_ids -blocked -batch_size -hive_capacity -failed_job_tolerance
482 -max_retry_count -can_be_empty -rc_id -rc_name -priority -meadow_type -analysis_capacity -language -wait_for -flow_into)}; # slicing a hash reference
484 my @unparsed_attribs = keys %aha_copy;
485 if(@unparsed_attribs) {
486 die
"Could not parse the following analysis attributes: ".join(
', ',@unparsed_attribs);
489 if( not $logic_name ) {
490 die
"'-logic_name' must be defined in every analysis";
491 } elsif( $logic_name =~ /[+\-\%\.,]/ ) {
492 die
"Characters + - % . , are no longer allowed to be a part of an Analysis name. Please rename Analysis '$logic_name' and try again.\n";
493 } elsif( looks_like_number($logic_name) ) {
494 die
"Numeric Analysis names are not allowed because they may clash with dbIDs. Please rename Analysis '$logic_name' and try again.\n";
497 if($seen_logic_name{$logic_name}++) {
498 die
"an entry with -logic_name '$logic_name' appears at least twice in the same configuration file, probably a typo";
502 die
"(-rc_id => $rc_id) syntax is deprecated, please use (-rc_name => 'your_resource_class_name')";
505 my $analysis = $analyses_by_logic_name{$logic_name}; # the analysis with
this logic_name may have already been stored in the db
509 warn
"Skipping creation of already existing analysis '$logic_name'.\n";
514 $rc_name ||=
'default';
515 my $resource_class = $cached_resource_classes{$rc_name}
516 or die
"Could not find local resource with name '$rc_name', please check that resource_classes() method of your PipeConfig either contains or inherits it from the parent class";
518 if ($meadow_type and not exists $amh->{$meadow_type}) {
519 warn
"The meadow '$meadow_type' is currently not registered (analysis '$logic_name')\n";
522 $parameters_hash ||= {}; # in
case nothing was given
523 die
"'-parameters' has to be a hash" unless(ref($parameters_hash) eq
'HASH');
525 ($analysis) = $pipeline->add_new_or_update(
'Analysis', $self->o(
'hive_debug_init'), # NB: add_new_or_update returns a list
526 'logic_name' => $logic_name,
528 'language' => $language,
529 'parameters' => $parameters_hash,
530 'comment' => $comment,
531 'tags' => ( (ref($tags) eq
'ARRAY') ? join(
',', @$tags) : $tags ),
532 'resource_class' => $resource_class,
533 'failed_job_tolerance' => $failed_job_tolerance,
534 'max_retry_count' => $max_retry_count,
535 'can_be_empty' => $can_be_empty,
536 'priority' => $priority,
537 'meadow_type' => $meadow_type,
538 'analysis_capacity' => $analysis_capacity,
539 'hive_capacity' => $hive_capacity,
540 'batch_size' => $batch_size,
542 $analysis->get_compiled_module_name(); # check
if it compiles and is named correctly
544 ($stats) = $pipeline->add_new_or_update(
'AnalysisStats', $self->o(
'hive_debug_init'), # NB: add_new_or_update returns a list
545 'analysis' => $analysis,
546 'status' => $blocked ?
'BLOCKED' :
'EMPTY', # be careful, as
this "soft" way of blocking may be accidentally unblocked by deep sync
547 'total_job_count' => 0,
548 'semaphored_job_count' => 0,
549 'ready_job_count' => 0,
550 'done_job_count' => 0,
551 'failed_job_count' => 0,
552 'num_running_workers' => 0,
557 # Keep a link to the analysis object to speed up the creation of control and dataflow rules
558 $analyses_by_logic_name{$logic_name} = $analysis;
560 # now create the corresponding jobs (if there are any):
563 'prev_job' => undef, # these jobs are created by the initialization script, not by another job
564 'analysis' => $analysis,
565 'input_id' => $_, # input_ids are now centrally stringified in the AnalysisJob itself
568 unless( $pipeline->hive_use_triggers() ) {
569 $stats->recalculate_from_job_counts( {
'READY' => scalar(@$input_ids) } );
573 $self->print_debug(
"Done.\n\n" );
575 $self->print_debug(
"Adding Control and Dataflow Rules ...\n" );
576 foreach my $aha (@{$self->pipeline_analyses}) {
578 my ($logic_name, $wait_for, $flow_into)
579 = @{$aha}{qw(-logic_name -wait_for -flow_into)}; # slicing a hash reference
581 my $analysis = $analyses_by_logic_name{$logic_name};
592 $self->print_debug(
"Done.\n\n" );
594 # Block the analyses that should be blocked
595 $self->print_debug(
"Blocking the analyses that should be ...\n" );
596 foreach my $stats ($pipeline->collection_of(
'AnalysisStats')->list()) {
597 $stats->check_blocking_control_rules(
'no_die');
598 $stats->determine_status();
600 $self->print_debug(
"Done.\n\n" );
604 sub useful_commands_legend {
607 my $pipeline_url = $self->pipeline_url();
608 unless ($pipeline_url =~ /^[\
'\"]/) {
609 $pipeline_url = '"' . $pipeline_url . '"';
611 my $pipeline_name = $self->o('pipeline_name
');
612 my $extra_cmdline = $self->beekeeper_extra_cmdline_options();
616 '#
' . '-
' x 22 . '[Useful commands]
' . '-
' x 22,
618 " # It is convenient to store the pipeline url in a variable:",
619 "\texport EHIVE_URL=$pipeline_url\t\t\t# bash version",
621 "\tsetenv EHIVE_URL $pipeline_url\t\t\t# [t]csh version",
623 " # Add a new job to the pipeline (usually done once before running, but pipeline can be \"topped-up\" at any time) :",
624 "\tseed_pipeline.pl -url $pipeline_url -logic_name <analysis_name> -input_id <param_hash>",
626 " # At any moment during or after execution you can request a pipeline diagram in an image file (desired format is set via extension) :",
627 "\tgenerate_graph.pl -url $pipeline_url -out $pipeline_name.png",
629 " # Synchronize the Hive (to display fresh statistics about all analyses):",
630 "\tbeekeeper.pl -url $pipeline_url -sync",
632 " # Depending on the Meadow the pipeline is running on, you may be able to collect actual resource usage statistics :",
633 "\tload_resource_usage.pl -url $pipeline_url",
635 " # After having run load_resource_usage.pl, you can request a resource usage timeline in an image file (desired format is set via extension) :",
636 "\tgenerate_timeline.pl -url $pipeline_url -out timeline_$pipeline_name.png",
638 " # Peek into your pipeline database with a database client (useful to have open while the pipeline is running) :",
639 "\tdb_cmd.pl -url $pipeline_url",
641 " # Run the pipeline (can be interrupted and restarted) :",
642 "\tbeekeeper.pl -url $pipeline_url $extra_cmdline -loop\t\t# run in looped automatic mode (a scheduling step performed every minute)",
644 "\tbeekeeper.pl -url $pipeline_url $extra_cmdline -run \t\t# run one scheduling step of the pipeline and exit (useful for debugging/learning)",
646 "\trunWorker.pl -url $pipeline_url $extra_cmdline \t\t# run exactly one Worker locally (useful for debugging/learning)",
650 return join("\n", @output_lines);