13 This RunnableDB module acts as a wrapper
for shell-level command lines. If you behave you may also use parameter substitution.
15 The command line must be stored in the parameters() as the value corresponding to the 'cmd' key.
16 It allows to pass in other parameters and use the parameter substitution mechanism in its full glory.
18 This Runnable also allows the creation of dataflow using JSON stored in an external file.
19 Each line of this file contains an optional branch number, followed by a complete JSON serialisation of the parameters (output_id)
20 appearing on the same single line. For example, a line to direct dataflow on branch 2 might look like:
22 2 {
"parameter_name" :
"parameter_value"}
24 If no branch number is provided, then dataflow of those parameters will occour on the branch number
25 passed to SystemCmd in the
'dataflow_branch' parameter,
if given. Otherwise, it will
default to
28 A sample file is provided at ${EHIVE_ROOT_DIR}/modules/
Bio/
EnsEMBL/Hive/Examples/SystemCmd/PipeConfig/sample_files/Inject_JSON_Dataflow_example.json
30 =head1 CONFIGURATION EXAMPLES
32 # The following example shows how to configure SystemCmd in a PipeConfig module
33 # to create a MySQL snapshot of the Hive database before executing a critical operation.
35 # It is a useful incantation when debugging pipelines, similar to setting a breakpoint/savepoint.
36 # You will be able to reset your pipeline to the saved point in by un-dumping this file.
38 { -logic_name =>
'db_snapshot_before_critical_A',
39 -module =>
'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
41 'filename' => $ENV{
'HOME'}.
'/db_snapshot_before_critical_A',
42 'cmd' => $self->db_cmd().
' --executable mysqldump > #filename#',
46 # The following example shows how to configure SystemCmd in a PipeConfig module
47 # to generate dataflow events based on parameters stored as JSON in a file named "some_parameters.json"
49 { -logic_name =>
'inject_parameters_from_file',
50 -module =>
'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
52 'dataflow_file' =>
'some_parameters.json',
53 'cmd' =>
'sleep 0', # a command must be provided in the cmd parameter
59 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
60 Copyright [2016-2024] EMBL-European Bioinformatics Institute
62 Licensed under the Apache License, Version 2.0 (the
"License"); you may not use
this file except in compliance with the License.
63 You may obtain a copy of the License at
67 Unless required by applicable law or agreed to in writing, software distributed under the License
68 is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
69 See the License
for the specific language governing permissions and limitations under the License.
73 Please subscribe to the
Hive mailing list: http:
78 package Bio::EnsEMBL::Hive::RunnableDB::SystemCmd;
83 use base (
'Bio::EnsEMBL::Hive::Process');
88 return_codes_2_branches => {}, # Hash that maps some of the command
return codes to branch numbers
89 'use_bash_pipefail' => 0, # Boolean. When
true, the command will be
run with
"bash -o pipefail -c $cmd". Useful to capture errors in a command that contains pipes
90 'use_bash_errexit' => 0, # When the command is composed of multiple commands (concatenated with a semi-colon), use
"bash -o errexit" so that a failure will interrupt the whole script
91 'dataflow_file' => undef, # The path to a file that contains 1 line per dataflow event, in the form of a JSON
object
92 'dataflow_branch' => undef, # The
default branch
for JSON dataflows
93 'timeout' => undef, # Maximum runtime of the command
100 Description : Implements
run() interface method of
Bio::
EnsEMBL::Hive::Process that is used to perform the
main bulk of the job (minus input and output).
101 Here it actually runs the command line.
103 param('cmd'): The recommended way of passing in the command line. It can be either a
string, or an array-ref of strings. The later is safer if some of the
104 arguments contain white-spaces.
106 param('*'): Any other parameters can be freely used for parameter substitution.
113 my %transferred_options =
map {$_ => $self->param($_)} qw(use_bash_pipefail use_bash_errexit timeout);
114 my ($return_value, $stderr, $flat_cmd, $stdout, $runtime_msec) = $self->run_system_command($self->param_required(
'cmd'), \%transferred_options);
116 # To be used in write_output()
117 $self->param(
'return_value', $return_value);
118 $self->param(
'stderr', $stderr);
119 $self->param(
'flat_cmd', $flat_cmd);
120 $self->param(
'stdout', $stdout);
121 $self->param(
'runtime_msec', $runtime_msec);
127 Description : Implements write_output() interface method of
Bio::
EnsEMBL::Hive::Process that is used to deal with job's output after the execution.
128 Here we take actions based on the command's exit status.
135 my $return_value = $self->param(
'return_value');
138 unless ($return_value) {
139 # FIXME branch number
140 $self->dataflow_output_ids_from_json($self->param(
'dataflow_file'), $self->param(
'dataflow_branch'))
if $self->param(
'dataflow_file');
145 my $stderr = $self->param(
'stderr');
146 my $flat_cmd = $self->param(
'flat_cmd');
148 if ($return_value == -1) {
149 # system() could not start, or wait() failed
150 die sprintf(
"Could not start '%s': %s\n", $flat_cmd, $stderr);
152 } elsif ($return_value == -2) {
153 $self->complete_early_if_branch_connected(
"The command was aborted because it exceeded the allowed runtime. Flowing to the -2 branch.\n", -2);
154 die
"The command was aborted because it exceeded the allowed runtime, but there are no dataflow-rules on branch -2.\n";
156 # Lower 8 bits indicate the process has been killed and did not complete.
157 } elsif ($return_value & 255) {
158 # It can happen because of a MEMLIMIT / RUNLIMIT, which we
159 # know are not atomic. The best is to wait a bit that LSF kills
162 # If we reach this point, it was killed for another reason.
163 die sprintf(
"'%s' was killed with code=%d\nstderr is: %s\n", $flat_cmd, $return_value, $stderr);
166 # "Normal" process exit with a non-zero code (in the upper 8 bits)
169 # We create a dataflow event depending on the exit code of the process.
170 if (ref($self->param(
'return_codes_2_branches')) and exists $self->param(
'return_codes_2_branches')->{$return_value}) {
171 my $branch_number = $self->param(
'return_codes_2_branches')->{$return_value};
172 $self->complete_early(sprintf(
"The command exited with code %d, which is mapped to a dataflow on branch #%d.\n", $return_value, $branch_number), $branch_number);
175 if ($stderr =~ /Exception in thread
".*" java.lang.OutOfMemoryError: Java heap space at/) {
176 $self->complete_early_if_branch_connected(
"Java heap space is out of memory. A job has been dataflown to the -1 branch.\n", -1);
180 die sprintf(
"'%s' resulted in an error code=%d\nstderr is: %s\n", $flat_cmd, $return_value, $stderr);
185 ######################
186 ## Internal methods ##
187 ######################
189 =head2 complete_early_if_branch_connected
191 Arg[1] : (string) message
192 Arg[2] : (integer) branch number
193 Description : Wrapper around complete_early that first checks that the
194 branch is connected to something.
195 Returntype :
void if the branch is not connected. Otherwise doesn
't return
199 sub complete_early_if_branch_connected {
200 my ($self, $message, $branch_code) = @_;
202 # just return if no corresponding gc_dataflow rule has been defined
203 return unless $self->input_job->analysis->dataflow_rules_by_branch->{$branch_code};
205 $self->complete_early($message, $branch_code);