ensembl-hive  2.6
SystemCmd.pm
Go to the documentation of this file.
1 =pod
2 
3 =head1 NAME
4 
6 
7 =head1 SYNOPSIS
8 
9  standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::SystemCmd --cmd 'ls -1 ${ENSEMBL_CVS_ROOT_DIR}/ensembl-hive/modules/Bio/EnsEMBL/Hive/RunnableDB/*.pm >building_blocks.list'
10 
11 =head1 DESCRIPTION
12 
13  This RunnableDB module acts as a wrapper for shell-level command lines. If you behave you may also use parameter substitution.
14 
15  The command line must be stored in the parameters() as the value corresponding to the 'cmd' key.
16  It allows to pass in other parameters and use the parameter substitution mechanism in its full glory.
17 
18  This Runnable also allows the creation of dataflow using JSON stored in an external file.
19  Each line of this file contains an optional branch number, followed by a complete JSON serialisation of the parameters (output_id)
20  appearing on the same single line. For example, a line to direct dataflow on branch 2 might look like:
21 
22  2 {"parameter_name" : "parameter_value"}
23 
24  If no branch number is provided, then dataflow of those parameters will occour on the branch number
25  passed to SystemCmd in the 'dataflow_branch' parameter, if given. Otherwise, it will default to
26  branch 1 (autoflow).
27 
28  A sample file is provided at ${EHIVE_ROOT_DIR}/modules/Bio/EnsEMBL/Hive/Examples/SystemCmd/PipeConfig/sample_files/Inject_JSON_Dataflow_example.json
29 
30 =head1 CONFIGURATION EXAMPLES
31 
32  # The following example shows how to configure SystemCmd in a PipeConfig module
33  # to create a MySQL snapshot of the Hive database before executing a critical operation.
34  #
35  # It is a useful incantation when debugging pipelines, similar to setting a breakpoint/savepoint.
36  # You will be able to reset your pipeline to the saved point in by un-dumping this file.
37 
38  { -logic_name => 'db_snapshot_before_critical_A',
39  -module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
40  -parameters => {
41  'filename' => $ENV{'HOME'}.'/db_snapshot_before_critical_A',
42  'cmd' => $self->db_cmd().' --executable mysqldump > #filename#',
43  },
44  },
45 
46  # The following example shows how to configure SystemCmd in a PipeConfig module
47  # to generate dataflow events based on parameters stored as JSON in a file named "some_parameters.json"
48 
49  { -logic_name => 'inject_parameters_from_file',
50  -module => 'Bio::EnsEMBL::Hive::RunnableDB::SystemCmd',
51  -parameters => {
52  'dataflow_file' => 'some_parameters.json',
53  'cmd' => 'sleep 0', # a command must be provided in the cmd parameter
54  },
55  },
56 
57 =head1 LICENSE
58 
59  Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
60  Copyright [2016-2024] EMBL-European Bioinformatics Institute
61 
62  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
63  You may obtain a copy of the License at
64 
65  http://www.apache.org/licenses/LICENSE-2.0
66 
67  Unless required by applicable law or agreed to in writing, software distributed under the License
68  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
69  See the License for the specific language governing permissions and limitations under the License.
70 
71 =head1 CONTACT
72 
73  Please subscribe to the Hive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss Hive-related questions or to be notified of our updates
74 
75 =cut
76 
77 
78 package Bio::EnsEMBL::Hive::RunnableDB::SystemCmd;
79 
80 use strict;
81 use warnings;
82 
83 use base ('Bio::EnsEMBL::Hive::Process');
84 
85 
86 sub param_defaults {
87  return {
88  return_codes_2_branches => {}, # Hash that maps some of the command return codes to branch numbers
89  'use_bash_pipefail' => 0, # Boolean. When true, the command will be run with "bash -o pipefail -c $cmd". Useful to capture errors in a command that contains pipes
90  'use_bash_errexit' => 0, # When the command is composed of multiple commands (concatenated with a semi-colon), use "bash -o errexit" so that a failure will interrupt the whole script
91  'dataflow_file' => undef, # The path to a file that contains 1 line per dataflow event, in the form of a JSON object
92  'dataflow_branch' => undef, # The default branch for JSON dataflows
93  'timeout' => undef, # Maximum runtime of the command
94  }
95 }
96 
97 
98 =head2 run
99 
100  Description : Implements run() interface method of Bio::EnsEMBL::Hive::Process that is used to perform the main bulk of the job (minus input and output).
101  Here it actually runs the command line.
102 
103  param('cmd'): The recommended way of passing in the command line. It can be either a string, or an array-ref of strings. The later is safer if some of the
104  arguments contain white-spaces.
105 
106  param('*'): Any other parameters can be freely used for parameter substitution.
107 
108 =cut
109 
110 sub run {
111  my $self = shift;
112 
113  my %transferred_options = map {$_ => $self->param($_)} qw(use_bash_pipefail use_bash_errexit timeout);
114  my ($return_value, $stderr, $flat_cmd, $stdout, $runtime_msec) = $self->run_system_command($self->param_required('cmd'), \%transferred_options);
115 
116  # To be used in write_output()
117  $self->param('return_value', $return_value);
118  $self->param('stderr', $stderr);
119  $self->param('flat_cmd', $flat_cmd);
120  $self->param('stdout', $stdout);
121  $self->param('runtime_msec', $runtime_msec);
122 }
123 
124 
125 =head2 write_output
126 
127  Description : Implements write_output() interface method of Bio::EnsEMBL::Hive::Process that is used to deal with job's output after the execution.
128  Here we take actions based on the command's exit status.
129 
130 =cut
131 
132 sub write_output {
133  my $self = shift;
134 
135  my $return_value = $self->param('return_value');
136 
137  ## Success
138  unless ($return_value) {
139  # FIXME branch number
140  $self->dataflow_output_ids_from_json($self->param('dataflow_file'), $self->param('dataflow_branch')) if $self->param('dataflow_file');
141  return;
142  }
143 
144  ## Error processing
145  my $stderr = $self->param('stderr');
146  my $flat_cmd = $self->param('flat_cmd');
147 
148  if ($return_value == -1) {
149  # system() could not start, or wait() failed
150  die sprintf( "Could not start '%s': %s\n", $flat_cmd, $stderr);
151 
152  } elsif ($return_value == -2) {
153  $self->complete_early_if_branch_connected("The command was aborted because it exceeded the allowed runtime. Flowing to the -2 branch.\n", -2);
154  die "The command was aborted because it exceeded the allowed runtime, but there are no dataflow-rules on branch -2.\n";
155 
156  # Lower 8 bits indicate the process has been killed and did not complete.
157  } elsif ($return_value & 255) {
158  # It can happen because of a MEMLIMIT / RUNLIMIT, which we
159  # know are not atomic. The best is to wait a bit that LSF kills
160  # the worker too
161  sleep 30;
162  # If we reach this point, it was killed for another reason.
163  die sprintf( "'%s' was killed with code=%d\nstderr is: %s\n", $flat_cmd, $return_value, $stderr);
164 
165  } else {
166  # "Normal" process exit with a non-zero code (in the upper 8 bits)
167  $return_value >>= 8;
168 
169  # We create a dataflow event depending on the exit code of the process.
170  if (ref($self->param('return_codes_2_branches')) and exists $self->param('return_codes_2_branches')->{$return_value}) {
171  my $branch_number = $self->param('return_codes_2_branches')->{$return_value};
172  $self->complete_early(sprintf("The command exited with code %d, which is mapped to a dataflow on branch #%d.\n", $return_value, $branch_number), $branch_number);
173  }
174 
175  if ($stderr =~ /Exception in thread ".*" java.lang.OutOfMemoryError: Java heap space at/) {
176  $self->complete_early_if_branch_connected("Java heap space is out of memory. A job has been dataflown to the -1 branch.\n", -1);
177  die $stderr;
178  }
179 
180  die sprintf( "'%s' resulted in an error code=%d\nstderr is: %s\n", $flat_cmd, $return_value, $stderr);
181  }
182 }
183 
184 
185 ######################
186 ## Internal methods ##
187 ######################
188 
189 =head2 complete_early_if_branch_connected
190 
191  Arg[1] : (string) message
192  Arg[2] : (integer) branch number
193  Description : Wrapper around complete_early that first checks that the
194  branch is connected to something.
195  Returntype : void if the branch is not connected. Otherwise doesn't return
196 
197 =cut
198 
199 sub complete_early_if_branch_connected {
200  my ($self, $message, $branch_code) = @_;
201 
202  # just return if no corresponding gc_dataflow rule has been defined
203  return unless $self->input_job->analysis->dataflow_rules_by_branch->{$branch_code};
204 
205  $self->complete_early($message, $branch_code);
206 }
207 
208 1;
EnsEMBL
Definition: Filter.pm:1
map
public map()
Bio::EnsEMBL::Hive::RunnableDB::SystemCmd
Definition: SystemCmd.pm:31
main
public main()
run
public run()
Bio::EnsEMBL::Hive
Definition: Hive.pm:38
Bio
Definition: AltAlleleGroup.pm:4