ensembl-hive  2.8.1
standaloneJob.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 
3 use strict;
4 use warnings;
5 
6  # Finding out own path in order to reference own components (including own modules):
7 use Cwd ();
8 use File::Basename ();
9 BEGIN {
10  $ENV{'EHIVE_ROOT_DIR'} ||= File::Basename::dirname( File::Basename::dirname( Cwd::realpath($0) ) );
11  unshift @INC, $ENV{'EHIVE_ROOT_DIR'}.'/modules';
12 }
13 
14 use Getopt::Long qw(:config pass_through no_auto_abbrev);
15 use Pod::Usage;
16 
18 use Bio::EnsEMBL::Hive::Utils ('load_file_or_module', 'parse_cmdline_options', 'stringify', 'destringify');
21 
23 
24 main();
25 
26 
27 sub main {
28  my ($reg_conf,
29  $url,
30  $job_id,
31  $input_id,
32  $flow_into,
33  $no_write,
34  $no_cleanup,
35  $debug,
36  $language,
37  $help);
38 
39  GetOptions (
40  # connection parameters
41  'reg_conf|regfile|reg_file=s' => \$reg_conf,
42 
43  # Seed options
44  'input_id=s' => \$input_id,
45  'url=s' => \$url,
46  'job_id=i' => \$job_id,
47 
48  # flow control
49  'flow_into|flow=s' => \$flow_into,
50 
51  # debugging
52  'no_write' => \$no_write,
53  'no_cleanup' => \$no_cleanup,
54  'debug=i' => \$debug,
55 
56  # other commands/options
57  'language=s' => \$language,
58  'h|help!' => \$help,
59  );
60 
61  if ($help) {
62  pod2usage({-exitvalue => 0, -verbose => 2});
63  }
64 
65  my $module_or_file;
66 
67  if($reg_conf) {
68  require Bio::EnsEMBL::Registry;
69  Bio::EnsEMBL::Registry->load_all($reg_conf);
70  }
71 
72  if ($input_id && ($job_id || $url)) {
73  die "Error: -input_id cannot be given at the same time as -job_id or -url\n";
74 
75  } elsif ($job_id && $url) {
76  my $pipeline = Bio::EnsEMBL::Hive::HivePipeline->new( -url => $url, -no_sql_schema_version_check => 1 );
77  unless($pipeline->hive_dba) {
78  die "ERROR : no database connection\n\n";
79  }
80  my $job = $pipeline->hive_dba->get_AnalysisJobAdaptor->fetch_by_dbID($job_id)
81  || die "ERROR: No Job with jo_id=$job_id\n";
82  $job->load_parameters();
83  my ($param_hash, $param_list) = parse_cmdline_options();
84  if (@$param_list) {
85  die "ERROR: There are invalid arguments on the command-line: ". join(" ", @$param_list). "\n";
86  }
87  $input_id = stringify( {%{$job->{'_unsubstituted_param_hash'}}, %$param_hash} );
88  $module_or_file = $job->analysis->module;
89  my $status = $job->status;
90  warn "\nTaken parameters from job_id $job_id (status $status) @ $url\n";
91  warn "Will now disconnect from it. Be aware that the original Job will NOT be updated with the outcome of this standalone. Use runWorker.pl if you want to register your run.\n";
92  $pipeline->hive_dba->dbc->disconnect_if_idle;
93 
94  } elsif (!$input_id) {
95  $module_or_file = shift @ARGV;
96  my ($param_hash, $param_list) = parse_cmdline_options();
97  if (@$param_list) {
98  die "ERROR: There are invalid arguments on the command-line: ". join(" ", @$param_list). "\n";
99  }
100  $input_id = stringify($param_hash);
101  } else {
102  $module_or_file = shift @ARGV;
103  if (@ARGV) {
104  die "ERROR: There are invalid arguments on the command-line: ". join(" ", @ARGV). "\n";
105  }
106  }
107 
108  if (!$module_or_file) {
109  die "ERROR: need to provide a module name to run\n";
110  }
111 
112  warn "\nRunning '$module_or_file' with input_id='$input_id' :\n";
113 
114  my %flags = (
115  no_write => $no_write,
116  no_cleanup => $no_cleanup,
117  debug => $debug,
118  );
119  my $job_successful = Bio::EnsEMBL::Hive::Scripts::StandaloneJob::standaloneJob($module_or_file, $input_id, \%flags, $flow_into, $language);
120  exit(1) unless $job_successful;
121 }
122 
123 
124 __DATA__
125 
126 =pod
127 
128 =head1 NAME
129 
130 standaloneJob.pl
131 
132 =head1 DESCRIPTION
133 
134 standaloneJob.pl is an eHive component script that
135 
136 =over
137 
138 =item 1.
139 
140 takes in a Runnable module,
141 
142 =item 2.
143 
144 creates a standalone Job outside an eHive database by initialising parameters from command line arguments
145 
146 =item 3.
147 
148 and runs that Job outside of any eHive database.
149 
150 I<WARNING> the Runnable code may still access databases provided
151 as arguments and even harm them!
152 
153 =item 4.
154 
155 can optionally dataflow into tables fully defined by URLs
156 
157 =back
158 
159 Naturally, only certain Runnable modules can be run using this script, and some database-related functionality will be lost.
160 
161 There are several ways of initialising the Job parameters:
162 
163 =over
164 
165 =item 1.
166 
167 C<Module::Name -input_id>. The simplest one: just provide a stringified hash
168 
169 =item 2.
170 
171 C<Module::Name -param1 value1 -param2 value2 (...)>. Enumerate all the arguments on the command-line. ARRAY- and HASH-
172 arguments can be passed+parsed too!
173 
174 =item 3.
175 
176 C<-url $ehive_url job_id XXX>. The reference to an existing Job from which the parameters will be pulled. It is
177 a convenient way of gathering all the parameters (the Job's input_id, the Job's accu, the Analysis parameters
178 and the pipeline-wide parameters). Further parameters can be added with C<-param1 value1 -param2 value2 (...)>
179 and they take priority over the existing Job's parameters. The Runnable is also found in the database.
180 
181 <NOTE> the standaloneJob will *not* interact any further with this eHive database. There won't be any updates
182 to the C<job>, C<worker>, C<log_message> etc tables.
183 
184 =back
185 
186 =head1 USAGE EXAMPLES
187 
188  # Run a Job with default parameters, specify module by its package name:
189  standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::FailureTest
190 
191  # Run the same Job with default parameters, but specify module by its relative filename:
192  standaloneJob.pl RunnableDB/FailureTest.pm
193 
194  # Run a Job and re-define some of the default parameters:
195  standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::FailureTest -time_RUN=2 -time_WRITE_OUTPUT=3 -state=WRITE_OUTPUT -value=2
196  standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::SystemCmd -cmd 'ls -l'
197  standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::SystemCmd -input_id "{ 'cmd' => 'ls -l' }"
198 
199  # Run a Job and re-define its "db_conn" parameter to allow it to perform some database-related operations:
200  standaloneJob.pl RunnableDB/SqlCmd.pm -db_conn mysql://ensadmin:xxxxxxx@127.0.0.1:2912/lg4_compara_families_63 -sql 'INSERT INTO meta (meta_key,meta_value) VALUES ("hello", "world2")'
201 
202  # Run a Job initialised from the parameters of an existing Job topped-up with extra ones.
203  # In this particular example the Runnable needs a "compara_db" parameter which defaults to the eHive database.
204  # Since there is no eHive database here we need to define -compara_db on the command-line
205  standaloneJob.pl -url mysql://ensro@compara1.internal.sanger.ac.uk:3306/mm14_pecan_24way_86b -job_id 16781 -compara_db mysql://ensro@compara1.internal.sanger.ac.uk:3306/mm14_pecan_24way_86b
206 
207  # Run a Job with given parameters, but skip the write_output() step:
208  standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::FailureTest -no_write -time_RUN=2 -time_WRITE_OUTPUT=3 -state=WRITE_OUTPUT -value=2
209 
210  # Run a Job and re-direct its dataflow into tables:
211  standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::JobFactory -inputfile foo.txt -delimiter '\t' -column_names "[ 'name', 'age' ]" \
212  -flow_into "{ 2 => ['mysql://ensadmin:xxxxxxx@127.0.0.1:2914/lg4_triggers/foo', 'mysql://ensadmin:xxxxxxx@127.0.0.1:2914/lg4_triggers/bar'] }"
213 
214  # Run a Compara Job that needs a connection to Compara database:
215  standaloneJob.pl Bio::EnsEMBL::Compara::RunnableDB::ObjectFactory -compara_db 'mysql://ensadmin:xxxxxxx@127.0.0.1:2911/sf5_ensembl_compara_master' \
216  -adaptor_name MethodLinkSpeciesSetAdaptor -adaptor_method fetch_all_by_method_link_type -method_param_list "[ 'ENSEMBL_ORTHOLOGUES' ]" \
217  -column_names2getters "{ 'name' => 'name', 'mlss_id' => 'dbID' }" -flow_into "{ 2 => 'mysql://ensadmin:xxxxxxx@127.0.0.1:2914/lg4_triggers/baz' }"
218 
219  # Create a new Job in a database using automatic dataflow from a database-less Dummy Job:
220  standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::Dummy -a_multiplier 1234567 -b_multiplier 9876543 \
221  -flow_into "{ 1 => 'mysql://ensadmin:xxxxxxx@127.0.0.1/lg4_long_mult/analysis?logic_name=start' }"
222 
223  # Produce a Semaphore group of Jobs from a database-less DigitFactory Job:
224  standaloneJob.pl Bio::EnsEMBL::Hive::Examples::LongMult::RunnableDB::DigitFactory -input_id "{ 'a_multiplier' => '2222222222', 'b_multiplier' => '3434343434'}" \
225  -flow_into "{ '2->A' => 'mysql://ensadmin:${ENSADMIN_PSW}@127.0.0.1/lg4_long_mult/analysis?logic_name=part_multiply', 'A->1' => 'mysql://ensadmin:${ENSADMIN_PSW}@127.0.0.1/lg4_long_mult/analysis?logic_name=add_together' }"
226 
227 
228 =head1 SCRIPT-SPECIFIC OPTIONS
229 
230 =over
231 
232 =item --help
233 
234 print this help
235 
236 =item --debug <level>
237 
238 turn on debug messages at <level>
239 
240 =item --no_write
241 
242 skip the execution of write_output() step this time
243 
244 =item --no_cleanup
245 
246 do not cleanup temporary files
247 
248 =item --reg_conf <path>
249 
250 load registry entries from the given file (these entries may be needed by the Runnable itself)
251 
252 =item --input_id <hash>
253 
254 specify the whole input_id parameter in one stringified hash
255 
256 =item --flow_out <hash>
257 
258 defines the dataflow re-direction rules in a format similar to PipeConfig's - see the last example
259 
260 =item --language <name>
261 
262 language in which the Runnable is written
263 
264 =back
265 
266 All other options will be passed to the Runnable (leading dashes removed) and will constitute the parameters for the Job.
267 
268 =head1 LICENSE
269 
270  See the NOTICE file distributed with this work for additional information
271  regarding copyright ownership.
272 
273  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
274  You may obtain a copy of the License at
275 
276  http://www.apache.org/licenses/LICENSE-2.0
277 
278  Unless required by applicable law or agreed to in writing, software distributed under the License
279  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
280  See the License for the specific language governing permissions and limitations under the License.
281 
282 =head1 CONTACT
283 
284 Please subscribe to the eHive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss eHive-related questions or to be notified of our updates
285 
286 =cut
287 
Bio::EnsEMBL::Hive::Utils
Definition: Collection.pm:4
Bio::EnsEMBL::Hive::Utils::URL::hide_url_password
public Void hide_url_password()
Bio::EnsEMBL::Hive::RunnableDB::Dummy
Definition: Dummy.pm:28
Bio::EnsEMBL::Hive::Utils::URL
Definition: URL.pm:11
Bio::EnsEMBL::Hive::Examples::LongMult::RunnableDB::DigitFactory
Definition: DigitFactory.pm:25
Bio::EnsEMBL::Hive::RunnableDB::SystemCmd
Definition: SystemCmd.pm:31
cleanup
public cleanup()
Bio::EnsEMBL::Hive::HivePipeline::new
public new()
Bio::EnsEMBL::Hive::Scripts::StandaloneJob
Definition: StandaloneJob.pm:13
Bio::EnsEMBL::Hive::Scripts::StandaloneJob::standaloneJob
public standaloneJob()
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
debug
public debug()
Bio::EnsEMBL::Hive::HivePipeline
Definition: HivePipeline.pm:13
BEGIN
public BEGIN()
run
public run()
main
public main()
Bio::EnsEMBL::Hive::RunnableDB::JobFactory
Definition: JobFactory.pm:35
Bio::EnsEMBL::Registry::load_all
public Int load_all()