ensembl-hive  2.5
Utils.pm
Go to the documentation of this file.
1 
2 =pod
3 
4 =head1 NAME
5 
7 
8 =head1 SYNOPSIS
9 
10  # Example of an import:
11  use Bio::EnsEMBL::Hive::Utils 'stringify';
12  my $input_id_string = stringify($input_id_hash);
13 
14  # Example of inheritance:
15  use base ('Bio::EnsEMBL::Hive::Utils', ...);
16  my $input_id_string = $self->stringify($input_id_hash);
17 
18  # Example of a direct call:
20  my $input_id_string = Bio::EnsEMBL::Hive::Utils::stringify($input_id_hash);
21 
22 =head1 DESCRIPTION
23 
24  This module provides general utility functions that can be used in different contexts through three different calling mechanisms:
25 
26  * import: another module/script can selectively import methods from this module into its namespace
27 
28  * inheritance: another module can inherit from this one and so implicitly acquire the methods into its namespace
29 
30  * direct call to a module's method: another module/script can directly call a method from this module prefixed with this module's name
31 
32 =head1 LICENSE
33 
34  Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
35  Copyright [2016-2022] EMBL-European Bioinformatics Institute
36 
37  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
38  You may obtain a copy of the License at
39 
40  http://www.apache.org/licenses/LICENSE-2.0
41 
42  Unless required by applicable law or agreed to in writing, software distributed under the License
43  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
44  See the License for the specific language governing permissions and limitations under the License.
45 
46 =head1 CONTACT
47 
48  Please subscribe to the Hive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss Hive-related questions or to be notified of our updates
49 
50 =cut
51 
52 
53 package Bio::EnsEMBL::Hive::Utils;
54 
55 use strict;
56 use warnings;
57 use Carp ('confess');
58 use Data::Dumper;
59 use Scalar::Util qw(looks_like_number);
60 #use Bio::EnsEMBL::Hive::DBSQL::DBConnection; # causes warnings that all exported functions have been redefined
61 
62 use Exporter 'import';
63 our @EXPORT_OK = qw(stringify destringify dir_revhash parse_cmdline_options find_submodules load_file_or_module split_for_bash go_figure_dbc throw join_command_args whoami timeout);
64 
65 no warnings ('once'); # otherwise the next line complains about $Carp::Internal being used just once
66 $Carp::Internal{ (__PACKAGE__) }++;
67 
68 
69 =head2 stringify
70 
71  Description: This function takes in a Perl data structure and stringifies it using specific configuration
72  that allows us to store/recreate this data structure according to our specific storage/communication requirements.
73  NOTE: Some recursive structures are not stringified in a way that allows destringification with destringify
74 
75  Callers : Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor # stringification of input_id() hash
76  Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf # stringification of parameters() hash
77 
78 =cut
79 
80 sub stringify {
81  my $structure = pop @_;
82 
83  local $Data::Dumper::Indent = 0; # we want everything on one line
84  local $Data::Dumper::Terse = 1; # and we want it without dummy variable names
85  local $Data::Dumper::Sortkeys = 1; # make stringification more deterministic
86  local $Data::Dumper::Quotekeys = 1; # conserve some space
87  local $Data::Dumper::Useqq = 1; # escape the \n and \t correctly
88  local $Data::Dumper::Pair = ' => '; # make sure we always produce Perl-parsable structures, no matter what is set externally
89  local $Data::Dumper::Maxdepth = 0; # make sure nobody can mess up stringification by setting a lower Maxdepth
90  local $Data::Dumper::Deepcopy = 1; # avoid self-references in case the same structure is reused within params
91 
92  return Dumper($structure);
93 }
94 
95 =head2 destringify
96 
97  Description: This function takes in a string that may or may not contain a stingified Perl structure.
98  If it seems to contain a hash/array/quoted_string, the contents is evaluated, otherwise it is returned "as is".
99  This function is mainly used to read values from 'meta' table that may represent Perl structures, but generally don't have to.
100 
101  Callers : Bio::EnsEMBL::Hive::DBSQL::PipelineWideParametersAdaptor # destringification of general 'meta' params
102  beekeeper.pl script # destringification of the 'pipeline_name' meta param
103 
104 =cut
105 
106 sub destringify {
107  my $value = pop @_;
108 
109  if(defined $value) {
110  if($value=~/^'.*'$/s
111  or $value=~/^".*"$/s
112  or $value=~/^{.*}$/s
113  or $value=~/^\[.*\]$/s
114  or looks_like_number($value) # Needed for pipeline_wide_parameters as each value is destringified independently and the JSON writer would otherwise force writing numbers as strings
115  or $value eq 'undef') {
116 
117  $value = eval($value);
118  }
119  }
120 
121  return $value;
122 }
123 
124 =head2 dir_revhash
125 
126  Description: This function takes in a string (which is usually a numeric id) and turns its reverse into a multilevel directory hash.
127  Please note that no directory is created at this step - it is purely a string conversion function.
128 
129  Callers : Bio::EnsEMBL::Hive::Worker # hashing of the worker output directories
130  Bio::EnsEMBL::Hive::RunnableDB::JobFactory # hashing of an arbitrary id
131 
132 =cut
133 
134 sub dir_revhash {
135  my $id = pop @_;
136 
137  my @dirs = reverse(split(//, $id));
138  pop @dirs; # do not use the first digit for hashing
139 
140  return join('/', @dirs);
141 }
142 
143 
144 =head2 parse_cmdline_options
145 
146  Description: This function reads all options from command line into a key-value hash
147  (keys must be prefixed with a single or double dash, the following term becomes the value).
148  The rest of the terms go into the list.
149  Command line options are not removed from @ARGV, so the same or another parser can be run again if needed.
150 
151  Callers : scripts
152 
153 =cut
154 
155 sub parse_cmdline_options {
156  my %pairs = ();
157  my @list = ();
158 
159  my $temp_key;
160 
161  foreach my $arg (@ARGV) {
162  if($temp_key) { # only the value, get the key from buffer
163  $pairs{$temp_key} = destringify($arg);
164  $temp_key = '';
165  } elsif($arg=~/^--?(\w+)=(.+)$/) { # both the key and the value
166  $pairs{$1} = destringify($2);
167  } elsif($arg=~/^--?(\w+)$/) { # only the key, buffer it and expect the value on the next round
168  $temp_key = $1;
169  } else {
170  push @list, $arg;
171  }
172  }
173  return (\%pairs, \@list);
174 }
175 
176 
177 =head2 find_submodules
178 
179  Description: This function takes one argument ("prefix" of a module name),
180  transforms it into a directory name from the filesystem's point of view
181  and finds all module names in these "directories".
182  Each module_name found is reported only once,
183  even if there are multiple matching files in different directories.
184 
185  Callers : scripts
186 
187 =cut
188 
189 sub find_submodules {
190  my $prefix = shift @_;
191 
192  $prefix=~s{::}{/}g;
193 
194  my %seen_module_name = ();
195 
196  foreach my $inc (@INC) {
197  foreach my $full_module_path (glob("$inc/$prefix/*.pm")) {
198  my $module_name = substr($full_module_path, length($inc)+1, -3); # remove leading "$inc/" and trailing '.pm'
199  $module_name=~s{/}{::}g; # transform back to module_name space
200 
201  $seen_module_name{$module_name}++;
202  }
203  }
204  return [ keys %seen_module_name ];
205 }
206 
207 
208 =head2 load_file_or_module
209 
210  Description: This function takes one argument, tries to determine whether it is a module name ('::'-separated)
211  or a path to the module ('/'-separated), finds the module_name and dynamically loads it.
212 
213  Callers : scripts
214 
215 =cut
216 
217 sub load_file_or_module {
218  my $file_or_module = pop @_;
219 
220  my $module_name;
221 
222  if( $file_or_module=~/^(\w|::)+$/ ) {
223 
224  $module_name = $file_or_module;
225 
226  } elsif(-r $file_or_module) {
227 
228  if(my $package_line = `grep ^package $file_or_module`) {
229  if($package_line=~/^\s*package\s+((?:\w|::)+)\s*;/) {
230 
231  $module_name = $1;
232 
233  } else {
234  die "Package line format in '$file_or_module' unrecognized:\n$package_line\n";
235  }
236  } else {
237  die "Could not find the package definition line in '$file_or_module'\n";
238  }
239 
240  } else {
241  die "The parameter '$file_or_module' neither seems to be a valid module nor a valid readable file\n";
242  }
243 
244  eval "require $module_name;";
245  die $@ if ($@);
246 
247  return $module_name;
248 }
249 
250 
251 =head2 split_for_bash
252 
253  Description: This function takes one argument (String) and splits it assuming it represents bash command line parameters.
254  It mainly splits on whitespace, except for cases when spaces are trapped between quotes or apostrophes.
255  In the latter case the outer quotes are removed.
256  Returntype : list of Strings
257 
258 =cut
259 
260 sub split_for_bash {
261  my $cmd = pop @_;
262 
263  my @cmd = ();
264 
265  if( defined($cmd) ) {
266  @cmd = ($cmd =~ /((?:".*?"|'.*?'|\S)+)/g); # split on space except for quoted strings
267 
268  foreach my $syll (@cmd) { # remove the outer quotes or apostrophes
269  if($syll=~/^(\S*?)"(.*?)"(\S*?)$/) {
270  $syll = $1 . $2 . $3;
271  } elsif($syll=~/^(\S*?)'(.*?)'(\S*?)$/) {
272  $syll = $1 . $2 . $3;
273  }
274  }
275  }
276 
277  return @cmd;
278 }
279 
280 
281 =head2 go_figure_dbc
282 
283  Description: This function tries its best to build a DBConnection from $foo
284  It may need $reg_type if $foo is a Registry key and there are more than 1 DBAdaptors for it
285 
286 =cut
287 
288 sub go_figure_dbc {
289  my ($foo, $reg_type) = @_; # NB: the second parameter is used by a Compara Runnable
290 
292 
293 # if(UNIVERSAL::isa($foo, 'Bio::EnsEMBL::DBSQL::DBConnection')) { # already a DBConnection, return it:
294  if ( ref($foo) =~ /DBConnection$/ ) { # already a DBConnection, hive-ify it and return
295  return bless $foo, 'Bio::EnsEMBL::Hive::DBSQL::DBConnection';
296 
297 # } elsif(UNIVERSAL::can($foo, 'dbc') and UNIVERSAL::isa($foo->dbc, 'Bio::EnsEMBL::DBSQL::DBConnection')) {
298  } elsif(UNIVERSAL::can($foo, 'dbc') and ref($foo->dbc) =~ /DBConnection$/) {
299 
300  return bless $foo->dbc, 'Bio::EnsEMBL::Hive::DBSQL::DBConnection';
301 
302 # } elsif(UNIVERSAL::can($foo, 'db') and UNIVERSAL::can($foo->db, 'dbc') and UNIVERSAL::isa($foo->db->dbc, 'Bio::EnsEMBL::DBSQL::DBConnection')) { # another data adaptor or Runnable:
303  } elsif(UNIVERSAL::can($foo, 'db') and UNIVERSAL::can($foo->db, 'dbc') and ref($foo->db->dbc) =~ /DBConnection$/) { # another data adaptor or Runnable:
304 
305  return bless $foo->db->dbc, 'Bio::EnsEMBL::Hive::DBSQL::DBConnection';
306 
307  } elsif(ref($foo) eq 'HASH') {
308 
309  return Bio::EnsEMBL::Hive::DBSQL::DBConnection->new( %$foo );
310 
311  } elsif($foo =~ m{^(\w*)://(?:(\w+)(?:\:([^/\@]*))?\@)?(?:([\w\-\.]+)(?:\:(\d+))?)?/(\w*)} ) { # We can probably use a simpler regexp
312 
313  return Bio::EnsEMBL::Hive::DBSQL::DBConnection->new( -url => $foo );
314 
315  } else {
316  unless(ref($foo)) { # maybe it is simply a registry key?
317  my $dba;
318 
319  eval {
320  require Bio::EnsEMBL::Registry;
321 
322  if($foo=~/^(\w+):(\w+)$/) {
323  ($reg_type, $foo) = ($1, $2);
324  }
325 
326  if($reg_type) {
327  $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($foo, $reg_type);
328  } else {
329  my $dbas = Bio::EnsEMBL::Registry->get_all_DBAdaptors(-species => $foo);
330 
331  if( scalar(@$dbas) == 1 ) {
332  $dba = $dbas->[0];
333  } elsif( @$dbas ) {
334  die "The registry contains multiple entries for '$foo', please prepend the reg_alias with the desired type";
335  }
336  }
337  };
338 
339  if(UNIVERSAL::can($dba, 'dbc')) {
340  return bless $dba->dbc, 'Bio::EnsEMBL::Hive::DBSQL::DBConnection';
341  }
342  }
343  die "Sorry, could not figure out how to make a DBConnection object out of '$foo'";
344  }
345 }
346 
347 
348 sub throw {
349  my $msg = pop @_;
350 
351  # TODO: newer versions of Carp are much more tunable, but I am stuck with v1.08 .
352  # Alternatively, we could implement our own stack reporter instead of Carp::confess.
353  confess $msg;
354 }
355 
356 
357 =head2 join_command_args
358 
359  Argument[0]: String or Arrayref of Strings
360  Description: Prepares the command to be executed by system(). It is needed if the
361  command is in fact composed of multiple commands.
362  Returns: Tuple (boolean,string). The boolean indicates whether it was needed to
363  join the arguments. The string is the new command-line string.
364  PS: Shamelessly adapted from http://www.perlmonks.org/?node_id=908096
365 
366 =cut
367 
368 my %shell_characters = map {$_ => 1} qw(< > >> 2> 2>&1 | && || ;);
369 
370 sub join_command_args {
371  my $args = shift;
372  return (0,$args) unless ref($args);
373 
374  # system() can only spawn 1 process. For multiple commands piped
375  # together or if redirections are used, it needs a shell to parse
376  # a string representing the whole command
377  my $join_needed = (grep {$shell_characters{$_}} @$args) ? 1 : 0;
378 
379  my @new_args = ();
380  foreach my $a (@$args) {
381  if ($shell_characters{$a} or $a =~ /^[a-zA-Z0-9_\/\-]+\z/) {
382  push @new_args, $a;
383  } else {
384  # Escapes the single-quotes and protects the arguments
385  $a =~ s/'/'\\''/g;
386  push @new_args, "'$a'";
387  }
388  }
389 
390  return ($join_needed,join(' ', @new_args));
391 }
392 
393 
394 =head2 whoami
395 
396  Description: Returns the name of the user who's currently running Perl.
397  $ENV{'USER'} is the most common source but it can be missing
398  so we also default to a builtin method.
399 
400 =cut
401 
402 sub whoami {
403  return ($ENV{'USER'} || (getpwuid($<))[0]);
404 }
405 
406 
407 =head2 timeout
408 
409  Argument[0]: (coderef) Callback subroutine
410  Argument[1]: (integer) Time to wait (in seconds)
411  Description: Calls the callback whilst ensuring it does not take more than the allowed time to run.
412  Returns: The return value (scalar context) of the callback or -2 if the
413  command had to be aborted.
414  FIXME: may need a better mechanism that allows callbacks to return -2 too
415 
416 =cut
417 
418 sub timeout {
419  my ($callback, $timeout) = @_;
420  if (not $timeout) {
421  return $callback->();
422  }
423 
424  my $ret;
425  ## Adapted from the TimeLimit pacakge: http://www.perlmonks.org/?node_id=74429
426  my $die_text = "_____RunCommandTimeLimit_____\n";
427  my $old_alarm = alarm(0); # turn alarm off and read old value
428  {
429  local $SIG{ALRM} = 'IGNORE'; # ignore alarms in this scope
430 
431  eval
432  {
433  local $SIG{__DIE__}; # turn die handler off in eval block
434  local $SIG{ALRM} = sub { die $die_text };
435  alarm($timeout); # set alarm
436  $ret = $callback->();
437  };
438 
439  # Note the alarm is still active here - however we assume that
440  # if we got here without an alarm the user's code succeeded -
441  # hence the IGNOREing of alarms in this scope
442 
443  alarm 0; # kill off alarm
444  }
445 
446  alarm $old_alarm; # restore alarm
447 
448  if ($@) {
449  # the eval returned an error
450  die $@ if $@ ne $die_text;
451  return -2;
452  }
453  return $ret;
454 }
455 
456 
457 1;
458