ensembl-hive  2.6
Utils.pm
Go to the documentation of this file.
1 
2 =pod
3 
4 =head1 NAME
5 
7 
8 =head1 SYNOPSIS
9 
10  # Example of an import:
11  use Bio::EnsEMBL::Hive::Utils 'stringify';
12  my $input_id_string = stringify($input_id_hash);
13 
14  # Example of inheritance:
15  use base ('Bio::EnsEMBL::Hive::Utils', ...);
16  my $input_id_string = $self->stringify($input_id_hash);
17 
18  # Example of a direct call:
20  my $input_id_string = Bio::EnsEMBL::Hive::Utils::stringify($input_id_hash);
21 
22 =head1 DESCRIPTION
23 
24  This module provides general utility functions that can be used in different contexts through three different calling mechanisms:
25 
26  * import: another module/script can selectively import methods from this module into its namespace
27 
28  * inheritance: another module can inherit from this one and so implicitly acquire the methods into its namespace
29 
30  * direct call to a module's method: another module/script can directly call a method from this module prefixed with this module's name
31 
32 =head1 LICENSE
33 
34  Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
35  Copyright [2016-2024] EMBL-European Bioinformatics Institute
36 
37  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
38  You may obtain a copy of the License at
39 
40  http://www.apache.org/licenses/LICENSE-2.0
41 
42  Unless required by applicable law or agreed to in writing, software distributed under the License
43  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
44  See the License for the specific language governing permissions and limitations under the License.
45 
46 =head1 CONTACT
47 
48  Please subscribe to the Hive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss Hive-related questions or to be notified of our updates
49 
50 =cut
51 
52 
53 package Bio::EnsEMBL::Hive::Utils;
54 
55 use strict;
56 use warnings;
57 use Carp ('confess');
58 use Data::Dumper;
59 use List::Util 'max';
60 use Scalar::Util qw(looks_like_number);
61 #use Bio::EnsEMBL::Hive::DBSQL::DBConnection; # causes warnings that all exported functions have been redefined
62 
63 use Exporter 'import';
64 our @EXPORT_OK = qw(stringify destringify dir_revhash parse_cmdline_options find_submodules load_file_or_module split_for_bash go_figure_dbc throw join_command_args whoami timeout print_aligned_fields);
65 
66 no warnings ('once'); # otherwise the next line complains about $Carp::Internal being used just once
67 $Carp::Internal{ (__PACKAGE__) }++;
68 
69 
70 =head2 stringify
71 
72  Description: This function takes in a Perl data structure and stringifies it using specific configuration
73  that allows us to store/recreate this data structure according to our specific storage/communication requirements.
74  NOTE: Some recursive structures are not stringified in a way that allows destringification with destringify
75 
76  Callers : Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor # stringification of input_id() hash
77  Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf # stringification of parameters() hash
78 
79 =cut
80 
81 sub stringify {
82  my $structure = pop @_;
83 
84  local $Data::Dumper::Indent = 0; # we want everything on one line
85  local $Data::Dumper::Terse = 1; # and we want it without dummy variable names
86  local $Data::Dumper::Sortkeys = 1; # make stringification more deterministic
87  local $Data::Dumper::Quotekeys = 1; # conserve some space
88  local $Data::Dumper::Useqq = 1; # escape the \n and \t correctly
89  local $Data::Dumper::Pair = ' => '; # make sure we always produce Perl-parsable structures, no matter what is set externally
90  local $Data::Dumper::Maxdepth = 0; # make sure nobody can mess up stringification by setting a lower Maxdepth
91  local $Data::Dumper::Deepcopy = 1; # avoid self-references in case the same structure is reused within params
92  local $Data::Dumper::Sparseseen= 1; # optimized "seen" hash of scalars
93 
94  return Dumper($structure);
95 }
96 
97 =head2 destringify
98 
99  Description: This function takes in a string that may or may not contain a stingified Perl structure.
100  If it seems to contain a hash/array/quoted_string, the contents is evaluated, otherwise it is returned "as is".
101  This function is mainly used to read values from 'meta' table that may represent Perl structures, but generally don't have to.
102 
103  Callers : Bio::EnsEMBL::Hive::DBSQL::PipelineWideParametersAdaptor # destringification of general 'meta' params
104  beekeeper.pl script # destringification of the 'pipeline_name' meta param
105 
106 =cut
107 
108 sub destringify {
109  my $value = pop @_;
110 
111  if(defined $value) {
112  if($value=~/^'.*'$/s
113  or $value=~/^".*"$/s
114  or $value=~/^{.*}$/s
115  or $value=~/^\[.*\]$/s
116  or looks_like_number($value) # Needed for pipeline_wide_parameters as each value is destringified independently and the JSON writer would otherwise force writing numbers as strings
117  or $value eq 'undef') {
118 
119  $value = eval($value);
120  }
121  }
122 
123  return $value;
124 }
125 
126 =head2 dir_revhash
127 
128  Description: This function takes in a string (which is usually a numeric id) and turns its reverse into a multilevel directory hash.
129  Please note that no directory is created at this step - it is purely a string conversion function.
130 
131  Callers : Bio::EnsEMBL::Hive::Worker # hashing of the worker output directories
132  Bio::EnsEMBL::Hive::RunnableDB::JobFactory # hashing of an arbitrary id
133 
134 =cut
135 
136 sub dir_revhash {
137  my $id = pop @_;
138 
139  my @dirs = reverse(split(//, $id));
140  pop @dirs; # do not use the first digit for hashing
141 
142  return join('/', @dirs);
143 }
144 
145 
146 =head2 parse_cmdline_options
147 
148  Description: This function reads all options from command line into a key-value hash
149  (keys must be prefixed with a single or double dash, the following term becomes the value).
150  The rest of the terms go into the list.
151  Command line options are not removed from @ARGV, so the same or another parser can be run again if needed.
152 
153  Callers : scripts
154 
155 =cut
156 
157 sub parse_cmdline_options {
158  my %pairs = ();
159  my @list = ();
160 
161  my $temp_key;
162 
163  foreach my $arg (@ARGV) {
164  if($temp_key) { # only the value, get the key from buffer
165  $pairs{$temp_key} = destringify($arg);
166  $temp_key = '';
167  } elsif($arg=~/^--?(\w+)=(.+)$/) { # both the key and the value
168  $pairs{$1} = destringify($2);
169  } elsif($arg=~/^--?(\w+)$/) { # only the key, buffer it and expect the value on the next round
170  $temp_key = $1;
171  } else {
172  push @list, $arg;
173  }
174  }
175  return (\%pairs, \@list);
176 }
177 
178 
179 =head2 find_submodules
180 
181  Description: This function takes one argument ("prefix" of a module name),
182  transforms it into a directory name from the filesystem's point of view
183  and finds all module names in these "directories".
184  Each module_name found is reported only once,
185  even if there are multiple matching files in different directories.
186 
187  Callers : scripts
188 
189 =cut
190 
191 sub find_submodules {
192  my $prefix = shift @_;
193 
194  $prefix=~s{::}{/}g;
195 
196  my %seen_module_name = ();
197 
198  foreach my $inc (@INC) {
199  foreach my $full_module_path (glob("$inc/$prefix/*.pm")) {
200  my $module_name = substr($full_module_path, length($inc)+1, -3); # remove leading "$inc/" and trailing '.pm'
201  $module_name=~s{/}{::}g; # transform back to module_name space
202 
203  $seen_module_name{$module_name}++;
204  }
205  }
206  return [ keys %seen_module_name ];
207 }
208 
209 
210 =head2 load_file_or_module
211 
212  Description: This function takes one argument, tries to determine whether it is a module name ('::'-separated)
213  or a path to the module ('/'-separated), finds the module_name and dynamically loads it.
214 
215  Callers : scripts
216 
217 =cut
218 
219 sub load_file_or_module {
220  my $file_or_module = pop @_;
221 
222  my $module_name;
223 
224  if( $file_or_module=~/^(\w|::)+$/ ) {
225 
226  $module_name = $file_or_module;
227 
228  } elsif(-r $file_or_module) {
229 
230  if(my $package_line = `grep ^package $file_or_module`) {
231  if($package_line=~/^\s*package\s+((?:\w|::)+)\s*;/) {
232 
233  $module_name = $1;
234 
235  } else {
236  die "Package line format in '$file_or_module' unrecognized:\n$package_line\n";
237  }
238  } else {
239  die "Could not find the package definition line in '$file_or_module'\n";
240  }
241 
242  } else {
243  die "The parameter '$file_or_module' neither seems to be a valid module nor a valid readable file\n";
244  }
245 
246  eval "require $module_name;";
247  die $@ if ($@);
248 
249  return $module_name;
250 }
251 
252 
253 =head2 split_for_bash
254 
255  Description: This function takes one argument (String) and splits it assuming it represents bash command line parameters.
256  It mainly splits on whitespace, except for cases when spaces are trapped between quotes or apostrophes.
257  In the latter case the outer quotes are removed.
258  Returntype : list of Strings
259 
260 =cut
261 
262 sub split_for_bash {
263  my $cmd = pop @_;
264 
265  my @cmd = ();
266 
267  if( defined($cmd) ) {
268  @cmd = ($cmd =~ /((?:".*?"|'.*?'|\S)+)/g); # split on space except for quoted strings
269 
270  foreach my $syll (@cmd) { # remove the outer quotes or apostrophes
271  if($syll=~/^(\S*?)"(.*?)"(\S*?)$/) {
272  $syll = $1 . $2 . $3;
273  } elsif($syll=~/^(\S*?)'(.*?)'(\S*?)$/) {
274  $syll = $1 . $2 . $3;
275  }
276  }
277  }
278 
279  return @cmd;
280 }
281 
282 
283 =head2 go_figure_dbc
284 
285  Description: This function tries its best to build a DBConnection from $foo
286  It may need $reg_type if $foo is a Registry key and there are more than 1 DBAdaptors for it
287 
288 =cut
289 
290 sub go_figure_dbc {
291  my ($foo, $reg_type) = @_; # NB: the second parameter is used by a Compara Runnable
292 
294 
295 # if(UNIVERSAL::isa($foo, 'Bio::EnsEMBL::DBSQL::DBConnection')) { # already a DBConnection, return it:
296  if ( ref($foo) =~ /DBConnection$/ ) { # already a DBConnection, hive-ify it and return
297  return bless $foo, 'Bio::EnsEMBL::Hive::DBSQL::DBConnection';
298 
299 # } elsif(UNIVERSAL::can($foo, 'dbc') and UNIVERSAL::isa($foo->dbc, 'Bio::EnsEMBL::DBSQL::DBConnection')) {
300  } elsif(UNIVERSAL::can($foo, 'dbc') and ref($foo->dbc) =~ /DBConnection$/) {
301 
302  return bless $foo->dbc, 'Bio::EnsEMBL::Hive::DBSQL::DBConnection';
303 
304 # } elsif(UNIVERSAL::can($foo, 'db') and UNIVERSAL::can($foo->db, 'dbc') and UNIVERSAL::isa($foo->db->dbc, 'Bio::EnsEMBL::DBSQL::DBConnection')) { # another data adaptor or Runnable:
305  } elsif(UNIVERSAL::can($foo, 'db') and UNIVERSAL::can($foo->db, 'dbc') and ref($foo->db->dbc) =~ /DBConnection$/) { # another data adaptor or Runnable:
306 
307  return bless $foo->db->dbc, 'Bio::EnsEMBL::Hive::DBSQL::DBConnection';
308 
309  } elsif(ref($foo) eq 'HASH') {
310 
312 
313  } elsif($foo =~ m{^(\w*)://(?:(\w+)(?:\:([^/\@]*))?\@)?(?:([\w\-\.]+)(?:\:(\d+))?)?/(\w*)} ) { # We can probably use a simpler regexp
314 
315  return Bio::EnsEMBL::Hive::DBSQL::DBConnection->new( -url => $foo );
316 
317  } else {
318  unless(ref($foo)) { # maybe it is simply a registry key?
319  my $dba;
320 
321  eval {
322  require Bio::EnsEMBL::Registry;
323 
324  if($foo=~/^(\w+):(\w+)$/) {
325  ($reg_type, $foo) = ($1, $2);
326  }
327 
328  if($reg_type) {
329  $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($foo, $reg_type);
330  } else {
331  my $dbas = Bio::EnsEMBL::Registry->get_all_DBAdaptors(-species => $foo);
332 
333  if( scalar(@$dbas) == 1 ) {
334  $dba = $dbas->[0];
335  } elsif( @$dbas ) {
336  die "The registry contains multiple entries for '$foo', please prepend the reg_alias with the desired type";
337  }
338  }
339  };
340 
341  if(UNIVERSAL::can($dba, 'dbc')) {
342  return bless $dba->dbc, 'Bio::EnsEMBL::Hive::DBSQL::DBConnection';
343  }
344  }
345  die "Sorry, could not figure out how to make a DBConnection object out of '$foo'";
346  }
347 }
348 
349 
350 sub throw {
351  my $msg = pop @_;
352 
353  # TODO: newer versions of Carp are much more tunable, but I am stuck with v1.08 .
354  # Alternatively, we could implement our own stack reporter instead of Carp::confess.
355  confess $msg;
356 }
357 
358 
359 =head2 join_command_args
360 
361  Argument[0]: String or Arrayref of Strings
362  Description: Prepares the command to be executed by system(). It is needed if the
363  command is in fact composed of multiple commands.
364  Returns: Tuple (boolean,string). The boolean indicates whether it was needed to
365  join the arguments. The string is the new command-line string.
366  PS: Shamelessly adapted from http://www.perlmonks.org/?node_id=908096
367 
368 =cut
369 
370 my %shell_characters = map {$_ => 1} qw(< > >> 2> 2>&1 | && || ;);
371 
372 sub join_command_args {
373  my $args = shift;
374  return (0,$args) unless ref($args);
375 
376  # system() can only spawn 1 process. For multiple commands piped
377  # together or if redirections are used, it needs a shell to parse
378  # a string representing the whole command
379  my $join_needed = (grep {$shell_characters{$_}} @$args) ? 1 : 0;
380 
381  my @new_args = ();
382  foreach my $a (@$args) {
383  if ($shell_characters{$a} or $a =~ /^[a-zA-Z0-9_\/\-]+\z/) {
384  push @new_args, $a;
385  } else {
386  # Escapes the single-quotes and protects the arguments
387  $a =~ s/'/'\\''/g;
388  push @new_args, "'$a'";
389  }
390  }
391 
392  return ($join_needed,join(' ', @new_args));
393 }
394 
395 
396 =head2 whoami
397 
398  Description: Returns the name of the user who's currently running Perl.
399  $ENV{'USER'} is the most common source but it can be missing
400  so we also default to a builtin method.
401 
402 =cut
403 
404 sub whoami {
405  return ($ENV{'USER'} || (getpwuid($<))[0]);
406 }
407 
408 
409 =head2 timeout
410 
411  Argument[0]: (coderef) Callback subroutine
412  Argument[1]: (integer) Time to wait (in seconds)
413  Description: Calls the callback whilst ensuring it does not take more than the allowed time to run.
414  Returns: The return value (scalar context) of the callback or -2 if the
415  command had to be aborted.
416  FIXME: may need a better mechanism that allows callbacks to return -2 too
417 
418 =cut
419 
420 sub timeout {
421  my ($callback, $timeout) = @_;
422  if (not $timeout) {
423  return $callback->();
424  }
425 
426  my $ret;
427  ## Adapted from the TimeLimit pacakge: http://www.perlmonks.org/?node_id=74429
428  my $die_text = "_____RunCommandTimeLimit_____\n";
429  my $old_alarm = alarm(0); # turn alarm off and read old value
430  {
431  local $SIG{ALRM} = 'IGNORE'; # ignore alarms in this scope
432 
433  eval
434  {
435  local $SIG{__DIE__}; # turn die handler off in eval block
436  local $SIG{ALRM} = sub { die $die_text };
437  alarm($timeout); # set alarm
438  $ret = $callback->();
439  };
440 
441  # Note the alarm is still active here - however we assume that
442  # if we got here without an alarm the user's code succeeded -
443  # hence the IGNOREing of alarms in this scope
444 
445  alarm 0; # kill off alarm
446  }
447 
448  alarm $old_alarm; # restore alarm
449 
450  if ($@) {
451  # the eval returned an error
452  die $@ if $@ ne $die_text;
453  return -2;
454  }
455  return $ret;
456 }
457 
458 
459 =head2 print_aligned_fields
460 
461  Argument[0]: Arrayref of key-value Hashrefs
462  Argument[1]: Template string
463  Description: For each hashref the template string will be interpolated (replacing
464  each key with its value) and printed, but making sure the same fields
465  are (right) aligned across all lines.
466  The interpolator searches for C<%(key)> patterns and replaces them
467  with the value found in the hashref. The key name can be prefixed with
468  a dash to require a left alignment instead.
469 
470 =cut
471 
472 sub print_aligned_fields {
473  my $all_fields = shift;
474  my $template = shift;
475 
476  return unless @$all_fields;
477 
478  my @field_names = keys %{$all_fields->[0]};
479  my @all_widths;
480  my %col_width;
481 
482  # Get the width of each element
483  foreach my $line_fields (@$all_fields) {
484  # Remove the ANSI colour codes before getting the length
485  my %row_width = map {my $s = $line_fields->{$_}; $s =~ s/\x1b\[[0-9;]*m//g; $_ => length($s)} @field_names;
486  push @all_widths, \%row_width;
487  }
488 
489  # Get the width of each field (across all lines)
490  foreach my $field_name (@field_names) {
491  $col_width{$field_name} = max(map {$_->{$field_name}} @all_widths);
492  }
493 
494  # Interpolate and print each line
495  foreach my $line_fields (@$all_fields) {
496  my $row_width = shift @all_widths;
497  my $line = $template;
498  $line =~ s/%\((-?)([a-zA-Z_]\w*)\)/
499  $1 ?
500  $line_fields->{$2} . (' ' x ($col_width{$2}-$row_width->{$2}))
501  :
502  (' ' x ($col_width{$2}-$row_width->{$2})) . $line_fields->{$2};
503  /ge;
504  print $line, "\n";
505  }
506 }
507 
508 
509 1;
510 
Bio::EnsEMBL::Hive::Utils
Definition: Collection.pm:4
Bio::EnsEMBL::Hive::DBSQL::DBConnection::new
public new()
map
public map()
Bio::EnsEMBL::Hive::DBSQL::DBConnection
Definition: DBConnection.pm:20
Bio::EnsEMBL::Hive::Version
Definition: Version.pm:19
Bio::EnsEMBL::Hive::Utils::stringify
public stringify()
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
Bio::EnsEMBL::Registry::get_DBAdaptor
public DBAdaptor get_DBAdaptor()
about
public about()
Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf
Definition: HiveGeneric_conf.pm:54
Bio::EnsEMBL::Hive
Definition: Hive.pm:38
Bio::EnsEMBL::Registry::get_all_DBAdaptors
public List get_all_DBAdaptors()
Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor
Definition: AnalysisJobAdaptor.pm:22