ensembl-hive  2.5
URL.pm
Go to the documentation of this file.
1 =pod
2 
3 =head1 NAME
4 
6 
7 =head1 DESCRIPTION
8 
9  A Hive-specific URL parser.
10 
11 =head1 LICENSE
12 
13  Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
14  Copyright [2016-2022] EMBL-European Bioinformatics Institute
15 
16  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
17  You may obtain a copy of the License at
18 
19  http://www.apache.org/licenses/LICENSE-2.0
20 
21  Unless required by applicable law or agreed to in writing, software distributed under the License
22  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23  See the License for the specific language governing permissions and limitations under the License.
24 
25 =head1 CONTACT
26 
27  Please subscribe to the Hive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss Hive-related questions or to be notified of our updates
28 
29 =cut
30 
31 
32 package Bio::EnsEMBL::Hive::Utils::URL;
33 
34 use strict;
35 use warnings;
36 use Bio::EnsEMBL::Hive::Utils ('stringify');
37 
38 
39 sub parse {
40  my $url = shift @_ or return;
41 
42  my ($old_parse, $new_parse,
43  $dbconn_part, $url_parts_hash, $table_name, $tparam_name, $tparam_value, $conn_param_string, $query_part);
44 
45  # In case the whole URL is quoted (should we do this with double-quotes too ?)
46  if( $url=~/^'(.*)'$/ ) {
47  $url = $1;
48  }
49 
50  if( $url=~/^\w+$/ ) {
51 
52  $new_parse = {
53  'unambig_url' => ':///',
54  'query_params' => { 'object_type' => 'Analysis', 'logic_name' => $url, },
55  };
56 
57  } else {
58 
59  # Perform environment variable substitution separately with and without curly braces.
60  # Make sure expressions stay as they were if we were unable to substitute them.
61  #
62  $url =~ s/\$(?|\{(\w+)\}|(\w+))/defined($ENV{$1})?"$ENV{$1}":"\$$1"/eg;
63 
64  if( ($dbconn_part, @$url_parts_hash{'driver', 'user', 'pass', 'host', 'port', 'dbname'}, $table_name, $tparam_name, $tparam_value, $conn_param_string) =
65  $url =~ m{^((\w*)://(?:(\w+)(?:\:([^/\@]*))?\@)?(?:([\w\-\.]+)(?:\:(\d*))?)?/([\w\-\.]*))(?:/(\w+)(?:\?(\w+)=([\w\[\]\{\}]*))?)?((?:;(\w+)=(\w+))*)$} ) {
66 
67  my ($dummy, %conn_params) = split(/[;=]/, $conn_param_string // '' );
68  my $query_params;
69  my $exception_from_OLD_format;
70 
71  if($table_name) {
72  if($table_name eq 'analysis') {
73  $query_params->{'object_type'} = 'Analysis';
74  $query_params->{$tparam_name} = $tparam_value; # $tparam_name is 'logic_name' or 'dbID', $tparam_value is the analysis_name or dbID
75  } elsif($table_name eq 'accu') {
76  $query_params->{'object_type'} = 'Accumulator';
77  $query_params->{'accu_name'} = $tparam_name;
78  $query_params->{'accu_address'} = $tparam_value;
79  } elsif($table_name eq 'job') {
80  die "Jobs cannot yet be located by URLs, sorry";
81  } else {
82  $query_params->{'object_type'} = 'NakedTable';
83  $query_params->{'table_name'} = $table_name;
84  if($tparam_name) {
85  if( $tparam_name eq 'insertion_method' ) { # extra hint on the OLD format from the insertion_method
86  $query_params->{'insertion_method'} = $tparam_value;
87  } elsif( $tparam_name eq 'table_name' ) { # hinting this is NEW format with a bipartite dbpath
88  $exception_from_OLD_format = 1;
89  }
90  }
91  }
92  }
93 
94  if($exception_from_OLD_format) {
95  warn "\nOLD URL parser thinks you are using the NEW URL syntax for a remote $query_params->{'object_type'}, so skipping it (it may be wrong!)\n";
96  } else {
97  my $unambig_url = hash_to_unambig_url( $url_parts_hash );
98 
99  $old_parse = {
100  'dbconn_part' => $dbconn_part,
101  %$url_parts_hash,
102  'conn_params' => \%conn_params,
103  'query_params' => $query_params,
104  'unambig_url' => $unambig_url,
105  };
106  }
107  } # /if OLD format
108 
109  if( ($dbconn_part, @$url_parts_hash{'driver', 'user', 'pass', 'host', 'port', 'dbname'}, $query_part, $conn_param_string) =
110  $url =~ m{^((\w+)://(?:(\w+)(?:\:([^/\@]*))?\@)?(?:([\w\-\.]+)(?:\:(\d*))?)?(?:/([/~\w\-\.]*))?)?(?:\?(\w+=[\w\[\]\{\}]*(?:&\w+=[\w\[\]\{\}]*)*))?(;\w+=\w+(?:;\w+=\w+)*)?$} ) {
111 
112  my ($dummy, %conn_params) = split(/[;=]/, $conn_param_string // '' );
113  my $query_params = $query_part ? { split(/[&=]/, $query_part ) } : undef;
114  my $exception_from_NEW_format;
115 
116  my ($driver, $dbname) = @$url_parts_hash{'driver', 'dbname'};
117 
118  if(!$query_params and ($driver eq 'mysql' or $driver eq 'pgsql') and $dbname and $dbname=~m{/}) { # a special case of multipart dbpath hints at the OLD format (or none at all)
119 
120  $query_params = { 'object_type' => 'NakedTable' };
121  $exception_from_NEW_format = 1;
122 
123  } elsif($query_params and not (my $object_type = $query_params->{'object_type'})) { # do a bit of guesswork:
124 
125  if($query_params->{'logic_name'}) {
126  $object_type = 'Analysis';
127  if($dbname and $dbname=~m{^([/~\w\-\.]*)/analysis$}) {
128  $exception_from_NEW_format = 1;
129  }
130  } elsif($query_params->{'job_id'}) {
131  $object_type = 'AnalysisJob';
132  } elsif($query_params->{'semaphore_id'}) {
133  $object_type = 'Semaphore';
134  } elsif($query_params->{'accu_name'}) { # we don't require $query_params->{'accu_address'} to support scalar accu
135  $object_type = 'Accumulator';
136  } elsif($query_params->{'table_name'}) { # NB: the order is important here, in case table_name is reset for non-NakedTables
137  $object_type = 'NakedTable';
138  } elsif($query_params->{'insertion_method'}) {
139  $object_type = 'NakedTable';
140  if($dbname and $dbname=~m{^([/~\w\-\.]*)/(\w+)$}) {
141  $exception_from_NEW_format = 1;
142  }
143  }
144 
145  $query_params->{'object_type'} = $object_type;
146  }
147 
148  if($exception_from_NEW_format) {
149  warn "\nNEW URL parser thinks you are using the OLD URL syntax for a remote $query_params->{'object_type'}, so skipping it (it may be wrong!)\n";
150  } else {
151  my $unambig_url = hash_to_unambig_url( $url_parts_hash );
152 
153  $new_parse = {
154  'dbconn_part' => $dbconn_part,
155  %$url_parts_hash,
156  'conn_params' => \%conn_params,
157  'query_params' => $query_params,
158  'unambig_url' => $unambig_url,
159  };
160  }
161  } # /if NEW format
162 
163  }
164 
165  if($new_parse and $old_parse) {
166  if(stringify($old_parse) eq stringify($new_parse)) {
167  return $new_parse;
168  } else {
169  warn "\nThe URL '$url' can be parsed ambiguously:\n\t".stringify($old_parse)."\nvs\n\t".stringify($new_parse)."\n). Using the OLD parser at the moment.\nPlease change your URL to match the new format if you see weird behaviour\n\n";
170  return $old_parse;
171  }
172  } elsif($new_parse) {
173  return $new_parse;
174  } elsif($old_parse) {
175  warn "\nThe URL '$url' only works with the old parser, please start using the new syntax as the old parser will soon be deprecated\n\n";
176  return $old_parse;
177  } else {
178  warn "\nThe URL '$url' could not be parsed, please check it\n";
179  return;
180  }
181 }
182 
183 
184 =head2 hash_to_unambig_url
185 
186  Arg [1] : a hash describing (at least) db connection parameters
187  Example : my $unambig_url = hash_to_unambig_url( $url_parts_hash );
188  Description : Generates a degenerate URL that omits unnecessary parts (password, default port numbers)
189  : but tries to uniquely represent a connection.
190  Returntype : a string
191 
192 =cut
193 
194 sub hash_to_unambig_url {
195  my $url_parts_hash = shift @_; # expected to contain the parts from which to build a URL
196 
197  my $driver = $url_parts_hash->{'driver'} // '';
198  my $unambig_port = $url_parts_hash->{'port'} // { 'mysql' => 3306, 'pgsql' => 5432, 'sqlite' => '' }->{$driver} // '';
199  my $unambig_host = ( $url_parts_hash->{'host'} // '' ) eq 'localhost' ? '127.0.0.1' : ( $url_parts_hash->{'host'} // '' );
200  my $unambig_url = $driver .'://'. ($url_parts_hash->{'user'} ? $url_parts_hash->{'user'}.'@' : '')
201  . $unambig_host . ( $unambig_port ? ':'.$unambig_port : '') .'/'. ( $url_parts_hash->{'dbname'} // '' );
202 
203  return $unambig_url;
204 }
205 
206 
207 =head2 hash_to_url
208 
209  Arg [1] : a hash describing a db connection, or accumulator, as generated by parse_url
210  Example : my $parse = parse_url($url1); my $url2 = hash_to_url($parse);
211  Description : Generates a "new-style" URL from a hash containing the parse of a URL
212  : (old or new style). In cases where a trailing slash is optional, it leaves
213  : off the trailing slash
214  Returntype : a URL as a string
215 
216 =cut
217 
218 sub hash_to_url {
219  my $parse = shift;
220 
221  my $location_part = '';
222  if ($parse->{'driver'}) {
223  $location_part = join('',
224  $parse->{'driver'} // '',
225  '://',
226  $parse->{'user'} ? $parse->{'user'}.($parse->{'pass'} ? ':' . $parse->{'pass'} : '').'@' : '',
227  $parse->{'host'} ? $parse->{'host'}.($parse->{'port'} ? ':' . $parse->{'port'} : '') : '',
228  '/',
229  $parse->{'dbname'} // '',
230  );
231  }
232 
233  # Query part:
234  my $qp_hash = \%{ $parse->{'query_params'} };
235  my $object_type = delete $qp_hash->{'object_type'} // ''; # in most cases we don't need object_type in the URL
236  my $query_params_part =
237  (($object_type eq 'Analysis') && !$location_part)
238  ? $qp_hash->{'logic_name'}
239  : (($object_type eq 'AnalysisJob') && !$location_part)
240  ? $qp_hash->{'job_id'}
241  : keys %$qp_hash
242  ? '?' . join('&', map { $_.'='.$qp_hash->{$_} } keys %$qp_hash)
243  : '';
244 
245  # DBC extra arguments' part:
246  my $cp_hash = $parse->{'conn_params'} || {};
247  my $conn_params_part = keys %$cp_hash
248  ? ';' . join(';', map { $_.'='.$cp_hash->{$_} } keys %$cp_hash)
249  : '';
250 
251  my $url = $location_part . $query_params_part . $conn_params_part;
252 
253  return $url;
254 }
255 
256 
257 =head2 hide_url_password
258 
259  Description : Check the command-line for -url or -pipeline_url in order to
260  replace the password with an environment variable and then
261  exec on the new arguments (in which case the function doesn't
262  return)
263  Returntype : void or no return
264 
265 =cut
266 
267 sub hide_url_password {
268 
269  # avoid calling exec whilst in the Perl debugger not to confuse the latter
270  # Detect the presence of the debugger with https://www.nntp.perl.org/group/perl.debugger/2004/11/msg55.html
271  return if defined &DB::DB;
272 
273  # Safeguard to avoid attempting the substitution twice
274  # NOTE: the environment is propagated to the children, meaning that the
275  # variable, once set by beekeeper.pl, will extend to all its workers,
276  # which is fine as long as beekeeper protects the passwords.
277  return if $ENV{EHIVE_SANITIZED_ARGS};
278  $ENV{EHIVE_SANITIZED_ARGS} = 1;
279 
280  # Work on a copy of @ARGV
281  my @args = (@ARGV);
282 
283  my @new_args;
284  # Scan the list of arguments
285  while (@args) {
286  my $a = shift @args;
287  # Search for -url and -pipeline_url (with one or two hyphens)
288  if (($a =~ /^--?(pipeline_)?url$/) and @args) {
289  my $url = shift @args;
290  # Does the next value look like a proper URL ?
291  if ($url =~ /^(.*:\/\/\w*:)([^\/\@]*)(\@.*)$/) {
292  # Recognized URL
293  my $driver_and_user = $1;
294  my $possible_password = $2;
295  my $url_remainder = $3;
296  # Does the password look like an environment variable ?
297  if ($possible_password =~ /\$(?|\{(\w+)\}|(\w+))/) {
298  # Does the variable exist ?
299  if (defined($ENV{$1})) {
300  # Already a substituted password -> bail out !
301  return;
302  }
303  }
304  # Perform the substitution
305  my $pass_variable = '_EHIVE_HIDDEN_PASS';
306  $ENV{$pass_variable} = $possible_password;
307  # Single quotes are needed so that LSF doesn't expand the variable
308  $url = q{'} . $driver_and_user .'${'.$pass_variable . '}' . $url_remainder . q{'};
309  }
310  # Found the URL, let's push the remaining arguments and exec
311  push @new_args, $a, $url, @args;
312  exec($^X, $0, @new_args);
313 
314  } elsif ($a eq '--') {
315  # We've reached the end of the parsable options without finding
316  # a password -> nothing to do
317  return;
318 
319  } else {
320  push @new_args, $a;
321  }
322  }
323  # If we arrive here it means we couldn't find anything to substitute,
324  # so there is nothing to do
325 }
326 
327 
328 1;