ensembl-hive  2.8.1
FetchFiles.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefParser::FetchFiles;
21 
22 use strict;
23 use warnings;
24 
25 # Given one or several FTP or HTTP URIs, download them. If an URI is
26 # for a file or MySQL connection, then these will be ignored. For
27 # FTP, standard shell file name globbing is allowed (but not regular
28 # expressions). HTTP does not allow file name globbing. The routine
29 # returns a list of successfully downloaded local files or an empty list
30 # if there was an error.
31 
32 
33 use Carp;
34 use DBI;
35 use Digest::MD5 qw(md5_hex);
36 use Getopt::Long;
37 use POSIX qw(strftime);
38 
39 use File::Basename;
40 use File::Spec::Functions;
41 use IO::File;
42 use Net::FTP;
43 use HTTP::Tiny;
44 use URI;
45 use URI::file;
46 use Text::Glob qw( match_glob );
47 use LWP::UserAgent;
48 
49 
50 my $base_dir = File::Spec->curdir();
51 
52 sub new {
53  my ($proto) = @_;
54 
55  my $class = ref $proto || $proto;
56  return bless {}, $class;
57 }
58 
59 sub fetch_files {
60  my ($self, $arg_ref) = @_;
61 
62 
63  my $dest_dir = $arg_ref->{dest_dir};
64  my $user_uris = $arg_ref->{user_uris};
65  my $deletedownloaded = $arg_ref->{del_down};
66  my $checkdownload = $arg_ref->{chk_down};
67  my $verbose = $arg_ref->{verbose} ;
68 
69  my @processed_files;
70 
71  foreach my $user_uri (@$user_uris) {
72  # Change old-style 'LOCAL:' URIs into 'file:'.
73  $user_uri =~ s/^LOCAL:/file:/ix;
74  my $uri = URI->new($user_uri);
75 
76  if ( $uri->scheme() eq 'script' ) {
77  push( @processed_files, $user_uri );
78  } elsif ( $uri->scheme() eq 'file' ) {
79 
80  # Deal with local files.
81 
82  $user_uri =~ s/file://x;
83  if ( -s $user_uri ) {
84  push( @processed_files, $user_uri );
85  } else {
86  printf( "==> Can not find file '%s' (or it is empty)\n",
87  $user_uri );
88  return ();
89  }
90  } elsif ( $uri->scheme() eq 'ftp' ) {
91  # Deal with FTP files.
92 
93  my $file_path = catfile( $dest_dir, basename( $uri->path() ) );
94 
95  if ( $deletedownloaded && -e $file_path ) {
96  if ($verbose) {
97  printf( "Deleting '%s'\n", $file_path );
98  }
99  unlink($file_path);
100  }
101 
102  if ( $checkdownload && -s $file_path ) {
103  # The file is already there, no need to connect to a FTP
104  # server. This also means no file name globbing was
105  # used (for globbing FTP URIs, we always need to connect
106  # to a FTP site to see what files are there).
107 
108  if ($verbose) {
109  printf( "File '%s' already exists\n", $file_path );
110  }
111  push( @processed_files, $file_path );
112  next;
113  }
114 
115  if ( -e $file_path ) { unlink($file_path) }
116 
117  if ($verbose) {
118  printf( "Connecting to FTP host '%s' for file '%s' \n",
119  $uri->host(), $file_path );
120  }
121 
122  my $ftp = $self->get_ftp($uri, 0);
123  if(!defined($ftp) or ! $ftp->can('ls') or !$ftp->ls()){
124  $ftp = $self->get_ftp($uri, 1);
125  }
126  foreach my $remote_file ( ( @{ $ftp->ls() } ) ) {
127  if ( !match_glob( basename( $uri->path() ), $remote_file ) ) {
128  next;
129  }
130 
131  $file_path = catfile( $dest_dir, basename($remote_file) );
132 
133  if ( $deletedownloaded && -e $file_path ) {
134  if ($verbose) {
135  printf( "Deleting '%s'\n", $file_path );
136  }
137  unlink($file_path);
138  }
139 
140  if ( $checkdownload && -s $file_path ) {
141  if ($verbose) {
142  printf( "File '%s' already exists\n", $file_path );
143  }
144  } else {
145 
146  if ( -e $file_path ) { unlink($file_path) }
147 
148  if ( !-d dirname($file_path) ) {
149  if ($verbose) {
150  printf( "Creating directory '%s'\n",
151  dirname($file_path) );
152  }
153  if ( !mkdir( dirname($file_path) ) ) {
154  printf( "==> Can not create directory '%s': %s",
155  dirname($file_path), $! );
156  return ();
157  }
158  }
159 
160  if ($verbose) {
161  printf( "Fetching '%s' (size = %s)\n",
162  $remote_file,
163  $ftp->size($remote_file) || '(unknown)' );
164  printf( "Local file is '%s'\n", $file_path );
165  }
166 
167  if ( !$ftp->get( $remote_file, $file_path ) ) {
168  printf( "==> Could not get '%s': %s\n",
169  basename( $uri->path() ), $ftp->message() );
170  return ();
171  }
172  } ## end else [ if ( $checkdownload &&...)]
173 
174  if ( $file_path =~ /\.(gz|Z)$/x ) {
175  # Read from zcat pipe
176  #
177  my $cmd = "gzip -t $file_path";
178  if ( system($cmd) != 0 ) {
179  printf( "system command '%s' failed: %s - "
180  . "Checking of gzip file failed - "
181  . "FILE CORRUPTED ?\n\n",
182  $cmd, $? );
183 
184  if ( -e $file_path ) {
185  if ($verbose) {
186  printf( "Deleting '%s'\n", $file_path );
187  }
188  unlink($file_path);
189  }
190  return ();
191  } else {
192  if ($verbose) {
193  printf( "'%s' passed (gzip -t) corruption test.\n",
194  $file_path );
195  }
196  }
197  }
198  push( @processed_files, $file_path );
199 
200  } ## end foreach my $remote_file ( (...))
201  if (!@processed_files) { printf ("No files found matching $uri") ; }
202 
203 
204  } elsif ( $uri->scheme() eq 'http' || $uri->scheme eq 'https') {
205  # Deal with HTTP files.
206 
207  my $filename = basename ($uri->path() );
208  if ($uri->path eq '') { $filename = "index.html"; }
209 
210  my $file_path = catfile( $dest_dir, $filename );
211 
212  if ( $deletedownloaded && -e $file_path ) {
213  if ($verbose) {
214  printf( "Deleting '%s'\n", $file_path );
215  }
216  unlink($file_path);
217  }
218 
219  if ( $checkdownload && -s $file_path ) {
220  # The file is already there, no need to connect to a
221  # HTTP server.
222 
223  if ($verbose) {
224  printf( "File '%s' already exists\n", $file_path );
225  }
226  push( @processed_files, $file_path );
227  next;
228  }
229 
230  if ( -e $file_path ) { unlink($file_path) }
231 
232  if ( !-d dirname($file_path) ) {
233  if ($verbose) {
234  printf( "Creating directory '%s'\n", dirname($file_path) );
235  }
236  if ( !mkdir( dirname($file_path) ) ) {
237  printf( "==> Can not create directory '%s': %s",
238  dirname($file_path), $! );
239  return ();
240  }
241  }
242 
243  if ($verbose) {
244  printf( "Connecting to HTTP host '%s'\n", $uri->host() );
245  printf( "Fetching '%s'\n", $uri->path() );
246  }
247 
248  if ( $checkdownload && -s $file_path ) {
249  if ($verbose) {
250  printf( "File '%s' already exists\n", $file_path );
251  }
252  } else {
253 
254  if ($verbose) {
255  printf( "Local file is '%s'\n", $file_path );
256  }
257 
258  if ( -e $file_path ) { unlink($file_path) }
259 
260  open OUT, ">$file_path" or die "Couldn't open file $file_path $!";
261  my $http = HTTP::Tiny->new();
262 
263  my $response = $http->get($uri->as_string());
264 
265  if ( !$response->{success} ) {
266  printf( "==> Could not get '%s': %s\n",
267  basename( $uri->path() ), $response->{content} );
268  return ();
269  }
270  print OUT $response->{content};
271  close OUT;
272  }
273 
274  push( @processed_files, $file_path );
275 
276  } elsif ( $uri->scheme() eq 'mysql' ) {
277  # Just leave MySQL data untouched for now.
278  push( @processed_files, $user_uri );
279  } else {
280  printf( "==> Unknown URI scheme '%s' in URI '%s'\n",
281  $uri->scheme(), $uri->as_string() );
282  return ();
283  }
284  } ## end foreach my $user_uri (@user_uris)
285 
286  return @processed_files;
287 } ## end sub fetch_files
288 
289 
290 sub get_ftp{
291  my ($self, $uri, $passive) = @_;
292  my $ftp;
293 
294  if($passive){
295  $ftp = Net::FTP->new( $uri->host(), 'Debug' => 0, Passive => 1);
296  }
297  else{
298  $ftp = Net::FTP->new( $uri->host(), 'Debug' => 0);
299  }
300 
301  if ( !defined($ftp) ) {
302  printf( "==> Can not open FTP connection: %s\n", $@ );
303  return ();
304  }
305 
306  if ( !$ftp->login( 'anonymous', '-anonymous@' ) ) {
307  printf( "==> Can not log in on FTP host: %s\n",
308  $ftp->message() );
309  return ();
310  }
311 
312  if ( !$ftp->cwd( dirname( $uri->path() ) ) ) {
313  printf( "== Can not change directory to '%s': %s\n",
314  dirname( $uri->path() ), $ftp->message() );
315  return ();
316  }
317 
318  $ftp->binary();
319  return $ftp;
320 }
321 
322 1;