ensembl-hive  2.8.1
DataFileAdaptor.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package Bio::EnsEMBL::DBSQL::DataFileAdaptor;
21 
22 =pod
23 
24 
25 =head1 CONTACT
26 
27  Please email comments or questions to the public Ensembl
28  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
29 
30  Questions may also be sent to the Ensembl help desk at
31  <http://www.ensembl.org/Help/Contact>.
32 
33 =head1 NAME
34 
36 
37 =head1 SYNOPSIS
38 
39  my $dfa = $dba->get_DataFileAdaptor();
40  my $file = $dfa->fetch_by_dbID(1);
41  my $files = $dfa->fetch_all();
42 
43  my $logic_name_files = $dfa->fetch_all_by_logic_name('bam_alignments');
44 
45 =head1 DESCRIPTION
46 
47 Provides a database wrapper to store the locations of files and to pull these
48 records back out. DataFile objects can only provide basic information but they
49 can return an intended external database adaptor which can be used to
50 parse the information. This system assumes nothing about the file just that
51 your parser can access it.
52 
53 Files are supported over any protocol your parser supports and locations can be
54 made absolute, built on the fly or versioned.
55 
56 =head1 METHODS
57 
58 =cut
59 
60 use strict;
61 use warnings;
62 
64 
67 use Bio::EnsEMBL::Utils::Exception qw/throw/;
68 use Bio::EnsEMBL::Utils::Scalar qw/:assert/;
69 
70 my $GLOBAL_BASE_PATH;
71 
72 =head2 global_base_path
73 
74  Arg[1] : String; base path
76  Description : Stores a global value to be used when building data file paths
77  Returntype : String
78  Exceptions : None
79 
80 =cut
81 
82 sub global_base_path {
83  my ($class, $base_path) = @_;
84  return $GLOBAL_BASE_PATH unless $base_path;
85  $GLOBAL_BASE_PATH = $base_path;
86  return $GLOBAL_BASE_PATH;
87 }
88 
89 =head2 get_base_path
90 
91  Arg[1] : String; (optional) base path
92  Example : $dfa->get_base_path();
93  Description : If given the path it will return that path; if not it consults
94  $self->global_base_path() for a value. As a last resort
95  it will look at the meta table for an entry keyed by
96  B<data_file.base_path>
97  Returntype : String
98  Exceptions : Thrown if nothing is found after consulting all three locations
99 
100 =cut
101 
102 sub get_base_path {
103  my ($self, $path) = @_;
104  return $path if defined $path;
105  my $global_base_path = $self->global_base_path();
106  return $global_base_path if defined $global_base_path;
107  my $meta_base_path = $self->db()->get_MetaContainer()->single_value_by_key('data_file.base_path', 1);
108  return $meta_base_path if defined $meta_base_path;
109  throw "No base path discovered. Either provide a path, set a global using global_base_path() or specify 'data_file.base_path' in meta";
110 }
111 
112 =head2 DataFile_to_extensions
113 
114  Arg[1] : Bio::EnsEMBL::DataFile
115  Example : my $exts = $dfa->DataFile_to_extensions($bam_df);
116  Description : Returns all expected extensions for the given DataFile type. The
117  first returned is the default extension
118  Returntype : ArrayRef
119  Exceptions : Raised if the given file type is not understood
120 
121 =cut
122 
123 sub DataFile_to_extensions {
124  my ($self, $df) = @_;
125  my $type = $df->file_type();
126  my $extensions = {
127  BAM => ['bam', 'bam.bai'],
128  BAMCOV => ['bam', 'bam.bai', 'bam.bw'], # BAM coverage files
129  BIGBED => ['bb'],
130  BIGWIG => ['bw'],
131  VCF => ['vcf.gz', 'vcf.gz.tbi'],
132  }->{$type};
133  throw sprintf(q{No extensions found for the type '%s'}, $type ) if ! $extensions;
134  return $extensions;
135 }
136 
137 
138 =head2 DataFile_to_adaptor
139 
140  Arg[1] : Bio::EnsEMBL::DataFile
141  Arg[2] : (optional) base path
142  Arg[3] : (optional) file type
143  Example : my $bam = $dfa->DataFile_to_adaptor($bam_df);
144  Description : Returns an adaptor instance which will access the given DataFile.
145  Can explicitly request for an adaptor of a given file type (third
146  argument), useful with composite types, e.g. BAM coverage files
147  can be returned as BAM or BIGWIG
148  Returntype : Scalar actual return depends upon the given file type and the
149  requested type
150  Exceptions : Raised if the given file type is not understood or if the requested
151  file type is incompatible with the actual data file type.
152 
153 =cut
154 
155 sub DataFile_to_adaptor {
156  my ($self, $df, $base, $requested_type) = @_;
157  my $type = $df->file_type();
158 
159  throw sprintf("Request for a '%s' adaptor, but file is of type '%s'", $requested_type, $type)
160  if $type ne 'BAMCOV' and $type ne $requested_type;
161 
162  SWITCH:
163  {
164  return Bio::EnsEMBL::IO::Adaptor::BAMAdaptor->new($df->path($base))
165  if $type eq 'BAM';
166 
167  return Bio::EnsEMBL::IO::Adaptor::BigBedAdaptor->new($df->path($base))
168  if $type eq 'BIGBED';
169 
170  return Bio::EnsEMBL::IO::Adaptor::BigWigAdaptor->new($df->path($base))
171  if $type eq 'BIGWIG';
172 
173  return Bio::EnsEMBL::IO::Adaptor::VCFAdaptor->new($df->path($base))
174  if $type eq 'VCF';
175 
176  # BAMCOV composite case
177  if ($type eq 'BAMCOV') {
178  return Bio::EnsEMBL::IO::Adaptor::BAMAdaptor->new($df->path($base))
179  if $requested_type eq 'BAM' or $requested_type eq 'BAMCOV';
180 
181  return Bio::EnsEMBL::IO::Adaptor::BigWigAdaptor->new($df->get_all_paths($base)->[2])
182  if $requested_type eq 'BIGWIG';
183  }
184 
185  throw sprintf(q{No '%s' handler found for the type '%s'}, $requested_type, $type )
186  }
187 
188 }
189 
190 =head2 fetch_all_by_logic_name
191 
192  Args [1] : String $logic_name for the linked analysis
193  Example : my $dfs = $dfa->fetch_all_by_logic_name('bam_alignments');
194  Description : Returns all DataFile entries linked to the given analysis
195  logic name
196  Returntype : ArrayRef contains Bio::EnsEMBL::DataFile instances
197  Exceptions : Thrown if logic name does not exist
198 
199 =cut
200 
201 sub fetch_all_by_logic_name {
202  my ($self, $logic_name) = @_;
203  my $analysis = $self->db()->get_AnalysisAdaptor()->fetch_by_logic_name($logic_name);
204  throw "No analysis found for logic_name '${logic_name}'" if ! $analysis;
205  return $self->fetch_all_by_Analysis($analysis);
206 }
207 
208 =head2 fetch_all_by_Analysis
209 
210  Args [1] : Bio::EnsEMBL::Analysis $analysis to look up by
211  Example : my $dfs = $dfa->fetch_all_by_Analysis($analysis);
212  Description : Returns all DataFile entries linked to the given analysis
213  Returntype : ArrayRef contains Bio::EnsEMBL::DataFile instances
214  Exceptions : None
215 
216 =cut
217 
218 sub fetch_all_by_Analysis {
219  my ($self, $analysis) = @_;
220  assert_ref($analysis, 'Bio::EnsEMBL::Analysis', 'analysis');
221  $self->bind_param_generic_fetch($analysis->dbID(), SQL_INTEGER);
222  return $self->generic_fetch('df.analysis_id =?');
223 }
224 
225 =head2 fetch_all_by_CoordSystem
226 
227  Args [1] : Bio::EnsEMBL::CoordSystem $coord_system to look up by
228  Example : my $dfs = $dfa->fetch_all_by_CoordSystem($cs);
229  Description : Returns all DataFile entries linked to the given coordinate
230  system. Does B<not> support I<toplevel>
231  Returntype : ArrayRef contains Bio::EnsEMBL::DataFile instances
232  Exceptions : None
233 
234 =cut
235 
236 sub fetch_all_by_CoordSystem {
237  my ($self, $cs) = @_;
238  assert_ref($cs, 'Bio::EnsEMBL::CoordSystem', 'coord_system');
239  $self->bind_param_generic_fetch($cs->dbID(), SQL_INTEGER);
240  return $self->generic_fetch('df.coord_system_id =?');
241 }
242 
243 sub fetch_by_name_and_type {
244  my ($self, $name, $type) = @_;
245  $self->bind_param_generic_fetch($name, SQL_VARCHAR);
246  $self->bind_param_generic_fetch($type, SQL_VARCHAR);
247  my $results = $self->generic_fetch('df.name =? and df.file_type =?');
248  return $results->[0] if @{$results};
249  return;
250 }
251 
252 sub generic_fetch {
253  my ($self, $constraint) = @_;
254  $constraint ||= q{};
255 
256  my $sql = <<'SQL';
257 select df.data_file_id, df.coord_system_id, df.analysis_id, df.name, df.version_lock, df.absolute, df.url, df.file_type
258 from data_file df
259 join coord_system cs using (coord_system_id)
260 where cs.species_id =?
261 SQL
262  $sql .= 'AND '.$constraint if $constraint;
263 
264  my $params = $self->bind_param_generic_fetch();
265  if(defined $params) {
266  $self->{'_bind_param_generic_fetch'} = ();
267  }
268  else {
269  $params = [];
270  }
271  unshift(@{$params}, $self->db()->species_id());
272 
273  my $csa = $self->db()->get_CoordSystemAdaptor();
274  my $aa = $self->db()->get_AnalysisAdaptor();
275 
276  return $self->dbc()->sql_helper()->execute(-SQL => $sql, -PARAMS => $params, -CALLBACK => sub {
277  my ($row) = @_;
278  my ($data_file_id, $coord_system_id, $analysis_id, $name, $version_lock, $absolute, $url, $file_type) = @{$row};
279  my $hash = {
280  dbID => $data_file_id,
281  adaptor => $self,
282  coord_system => $csa->fetch_by_dbID($coord_system_id),
283  analysis => $aa->fetch_by_dbID($analysis_id),
284  name => $name,
285  version_lock => $version_lock,
286  absolute => $absolute,
287  file_type => $file_type,
288  };
289  $hash->{url} = $url if $url;
290  return Bio::EnsEMBL::DataFile->new_fast($hash);
291  });
292 }
293 
294 sub store {
295  my ($self, $df) = @_;
296 
297  assert_ref($df, 'Bio::EnsEMBL::DataFile', 'datafile');
298 
299  if ($df->is_stored($self->db())) {
300  return $df->dbID();
301  }
302 
303  throw 'Analysis is not defined for this data file' if ! defined $df->analysis();
304  throw 'Coord system is not defined for this data file' if ! defined $df->coord_system();
305 
306  my $sql = <<'SQL';
307 INSERT INTO data_file (coord_system_id, analysis_id, name, version_lock, absolute, url, file_type)
308 VALUES (?,?,?,?,?,?,?)
309 SQL
310  my $params = [
311  [$df->coord_system()->dbID(), SQL_INTEGER],
312  [$df->analysis()->dbID(), SQL_INTEGER],
313  [$df->name(), SQL_VARCHAR],
314  [$df->version_lock(), SQL_INTEGER],
315  [$df->absolute(), SQL_INTEGER],
316  [$df->url(), SQL_VARCHAR],
317  [$df->file_type(), SQL_VARCHAR],
318  ];
319  $self->dbc()->sql_helper()->execute_update(-SQL => $sql, -PARAMS => $params, -CALLBACK => sub {
320  my ( $sth, $dbh ) = @_;
321  $df->dbID($self->last_insert_id());
322  return;
323  });
324  $df->adaptor($self);
325 
326  return $df->dbID();
327 }
328 
329 sub update {
330  my ($self, $df) = @_;
331 
332  assert_ref($df, 'Bio::EnsEMBL::DataFile', 'datafile');
333 
334  if (! $df->is_stored($self->db())) {
335  $self->store($df);
336  return;
337  }
338 
339  my $sql = <<'SQL';
340 UPDATE data_file SET coord_system_id =?, analysis_id=?, name=?, version_lock=?, absolute=?, url=?, file_type=?
341 WHERE data_file_id =?
342 SQL
343  my $params = [
344  [$df->coord_system()->dbID(), SQL_INTEGER],
345  [$df->analysis()->dbID(), SQL_INTEGER],
346  [$df->name(), SQL_VARCHAR],
347  [$df->version_lock(), SQL_INTEGER],
348  [$df->absolute(), SQL_INTEGER],
349  [$df->url(), SQL_VARCHAR],
350  [$df->file_type(), SQL_VARCHAR],
351  [$df->dbID(), SQL_INTEGER],
352  ];
353  $self->dbc()->sql_helper()->execute_update(-SQL => $sql, -PARAMS => $params);
354  return;
355 }
356 
357 sub delete {
358  my ($self, $df) = @_;
359 
360  assert_ref($df, 'Bio::EnsEMBL::DataFile', 'datafile');
361 
362  if (! $df->is_stored($self->db())) {
363  throw "Cannot delete the data file if it has not already been stored in this database";
364  }
365 
366  $self->dbc()->sql_helper()->execute_update(
367  -SQL => 'DELETE from data_file where data_file_id =?',
368  -PARAMS => [[$df->dbID(), SQL_INTEGER]],
369  );
370 
371  return;
372 }
373 
374 sub _tables {
375  my ($self) = @_;
376  return (
377  [qw/data_file df/]
378  );
379 }
380 
381 1;
Bio::EnsEMBL::DBSQL::BaseAdaptor::fetch_by_dbID
public Bio::EnsEMBL::Feature fetch_by_dbID()
Bio::EnsEMBL::CoordSystem
Definition: CoordSystem.pm:40
Bio::EnsEMBL::DataFile
Definition: DataFile.pm:9
Bio::EnsEMBL::Analysis
Definition: PairAlign.pm:3
about
public about()
Bio::EnsEMBL::DBSQL::BaseAdaptor
Definition: BaseAdaptor.pm:71
Bio::EnsEMBL::Utils::Scalar
Definition: Scalar.pm:66
Bio::EnsEMBL::DBSQL::DataFileAdaptor
Definition: DataFileAdaptor.pm:30
Bio::EnsEMBL::Storable::new_fast
public Instance new_fast()
Bio::EnsEMBL::DataFile::file_type
public String file_type()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::DBSQL::DataFileAdaptor::global_base_path
public String global_base_path()