ensembl-hive  2.7.0
Cache.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::IdMapping::Cache - a cache to hold data objects used by the
34 IdMapping application
35 
36 =head1 DESCRIPTION
37 
38 =head1 METHODS
39 
40 =cut
41 
42 
43 package Bio::EnsEMBL::IdMapping::Cache;
44 
45 use strict;
46 use warnings;
47 no warnings 'uninitialized';
48 
49 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
50 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
51 use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes path_append);
52 use Bio::EnsEMBL::Utils::Scalar qw(assert_ref);
58 use Storable qw(nstore retrieve);
59 use Digest::MD5 qw(md5_hex);
60 
61 # define available cache names here
62 my @cache_names = qw(
63  exons_by_id
64  transcripts_by_id
65  transcripts_by_exon_id
66  translations_by_id
67  genes_by_id
68  genes_by_transcript_id
69 );
70 
71 
72 =head2 new
73 
74  Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
75  Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
76  Example : my $cache = Bio::EnsEMBL::IdMapping::Cache->new(
77  -LOGGER => $logger,
78  -CONF => $conf,
79  );
80  Description : constructor
81  Return type : Bio::EnsEMBL::IdMapping::Cache object
82  Exceptions : thrown on wrong or missing arguments
83  Caller : general
84  Status : At Risk
85  : under development
86 
87 =cut
88 
89 sub new {
90  my $caller = shift;
91  my $class = ref($caller) || $caller;
92 
93  my ($logger, $conf, $load_instance) =
94  rearrange(['LOGGER', 'CONF', 'LOAD_INSTANCE'], @_);
95 
96  unless ($logger->isa('Bio::EnsEMBL::Utils::Logger')) {
97  throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
98  }
99 
100  unless ($conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
101  throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
102  }
103 
104  my $self = {};
105  bless ($self, $class);
106 
107  # initialise
108  $self->logger($logger);
109  $self->conf($conf);
110 
111  if ($load_instance) {
112  $self->read_instance_from_file;
113  }
114 
115  return $self;
116 }
117 
118 
119 =head2 build_cache_by_slice
120 
121  Arg[1] : String $dbtype - db type (source|target)
122  Arg[2] : String $slice_name - the name of a slice (format as returned by
123  Bio::EnsEMBL::Slice->name)
124  Example : my ($num_genes, $filesize) = $cache->build_cache_by_slice(
125  'source', 'chromosome:NCBI36:X:1:1000000:-1');
126  Description : Builds a cache of genes, transcripts, translations and exons
127  needed by the IdMapping application and serialises the resulting
128  cache object to a file, one slice at a time.
129  Return type : list of the number of genes processed and the size of the
130  serialised cache file
131  Exceptions : thrown on invalid slice name
132  Caller : general
133  Status : At Risk
134  : under development
135 
136 =cut
137 
138 sub build_cache_by_slice {
139  my $self = shift;
140  my $dbtype = shift;
141  my $slice_name = shift;
142 
143  # set cache method (required for loading cache later)
144  $self->cache_method('BY_SEQ_REGION');
145 
146  my $dba = $self->get_DBAdaptor($dbtype);
147  my $sa = $dba->get_SliceAdaptor;
148 
149  my $slice = $sa->fetch_by_name($slice_name);
150  unless ($slice) {
151  throw("Could not retrieve slice $slice_name.");
152  }
153 
154  my $genes = $slice->get_all_Genes( undef, undef, 1 );
155 
156  # find common coord_system
157  my $common_cs_found = $self->find_common_coord_systems;
158 
159  # find out whether native coord_system is a common coord_system.
160  # if so, you don't need to project.
161  # also don't project if no common coord_system present
162  my $need_project = 1;
163 
164  my $csid = join( ':',
165  $slice->coord_system_name,
166  $slice->coord_system->version );
167 
168  if ( $self->is_common_cs($csid) or !$self->highest_common_cs ) {
169  $need_project = 0;
170  }
171 
172  # build cache
173  my $type = "$dbtype.$slice_name";
174  my $num_genes =
175  $self->build_cache_from_genes( $type, $genes, $need_project );
176  undef $genes;
177 
178  # write cache to file, then flush cache to reclaim memory
179  my $size = $self->write_all_to_file($type);
180 
181  return $num_genes, $size;
182 } ## end sub build_cache_by_slice
183 
184 
185 =head2 build_cache_all
186 
187  Arg[1] : String $dbtype - db type (source|target)
188  Example : my ($num_genes, $filesize) = $cache->build_cache_all('source');
189  Description : Builds a cache of genes, transcripts, translations and exons
190  needed by the IdMapping application and serialises the
191  resulting cache object to a file. All genes across the genome
192  are processed in one go. This method should be used when
193  build_cache_by_seq_region can't be used due to a large number
194  of toplevel seq_regions (e.g. 2x genomes).
195  Return type : list of the number of genes processed and the size of the
196  serialised cache file
197  Exceptions : thrown on invalid slice name
198  Caller : general
199  Status : At Risk
200  : under development
201 
202 =cut
203 
204 sub build_cache_all {
205  my $self = shift;
206  my $dbtype = shift;
207 
208  # set cache method (required for loading cache later)
209  $self->cache_method('ALL');
210 
211  my $dba = $self->get_DBAdaptor($dbtype);
212  my $ga = $dba->get_GeneAdaptor;
213 
214  my $genes = $ga->fetch_all;
215 
216  # find common coord_system
217  my $common_cs_found = $self->find_common_coord_systems;
218 
219  # Build cache. Setting $need_project to 'CHECK' will cause
220  # build_cache_from_genes() to check the coordinate system for each
221  # gene.
222  my $type = "$dbtype.ALL";
223  my $need_project = 'CHECK';
224  my $num_genes =
225  $self->build_cache_from_genes( $type, $genes, $need_project );
226 
227  undef $genes;
228 
229  # write cache to file, then flush cache to reclaim memory
230  my $size = $self->write_all_to_file($type);
231 
232  return $num_genes, $size;
233 }
234 
235 
236 =head2 build_cache_from_genes
237 
238  Arg[1] : String $type - cache type
239  Arg[2] : Listref of Bio::EnsEMBL::Genes $genes - genes to build cache
240  from
241  Arg[3] : Boolean $need_project - indicate if we need to project exons to
242  common coordinate system
243  Example : $cache->build_cache_from_genes(
244  'source.chromosome:NCBI36:X:1:100000:1', \@genes);
245  Description : Builds the cache by fetching transcripts, translations and exons
246  for a list of genes from the database, and creating lightweight
247  Bio::EnsEMBL::IdMapping::TinyFeature objects containing only the
248  data needed by the IdMapping application. These objects are
249  attached to a name cache in this cache object. Exons only need
250  to be projected to a commond coordinate system if their native
251  coordinate system isn't common to source and target assembly
252  itself.
253  Return type : int - number of genes after filtering
254  Exceptions : thrown on wrong or missing arguments
255  Caller : internal
256  Status : At Risk
257  : under development
258 
259 =cut
260 
261 sub build_cache_from_genes {
262  my $self = shift;
263  my $type = shift;
264  my $genes = shift;
265  my $need_project = shift;
266 
267  throw("You must provide a type.") unless $type;
268  throw("You must provide a listref of genes.")
269  unless ( ref($genes) eq 'ARRAY' );
270 
271  # biotype filter
272  if ( $self->conf()->param('biotypes') ||
273  $self->conf()->param('biotypes_include') ||
274  $self->conf()->param('biotypes_exclude') )
275  {
276  $genes = $self->filter_biotypes($genes);
277  }
278  my $num_genes = scalar(@$genes);
279 
280  # initialise cache for the given type.
281  $self->{'cache'}->{$type} = {};
282 
283  #my $i = 0;
284  #my $num_genes = scalar(@$genes);
285  #my $progress_id = $self->logger->init_progress($num_genes);
286 
287  # loop over genes sorted by gene location.
288  # the sort will hopefully improve assembly mapper cache performance and
289  # therefore speed up exon sequence retrieval
290  foreach my $gene ( sort { $a->start <=> $b->start } @$genes ) {
291  #$self->logger->log_progressbar($progress_id, ++$i, 2);
292  #$self->logger->log_progress($num_genes, ++$i, 20, 3, 1);
293 
294  if ( $need_project eq 'CHECK' ) {
295  # find out whether native coord_system is a common coord_system.
296  # if so, you don't need to project.
297  # also don't project if no common coord_system present
298  if ( $self->highest_common_cs ) {
299  my $csid = join( ':',
300  $gene->slice->coord_system_name,
301  $gene->slice->coord_system->version );
302  if ( $self->is_common_cs($csid) ) {
303  $need_project = 0;
304  }
305  }
306  else {
307  $need_project = 0;
308  }
309  }
310 
311  # create lightweigt gene
312  my $lgene =
314  $gene->dbID, $gene->stable_id,
315  $gene->version, $gene->created_date,
316  $gene->modified_date, $gene->start,
317  $gene->end, $gene->strand,
318  $gene->slice->seq_region_name, $gene->biotype,
319  $gene->analysis->logic_name,
320  ] );
321 
322  # build gene caches
323  $self->add( 'genes_by_id', $type, $gene->dbID, $lgene );
324 
325  # transcripts
326  foreach my $tr ( @{ $gene->get_all_Transcripts } ) {
327  my $ltr =
329  $tr->dbID, $tr->stable_id,
330  $tr->version, $tr->created_date,
331  $tr->modified_date, $tr->start,
332  $tr->end, $tr->strand,
333  $tr->length, md5_hex( $tr->spliced_seq ),
334  ] );
335 
336  $ltr->biotype( $tr->biotype() );
337  $ltr->seq_region_name( $tr->slice->seq_region_name() );
338  $lgene->add_Transcript($ltr);
339 
340  # build transcript caches
341  $self->add( 'transcripts_by_id', $type, $tr->dbID, $ltr );
342  $self->add( 'genes_by_transcript_id', $type, $tr->dbID, $lgene );
343 
344  # translation (if there is one)
345  if ( my $tl = $tr->translation ) {
346  my $ltl =
348  $tl->dbID, $tl->stable_id,
349  $tl->version, $tl->created_date,
350  $tl->modified_date, $tr->dbID,
351  $tr->translate->seq,
352  ] );
353 
354  $ltr->add_Translation($ltl);
355 
356  $self->add( 'translations_by_id', $type, $tl->dbID, $ltl );
357 
358  undef $tl;
359  }
360 
361  # exons
362  foreach my $exon ( @{ $tr->get_all_Exons } ) {
363  my $lexon =
365  $exon->dbID,
366  $exon->stable_id,
367  $exon->version,
368  $exon->created_date,
369  $exon->modified_date,
370  $exon->start,
371  $exon->end,
372  $exon->strand,
373  $exon->slice->seq_region_name,
374  $exon->slice->coord_system_name,
375  $exon->slice->coord_system->version,
376  $exon->slice->subseq( $exon->start, $exon->end,
377  $exon->strand ),
378  $exon->phase,
379  $need_project, ] );
380 
381  # get coordinates in common coordinate system if needed
382  if ($need_project) {
383  my @seg = @{
384  $exon->project( $self->highest_common_cs,
385  $self->highest_common_cs_version ) };
386 
387  if ( scalar(@seg) == 1 ) {
388  my $sl = $seg[0]->to_Slice;
389  $lexon->common_start( $sl->start );
390  $lexon->common_end( $sl->end );
391  $lexon->common_strand( $sl->strand );
392  $lexon->common_sr_name( $sl->seq_region_name );
393  }
394  }
395 
396  $ltr->add_Exon($lexon);
397 
398  $self->add( 'exons_by_id', $type, $exon->dbID, $lexon );
399  $self->add_list( 'transcripts_by_exon_id',
400  $type, $exon->dbID, $ltr );
401 
402  undef $exon;
403  } ## end foreach my $exon ( @{ $tr->get_all_Exons...})
404 
405  undef $tr;
406  } ## end foreach my $tr ( @{ $gene->get_all_Transcripts...})
407 
408  undef $gene;
409  } ## end foreach my $gene ( sort { $a...})
410 
411  return $num_genes;
412 } ## end sub build_cache_from_genes
413 
414 
415 =head2 filter_biotypes
416 
417  Arg[1] : Listref of Bio::EnsEMBL::Genes $genes - the genes to filter
418  Example : my @filtered = @{ $cache->filter_biotypes(\@genes) };
419 
420  Description : Filters a list of genes by biotype. Biotypes are
421  taken from the IdMapping configuration parameter
422  'biotypes_include' or 'biotypes_exclude'.
423 
424  If the configuration parameter 'biotypes_exclude' is
425  defined, then rather than returning the genes whose
426  biotype is listed in the configuration parameter
427  'biotypes_include' the method will return the genes
428  whose biotype is *not* listed in the 'biotypes_exclude'
429  configuration parameter.
430 
431  It is an error to define both these configuration
432  parameters.
433 
434  The old parameter 'biotypes' is equivalent to
435  'biotypes_include'.
436 
437  Return type : Listref of Bio::EnsEMBL::Genes (or empty list)
438  Exceptions : none
439  Caller : internal
440  Status : At Risk
441  : under development
442 
443 =cut
444 
445 sub filter_biotypes {
446  my ( $self, $genes ) = @_;
447 
448  my @filtered;
449  my @biotypes;
450  my $opt_reverse;
451 
452  if ( defined( $self->conf()->param('biotypes_include') ) ||
453  defined( $self->conf()->param('biotypes') ) )
454  {
455  if ( defined( $self->conf()->param('biotypes_exclude') ) ) {
456  $self->logger()
457  ->error( "You may not use both " .
458  "'biotypes_include' and 'biotypes_exclude' " .
459  "in the configuration" );
460  }
461 
462  if ( defined( $self->conf()->param('biotypes_include') ) ) {
463  @biotypes = $self->conf()->param('biotypes_include');
464  }
465  else {
466  @biotypes = $self->conf()->param('biotypes');
467  }
468  $opt_reverse = 0;
469  }
470  else {
471  @biotypes = $self->conf()->param('biotypes_exclude');
472  $opt_reverse = 1;
473  }
474 
475  foreach my $gene ( @{$genes} ) {
476  my $keep_gene;
477 
478  foreach my $biotype (@biotypes) {
479  if ( $gene->biotype() eq $biotype ) {
480  if ($opt_reverse) { $keep_gene = 0 }
481  else { $keep_gene = 1 }
482  last;
483  }
484  }
485 
486  if ( defined($keep_gene) ) {
487  if ($keep_gene) {
488  push( @filtered, $gene );
489  }
490  }
491  elsif ($opt_reverse) {
492  push( @filtered, $gene );
493  }
494  }
495 
496  return \@filtered;
497 } ## end sub filter_biotypes
498 
499 
500 =head2 add
501 
502  Arg[1] : String $name - a cache name (e.g. 'genes_by_id')
503  Arg[2] : String type - a cache type (e.g. "source.$slice_name")
504  Arg[3] : String $key - key of this entry (e.g. a gene dbID)
505  Arg[4] : Bio::EnsEMBL::IdMappping::TinyFeature $val - value to cache
506  Example : $cache->add('genes_by_id',
507  'source.chromosome:NCBI36:X:1:1000000:1', '1234', $tiny_gene);
508  Description : Adds a TinyFeature object to a named cache.
509  Return type : Bio::EnsEMBL::IdMapping::TinyFeature
510  Exceptions : thrown on wrong or missing arguments
511  Caller : internal
512  Status : At Risk
513  : under development
514 
515 =cut
516 
517 sub add {
518  my $self = shift;
519  my $name = shift;
520  my $type = shift;
521  my $key = shift;
522  my $val = shift;
523 
524  throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
525  throw("You must provide a cache type.") unless $type;
526  throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
527 
528  $self->{'cache'}->{$type}->{$name}->{$key} = $val;
529 
530  return $self->{'cache'}->{$type}->{$name}->{$key};
531 }
532 
533 =head2 add_list
534 
535  Arg[1] : String $name - a cache name (e.g. 'genes_by_id')
536  Arg[2] : String type - a cache type (e.g. "source.$slice_name")
537  Arg[3] : String $key - key of this entry (e.g. a gene dbID)
538  Arg[4] : List of Bio::EnsEMBL::IdMappping::TinyFeature @val - values
539  to cache
540  Example : $cache->add_list('transcripts_by_exon_id',
541  'source.chromosome:NCBI36:X:1:1000000:1', '1234',
542  $tiny_transcript1, $tiny_transcript2);
543  Description : Adds a list of TinyFeature objects to a named cache.
544  Return type : Listref of Bio::EnsEMBL::IdMapping::TinyFeature objects
545  Exceptions : thrown on wrong or missing arguments
546  Caller : internal
547  Status : At Risk
548  : under development
549 
550 =cut
551 
552 sub add_list {
553  my $self = shift;
554  my $name = shift;
555  my $type = shift;
556  my $key = shift;
557  my @vals = @_;
558 
559  throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
560  throw("You must provide a cache type.") unless $type;
561  throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
562 
563  push @{ $self->{'cache'}->{$type}->{$name}->{$key} }, @vals;
564 
565  return $self->{'cache'}->{$type}->{$name}->{$key};
566 }
567 
568 sub get_by_key {
569  my $self = shift;
570  my $name = shift;
571  my $type = shift;
572  my $key = shift;
573 
574  throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
575  throw("You must provide a cache type.") unless $type;
576  throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
577 
578  # transparently load cache from file unless already loaded
579  unless ($self->{'instance'}->{'loaded'}->{"$type"}) {
580  $self->read_and_merge($type);
581  }
582 
583  return $self->{'cache'}->{$type}->{$name}->{$key};
584 }
585 
586 sub get_by_name {
587  my $self = shift;
588  my $name = shift;
589  my $type = shift;
590 
591  throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
592  throw("You must provide a cache type.") unless $type;
593 
594  # transparently load cache from file unless already loaded
595  unless ($self->{'instance'}->{'loaded'}->{$type}) {
596  $self->read_and_merge($type);
597  }
598 
599  return $self->{'cache'}->{$type}->{$name} || {};
600 }
601 
602 
603 sub get_count_by_name {
604  my $self = shift;
605  my $name = shift;
606  my $type = shift;
607 
608  throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
609  throw("You must provide a cache type.") unless $type;
610 
611  # transparently load cache from file unless already loaded
612  unless ($self->{'instance'}->{'loaded'}->{$type}) {
613  $self->read_and_merge($type);
614  }
615 
616  return scalar(keys %{ $self->get_by_name($name, $type) });
617 }
618 
619 
620 sub find_common_coord_systems {
621  my $self = shift;
622 
623  # get adaptors for source db
624  my $s_dba = $self->get_DBAdaptor('source');
625  my $s_csa = $s_dba->get_CoordSystemAdaptor;
626  my $s_sa = $s_dba->get_SliceAdaptor;
627 
628  # get adaptors for target db
629  my $t_dba = $self->get_DBAdaptor('target');
630  my $t_csa = $t_dba->get_CoordSystemAdaptor;
631  my $t_sa = $t_dba->get_SliceAdaptor;
632 
633  # find common coord_systems
634  my @s_coord_systems = @{ $s_csa->fetch_all };
635  my @t_coord_systems = @{ $t_csa->fetch_all };
636  my $found_highest = 0;
637 
638 SOURCE:
639  foreach my $s_cs (@s_coord_systems) {
640  if ( !$s_cs->is_default() ) { next SOURCE }
641 
642  TARGET:
643  foreach my $t_cs (@t_coord_systems) {
644  if ( !$t_cs->is_default() ) { next TARGET }
645 
646  if ( $s_cs->name eq $t_cs->name ) {
647 
648  # test for identical coord_system version
649  if ( $s_cs->version and ( $s_cs->version ne $t_cs->version ) ) {
650  next TARGET;
651  }
652 
653  # test for at least 50% identical seq_regions
654  if ( $self->seq_regions_compatible( $s_cs, $s_sa, $t_sa ) ) {
655  $self->add_common_cs($s_cs);
656 
657  unless ($found_highest) {
658  $self->highest_common_cs( $s_cs->name );
659  $self->highest_common_cs_version( $s_cs->version );
660  }
661 
662  $found_highest = 1;
663 
664  next SOURCE;
665  }
666  }
667  } ## end foreach my $t_cs (@t_coord_systems)
668  } ## end foreach my $s_cs (@s_coord_systems)
669 
670  return $found_highest;
671 } ## end sub find_common_coord_systems
672 
673 
674 sub seq_regions_compatible {
675  my $self = shift;
676  my $cs = shift;
677  my $s_sa = shift;
678  my $t_sa = shift;
679 
680  unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) {
681  throw('You must provide a CoordSystem');
682  }
683 
684  unless ($s_sa and $t_sa and $s_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')
685  and $t_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')) {
686  throw('You must provide a source and target SliceAdaptor');
687  }
688 
689  my %sr_match;
690  my $equal = 0;
691 
692  my $s_seq_regions = $s_sa->fetch_all($cs->name, $cs->version);
693  my $t_seq_regions = $t_sa->fetch_all($cs->name, $cs->version);
694 
695  # sanity check to prevent divison by zero
696  my $s_count = scalar(@$s_seq_regions);
697  my $t_count = scalar(@$t_seq_regions);
698  return(0) if ($s_count == 0 or $t_count == 0);
699 
700  foreach my $s_sr (@$s_seq_regions) {
701  $sr_match{$s_sr->seq_region_name} = $s_sr->length;
702  }
703 
704  foreach my $t_sr (@$t_seq_regions) {
705  if (exists($sr_match{$t_sr->seq_region_name})) {
706  $equal++;
707 
708  # return false if we have a region with same name but different length
709  return(0) unless ($sr_match{$t_sr->seq_region_name} == $t_sr->length);
710  }
711  }
712 
713  if ($equal/$s_count > 0.5 and $equal/$t_count > 0.5) {
714  return(1);
715  } else {
716  $self->logger->info("Only $equal seq_regions identical for ".$cs->name." ".$cs->version."\n");
717  return(0);
718  }
719 
720 }
721 
722 
723 sub check_db_connection {
724  my $self = shift;
725  my $dbtype = shift;
726 
727  my $err = 0;
728 
729  eval {
730  my $dba = $self->get_DBAdaptor($dbtype);
731  $dba->dbc->connect;
732  };
733 
734  if ($@) {
735  $self->logger->warning("Can't connect to $dbtype db: $@\n");
736  $err++;
737  } else {
738  $self->logger->debug("Connection to $dbtype db ok.\n");
739  $self->{'_db_conn_ok'}->{$dbtype} = 1;
740  }
741 
742  return $err;
743 }
744 
745 
746 sub check_db_read_permissions {
747  my $self = shift;
748  my $dbtype = shift;
749 
750  # skip this check if db connection failed (this prevents re-throwing
751  # exceptions).
752  return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
753 
754  my $err = 0;
755  my %privs = %{ $self->get_db_privs($dbtype) };
756 
757  unless ($privs{'SELECT'} or $privs{'ALL PRIVILEGES'}) {
758  $self->logger->warning("User doesn't have read permission on $dbtype db.\n");
759  $err++;
760  } else {
761  $self->logger->debug("Read permission on $dbtype db ok.\n");
762  }
763 
764  return $err;
765 }
766 
767 
768 sub check_db_write_permissions {
769  my $self = shift;
770  my $dbtype = shift;
771 
772  # skip this check if db connection failed (this prevents re-throwing
773  # exceptions).
774  return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
775 
776  my $err = 0;
777 
778  unless ($self->do_upload) {
779  $self->logger->debug("No uploads, so write permission on $dbtype db not required.\n");
780  return $err;
781  }
782 
783  my %privs = %{ $self->get_db_privs($dbtype) };
784 
785  unless ($privs{'INSERT'} or $privs{'ALL PRIVILEGES'}) {
786  $self->logger->warning("User doesn't have write permission on $dbtype db.\n");
787  $err++;
788  } else {
789  $self->logger->debug("Write permission on $dbtype db ok.\n");
790  }
791 
792  return $err;
793 }
794 
795 
796 sub do_upload {
797  my $self = shift;
798 
799  if ($self->conf->param('dry_run') or
800  ! ($self->conf->param('upload_events') or
801  $self->conf->param('upload_stable_ids') or
802  $self->conf->param('upload_archive'))) {
803  return 0;
804  } else {
805  return 1;
806  }
807 }
808 
809 
810 sub get_db_privs {
811  my ( $self, $dbtype ) = @_;
812 
813  my %privs = ();
814  my $rs;
815 
816  # get privileges from mysql db
817  eval {
818  my $dbc = $self->get_DBAdaptor($dbtype)->dbc();
819  my $sql = qq(SHOW GRANTS FOR ) . $dbc->username();
820  my $sth = $dbc->prepare($sql);
821  $sth->execute();
822  $rs = $sth->fetchall_arrayref();
823  #$sth->finish();
824  };
825 
826  if ($@) {
827  $self->logger->warning(
828  "Error obtaining privileges from $dbtype db: $@\n");
829  return {};
830  }
831 
832  # parse the output
833  foreach my $r ( map { $_->[0] } @{$rs} ) {
834  $r =~ s/GRANT (.*) ON .*/$1/i;
835  foreach my $p ( split( ',', $r ) ) {
836  # trim leading and trailing whitespace
837  $p =~ s/^\s+//;
838  $p =~ s/\s+$//;
839  $privs{ uc($p) } = 1;
840  }
841  }
842 
843  return \%privs;
844 } ## end sub get_db_privs
845 
846 
847 sub check_empty_tables {
848  my $self = shift;
849  my $dbtype = shift;
850 
851  # skip this check if db connection failed (this prevents re-throwing
852  # exceptions).
853  return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
854 
855  my $err = 0;
856  my $c = 0;
857 
858  if ($self->conf->param('no_check_empty_tables') or !$self->do_upload) {
859  $self->logger->debug("Won't check for empty stable ID and archive tables in $dbtype db.\n");
860  return $err;
861  }
862 
863  eval {
864  my @tables =
865  qw(
866  gene_stable_id
867  transcript_stable_id
868  translation_stable_id
869  exon_stable_id
870  stable_id_event
871  mapping_session
872  gene_archive
873  peptide_archive
874  );
875 
876  my $dba = $self->get_DBAdaptor($dbtype);
877  foreach my $table (@tables) {
878  if ( $table =~ /^([^_]+)_stable_id/ ) {
879  $table = $1;
880  if ( $c =
881  $self->fetch_value_from_db(
882  $dba,
883  "SELECT COUNT(*) FROM $table WHERE stable_id IS NOT NULL"
884  ) )
885  {
886  $self->logger->warning(
887  "$table table in $dbtype db has $c stable IDs.\n");
888  $err++;
889  }
890  }
891  else {
892  if ( $c =
893  $self->fetch_value_from_db(
894  $dba, "SELECT COUNT(*) FROM $table"
895  ) )
896  {
897  $self->logger->warning(
898  "$table table in $dbtype db has $c entries.\n");
899  $err++;
900  }
901  }
902  } ## end foreach my $table (@tables)
903  };
904 
905  if ($@) {
906  $self->logger->warning(
907 "Error retrieving stable ID and archive table row counts from $dbtype db: $@\n"
908  );
909  $err++;
910  }
911  elsif ( !$err ) {
912  $self->logger->debug(
913  "All stable ID and archive tables in $dbtype db are empty.\n");
914  }
915  return $err;
916 }
917 
918 
919 sub check_sequence {
920  my ( $self, $dbtype ) = @_;
921 
922  # skip this check if db connection failed (this prevents re-throwing
923  # exceptions).
924  return 1 unless ( $self->{'_db_conn_ok'}->{$dbtype} );
925 
926  my $err = 0;
927  my $c = 0;
928 
929  eval {
930  my $dba = $self->get_DBAdaptor($dbtype);
931  unless ( $c =
932  $self->fetch_value_from_db(
933  $dba->dnadb(), "SELECT COUNT(*) FROM dna"
934  ) )
935  {
936  $err++;
937  }
938  };
939 
940  if ($@) {
941  $self->logger->warning( "Error retrieving dna table row count "
942  . "from $dbtype database: $@\n" );
943  $err++;
944  } elsif ($err) {
945  $self->logger->warning("No sequence found in $dbtype database.\n");
946  } else {
947  $self->logger->debug(
948  ucfirst($dbtype) . " db has sequence ($c entries).\n" );
949  }
950 
951  return $err;
952 } ## end sub check_sequence
953 
954 
955 sub check_meta_entries {
956  my $self = shift;
957  my $dbtype = shift;
958 
959  # skip this check if db connection failed (this prevents re-throwing
960  # exceptions).
961  return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
962 
963  my $err = 0;
964  my $assembly_default;
965  my $schema_version;
966 
967  eval {
968  my $dba = $self->get_DBAdaptor($dbtype);
969  $assembly_default = $self->fetch_value_from_db($dba,
970  qq(SELECT meta_value FROM meta WHERE meta_key = 'assembly.default'));
971  $schema_version = $self->fetch_value_from_db($dba,
972  qq(SELECT meta_value FROM meta WHERE meta_key = 'schema_version'));
973  };
974 
975  if ($@) {
976  $self->logger->warning("Error retrieving dna table row count from $dbtype db: $@\n");
977  return ++$err;
978  }
979 
980  unless ($assembly_default) {
981  $self->logger->warning("No meta.assembly.default value found in $dbtype db.\n");
982  $err++;
983  } else {
984  $self->logger->debug("meta.assembly.default value found ($assembly_default).\n");
985  }
986 
987  unless ($schema_version) {
988  $self->logger->warning("No meta.schema_version value found in $dbtype db.\n");
989  $err++;
990  } else {
991  $self->logger->debug("meta.schema_version value found ($schema_version).\n");
992  }
993 
994  return $err;
995 }
996 
997 
998 sub fetch_value_from_db {
999  my ( $self, $dba, $sql ) = @_;
1000 
1001  assert_ref( $dba, 'Bio::EnsEMBL::DBSQL::DBAdaptor' );
1002 
1003  if ( !defined($sql) ) {
1004  throw("Need an SQL statement to execute.\n");
1005  }
1006 
1007  my $sth = $dba->dbc->prepare($sql);
1008  $sth->execute();
1009 
1010  my ($c) = $sth->fetchrow_array;
1011  return $c;
1012 }
1013 
1014 sub get_DBAdaptor {
1015  my ( $self, $prefix ) = @_;
1016 
1017  unless ( $self->{'_dba'}->{$prefix} ) {
1018  # connect to database
1019  my $dba =
1021  -host => $self->conf->param("${prefix}host"),
1022  -port => $self->conf->param("${prefix}port"),
1023  -user => $self->conf->param("${prefix}user"),
1024  -pass => $self->conf->param("${prefix}pass"),
1025  -dbname => $self->conf->param("${prefix}dbname"),
1026  -group => $prefix, );
1027 
1028  if ( !defined( $self->conf->param("${prefix}host_dna") ) ) {
1029  # explicitely set the dnadb to itself - by default the Registry
1030  # assumes a group 'core' for this now
1031  $dba->dnadb($dba);
1032  } else {
1033  my $dna_dba =
1035  -host => $self->conf->param("${prefix}host_dna"),
1036  -port => $self->conf->param("${prefix}port_dna"),
1037  -user => $self->conf->param("${prefix}user_dna"),
1038  -pass => $self->conf->param("${prefix}pass_dna"),
1039  -dbname => $self->conf->param("${prefix}dbname_dna"),
1040  -group => $prefix, );
1041  $dba->dnadb($dna_dba);
1042  }
1043 
1044  $self->{'_dba'}->{$prefix} = $dba;
1045  } ## end unless ( $self->{'_dba'}->...)
1046 
1047  return $self->{'_dba'}->{$prefix};
1048 } ## end sub get_DBAdaptor
1049 
1050 
1051 sub get_production_DBAdaptor() {
1052  my ($self) = @_;
1053  my $dba = new Bio::EnsEMBL::DBSQL::DBAdaptor(
1054  -host => $self->conf->param("productionhost"),
1055  -port => $self->conf->param("productionport"),
1056  -user => $self->conf->param("productionuser"),
1057  -pass => $self->conf->param("productionpass"),
1058  -dbname => $self->conf->param("productiondbname"));
1059  return $dba;
1060 }
1061 
1062 
1063 sub cache_file_exists {
1064  my $self = shift;
1065  my $type = shift;
1066 
1067  throw("You must provide a cache type.") unless $type;
1068 
1069  my $cache_file = $self->cache_file($type);
1070 
1071  if (-e $cache_file) {
1072  $self->logger->info("Cache file found for $type.\n", 2);
1073  $self->logger->debug("Will read from $cache_file.\n", 2);
1074  return 1;
1075  } else {
1076  $self->logger->info("No cache file found for $type.\n", 2);
1077  $self->logger->info("Will build cache from db.\n", 2);
1078  return 0;
1079  }
1080 }
1081 
1082 
1083 sub cache_file {
1084  my $self = shift;
1085  my $type = shift;
1086 
1087  throw("You must provide a cache type.") unless $type;
1088 
1089  return $self->dump_path."/$type.object_cache.ser";
1090 }
1091 
1092 
1093 sub instance_file {
1094  my $self = shift;
1095 
1096  return $self->dump_path."/cache_instance.ser";
1097 }
1098 
1099 
1100 sub dump_path {
1101  my $self = shift;
1102 
1103  $self->{'dump_path'} ||= path_append($self->conf->param('basedir'), 'cache');
1104 
1105  return $self->{'dump_path'};
1106 }
1107 
1108 
1109 sub write_all_to_file {
1110  my $self = shift;
1111  my $type = shift;
1112 
1113  throw("You must provide a cache type.") unless $type;
1114 
1115  my $size = 0;
1116  $size += $self->write_to_file($type);
1117  $size += $self->write_instance_to_file;
1118 
1119  return parse_bytes($size);
1120 }
1121 
1122 
1123 sub write_to_file {
1124  my $self = shift;
1125  my $type = shift;
1126 
1127  throw("You must provide a cache type.") unless $type;
1128 
1129  unless ($self->{'cache'}->{$type}) {
1130  $self->logger->warning("No features found in $type. Won't write cache file.\n");
1131  return;
1132  }
1133 
1134  my $cache_file = $self->cache_file($type);
1135 
1136  eval { nstore($self->{'cache'}->{$type}, $cache_file) };
1137  if ($@) {
1138  throw("Unable to store $cache_file: $@\n");
1139  }
1140 
1141  my $size = -s $cache_file;
1142  return $size;
1143 }
1144 
1145 
1146 sub write_instance_to_file {
1147  my $self = shift;
1148 
1149  my $instance_file = $self->instance_file;
1150 
1151  eval { nstore($self->{'instance'}, $instance_file) };
1152  if ($@) {
1153  throw("Unable to store $instance_file: $@\n");
1154  }
1155 
1156  my $size = -s $instance_file;
1157  return $size;
1158 }
1159 
1160 
1161 sub read_from_file {
1162  my $self = shift;
1163  my $type = shift;
1164 
1165  throw("You must provide a cache type.") unless $type;
1166 
1167  my $cache_file = $self->cache_file($type);
1168 
1169  if (-s $cache_file) {
1170 
1171  #$self->logger->info("Reading cache from file...\n", 0, 'stamped');
1172  #$self->logger->info("Cache file $cache_file.\n", 1);
1173  eval { $self->{'cache'}->{$type} = retrieve($cache_file); };
1174  if ($@) {
1175  throw("Unable to retrieve cache: $@");
1176  }
1177  #$self->logger->info("Done.\n", 0, 'stamped');
1178 
1179  } else {
1180  $self->logger->warning("Cache file $cache_file not found or empty.\n");
1181  }
1182 
1183 
1184  return $self->{'cache'}->{$type};
1185 }
1186 
1187 
1188 sub read_and_merge {
1189  my $self = shift;
1190  my $dbtype = shift;
1191 
1192  unless ($dbtype eq 'source' or $dbtype eq 'target') {
1193  throw("Db type must be 'source' or 'target'.");
1194  }
1195 
1196  # read cache from single or multiple files, depending on caching strategy
1197  my $cache_method = $self->cache_method;
1198  if ($cache_method eq 'ALL') {
1199  $self->read_from_file("$dbtype.ALL");
1200  } elsif ($cache_method eq 'BY_SEQ_REGION') {
1201  foreach my $slice_name (@{ $self->slice_names($dbtype) }) {
1202  $self->read_from_file("$dbtype.$slice_name");
1203  }
1204  } else {
1205  throw("Unknown cache method: $cache_method.");
1206  }
1207 
1208  $self->merge($dbtype);
1209 
1210  # flag as being loaded
1211  $self->{'instance'}->{'loaded'}->{$dbtype} = 1;
1212 }
1213 
1214 
1215 sub merge {
1216  my $self = shift;
1217  my $dbtype = shift;
1218 
1219  unless ($dbtype eq 'source' or $dbtype eq 'target') {
1220  throw("Db type must be 'source' or 'target'.");
1221  }
1222 
1223  foreach my $type (keys %{ $self->{'cache'} || {} }) {
1224  next unless ($type =~ /^$dbtype/);
1225 
1226  foreach my $name (keys %{ $self->{'cache'}->{$type} || {} }) {
1227 
1228  foreach my $key (keys %{ $self->{'cache'}->{$type}->{$name} || {} }) {
1229  if (defined $self->{'cache'}->{$dbtype}->{$name}->{$key}) {
1230  # warning("Duplicate key in cache: $name|$dbtype|$key. Skipping.\n");
1231  } else {
1232  $self->{'cache'}->{$dbtype}->{$name}->{$key} =
1233  $self->{'cache'}->{$type}->{$name}->{$key};
1234  }
1235 
1236  delete $self->{'cache'}->{$type}->{$name}->{$key};
1237  }
1238 
1239  delete $self->{'cache'}->{$type}->{$name};
1240  }
1241 
1242  delete $self->{'cache'}->{$type};
1243 
1244  }
1245 }
1246 
1247 
1248 sub read_instance_from_file {
1249  my $self = shift;
1250 
1251  my $instance_file = $self->instance_file;
1252 
1253  unless (-s $instance_file) {
1254  throw("No valid cache instance file found at $instance_file.");
1255  }
1256 
1257  eval { $self->{'instance'} = retrieve($instance_file); };
1258  if ($@) {
1259  throw("Unable to retrieve cache instance: $@");
1260  }
1261 
1262  return $self->{'instance'};
1263 }
1264 
1265 
1266 sub slice_names {
1267  my $self = shift;
1268  my $dbtype = shift;
1269 
1270  throw("You must provide a db type (source|target).") unless $dbtype;
1271 
1272  my $dba = $self->get_DBAdaptor($dbtype);
1273  my $sa = $dba->get_SliceAdaptor;
1274 
1275  my @slice_names = ();
1276 
1277  if ( $self->conf->param('chromosomes') ) {
1278  # Fetch the specified chromosomes.
1279  foreach my $chr ( $self->conf->param('chromosomes') ) {
1280  my $slice = $sa->fetch_by_region( 'chromosome', $chr );
1281  push @slice_names, $slice->name;
1282  }
1283 
1284  }
1285  elsif ( $self->conf->param('region') ) {
1286  # Fetch the slices on the specified regions. Don't use
1287  # SliceAdaptor->fetch_by_name() since this will fail if assembly
1288  # versions are different for source and target db.
1289  my ( $cs, $version, $name, $start, $end, $strand ) =
1290  split( /:/, $self->conf->param('region') );
1291 
1292  my $slice = $sa->fetch_by_region( $cs, $name, $start, $end );
1293 
1294  push @slice_names, $slice->name;
1295 
1296  }
1297  else {
1298  # Fetch all slices that have genes on them.
1299  my $ga = $dba->get_GeneAdaptor;
1300  my $sa = $dba->get_SliceAdaptor;
1301 
1302  foreach my $srid ( @{ $ga->list_seq_region_ids } ) {
1303  my $slice = $sa->fetch_by_seq_region_id($srid);
1304  my $slices = $sa->fetch_by_region_unique( $slice->coord_system_name(), $slice->seq_region_name() );
1305 
1306  push( @slice_names, map { $_->name() } @{$slices} );
1307  }
1308  }
1309 
1310  return \@slice_names;
1311 } ## end sub slice_names
1312 
1313 
1314 sub logger {
1315  my $self = shift;
1316  $self->{'logger'} = shift if (@_);
1317  return $self->{'logger'};
1318 }
1319 
1320 sub conf {
1321  my $self = shift;
1322  $self->{'conf'} = shift if (@_);
1323  return $self->{'conf'};
1324 }
1325 
1326 
1327 sub cache_method {
1328  my $self = shift;
1329  $self->{'instance'}->{'cache_method'} = shift if (@_);
1330  return $self->{'instance'}->{'cache_method'};
1331 }
1332 
1333 
1334 sub highest_common_cs {
1335  my $self = shift;
1336  $self->{'instance'}->{'hccs'} = shift if (@_);
1337  return $self->{'instance'}->{'hccs'};
1338 }
1339 
1340 
1341 sub highest_common_cs_version {
1342  my $self = shift;
1343  $self->{'instance'}->{'hccsv'} = shift if (@_);
1344  return $self->{'instance'}->{'hccsv'};
1345 }
1346 
1347 
1348 sub add_common_cs {
1349  my $self = shift;
1350  my $cs = shift;
1351 
1352  unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) {
1353  throw('You must provide a CoordSystem');
1354  }
1355 
1356  my $csid = join(':', $cs->name, $cs->version);
1357 
1358  $self->{'instance'}->{'ccs'}->{$csid} = 1;
1359 }
1360 
1361 
1362 sub is_common_cs {
1363  my $self = shift;
1364  my $csid = shift;
1365 
1366  return $self->{'instance'}->{'ccs'}->{$csid};
1367 }
1368 
1369 
1370 1;
1371 
EnsEMBL
Definition: Filter.pm:1
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
map
public map()
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
Bio::EnsEMBL::IdMapping::TinyExon
Definition: TinyExon.pm:37
build_cache_all
public build_cache_all()
Bio::EnsEMBL::Storable
Definition: Storable.pm:23
Bio::EnsEMBL::IdMapping::TinyFeature::new_fast
public Bio::EnsEMBL::IdMapping::TinyFeature new_fast()
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::IdMapping::Cache::new
public Bio::EnsEMBL::IdMapping::Cache new()
Bio::EnsEMBL::IdMapping::TinyTranslation
Definition: TinyTranslation.pm:27
Bio::EnsEMBL::Utils::ConfParser
Definition: ConfParser.pm:41
build_cache_by_seq_region
public build_cache_by_seq_region()
Bio::EnsEMBL::Utils::Logger
Definition: Logger.pm:36
Bio::EnsEMBL::Utils::Scalar
Definition: Scalar.pm:66
Bio::EnsEMBL::IdMapping::TinyGene
Definition: TinyGene.pm:28
Bio::EnsEMBL::IdMapping::Cache
Definition: Cache.pm:18
Bio::EnsEMBL::IdMapping::TinyTranscript
Definition: TinyTranscript.pm:29
Bio
Definition: AltAlleleGroup.pm:4
Bio::EnsEMBL::Utils::Argument
Definition: Argument.pm:34
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68