3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
43 package Bio::EnsEMBL::IdMapping::Cache;
47 no warnings
'uninitialized';
59 use Digest::MD5 qw(md5_hex);
61 # define available cache names here
65 transcripts_by_exon_id
68 genes_by_transcript_id
80 Description : constructor
82 Exceptions : thrown on wrong or missing arguments
91 my $class = ref($caller) || $caller;
93 my ($logger, $conf, $load_instance) =
94 rearrange([
'LOGGER',
'CONF',
'LOAD_INSTANCE'], @_);
96 unless ($logger->isa(
'Bio::EnsEMBL::Utils::Logger')) {
97 throw(
"You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
100 unless ($conf->isa(
'Bio::EnsEMBL::Utils::ConfParser')) {
101 throw(
"You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
105 bless ($self, $class);
108 $self->logger($logger);
111 if ($load_instance) {
112 $self->read_instance_from_file;
119 =head2 build_cache_by_slice
121 Arg[1] : String $dbtype - db type (source|target)
122 Arg[2] : String $slice_name - the name of a slice (format as returned by
124 Example : my ($num_genes, $filesize) = $cache->build_cache_by_slice(
125 'source',
'chromosome:NCBI36:X:1:1000000:-1');
126 Description : Builds a cache of genes, transcripts, translations and exons
127 needed by the IdMapping application and serialises the resulting
128 cache
object to a file, one slice at a time.
129 Return type : list of the number of genes processed and the size of the
130 serialised cache file
131 Exceptions : thrown on invalid slice name
138 sub build_cache_by_slice {
141 my $slice_name = shift;
143 # set cache method (required for loading cache later)
144 $self->cache_method(
'BY_SEQ_REGION');
146 my $dba = $self->get_DBAdaptor($dbtype);
147 my $sa = $dba->get_SliceAdaptor;
149 my $slice = $sa->fetch_by_name($slice_name);
151 throw(
"Could not retrieve slice $slice_name.");
154 my $genes = $slice->get_all_Genes( undef, undef, 1 );
156 # find common coord_system
157 my $common_cs_found = $self->find_common_coord_systems;
159 # find out whether native coord_system is a common coord_system.
160 # if so, you don't need to project.
161 # also don't project if no common coord_system present
162 my $need_project = 1;
164 my $csid = join(
':',
165 $slice->coord_system_name,
166 $slice->coord_system->version );
168 if ( $self->is_common_cs($csid) or !$self->highest_common_cs ) {
173 my $type =
"$dbtype.$slice_name";
175 $self->build_cache_from_genes( $type, $genes, $need_project );
178 # write cache to file, then flush cache to reclaim memory
179 my $size = $self->write_all_to_file($type);
181 return $num_genes, $size;
182 } ## end sub build_cache_by_slice
187 Arg[1] : String $dbtype - db type (source|target)
188 Example : my ($num_genes, $filesize) = $cache->build_cache_all(
'source');
189 Description : Builds a cache of genes, transcripts, translations and exons
190 needed by the IdMapping application and serialises the
191 resulting cache
object to a file. All genes across the genome
192 are processed in one go. This method should be used when
194 of toplevel seq_regions (e.g. 2x genomes).
195 Return type : list of the number of genes processed and the size of the
196 serialised cache file
197 Exceptions : thrown on invalid slice name
204 sub build_cache_all {
208 # set cache method (required for loading cache later)
209 $self->cache_method('ALL
');
211 my $dba = $self->get_DBAdaptor($dbtype);
212 my $ga = $dba->get_GeneAdaptor;
214 my $genes = $ga->fetch_all;
216 # find common coord_system
217 my $common_cs_found = $self->find_common_coord_systems;
219 # Build cache. Setting $need_project to 'CHECK
' will cause
220 # build_cache_from_genes() to check the coordinate system for each
222 my $type = "$dbtype.ALL";
223 my $need_project = 'CHECK
';
225 $self->build_cache_from_genes( $type, $genes, $need_project );
229 # write cache to file, then flush cache to reclaim memory
230 my $size = $self->write_all_to_file($type);
232 return $num_genes, $size;
236 =head2 build_cache_from_genes
238 Arg[1] : String $type - cache type
239 Arg[2] : Listref of Bio::EnsEMBL::Genes $genes - genes to build cache
241 Arg[3] : Boolean $need_project - indicate if we need to project exons to
242 common coordinate system
243 Example : $cache->build_cache_from_genes(
244 'source.chromosome:NCBI36:X:1:100000:1
', \@genes);
245 Description : Builds the cache by fetching transcripts, translations and exons
246 for a list of genes from the database, and creating lightweight
247 Bio::EnsEMBL::IdMapping::TinyFeature objects containing only the
248 data needed by the IdMapping application. These objects are
249 attached to a name cache in this cache object. Exons only need
250 to be projected to a commond coordinate system if their native
251 coordinate system isn't common to source and target assembly
253 Return type :
int - number of genes after filtering
254 Exceptions : thrown on wrong or missing arguments
261 sub build_cache_from_genes {
265 my $need_project = shift;
267 throw(
"You must provide a type.") unless $type;
268 throw(
"You must provide a listref of genes.")
269 unless ( ref($genes) eq
'ARRAY' );
272 if ( $self->conf()->param(
'biotypes') ||
273 $self->conf()->param(
'biotypes_include') ||
274 $self->conf()->param(
'biotypes_exclude') )
276 $genes = $self->filter_biotypes($genes);
278 my $num_genes = scalar(@$genes);
280 # initialise cache for the given type.
281 $self->{
'cache'}->{$type} = {};
284 #my $num_genes = scalar(@$genes);
285 #my $progress_id = $self->logger->init_progress($num_genes);
287 # loop over genes sorted by gene location.
288 # the sort will hopefully improve assembly mapper cache performance and
289 # therefore speed up exon sequence retrieval
290 foreach my $gene ( sort { $a->start <=> $b->start } @$genes ) {
291 #$self->logger->log_progressbar($progress_id, ++$i, 2);
292 #$self->logger->log_progress($num_genes, ++$i, 20, 3, 1);
294 if ( $need_project eq
'CHECK' ) {
295 # find out whether native coord_system is a common coord_system.
296 # if so, you don't need to project.
297 # also don't project if no common coord_system present
298 if ( $self->highest_common_cs ) {
299 my $csid = join(
':',
300 $gene->slice->coord_system_name,
301 $gene->slice->coord_system->version );
302 if ( $self->is_common_cs($csid) ) {
311 # create lightweigt gene
314 $gene->dbID, $gene->stable_id,
315 $gene->version, $gene->created_date,
316 $gene->modified_date, $gene->start,
317 $gene->end, $gene->strand,
318 $gene->slice->seq_region_name, $gene->biotype,
319 $gene->analysis->logic_name,
323 $self->add(
'genes_by_id', $type, $gene->dbID, $lgene );
326 foreach my $tr ( @{ $gene->get_all_Transcripts } ) {
329 $tr->dbID, $tr->stable_id,
330 $tr->version, $tr->created_date,
331 $tr->modified_date, $tr->start,
332 $tr->end, $tr->strand,
333 $tr->length, md5_hex( $tr->spliced_seq ),
336 $ltr->biotype( $tr->biotype() );
337 $ltr->seq_region_name( $tr->slice->seq_region_name() );
338 $lgene->add_Transcript($ltr);
340 # build transcript caches
341 $self->add(
'transcripts_by_id', $type, $tr->dbID, $ltr );
342 $self->add(
'genes_by_transcript_id', $type, $tr->dbID, $lgene );
344 # translation (if there is one)
345 if ( my $tl = $tr->translation ) {
348 $tl->dbID, $tl->stable_id,
349 $tl->version, $tl->created_date,
350 $tl->modified_date, $tr->dbID,
354 $ltr->add_Translation($ltl);
356 $self->add(
'translations_by_id', $type, $tl->dbID, $ltl );
362 foreach my $exon ( @{ $tr->get_all_Exons } ) {
369 $exon->modified_date,
373 $exon->slice->seq_region_name,
374 $exon->slice->coord_system_name,
375 $exon->slice->coord_system->version,
376 $exon->slice->subseq( $exon->start, $exon->end,
381 # get coordinates in common coordinate system if needed
384 $exon->project( $self->highest_common_cs,
385 $self->highest_common_cs_version ) };
387 if ( scalar(@seg) == 1 ) {
388 my $sl = $seg[0]->to_Slice;
389 $lexon->common_start( $sl->start );
390 $lexon->common_end( $sl->end );
391 $lexon->common_strand( $sl->strand );
392 $lexon->common_sr_name( $sl->seq_region_name );
396 $ltr->add_Exon($lexon);
398 $self->add(
'exons_by_id', $type, $exon->dbID, $lexon );
399 $self->add_list(
'transcripts_by_exon_id',
400 $type, $exon->dbID, $ltr );
403 } ## end
foreach my $exon ( @{ $tr->get_all_Exons...})
406 } ## end
foreach my $tr ( @{ $gene->get_all_Transcripts...})
409 } ## end
foreach my $gene ( sort { $a...})
412 } ## end sub build_cache_from_genes
415 =head2 filter_biotypes
417 Arg[1] : Listref of Bio::EnsEMBL::Genes $genes - the genes to filter
418 Example : my @filtered = @{ $cache->filter_biotypes(\@genes) };
420 Description : Filters a list of genes by biotype. Biotypes are
421 taken from the IdMapping configuration parameter
422 'biotypes_include' or
'biotypes_exclude'.
424 If the configuration parameter
'biotypes_exclude' is
425 defined, then rather than returning the genes whose
426 biotype is listed in the configuration parameter
427 'biotypes_include' the method will
return the genes
428 whose biotype is *not* listed in the
'biotypes_exclude'
429 configuration parameter.
431 It is an error to define both these configuration
434 The old parameter
'biotypes' is equivalent to
437 Return type : Listref of Bio::EnsEMBL::Genes (or empty list)
445 sub filter_biotypes {
446 my ( $self, $genes ) = @_;
452 if ( defined( $self->conf()->param(
'biotypes_include') ) ||
453 defined( $self->conf()->param(
'biotypes') ) )
455 if ( defined( $self->conf()->param(
'biotypes_exclude') ) ) {
457 ->error(
"You may not use both " .
458 "'biotypes_include' and 'biotypes_exclude' " .
459 "in the configuration" );
462 if ( defined( $self->conf()->param(
'biotypes_include') ) ) {
463 @biotypes = $self->conf()->param(
'biotypes_include');
466 @biotypes = $self->conf()->param(
'biotypes');
471 @biotypes = $self->conf()->param(
'biotypes_exclude');
475 foreach my $gene ( @{$genes} ) {
478 foreach my $biotype (@biotypes) {
479 if ( $gene->biotype() eq $biotype ) {
480 if ($opt_reverse) { $keep_gene = 0 }
481 else { $keep_gene = 1 }
486 if ( defined($keep_gene) ) {
488 push( @filtered, $gene );
491 elsif ($opt_reverse) {
492 push( @filtered, $gene );
497 } ## end sub filter_biotypes
502 Arg[1] : String $name - a cache name (e.g.
'genes_by_id')
503 Arg[2] : String type - a cache type (e.g. "source.$slice_name")
504 Arg[3] : String $key - key of this entry (e.g. a gene dbID)
505 Arg[4] :
Bio::
EnsEMBL::IdMappping::TinyFeature $val - value to cache
506 Example : $cache->add('genes_by_id',
507 'source.chromosome:NCBI36:X:1:1000000:1', '1234', $tiny_gene);
508 Description : Adds a TinyFeature
object to a named cache.
509 Return type :
Bio::
EnsEMBL::IdMapping::TinyFeature
510 Exceptions : thrown on wrong or missing arguments
524 throw(
"You must provide a cache name (e.g. genes_by_id.") unless $name;
525 throw(
"You must provide a cache type.") unless $type;
526 throw(
"You must provide a cache key (e.g. a gene dbID).") unless $key;
528 $self->{
'cache'}->{$type}->{$name}->{$key} = $val;
530 return $self->{
'cache'}->{$type}->{$name}->{$key};
535 Arg[1] : String $name - a cache name (e.g.
'genes_by_id')
536 Arg[2] : String type - a cache type (e.g. "source.$slice_name")
537 Arg[3] : String $key - key of this entry (e.g. a gene dbID)
538 Arg[4] : List of
Bio::
EnsEMBL::IdMappping::TinyFeature @val - values
540 Example : $cache->add_list('transcripts_by_exon_id',
541 'source.chromosome:NCBI36:X:1:1000000:1', '1234',
542 $tiny_transcript1, $tiny_transcript2);
543 Description : Adds a list of TinyFeature objects to a named cache.
544 Return type : Listref of
Bio::
EnsEMBL::IdMapping::TinyFeature objects
545 Exceptions : thrown on wrong or missing arguments
559 throw(
"You must provide a cache name (e.g. genes_by_id.") unless $name;
560 throw(
"You must provide a cache type.") unless $type;
561 throw(
"You must provide a cache key (e.g. a gene dbID).") unless $key;
563 push @{ $self->{
'cache'}->{$type}->{$name}->{$key} }, @vals;
565 return $self->{
'cache'}->{$type}->{$name}->{$key};
574 throw(
"You must provide a cache name (e.g. genes_by_id.") unless $name;
575 throw(
"You must provide a cache type.") unless $type;
576 throw(
"You must provide a cache key (e.g. a gene dbID).") unless $key;
578 # transparently load cache from file unless already loaded
579 unless ($self->{
'instance'}->{
'loaded'}->{
"$type"}) {
580 $self->read_and_merge($type);
583 return $self->{
'cache'}->{$type}->{$name}->{$key};
591 throw(
"You must provide a cache name (e.g. genes_by_id.") unless $name;
592 throw(
"You must provide a cache type.") unless $type;
594 # transparently load cache from file unless already loaded
595 unless ($self->{
'instance'}->{
'loaded'}->{$type}) {
596 $self->read_and_merge($type);
599 return $self->{
'cache'}->{$type}->{$name} || {};
603 sub get_count_by_name {
608 throw(
"You must provide a cache name (e.g. genes_by_id.") unless $name;
609 throw(
"You must provide a cache type.") unless $type;
611 # transparently load cache from file unless already loaded
612 unless ($self->{
'instance'}->{
'loaded'}->{$type}) {
613 $self->read_and_merge($type);
616 return scalar(keys %{ $self->get_by_name($name, $type) });
620 sub find_common_coord_systems {
623 # get adaptors for source db
624 my $s_dba = $self->get_DBAdaptor(
'source');
625 my $s_csa = $s_dba->get_CoordSystemAdaptor;
626 my $s_sa = $s_dba->get_SliceAdaptor;
628 # get adaptors for target db
629 my $t_dba = $self->get_DBAdaptor(
'target');
630 my $t_csa = $t_dba->get_CoordSystemAdaptor;
631 my $t_sa = $t_dba->get_SliceAdaptor;
633 # find common coord_systems
634 my @s_coord_systems = @{ $s_csa->fetch_all };
635 my @t_coord_systems = @{ $t_csa->fetch_all };
636 my $found_highest = 0;
639 foreach my $s_cs (@s_coord_systems) {
640 if ( !$s_cs->is_default() ) { next SOURCE }
643 foreach my $t_cs (@t_coord_systems) {
644 if ( !$t_cs->is_default() ) { next TARGET }
646 if ( $s_cs->name eq $t_cs->name ) {
648 # test for identical coord_system version
649 if ( $s_cs->version and ( $s_cs->version ne $t_cs->version ) ) {
653 # test for at least 50% identical seq_regions
654 if ( $self->seq_regions_compatible( $s_cs, $s_sa, $t_sa ) ) {
655 $self->add_common_cs($s_cs);
657 unless ($found_highest) {
658 $self->highest_common_cs( $s_cs->name );
659 $self->highest_common_cs_version( $s_cs->version );
667 } ## end
foreach my $t_cs (@t_coord_systems)
668 } ## end
foreach my $s_cs (@s_coord_systems)
670 return $found_highest;
671 } ## end sub find_common_coord_systems
674 sub seq_regions_compatible {
680 unless ($cs and $cs->isa(
'Bio::EnsEMBL::CoordSystem')) {
681 throw(
'You must provide a CoordSystem');
684 unless ($s_sa and $t_sa and $s_sa->isa(
'Bio::EnsEMBL::DBSQL::SliceAdaptor')
685 and $t_sa->isa(
'Bio::EnsEMBL::DBSQL::SliceAdaptor')) {
686 throw(
'You must provide a source and target SliceAdaptor');
692 my $s_seq_regions = $s_sa->fetch_all($cs->name, $cs->version);
693 my $t_seq_regions = $t_sa->fetch_all($cs->name, $cs->version);
695 # sanity check to prevent divison by zero
696 my $s_count = scalar(@$s_seq_regions);
697 my $t_count = scalar(@$t_seq_regions);
698 return(0)
if ($s_count == 0 or $t_count == 0);
700 foreach my $s_sr (@$s_seq_regions) {
701 $sr_match{$s_sr->seq_region_name} = $s_sr->length;
704 foreach my $t_sr (@$t_seq_regions) {
705 if (exists($sr_match{$t_sr->seq_region_name})) {
708 # return false if we have a region with same name but different length
709 return(0) unless ($sr_match{$t_sr->seq_region_name} == $t_sr->length);
713 if ($equal/$s_count > 0.5 and $equal/$t_count > 0.5) {
716 $self->logger->info(
"Only $equal seq_regions identical for ".$cs->name.
" ".$cs->version.
"\n");
723 sub check_db_connection {
730 my $dba = $self->get_DBAdaptor($dbtype);
735 $self->logger->warning(
"Can't connect to $dbtype db: $@\n");
738 $self->logger->debug(
"Connection to $dbtype db ok.\n");
739 $self->{
'_db_conn_ok'}->{$dbtype} = 1;
746 sub check_db_read_permissions {
750 # skip this check if db connection failed (this prevents re-throwing
752 return 1 unless ($self->{
'_db_conn_ok'}->{$dbtype});
755 my %privs = %{ $self->get_db_privs($dbtype) };
757 unless ($privs{
'SELECT'} or $privs{
'ALL PRIVILEGES'}) {
758 $self->logger->warning(
"User doesn't have read permission on $dbtype db.\n");
761 $self->logger->debug(
"Read permission on $dbtype db ok.\n");
768 sub check_db_write_permissions {
772 # skip this check if db connection failed (this prevents re-throwing
774 return 1 unless ($self->{
'_db_conn_ok'}->{$dbtype});
778 unless ($self->do_upload) {
779 $self->logger->debug(
"No uploads, so write permission on $dbtype db not required.\n");
783 my %privs = %{ $self->get_db_privs($dbtype) };
785 unless ($privs{
'INSERT'} or $privs{
'ALL PRIVILEGES'}) {
786 $self->logger->warning(
"User doesn't have write permission on $dbtype db.\n");
789 $self->logger->debug(
"Write permission on $dbtype db ok.\n");
799 if ($self->conf->param(
'dry_run') or
800 ! ($self->conf->param(
'upload_events') or
801 $self->conf->param(
'upload_stable_ids') or
802 $self->conf->param(
'upload_archive'))) {
811 my ( $self, $dbtype ) = @_;
816 # get privileges from mysql db
818 my $dbc = $self->get_DBAdaptor($dbtype)->dbc();
819 my $sql = qq(SHOW GRANTS FOR ) . $dbc->username();
820 my $sth = $dbc->prepare($sql);
822 $rs = $sth->fetchall_arrayref();
827 $self->logger->warning(
828 "Error obtaining privileges from $dbtype db: $@\n");
833 foreach my $r (
map { $_->[0] } @{$rs} ) {
834 $r =~ s/GRANT (.*) ON .*/$1/i;
835 foreach my $p ( split( ',', $r ) ) {
836 # trim leading and trailing whitespace
839 $privs{ uc($p) } = 1;
844 } ## end sub get_db_privs
847 sub check_empty_tables {
851 # skip this check if db connection failed (this prevents re-throwing
853 return 1 unless ($self->{
'_db_conn_ok'}->{$dbtype});
858 if ($self->conf->param(
'no_check_empty_tables') or !$self->do_upload) {
859 $self->logger->debug(
"Won't check for empty stable ID and archive tables in $dbtype db.\n");
868 translation_stable_id
876 my $dba = $self->get_DBAdaptor($dbtype);
877 foreach my $table (@tables) {
878 if ( $table =~ /^([^_]+)_stable_id/ ) {
881 $self->fetch_value_from_db(
883 "SELECT COUNT(*) FROM $table WHERE stable_id IS NOT NULL"
886 $self->logger->warning(
887 "$table table in $dbtype db has $c stable IDs.\n");
893 $self->fetch_value_from_db(
894 $dba,
"SELECT COUNT(*) FROM $table"
897 $self->logger->warning(
898 "$table table in $dbtype db has $c entries.\n");
902 } ## end
foreach my $table (@tables)
906 $self->logger->warning(
907 "Error retrieving stable ID and archive table row counts from $dbtype db: $@\n"
912 $self->logger->debug(
913 "All stable ID and archive tables in $dbtype db are empty.\n");
920 my ( $self, $dbtype ) = @_;
922 # skip this check if db connection failed (this prevents re-throwing
924 return 1 unless ( $self->{
'_db_conn_ok'}->{$dbtype} );
930 my $dba = $self->get_DBAdaptor($dbtype);
932 $self->fetch_value_from_db(
933 $dba->dnadb(),
"SELECT COUNT(*) FROM dna"
941 $self->logger->warning(
"Error retrieving dna table row count "
942 .
"from $dbtype database: $@\n" );
945 $self->logger->warning(
"No sequence found in $dbtype database.\n");
947 $self->logger->debug(
948 ucfirst($dbtype) .
" db has sequence ($c entries).\n" );
952 } ## end sub check_sequence
955 sub check_meta_entries {
959 # skip this check if db connection failed (this prevents re-throwing
961 return 1 unless ($self->{
'_db_conn_ok'}->{$dbtype});
964 my $assembly_default;
968 my $dba = $self->get_DBAdaptor($dbtype);
969 $assembly_default = $self->fetch_value_from_db($dba,
970 qq(SELECT meta_value FROM meta WHERE meta_key =
'assembly.default'));
971 $schema_version = $self->fetch_value_from_db($dba,
972 qq(SELECT meta_value FROM meta WHERE meta_key =
'schema_version'));
976 $self->logger->warning(
"Error retrieving dna table row count from $dbtype db: $@\n");
980 unless ($assembly_default) {
981 $self->logger->warning(
"No meta.assembly.default value found in $dbtype db.\n");
984 $self->logger->debug(
"meta.assembly.default value found ($assembly_default).\n");
987 unless ($schema_version) {
988 $self->logger->warning(
"No meta.schema_version value found in $dbtype db.\n");
991 $self->logger->debug(
"meta.schema_version value found ($schema_version).\n");
998 sub fetch_value_from_db {
999 my ( $self, $dba, $sql ) = @_;
1001 assert_ref( $dba,
'Bio::EnsEMBL::DBSQL::DBAdaptor' );
1003 if ( !defined($sql) ) {
1004 throw(
"Need an SQL statement to execute.\n");
1007 my $sth = $dba->dbc->prepare($sql);
1010 my ($c) = $sth->fetchrow_array;
1015 my ( $self, $prefix ) = @_;
1017 unless ( $self->{
'_dba'}->{$prefix} ) {
1018 # connect to database
1021 -host => $self->conf->param(
"${prefix}host"),
1022 -port => $self->conf->param(
"${prefix}port"),
1023 -user => $self->conf->param(
"${prefix}user"),
1024 -pass => $self->conf->param(
"${prefix}pass"),
1025 -dbname => $self->conf->param(
"${prefix}dbname"),
1026 -group => $prefix, );
1028 if ( !defined( $self->conf->param(
"${prefix}host_dna") ) ) {
1029 # explicitely set the dnadb to itself - by default the Registry
1030 # assumes a group 'core' for this now
1035 -host => $self->conf->param(
"${prefix}host_dna"),
1036 -port => $self->conf->param(
"${prefix}port_dna"),
1037 -user => $self->conf->param(
"${prefix}user_dna"),
1038 -pass => $self->conf->param(
"${prefix}pass_dna"),
1039 -dbname => $self->conf->param(
"${prefix}dbname_dna"),
1040 -group => $prefix, );
1041 $dba->dnadb($dna_dba);
1044 $self->{
'_dba'}->{$prefix} = $dba;
1045 } ## end unless ( $self->{
'_dba'}->...)
1047 return $self->{
'_dba'}->{$prefix};
1048 } ## end sub get_DBAdaptor
1051 sub get_production_DBAdaptor() {
1054 -host => $self->conf->param(
"productionhost"),
1055 -port => $self->conf->param(
"productionport"),
1056 -user => $self->conf->param(
"productionuser"),
1057 -pass => $self->conf->param(
"productionpass"),
1058 -dbname => $self->conf->param(
"productiondbname"));
1063 sub cache_file_exists {
1067 throw(
"You must provide a cache type.") unless $type;
1069 my $cache_file = $self->cache_file($type);
1071 if (-e $cache_file) {
1072 $self->logger->info(
"Cache file found for $type.\n", 2);
1073 $self->logger->debug(
"Will read from $cache_file.\n", 2);
1076 $self->logger->info(
"No cache file found for $type.\n", 2);
1077 $self->logger->info(
"Will build cache from db.\n", 2);
1087 throw(
"You must provide a cache type.") unless $type;
1089 return $self->dump_path.
"/$type.object_cache.ser";
1096 return $self->dump_path.
"/cache_instance.ser";
1103 $self->{
'dump_path'} ||= path_append($self->conf->param(
'basedir'),
'cache');
1105 return $self->{
'dump_path'};
1109 sub write_all_to_file {
1113 throw(
"You must provide a cache type.") unless $type;
1116 $size += $self->write_to_file($type);
1117 $size += $self->write_instance_to_file;
1119 return parse_bytes($size);
1127 throw(
"You must provide a cache type.") unless $type;
1129 unless ($self->{
'cache'}->{$type}) {
1130 $self->logger->warning(
"No features found in $type. Won't write cache file.\n");
1134 my $cache_file = $self->cache_file($type);
1136 eval { nstore($self->{
'cache'}->{$type}, $cache_file) };
1138 throw(
"Unable to store $cache_file: $@\n");
1141 my $size = -s $cache_file;
1146 sub write_instance_to_file {
1149 my $instance_file = $self->instance_file;
1151 eval { nstore($self->{
'instance'}, $instance_file) };
1153 throw(
"Unable to store $instance_file: $@\n");
1156 my $size = -s $instance_file;
1161 sub read_from_file {
1165 throw(
"You must provide a cache type.") unless $type;
1167 my $cache_file = $self->cache_file($type);
1169 if (-s $cache_file) {
1171 #$self->logger->info("Reading cache from file...\n", 0, 'stamped');
1172 #$self->logger->info("Cache file $cache_file.\n", 1);
1173 eval { $self->{
'cache'}->{$type} = retrieve($cache_file); };
1175 throw(
"Unable to retrieve cache: $@");
1177 #$self->logger->info("Done.\n", 0, 'stamped');
1180 $self->logger->warning(
"Cache file $cache_file not found or empty.\n");
1184 return $self->{
'cache'}->{$type};
1188 sub read_and_merge {
1192 unless ($dbtype eq
'source' or $dbtype eq
'target') {
1193 throw(
"Db type must be 'source' or 'target'.");
1196 # read cache from single or multiple files, depending on caching strategy
1197 my $cache_method = $self->cache_method;
1198 if ($cache_method eq
'ALL') {
1199 $self->read_from_file(
"$dbtype.ALL");
1200 } elsif ($cache_method eq
'BY_SEQ_REGION') {
1201 foreach my $slice_name (@{ $self->slice_names($dbtype) }) {
1202 $self->read_from_file(
"$dbtype.$slice_name");
1205 throw(
"Unknown cache method: $cache_method.");
1208 $self->merge($dbtype);
1210 # flag as being loaded
1211 $self->{
'instance'}->{
'loaded'}->{$dbtype} = 1;
1219 unless ($dbtype eq
'source' or $dbtype eq
'target') {
1220 throw(
"Db type must be 'source' or 'target'.");
1223 foreach my $type (keys %{ $self->{
'cache'} || {} }) {
1224 next unless ($type =~ /^$dbtype/);
1226 foreach my $name (keys %{ $self->{
'cache'}->{$type} || {} }) {
1228 foreach my $key (keys %{ $self->{
'cache'}->{$type}->{$name} || {} }) {
1229 if (defined $self->{
'cache'}->{$dbtype}->{$name}->{$key}) {
1230 # warning("Duplicate key in cache: $name|$dbtype|$key. Skipping.\n");
1232 $self->{
'cache'}->{$dbtype}->{$name}->{$key} =
1233 $self->{
'cache'}->{$type}->{$name}->{$key};
1236 delete $self->{
'cache'}->{$type}->{$name}->{$key};
1239 delete $self->{
'cache'}->{$type}->{$name};
1242 delete $self->{
'cache'}->{$type};
1248 sub read_instance_from_file {
1251 my $instance_file = $self->instance_file;
1253 unless (-s $instance_file) {
1254 throw(
"No valid cache instance file found at $instance_file.");
1257 eval { $self->{
'instance'} = retrieve($instance_file); };
1259 throw(
"Unable to retrieve cache instance: $@");
1262 return $self->{
'instance'};
1270 throw(
"You must provide a db type (source|target).") unless $dbtype;
1272 my $dba = $self->get_DBAdaptor($dbtype);
1273 my $sa = $dba->get_SliceAdaptor;
1275 my @slice_names = ();
1277 if ( $self->conf->param(
'chromosomes') ) {
1278 # Fetch the specified chromosomes.
1279 foreach my $chr ( $self->conf->param(
'chromosomes') ) {
1280 my $slice = $sa->fetch_by_region(
'chromosome', $chr );
1281 push @slice_names, $slice->name;
1285 elsif ( $self->conf->param(
'region') ) {
1286 # Fetch the slices on the specified regions. Don't use
1287 # SliceAdaptor->fetch_by_name() since this will fail if assembly
1288 # versions are different for source and target db.
1289 my ( $cs, $version, $name, $start, $end, $strand ) =
1290 split( /:/, $self->conf->param(
'region') );
1292 my $slice = $sa->fetch_by_region( $cs, $name, $start, $end );
1294 push @slice_names, $slice->name;
1298 # Fetch all slices that have genes on them.
1299 my $ga = $dba->get_GeneAdaptor;
1300 my $sa = $dba->get_SliceAdaptor;
1302 foreach my $srid ( @{ $ga->list_seq_region_ids } ) {
1303 my $slice = $sa->fetch_by_seq_region_id($srid);
1304 my $slices = $sa->fetch_by_region_unique( $slice->coord_system_name(), $slice->seq_region_name() );
1306 push( @slice_names,
map { $_->name() } @{$slices} );
1310 return \@slice_names;
1311 } ## end sub slice_names
1316 $self->{
'logger'} = shift
if (@_);
1317 return $self->{
'logger'};
1322 $self->{
'conf'} = shift
if (@_);
1323 return $self->{
'conf'};
1329 $self->{
'instance'}->{
'cache_method'} = shift
if (@_);
1330 return $self->{
'instance'}->{
'cache_method'};
1334 sub highest_common_cs {
1336 $self->{
'instance'}->{
'hccs'} = shift
if (@_);
1337 return $self->{
'instance'}->{
'hccs'};
1341 sub highest_common_cs_version {
1343 $self->{
'instance'}->{
'hccsv'} = shift
if (@_);
1344 return $self->{
'instance'}->{
'hccsv'};
1352 unless ($cs and $cs->isa(
'Bio::EnsEMBL::CoordSystem')) {
1353 throw(
'You must provide a CoordSystem');
1356 my $csid = join(
':', $cs->name, $cs->version);
1358 $self->{
'instance'}->{
'ccs'}->{$csid} = 1;
1366 return $self->{
'instance'}->{
'ccs'}->{$csid};