3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
37 my $dfa = $database_adaptor->get_DensityFeatureAdaptor();
40 my $blocks_wanted = 50;
49 Density
Feature Adaptor - An adaptor responsible
for the creation of density
50 features from the database.
56 package Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor;
71 our $DENSITY_FEATURE_CACHE_SIZE = 20;
76 Arg [1] : list of args @args
77 Superclass constructor arguments
79 Description: Constructor which just initializes
internal cache structures
82 Caller : implementing subclass constructors
90 my $class = ref($caller) || $caller;
92 my $self = $class->SUPER::new(@_);
94 #initialize an LRU cache
96 tie(%cache,
'Bio::EnsEMBL::Utils::Cache', $DENSITY_FEATURE_CACHE_SIZE);
97 $self->{
'_density_feature_cache'} = \%cache;
102 =head2 fetch_all_by_Slice
105 to retrieve density features from.
106 Arg [2] :
string $logic_name - The logic name of the density features to
108 Arg [3] :
int $num_blocks (optional;
default = 50) - The number of
109 features that are desired. The ratio between the size of these
110 features and the size of the features in the database will be
111 used to determine which database features will be used.
112 Arg [4] :
boolean $interpolate (optional;
default = 0) - A flag indicating
113 whether the features in the database should be interpolated to
114 fit them to the requested number of features. If
true the
115 features will be interpolated to provide $num_blocks features.
116 This will not guarantee that exactly $num_blocks features are
117 returned due to rounding etc. but it will be close.
118 Arg [5] :
float $max_ratio - The maximum ratio between the size of the
119 requested features (as determined by $num_blocks) and the actual
120 size of the features in the database. If
this value is exceeded
121 then an empty list will be returned. This can be used to
122 prevent excessive interpolation of the database values.
123 Example : #interpolate:
124 $feats = $dfa->fetch_all_by_Slice($slice,
'SNPDensity', 10, 1);
125 #do not interpoloate, get what is in the database:
126 $feats = $dfa->fetch_all_by_Slice($slice,
'SNPDensity', 50);
127 #interpolate, but not too much
128 $feats = $dfa->fetch_all_by_Slice($slice,
'SNPDensity',50,1,5.0);
129 Description: Retrieves a set of density features which overlap the region
130 of
this slice. Density features are a discrete representation
131 of a continuous value along a sequence, such as a density or
132 percent coverage. Density Features may be stored in chunks of
133 different sizes in the database, and interpolated to fit the
134 desired size
for the requested region. For example the database
135 may store a single density value
for each 1KB and also
for each
136 1MB. When fetching
for an entire chromosome the 1MB density
137 chunks will be used
if the requested number of blocks is not
139 Note that features which have been interpolated are not stored
140 in the database and as such will have no dbID or adaptor set.
142 Exceptions : warning on invalid $num_blocks argument
143 warning
if no features with logic_name $logic_name exist
144 warning
if density_type table has invalid block_size value
150 sub fetch_all_by_Slice {
151 my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;
153 if(defined($num_blocks) && $num_blocks < 1) {
154 warning(
"Invalid number of density blocks [$num_blocks] requested.\n" .
155 "Returning empty list.");
160 my $length = $slice->length();
162 my $wanted_block_size = POSIX::ceil($length/$num_blocks);
165 # get out all of the density types and choose the one with the
166 # block size closest to our desired block size
169 my $dta = $self->db()->get_DensityTypeAdaptor();
171 my @dtypes = @{$dta->fetch_all_by_logic_name($logic_name)};
173 my @all_dtypes = @{ $dta->fetch_all() };
174 @all_dtypes or warning(
"No DensityTypes in $dta" ) &&
return [];
175 my $valid_list = join(
", ",
map{$_->analysis->logic_name} @all_dtypes );
176 warning(
"No DensityTypes for logic name $logic_name. ".
177 "Select from $valid_list" );
181 my $best_ratio = undef;
182 my $density_type = undef;
183 my $best_ratio_large = undef;
184 my $density_type_large = undef;
187 foreach my $dt (@dtypes) {
190 if( $dt->block_size() > 0 ) {
191 $ratio = $wanted_block_size/$dt->block_size();
193 # This is only valid if the requested seq_region is the one the
194 # features are stored on. Please use sensibly. Or find better implementation.
196 my $block_size = $slice->seq_region_length() / $dt->region_features();
197 $ratio = $wanted_block_size / $block_size;
200 # we prefer to use a block size that's smaller than the required one
201 # (better results on interpolation).
202 # give larger bits a disadvantage and make them comparable
207 $dt_ratio_hash{ $ratio } = $dt;
210 $best_ratio = (sort {$a<=>$b} (keys %dt_ratio_hash))[0];
212 #the ratio was not good enough, or this logic name was not in the
213 #density type table, return an empty list
214 if(!defined($best_ratio) ||
215 (defined($max_ratio) && $best_ratio > $max_ratio)) {
219 $density_type = $dt_ratio_hash{$best_ratio};
221 my $constraint =
"df.density_type_id = " . $density_type->dbID();
224 @{$self->fetch_all_by_Slice_constraint($slice,$constraint)};
226 return \@features
if(!$interpolate);
228 #interpolate the features into new features of a different size
230 #sort the features on start position
231 @features = sort( { $a->start() <=> $b->start() } @features );
233 #resize the features that were returned
235 my $end = $start+$wanted_block_size-1;
237 my $value_type = $density_type->value_type();
239 # create a new density type object for the interpolated features that
240 # is not stored in the database
242 (-VALUE_TYPE => $value_type,
243 -ANALYSIS => $density_type->analysis(),
244 -BLOCK_SIZE => $wanted_block_size);
246 while($start < $length) {
247 # $end = $length if($end > $length);
249 my $density_value = 0.0;
250 my ($f, $fstart, $fend, $portion);
253 #construct a new feature using all of the old density features that
255 while(($f = shift(@features)) && $end > $f->{
'start'}) {
257 #what portion of this feature are we using to construct our new block?
258 $fstart = ($f->{
'start'} < $start) ? $start : $f->{
'start'};
259 $fend = ($f->{
'end'} > $end ) ? $end : $f->{
'end'};
260 $fend = $length
if($fend > $length);
262 if($value_type eq
'sum') {
264 $portion = ($fend-$fstart+1)/$f->length();
266 #take a percentage of density value, depending on how much of the
267 #feature we overlapped
268 $density_value += $portion * $f->{
'density_value'};
270 } elsif($value_type eq
'ratio') {
272 #maintain a running total of the length used from each feature
274 push(@dvalues,[$fend-$fstart+1,$f->{
'density_value'}]);
277 throw(
"Unknown density value type [$value_type].");
280 #do not want to look at next feature if we only used part of this one:
281 last
if($fend < $f->{
'end'});
284 #if we did not completely overlap the last feature, put it back on so
285 #it can be partially used by the next block
286 if(defined($f) && (!defined($fend) || $fend < $f->{
'end'})) {
287 unshift(@features, $f);
290 if($value_type eq
'ratio') {
291 #take a weighted average of the all the density values of the features
292 #used to construct this one
293 my $total_len = $end - $start + 1;
295 foreach my $pair (@dvalues) {
296 my ($dlen, $dval) = @$pair;
297 $density_value += $dval * ($dlen/$total_len);
302 # interpolated features are not stored in the db so do not set the dbID
305 (-seq_region => $slice,
308 -density_type => $density_type,
309 -density_value => $density_value);
312 $end += $wanted_block_size;
321 my $logic_name = shift;
323 my ($sth, $seq_region_id, $start, $end, $density_value, $density_type_id);
324 my ($slice, $density_type, @out);
325 my $sa = $self->db()->get_SliceAdaptor();
326 my $dta = $self->db()->get_DensityTypeAdaptor();
329 $sth = $self->prepare(
"SELECT df.seq_region_id, df.seq_region_start, df.seq_region_end, df.density_value, df.density_type_id "
330 .
"FROM density_feature df, density_type dt, analysis a "
331 .
"WHERE df.density_type_id = dt.density_type_id "
332 .
"AND dt.analysis_id = a.analysis_id "
333 .
"AND logic_name = ?" );
334 $sth->bind_param(1, $logic_name, SQL_VARCHAR);
337 $sth = $self->prepare(
"SELECT df.seq_region_id, df.seq_region_start, df.seq_region_end, df.density_value, df.density_type_id "
338 .
"FROM density_feature df, density_type dt "
339 .
"WHERE df.density_type_id = dt.density_type_id" );
344 $sth->bind_columns(\($seq_region_id, $start, $end, $density_value, $density_type_id));
347 while ($sth->fetch()) {
348 $slice = $sa->fetch_by_seq_region_id($seq_region_id);
349 $density_type = $dta->fetch_by_dbID($density_type_id);
351 (-seq_region => $slice,
354 -density_type => $density_type,
355 -density_value => $density_value));
361 =head2 fetch_all_by_Slice_constraint
364 the slice from which to obtain features
365 Arg [2] : (optional)
string $constraint
366 An SQL query constraint (i.e. part of the WHERE clause)
367 Example : $fs = $a->fetch_all_by_Slice_constraint($slc,
'density_type_id = 88');
368 Description: Returns a listref of features created from the database which
369 are on the Slice defined by $slice and fulfill the SQL
370 constraint defined by $constraint.
371 Note that
this is a re-implementation of a method with the same name
372 in the BaseFeatureAdaptor.
373 This was necessary to remove the use of symliked sequences
for density features
374 Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
375 Exceptions : thrown
if $slice is not defined
382 sub fetch_all_by_Slice_constraint {
385 my $constraint = shift;
389 #Pull out here as we need to remember them & reset accordingly
390 my $bind_params = $self->bind_param_generic_fetch();
393 || !( $slice->isa(
'Bio::EnsEMBL::Slice')
394 or $slice->isa(
'Bio::EnsEMBL::LRGSlice') ) )
396 throw(
"Bio::EnsEMBL::Slice argument expected.");
401 # If the logic name was invalid, undef was returned
402 if ( !defined($constraint) ) {
return [] }
407 # Will only use feature_cache if hasn't been no_cache attribute set
409 !( defined( $self->db()->no_cache() ) && $self->db()->no_cache() ) )
412 #strain test and add to constraint if so to stop caching.
413 if ( $slice->isa(
'Bio::EnsEMBL::Variation::StrainSlice') ) {
415 $self->dbc()->db_handle()->quote( $slice->strain_name() );
417 if ( $constraint ne
"" ) {
418 $constraint .=
" AND $string = $string ";
420 $constraint .=
" $string = $string ";
424 # Check the cache and return the cached results if we have already
425 # done this query. The cache key is the made up from the slice
426 # name, the constraint, and the bound parameters (if there are any).
427 $key = uc( join(
':', $slice->name(), $constraint ) );
429 if ( defined($bind_params) ) {
431 . join(
':',
map { $_->[0] .
'/' . $_->[1] } @{$bind_params} );
434 $cache = $self->_slice_feature_cache();
435 if ( exists( $cache->{$key} ) ) {
436 # Clear the bound parameters and return the cached data.
437 $self->{
'_bind_param_generic_fetch'} = ();
438 return $cache->{$key};
440 } ## end
if ( !( defined( $self...)))
442 $self->_bind_param_generic_fetch($bind_params);
443 my $features = $self->_slice_fetch( $slice, $constraint );
445 if ( defined($key) ) {
446 $cache->{$key} = $features;
456 return ([
'density_feature',
'df']);
463 return qw( df.density_feature_id
464 df.seq_region_id df.seq_region_start df.seq_region_end
465 df.density_value df.density_type_id );
471 my ($self, $sth, $mapper, $dest_slice) = @_;
474 # This code is ugly because an attempt has been made to remove as many
475 # function calls as possible for speed purposes. Thus many caches,
476 # and a fair bit of gymnastics have been used.
479 my $sa = $self->db()->get_SliceAdaptor();
480 my $dta = $self->db()->get_DensityTypeAdaptor();
489 $density_feature_id, $seq_region_id, $seq_region_start,
490 $seq_region_end, $density_value, $density_type_id );
492 $sth->bind_columns(\(
493 $density_feature_id, $seq_region_id, $seq_region_start,
494 $seq_region_end, $density_value, $density_type_id));
496 my $dest_slice_start;
498 my $dest_slice_strand;
499 my $dest_slice_length;
501 my $dest_slice_sr_name;
502 my $dest_slice_sr_id;
506 $dest_slice_start = $dest_slice->start();
507 $dest_slice_end = $dest_slice->end();
508 $dest_slice_strand = $dest_slice->strand();
509 $dest_slice_length = $dest_slice->length();
510 $dest_slice_cs = $dest_slice->coord_system();
511 $dest_slice_sr_name = $dest_slice->seq_region_name();
512 $dest_slice_sr_id = $dest_slice->get_seq_region_id();
513 $asma = $self->db->get_AssemblyMapperAdaptor();
516 FEATURE:
while($sth->fetch()) {
518 #get the density type object
519 my $density_type = $dtype_hash{$density_type_id} ||= $dta->fetch_by_dbID($density_type_id);
521 #need to get the internal_seq_region, if present
522 $seq_region_id = $self->get_seq_region_id_internal($seq_region_id);
523 my $slice = $slice_hash{
"ID:".$seq_region_id};
526 $slice = $sa->fetch_by_seq_region_id($seq_region_id);
527 $slice_hash{
"ID:".$seq_region_id} = $slice;
528 $sr_name_hash{$seq_region_id} = $slice->seq_region_name();
529 $sr_cs_hash{$seq_region_id} = $slice->coord_system();
532 #obtain a mapper if none was defined, but a dest_seq_region was
533 if(!$mapper && $dest_slice && !$dest_slice_cs->equals($slice->coord_system)) {
534 $mapper = $asma->fetch_by_CoordSystems($dest_slice_cs, $slice->coord_system);
537 my $sr_name = $sr_name_hash{$seq_region_id};
538 my $sr_cs = $sr_cs_hash{$seq_region_id};
541 # remap the feature coordinates to another coord system
542 # if a mapper was provided
547 my $len = $seq_region_end - $seq_region_start + 1;
549 if (defined $dest_slice && $mapper->isa(
'Bio::EnsEMBL::ChainedAssemblyMapper') ) {
550 ($seq_region_id, $seq_region_start, $seq_region_end) =
551 $mapper->map( $sr_name, $seq_region_start, $seq_region_end, 1, $sr_cs, 0, $dest_slice);
554 ($seq_region_id, $seq_region_start, $seq_region_end) =
555 $mapper->map($sr_name, $seq_region_start, $seq_region_end, 1, $sr_cs);
558 #skip features that map to gaps or coord system boundaries
559 next FEATURE
if (!defined($seq_region_id));
561 if($density_type->value_type() eq
'sum') {
562 #adjust density value so it reflects length of feature actually used
563 my $newlen = $seq_region_end - $seq_region_start +1;
564 $density_value *= $newlen/$len
if($newlen != $len);
567 #get a slice in the coord system we just mapped to
568 $slice = $slice_hash{
"ID:".$seq_region_id} ||= $sa->fetch_by_seq_region_id($seq_region_id);
572 # If a destination slice was provided convert the coords.
574 if (defined($dest_slice)) {
575 my $seq_region_len = $dest_slice->seq_region_length();
577 if ( $dest_slice_strand == 1 ) {
578 $seq_region_start = $seq_region_start - $dest_slice_start + 1;
579 $seq_region_end = $seq_region_end - $dest_slice_start + 1;
581 if ( $dest_slice->is_circular ) {
582 # Handle circular chromosomes.
584 if ( $seq_region_start > $seq_region_end ) {
585 # Looking at a feature overlapping the chromosome origin.
587 if ( $seq_region_end > $dest_slice_start ) {
588 # Looking at the region in the beginning of the chromosome
589 $seq_region_start -= $seq_region_len;
591 if ( $seq_region_end < 0 ) {
592 $seq_region_end += $seq_region_len;
595 if ($dest_slice_start > $dest_slice_end && $seq_region_end < 0) {
596 # Looking at the region overlapping the chromosome
597 # origin and a feature which is at the beginning of the
599 $seq_region_start += $seq_region_len;
600 $seq_region_end += $seq_region_len;
606 my $start = $dest_slice_end - $seq_region_end + 1;
607 my $end = $dest_slice_end - $seq_region_start + 1;
609 if ($dest_slice->is_circular()) {
611 if ($dest_slice_start > $dest_slice_end) {
612 # slice spans origin or replication
614 if ($seq_region_start >= $dest_slice_start) {
615 $end += $seq_region_len;
616 $start += $seq_region_len
if $seq_region_end > $dest_slice_start;
618 } elsif ($seq_region_start <= $dest_slice_end) {
620 } elsif ($seq_region_end >= $dest_slice_start) {
621 $start += $seq_region_len;
622 $end += $seq_region_len;
624 } elsif ($seq_region_end <= $dest_slice_end) {
625 $end += $seq_region_len
if $end < 0;
627 } elsif ($seq_region_start > $seq_region_end) {
628 $end += $seq_region_len;
633 if ($seq_region_start <= $dest_slice_end and $seq_region_end >= $dest_slice_start) {
635 } elsif ($seq_region_start > $seq_region_end) {
636 if ($seq_region_start <= $dest_slice_end) {
637 $start -= $seq_region_len;
638 } elsif ($seq_region_end >= $dest_slice_start) {
639 $end += $seq_region_len;
645 $seq_region_start = $start;
646 $seq_region_end = $end;
649 # Throw away features off the end of the requested slice or on
650 # different seq_region.
651 if ($seq_region_end < 1
652 || $seq_region_start > $dest_slice_length
653 || ($dest_slice_sr_id != $seq_region_id)) {
656 $slice = $dest_slice;
661 $self->_create_feature(
662 'Bio::EnsEMBL::DensityFeature', {
663 -dbID => $density_feature_id,
665 -start => $seq_region_start,
666 -end => $seq_region_end,
667 -seq_region => $slice,
668 -density_value => $density_value,
669 -density_type => $density_type
681 Example : @feature_ids = @{$density_feature_adaptor->list_dbIDs()};
682 Description: Gets an array of
internal ids
for all density features in the
684 Arg[1] : <optional>
int. not set to 0
for the ids to be sorted by the seq_region.
685 Returntype : list of ints
693 my ($self, $ordered) = @_;
695 return $self->_list_dbIDs(
"density_feature",undef, $ordered);
701 Arg [1] : list of Bio::EnsEMBL::DensityFeatures @df
702 the simple features to store in the database
703 Example : $density_feature_adaptor->store(1234, @density_feats);
704 Description: Stores a list of density feature objects in the database
706 Exceptions : thrown
if @df is not defined,
if any of the features
do not
707 have an attached slice.
708 or
if any elements of @df are not Bio::EnsEMBL::SeqFeatures
717 if( scalar(@df) == 0 ) {
718 throw(
"Must call store with list of DensityFeatures");
720 #mysql> desc density_feature;
721 #+--------------------+---------+------+-----+---------+----------------+
722 #| Field | Type | Null | Key | Default | Extra |
723 #+--------------------+---------+------+-----+---------+----------------+
724 #| density_feature_id | int(11) | | PRI | NULL | auto_increment |
725 #| density_type_id | int(11) | | MUL | 0 | |
726 #| seq_region_id | int(11) | | | 0 | |
727 #| seq_region_start | int(11) | | | 0 | |
728 #| seq_region_end | int(11) | | | 0 | |
729 #| density_value | float | | | 0 | |
730 #+--------------------+---------+------+-----+---------+----------------+
732 my $sth = $self->prepare
733 (
"INSERT INTO density_feature (seq_region_id, seq_region_start, " .
734 "seq_region_end, density_type_id, " .
736 "VALUES (?,?,?,?,?)");
738 my $db = $self->db();
739 my $analysis_adaptor = $db->get_AnalysisAdaptor();
741 FEATURE:
foreach my $df ( @df ) {
743 if( !ref $df || !$df->isa(
"Bio::EnsEMBL::DensityFeature") ) {
744 throw(
"DensityFeature must be an Ensembl DensityFeature, " .
745 "not a [".ref($df).
"]");
748 # we dont store 0 value density features
749 next
if( $df->density_value == 0 );
750 if($df->is_stored($db)) {
751 warning(
"DensityFeature [".$df->dbID.
"] is already stored" .
752 " in this database.");
756 if(!defined($df->density_type)) {
757 throw(
"A density type must be attached to the features to be stored.");
760 #store the density_type if it has not been stored yet
762 if(!$df->density_type->is_stored($db)) {
763 my $dta = $db->get_DensityTypeAdaptor();
764 $dta->store($df->density_type());
769 ($df, $seq_region_id) = $self->_pre_store($df);
771 $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
772 $sth->bind_param(2,$df->start,SQL_INTEGER);
773 $sth->bind_param(3,$df->end,SQL_INTEGER);
774 $sth->bind_param(4,$df->density_type->dbID,SQL_INTEGER);
775 $sth->bind_param(5,$df->density_value,SQL_FLOAT);
778 $original->dbID($self->last_insert_id(
'density_feature_id', undef,
'density_feature'));
779 $original->adaptor($self);
783 =head2 fetch_Featureset_by_Slice
787 for argument documentation
788 Example : $featureset = $dfa->fetch_FeatureSet_by_Slice($slice,'SNPDensity', 10, 1);
789 Description: wrapper around
790 Bio::
EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice()
791 which returns a
Bio::
EnsEMBL::DensityFeatureSet and also caches
800 sub fetch_Featureset_by_Slice {
801 my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;
803 my $key = join(
":", $slice->name,
809 unless ($self->{
'_density_feature_cache'}->{$key}) {
810 my $dfeats = $self->fetch_all_by_Slice($slice, $logic_name, $num_blocks,
811 $interpolate, $max_ratio);
812 $self->{
'_density_feature_cache'}->{$key} =
815 return $self->{
'_density_feature_cache'}->{$key};