3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
37 my $dfa = $database_adaptor->get_DensityFeatureAdaptor();
40 my $blocks_wanted = 50;
49 Density
Feature Adaptor - An adaptor responsible
for the creation of density
50 features from the database.
56 package Bio::EnsEMBL::DBSQL::DensityFeatureAdaptor;
72 our $DENSITY_FEATURE_CACHE_SIZE = 20;
77 Arg [1] : list of args @args
78 Superclass constructor arguments
80 Description: Constructor which just initializes
internal cache structures
83 Caller : implementing subclass constructors
91 my $class = ref($caller) || $caller;
93 my $self = $class->SUPER::new(@_);
95 #initialize an LRU cache
97 tie(%cache,
'Bio::EnsEMBL::Utils::Cache', $DENSITY_FEATURE_CACHE_SIZE);
98 $self->{
'_density_feature_cache'} = \%cache;
103 =head2 fetch_all_by_Slice
106 to retrieve density features from.
107 Arg [2] :
string $logic_name - The logic name of the density features to
109 Arg [3] :
int $num_blocks (optional;
default = 50) - The number of
110 features that are desired. The ratio between the size of these
111 features and the size of the features in the database will be
112 used to determine which database features will be used.
113 Arg [4] :
boolean $interpolate (optional;
default = 0) - A flag indicating
114 whether the features in the database should be interpolated to
115 fit them to the requested number of features. If
true the
116 features will be interpolated to provide $num_blocks features.
117 This will not guarantee that exactly $num_blocks features are
118 returned due to rounding etc. but it will be close.
119 Arg [5] :
float $max_ratio - The maximum ratio between the size of the
120 requested features (as determined by $num_blocks) and the actual
121 size of the features in the database. If
this value is exceeded
122 then an empty list will be returned. This can be used to
123 prevent excessive interpolation of the database values.
124 Example : #interpolate:
125 $feats = $dfa->fetch_all_by_Slice($slice,
'SNPDensity', 10, 1);
126 #do not interpoloate, get what is in the database:
127 $feats = $dfa->fetch_all_by_Slice($slice,
'SNPDensity', 50);
128 #interpolate, but not too much
129 $feats = $dfa->fetch_all_by_Slice($slice,
'SNPDensity',50,1,5.0);
130 Description: Retrieves a set of density features which overlap the region
131 of
this slice. Density features are a discrete representation
132 of a continuous value along a sequence, such as a density or
133 percent coverage. Density Features may be stored in chunks of
134 different sizes in the database, and interpolated to fit the
135 desired size
for the requested region. For example the database
136 may store a single density value
for each 1KB and also
for each
137 1MB. When fetching
for an entire chromosome the 1MB density
138 chunks will be used
if the requested number of blocks is not
140 Note that features which have been interpolated are not stored
141 in the database and as such will have no dbID or adaptor set.
143 Exceptions : warning on invalid $num_blocks argument
144 warning
if no features with logic_name $logic_name exist
145 warning
if density_type table has invalid block_size value
151 sub fetch_all_by_Slice {
152 my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;
154 if(defined($num_blocks) && $num_blocks < 1) {
155 warning(
"Invalid number of density blocks [$num_blocks] requested.\n" .
156 "Returning empty list.");
161 my $length = $slice->length();
163 my $wanted_block_size = POSIX::ceil($length/$num_blocks);
166 # get out all of the density types and choose the one with the
167 # block size closest to our desired block size
170 my $dta = $self->db()->get_DensityTypeAdaptor();
172 my @dtypes = @{$dta->fetch_all_by_logic_name($logic_name)};
174 my @all_dtypes = @{ $dta->fetch_all() };
175 @all_dtypes or warning(
"No DensityTypes in $dta" ) &&
return [];
176 my $valid_list = join(
", ",
map{$_->analysis->logic_name} @all_dtypes );
177 warning(
"No DensityTypes for logic name $logic_name. ".
178 "Select from $valid_list" );
182 my $best_ratio = undef;
183 my $density_type = undef;
184 my $best_ratio_large = undef;
185 my $density_type_large = undef;
188 foreach my $dt (@dtypes) {
191 if( $dt->block_size() > 0 ) {
192 $ratio = $wanted_block_size/$dt->block_size();
194 # This is only valid if the requested seq_region is the one the
195 # features are stored on. Please use sensibly. Or find better implementation.
197 my $block_size = $slice->seq_region_length() / $dt->region_features();
198 $ratio = $wanted_block_size / $block_size;
201 # we prefer to use a block size that's smaller than the required one
202 # (better results on interpolation).
203 # give larger bits a disadvantage and make them comparable
208 $dt_ratio_hash{ $ratio } = $dt;
211 $best_ratio = (sort {$a<=>$b} (keys %dt_ratio_hash))[0];
213 #the ratio was not good enough, or this logic name was not in the
214 #density type table, return an empty list
215 if(!defined($best_ratio) ||
216 (defined($max_ratio) && $best_ratio > $max_ratio)) {
220 $density_type = $dt_ratio_hash{$best_ratio};
222 my $constraint =
"df.density_type_id = " . $density_type->dbID();
225 @{$self->fetch_all_by_Slice_constraint($slice,$constraint)};
227 return \@features
if(!$interpolate);
229 #interpolate the features into new features of a different size
231 #sort the features on start position
232 @features = sort( { $a->start() <=> $b->start() } @features );
234 #resize the features that were returned
236 my $end = $start+$wanted_block_size-1;
238 my $value_type = $density_type->value_type();
240 # create a new density type object for the interpolated features that
241 # is not stored in the database
243 (-VALUE_TYPE => $value_type,
244 -ANALYSIS => $density_type->analysis(),
245 -BLOCK_SIZE => $wanted_block_size);
247 while($start < $length) {
248 # $end = $length if($end > $length);
250 my $density_value = 0.0;
251 my ($f, $fstart, $fend, $portion);
254 #construct a new feature using all of the old density features that
256 while(($f = shift(@features)) && $end > $f->{
'start'}) {
258 #what portion of this feature are we using to construct our new block?
259 $fstart = ($f->{
'start'} < $start) ? $start : $f->{
'start'};
260 $fend = ($f->{
'end'} > $end ) ? $end : $f->{
'end'};
261 $fend = $length
if($fend > $length);
263 if($value_type eq
'sum') {
265 $portion = ($fend-$fstart+1)/$f->length();
267 #take a percentage of density value, depending on how much of the
268 #feature we overlapped
269 $density_value += $portion * $f->{
'density_value'};
271 } elsif($value_type eq
'ratio') {
273 #maintain a running total of the length used from each feature
275 push(@dvalues,[$fend-$fstart+1,$f->{
'density_value'}]);
278 throw(
"Unknown density value type [$value_type].");
281 #do not want to look at next feature if we only used part of this one:
282 last
if($fend < $f->{
'end'});
285 #if we did not completely overlap the last feature, put it back on so
286 #it can be partially used by the next block
287 if(defined($f) && (!defined($fend) || $fend < $f->{
'end'})) {
288 unshift(@features, $f);
291 if($value_type eq
'ratio') {
292 #take a weighted average of the all the density values of the features
293 #used to construct this one
294 my $total_len = $end - $start + 1;
296 foreach my $pair (@dvalues) {
297 my ($dlen, $dval) = @$pair;
298 $density_value += $dval * ($dlen/$total_len);
303 # interpolated features are not stored in the db so do not set the dbID
306 (-seq_region => $slice,
309 -density_type => $density_type,
310 -density_value => $density_value);
313 $end += $wanted_block_size;
322 my $logic_name = shift;
324 my ($sth, $seq_region_id, $start, $end, $density_value, $density_type_id);
325 my ($slice, $density_type, @out);
326 my $sa = $self->db()->get_SliceAdaptor();
327 my $dta = $self->db()->get_DensityTypeAdaptor();
330 $sth = $self->prepare(
"SELECT df.seq_region_id, df.seq_region_start, df.seq_region_end, df.density_value, df.density_type_id "
331 .
"FROM density_feature df, density_type dt, analysis a "
332 .
"WHERE df.density_type_id = dt.density_type_id "
333 .
"AND dt.analysis_id = a.analysis_id "
334 .
"AND logic_name = ?" );
335 $sth->bind_param(1, $logic_name, SQL_VARCHAR);
338 $sth = $self->prepare(
"SELECT df.seq_region_id, df.seq_region_start, df.seq_region_end, df.density_value, df.density_type_id "
339 .
"FROM density_feature df, density_type dt "
340 .
"WHERE df.density_type_id = dt.density_type_id" );
345 $sth->bind_columns(\($seq_region_id, $start, $end, $density_value, $density_type_id));
348 while ($sth->fetch()) {
349 $slice = $sa->fetch_by_seq_region_id($seq_region_id);
350 $density_type = $dta->fetch_by_dbID($density_type_id);
352 (-seq_region => $slice,
355 -density_type => $density_type,
356 -density_value => $density_value));
362 =head2 fetch_all_by_Slice_constraint
365 the slice from which to obtain features
366 Arg [2] : (optional)
string $constraint
367 An SQL query constraint (i.e. part of the WHERE clause)
368 Example : $fs = $a->fetch_all_by_Slice_constraint($slc,
'density_type_id = 88');
369 Description: Returns a listref of features created from the database which
370 are on the Slice defined by $slice and fulfill the SQL
371 constraint defined by $constraint.
372 Note that
this is a re-implementation of a method with the same name
373 in the BaseFeatureAdaptor.
374 This was necessary to remove the use of symliked sequences
for density features
375 Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
376 Exceptions : thrown
if $slice is not defined
383 sub fetch_all_by_Slice_constraint {
386 my $constraint = shift;
390 #Pull out here as we need to remember them & reset accordingly
391 my $bind_params = $self->bind_param_generic_fetch();
394 || !( $slice->isa(
'Bio::EnsEMBL::Slice')
395 or $slice->isa(
'Bio::EnsEMBL::LRGSlice') ) )
397 throw(
"Bio::EnsEMBL::Slice argument expected.");
402 # If the logic name was invalid, undef was returned
403 if ( !defined($constraint) ) {
return [] }
408 # Will only use feature_cache if hasn't been no_cache attribute set
410 !( defined( $self->db()->no_cache() ) && $self->db()->no_cache() ) )
413 #strain test and add to constraint if so to stop caching.
414 if ( $slice->isa(
'Bio::EnsEMBL::Variation::StrainSlice') ) {
416 $self->dbc()->db_handle()->quote( $slice->strain_name() );
418 if ( $constraint ne
"" ) {
419 $constraint .=
" AND $string = $string ";
421 $constraint .=
" $string = $string ";
425 # Check the cache and return the cached results if we have already
426 # done this query. The cache key is the made up from the slice
427 # name, the constraint, and the bound parameters (if there are any).
428 $key = uc( join(
':', $slice->name(), $constraint ) );
430 if ( defined($bind_params) ) {
432 . join(
':',
map { $_->[0] .
'/' . $_->[1] } @{$bind_params} );
435 $cache = $self->_slice_feature_cache();
436 if ( exists( $cache->{$key} ) ) {
437 # Clear the bound parameters and return the cached data.
438 $self->{
'_bind_param_generic_fetch'} = ();
439 return $cache->{$key};
441 } ## end
if ( !( defined( $self...)))
443 $self->_bind_param_generic_fetch($bind_params);
444 my $features = $self->_slice_fetch( $slice, $constraint );
446 if ( defined($key) ) {
447 $cache->{$key} = $features;
457 return ([
'density_feature',
'df']);
464 return qw( df.density_feature_id
465 df.seq_region_id df.seq_region_start df.seq_region_end
466 df.density_value df.density_type_id );
472 my ($self, $sth, $mapper, $dest_slice) = @_;
475 # This code is ugly because an attempt has been made to remove as many
476 # function calls as possible for speed purposes. Thus many caches,
477 # and a fair bit of gymnastics have been used.
480 my $sa = $self->db()->get_SliceAdaptor();
481 my $dta = $self->db()->get_DensityTypeAdaptor();
490 $density_feature_id, $seq_region_id, $seq_region_start,
491 $seq_region_end, $density_value, $density_type_id );
493 $sth->bind_columns(\(
494 $density_feature_id, $seq_region_id, $seq_region_start,
495 $seq_region_end, $density_value, $density_type_id));
497 my $dest_slice_start;
499 my $dest_slice_strand;
500 my $dest_slice_length;
502 my $dest_slice_sr_name;
503 my $dest_slice_sr_id;
507 $dest_slice_start = $dest_slice->start();
508 $dest_slice_end = $dest_slice->end();
509 $dest_slice_strand = $dest_slice->strand();
510 $dest_slice_length = $dest_slice->length();
511 $dest_slice_cs = $dest_slice->coord_system();
512 $dest_slice_sr_name = $dest_slice->seq_region_name();
513 $dest_slice_sr_id = $dest_slice->get_seq_region_id();
514 $asma = $self->db->get_AssemblyMapperAdaptor();
517 FEATURE:
while($sth->fetch()) {
519 #get the density type object
520 my $density_type = $dtype_hash{$density_type_id} ||= $dta->fetch_by_dbID($density_type_id);
522 #need to get the internal_seq_region, if present
523 $seq_region_id = $self->get_seq_region_id_internal($seq_region_id);
524 my $slice = $slice_hash{
"ID:".$seq_region_id};
527 $slice = $sa->fetch_by_seq_region_id($seq_region_id);
528 $slice_hash{
"ID:".$seq_region_id} = $slice;
529 $sr_name_hash{$seq_region_id} = $slice->seq_region_name();
530 $sr_cs_hash{$seq_region_id} = $slice->coord_system();
533 #obtain a mapper if none was defined, but a dest_seq_region was
534 if(!$mapper && $dest_slice && !$dest_slice_cs->equals($slice->coord_system)) {
535 $mapper = $asma->fetch_by_CoordSystems($dest_slice_cs, $slice->coord_system);
538 my $sr_name = $sr_name_hash{$seq_region_id};
539 my $sr_cs = $sr_cs_hash{$seq_region_id};
542 # remap the feature coordinates to another coord system
543 # if a mapper was provided
548 my $len = $seq_region_end - $seq_region_start + 1;
550 if (defined $dest_slice && $mapper->isa(
'Bio::EnsEMBL::ChainedAssemblyMapper') ) {
551 ($seq_region_id, $seq_region_start, $seq_region_end) =
552 $mapper->map( $sr_name, $seq_region_start, $seq_region_end, 1, $sr_cs, 0, $dest_slice);
555 ($seq_region_id, $seq_region_start, $seq_region_end) =
556 $mapper->map($sr_name, $seq_region_start, $seq_region_end, 1, $sr_cs);
559 #skip features that map to gaps or coord system boundaries
560 next FEATURE
if (!defined($seq_region_id));
562 if($density_type->value_type() eq
'sum') {
563 #adjust density value so it reflects length of feature actually used
564 my $newlen = $seq_region_end - $seq_region_start +1;
565 $density_value *= $newlen/$len
if($newlen != $len);
568 #get a slice in the coord system we just mapped to
569 $slice = $slice_hash{
"ID:".$seq_region_id} ||= $sa->fetch_by_seq_region_id($seq_region_id);
573 # If a destination slice was provided convert the coords.
575 if (defined($dest_slice)) {
576 my $seq_region_len = $dest_slice->seq_region_length();
578 if ( $dest_slice_strand == 1 ) {
579 $seq_region_start = $seq_region_start - $dest_slice_start + 1;
580 $seq_region_end = $seq_region_end - $dest_slice_start + 1;
582 if ( $dest_slice->is_circular ) {
583 # Handle circular chromosomes.
585 if ( $seq_region_start > $seq_region_end ) {
586 # Looking at a feature overlapping the chromosome origin.
588 if ( $seq_region_end > $dest_slice_start ) {
589 # Looking at the region in the beginning of the chromosome
590 $seq_region_start -= $seq_region_len;
592 if ( $seq_region_end < 0 ) {
593 $seq_region_end += $seq_region_len;
596 if ($dest_slice_start > $dest_slice_end && $seq_region_end < 0) {
597 # Looking at the region overlapping the chromosome
598 # origin and a feature which is at the beginning of the
600 $seq_region_start += $seq_region_len;
601 $seq_region_end += $seq_region_len;
607 my $start = $dest_slice_end - $seq_region_end + 1;
608 my $end = $dest_slice_end - $seq_region_start + 1;
610 if ($dest_slice->is_circular()) {
612 if ($dest_slice_start > $dest_slice_end) {
613 # slice spans origin or replication
615 if ($seq_region_start >= $dest_slice_start) {
616 $end += $seq_region_len;
617 $start += $seq_region_len
if $seq_region_end > $dest_slice_start;
619 } elsif ($seq_region_start <= $dest_slice_end) {
621 } elsif ($seq_region_end >= $dest_slice_start) {
622 $start += $seq_region_len;
623 $end += $seq_region_len;
625 } elsif ($seq_region_end <= $dest_slice_end) {
626 $end += $seq_region_len
if $end < 0;
628 } elsif ($seq_region_start > $seq_region_end) {
629 $end += $seq_region_len;
634 if ($seq_region_start <= $dest_slice_end and $seq_region_end >= $dest_slice_start) {
636 } elsif ($seq_region_start > $seq_region_end) {
637 if ($seq_region_start <= $dest_slice_end) {
638 $start -= $seq_region_len;
639 } elsif ($seq_region_end >= $dest_slice_start) {
640 $end += $seq_region_len;
646 $seq_region_start = $start;
647 $seq_region_end = $end;
650 # Throw away features off the end of the requested slice or on
651 # different seq_region.
652 if ($seq_region_end < 1
653 || $seq_region_start > $dest_slice_length
654 || ($dest_slice_sr_id != $seq_region_id)) {
657 $slice = $dest_slice;
662 $self->_create_feature(
663 'Bio::EnsEMBL::DensityFeature', {
664 -dbID => $density_feature_id,
666 -start => $seq_region_start,
667 -end => $seq_region_end,
668 -seq_region => $slice,
669 -density_value => $density_value,
670 -density_type => $density_type
682 Example : @feature_ids = @{$density_feature_adaptor->list_dbIDs()};
683 Description: Gets an array of
internal ids
for all density features in the
685 Arg[1] : <optional>
int. not set to 0
for the ids to be sorted by the seq_region.
686 Returntype : list of ints
694 my ($self, $ordered) = @_;
696 return $self->_list_dbIDs(
"density_feature",undef, $ordered);
702 Arg [1] : list of Bio::EnsEMBL::DensityFeatures @df
703 the simple features to store in the database
704 Example : $density_feature_adaptor->store(1234, @density_feats);
705 Description: Stores a list of density feature objects in the database
707 Exceptions : thrown
if @df is not defined,
if any of the features
do not
708 have an attached slice.
709 or
if any elements of @df are not Bio::EnsEMBL::SeqFeatures
718 if( scalar(@df) == 0 ) {
719 throw(
"Must call store with list of DensityFeatures");
721 #mysql> desc density_feature;
722 #+--------------------+---------+------+-----+---------+----------------+
723 #| Field | Type | Null | Key | Default | Extra |
724 #+--------------------+---------+------+-----+---------+----------------+
725 #| density_feature_id | int(11) | | PRI | NULL | auto_increment |
726 #| density_type_id | int(11) | | MUL | 0 | |
727 #| seq_region_id | int(11) | | | 0 | |
728 #| seq_region_start | int(11) | | | 0 | |
729 #| seq_region_end | int(11) | | | 0 | |
730 #| density_value | float | | | 0 | |
731 #+--------------------+---------+------+-----+---------+----------------+
733 my $sth = $self->prepare
734 (
"INSERT INTO density_feature (seq_region_id, seq_region_start, " .
735 "seq_region_end, density_type_id, " .
737 "VALUES (?,?,?,?,?)");
739 my $db = $self->db();
740 my $analysis_adaptor = $db->get_AnalysisAdaptor();
742 FEATURE:
foreach my $df ( @df ) {
744 if( !ref $df || !$df->isa(
"Bio::EnsEMBL::DensityFeature") ) {
745 throw(
"DensityFeature must be an Ensembl DensityFeature, " .
746 "not a [".ref($df).
"]");
749 # we dont store 0 value density features
750 next
if( $df->density_value == 0 );
751 if($df->is_stored($db)) {
752 warning(
"DensityFeature [".$df->dbID.
"] is already stored" .
753 " in this database.");
757 if(!defined($df->density_type)) {
758 throw(
"A density type must be attached to the features to be stored.");
761 #store the density_type if it has not been stored yet
763 if(!$df->density_type->is_stored($db)) {
764 my $dta = $db->get_DensityTypeAdaptor();
765 $dta->store($df->density_type());
770 ($df, $seq_region_id) = $self->_pre_store($df);
772 $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
773 $sth->bind_param(2,$df->start,SQL_INTEGER);
774 $sth->bind_param(3,$df->end,SQL_INTEGER);
775 $sth->bind_param(4,$df->density_type->dbID,SQL_INTEGER);
776 $sth->bind_param(5,$df->density_value,SQL_FLOAT);
779 $original->dbID($self->last_insert_id(
'density_feature_id', undef,
'density_feature'));
780 $original->adaptor($self);
784 =head2 fetch_Featureset_by_Slice
788 for argument documentation
789 Example : $featureset = $dfa->fetch_FeatureSet_by_Slice($slice,'SNPDensity', 10, 1);
790 Description: wrapper around
791 Bio::
EnsEMBL::DBSQL::DensityFeatureAdaptor::fetch_all_by_Slice()
792 which returns a
Bio::
EnsEMBL::DensityFeatureSet and also caches
801 sub fetch_Featureset_by_Slice {
802 my ($self, $slice, $logic_name, $num_blocks, $interpolate, $max_ratio) = @_;
804 my $key = join(
":", $slice->name,
810 unless ($self->{
'_density_feature_cache'}->{$key}) {
811 my $dfeats = $self->fetch_all_by_Slice($slice, $logic_name, $num_blocks,
812 $interpolate, $max_ratio);
813 $self->{
'_density_feature_cache'}->{$key} =
816 return $self->{
'_density_feature_cache'}->{$key};