ensembl-hive  2.8.1
BaseSequenceAdaptor.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =head1 CONTACT
19 
20  Please email comments or questions to the public Ensembl
21  developers list at <dev@ensembl.org>.
22 
23  Questions may also be sent to the Ensembl help desk at
24  <helpdesk@ensembl.org>.
25 
26 =cut
27 
28 =head1 NAME
29 
31 
32 =head1 DESCRIPTION
33 
34 The BaseSequenceAdaptor is responsible for the conversion of calls from
35 C<fetch_by_Slice_start_end_strand()> for Sequence data into requests for a
36 backing data store. In Ensembl these are the B<seqlevel> sequence region
37 records held in the MySQL database.
38 
39 The base adaptor also provides sequence caching based on normalisation
40 technique similar to the UCSC and BAM binning indexes. The code works
41 by right-shifting the requested start and end by a seq chunk power
42 (by default 18 approx. 250,000bp) and then left-shifting by the same
43 value. This means any value within a given window will always result in
44 the same value. Please see the worked examples below:
45 
46  # Equation
47  p=position
48  o=seq chunk power
49  offset=( (p-1)>>o ) << o
50 
51  # Using real values
52  p=1340001
53  o=18
54  right_shifted = (1340001-1) >> 18 == 5
55  offset = 5 << 18 == 1310720
56 
57 To control the size of the cache and sequences stored you can provide
58 the seq chunk power and the number of sequences cached.
59 
60 =cut
61 
63 
64 use strict;
65 use warnings;
66 
68 use Bio::EnsEMBL::Utils::Exception qw(throw);
69 
70 our $SEQ_CHUNK_PWR = 18; # 2^18 = approx. 250KB
71 our $SEQ_CACHE_SIZE = 5;
72 
73 =head2 fetch_by_Slice_start_end_strand
74 
75  Arg [1] : Bio::EnsEMBL::Slice slice
76  The slice from which you want the sequence
77  Arg [2] : Integer; $strand (optional)
78  The start base pair relative to the start of the slice. Negative
79  values or values greater than the length of the slice are fine.
80  default = 1
81  Arg [3] : (optional) int endBasePair
82  The end base pair relative to the start of the slice. Negative
83  values or values greater than the length of the slice are fine,
84  but the end must be greater than or equal to the start
85  count from 1
86  default = the length of the slice
87  Arg [4] : Integer; $strand (optional)
88  Strand of DNA to fetch
89  Returntype : StringRef (DNA requested)
90  Description: Performs the fetching of DNA based upon a Slice. All fetches
91  should use this method and no-other.
92 
93  Implementing classes are responsible for converting the
94  given Slice and values into something which can be processed by
95  the underlying storage engine. Implementing class are also
96  responsible for the reverse complementing of sequence.
97  Exceptions : Thrown if not redefined
98 
99 =cut
100 
101 sub fetch_by_Slice_start_end_strand {
102  my ( $self, $slice, $start, $end, $strand ) = @_;
103  throw "fetch_by_Slice_start_end_strand() must be implemented";
104 }
105 
106 =head2 can_access_Slice
107 
108  Description : Returns a boolean indiciating if the adaptor understands
109  the given Slice.
110  Returntype : Boolean; if true you can get sequence for the given Slice
111  Exceptions : Thrown if not redefined
112 
113 =cut
114 
115 sub can_access_Slice {
116  my ($self, $slice) = @_;
117  throw "can_access_Slice() must be implemented";
118 }
119 
120 =head2 expand_Slice
121 
122  Arg [1] : Bio::EnsEMBL::Slice slice
123  The slice from which you want the sequence
124  Arg [2] : Integer; $strand (optional)
125  The start base pair relative to the start of the slice. Negative
126  values or values greater than the length of the slice are fine.
127  default = 1
128  Arg [3] : (optional) int endBasePair
129  The end base pair relative to the start of the slice. Negative
130  values or values greater than the length of the slice are fine,
131  but the end must be greater than or equal to the start
132  count from 1
133  default = the length of the slice
134  Arg [4] : Integer; $strand (optional)
135  Strand of DNA to fetch
136  Returntype : Bio::EnsEMBL::Slice
137  Description : Creates a new Slice which represents the requested region. Provides
138  logic applicable to all SliceAdaptor instance
139  Exceptions : Thrown if the Slice is circular (we currently do not support this as generic logic)
140 
141 =cut
142 
143 sub expand_Slice {
144  my ($self, $slice, $start, $end, $strand) = @_;
145 
146  $start = 1 if ! defined $start;
147  $end = ($slice->end() - $slice->start()) + 1 if ! defined $end;
148  $strand = 1 if ! defined $strand;
149 
150  my $right_expand = $end - $slice->length(); #negative is fine
151  my $left_expand = 1 - $start; #negative is fine
152 
153  my $new_slice = $slice;
154  if ( $right_expand || $left_expand ) {
155  $new_slice =
156  $slice->strand > 0
157  ? $slice->expand( $left_expand, $right_expand )
158  : $slice->expand( $right_expand, $left_expand );
159  }
160 
161  return $new_slice;
162 }
163 
164 =head2 new
165 
166  Arg [1] : Int $chunk_power; sets the size of each element of
167  the sequence cache. Defaults to 18 which gives
168  block sizes of ~250Kb (it is actually 2^18)
169  Arg [2] : Int $cache_size; size of the cache. Defaults to 5 meaning
170  a cache of 1Mb if you use default values
171  Example : my $sa = $db_adaptor->get_SequenceAdaptor();
172  Description: Constructor. Calls superclass constructor and initialises
173  internal cache structure.
175  Exceptions : none
176  Caller : DBAdaptor::get_SequenceAdaptor
177  Status : Stable
178 
179 =cut
180 
181 sub new {
182  my ($class, $chunk_power, $cache_size) = @_;
183  $class = ref($class) || $class;
184  my $self = bless({}, $class);
185  $self->_init_seq_instance($chunk_power, $cache_size);
186  return $self;
187 }
188 
189 sub _init_seq_instance {
190  my ($self, $chunk_power, $cache_size) = @_;
191  $chunk_power ||= $SEQ_CHUNK_PWR;
192  $cache_size ||= $SEQ_CACHE_SIZE;
193 
194  # use a LRU cache to limit the size
195  tie my %cache, 'Bio::EnsEMBL::Utils::Cache', $cache_size, {};
196 
197  $self->{cache_size} = $cache_size;
198  $self->{chunk_power} = $chunk_power;
199  $self->{seq_cache_max} = ((2 ** $chunk_power) * $cache_size);
200  $self->{seq_cache} = \%cache;
201  return;
202 }
203 
204 =head2 clear_cache
205 
206  Example : $sa->clear_cache();
207  Description : Removes all entries from the associcated sequence cache
208  Returntype : None
209  Exceptions : None
210 
211 =cut
212 
213 sub clear_cache {
214  my ($self) = @_;
215  %{$self->{seq_cache}} = ();
216  return;
217 }
218 
219 sub chunk_power {
220  my ($self) = @_;
221  return $self->{chunk_power};
222 }
223 
224 sub cache_size {
225  my ($self) = @_;
226  return $self->{cache_size};
227 }
228 
229 sub seq_cache_max {
230  my ($self) = @_;
231  return $self->{seq_cache_max};
232 }
233 
234 =head2 _fetch_raw_seq
235 
236  Arg [1] : String $id
237  The identifier of the sequence to fetch.
238  Arg [2] : Integer $start
239  Where to start fetching sequence from
240  Arg [2] : Integer $length
241  Total length of seuqence to fetch
242  Description : Performs the fetch of DNA from the backing storage
243  engine and provides it to the _fetch_seq() method
244  for optional caching.
245  Returntype : ScalarRef of DNA fetched. All bases should be uppercased
246  Exceptions : Thrown if the method is not reimplemented
247 
248 =cut
249 sub _fetch_raw_seq {
250  my ($self, $id, $start, $length) = @_;
251  throw "Need to implement _fetch_raw_seq(). Code must return a ScalarRef";
252 }
253 
254 =head2 _fetch_seq
255 
256  Arg [1] : String $id
257  The identifier of the sequence to fetch.
258  Arg [2] : Integer $start
259  Where to start fetching sequence from
260  Arg [2] : Integer $length
261  Total length of seuqence to fetch
262  Description : If the requested region is smaller than our maximum length
263  cachable region we will see if the cache already contains
264  this chunk. If not we will request the region from C<_fetch_raw_seq()>
265  and cache it. If the region requested is larger than
266  the maximum cacheable sequence length we pass the request
267  onto C<_fetch_raw_seq()> with no caching layer.
268 
269  This module is also responsible for the conversion of
270  requested regions into normalised region reuqests based
271  on C<chunk_power>.
272  Returntype : ScalarRef of DNA fetched. All bases should be uppercased
273  Exceptions : Thrown when C<_fetch_raw_seq()> is not re-implemented
274 
275 =cut
276 
277 sub _fetch_seq {
278  my ($self, $id, $start, $length) = @_;
279  my $cache = $self->{'seq_cache'};
280  my $cache_size = $self->cache_size();
281  my $seq_chunk_pwr = $self->chunk_power();
282  my $seq_cache_max = $self->seq_cache_max();
283 
284  my $seq_ref;
285 
286  # If the length of the region is less than the max size of the
287  # cache then we can use the cache. The region is converted into
288  # a min to max range of bins we have to query to get all the
289  # required sequence. Of these bins the cache may have all of them,
290  # none of them or a combination of the two states
291  if($length < $seq_cache_max) {
292  my $chunk_min = ($start-1) >> $seq_chunk_pwr;
293  my $chunk_max = ($start + $length - 1) >> $seq_chunk_pwr;
294 
295  # piece together sequence from cached component parts
296  my $entire_seq = q{};
297  for(my $i = $chunk_min; $i <= $chunk_max; $i++) {
298  my $key = "${id}:${i}";
299  #If it exists within the cache then add to the string. We will trim
300  #down to the requested region later on
301  my $cached_seq_ref = $cache->{$key};
302  if($cached_seq_ref) {
303  $entire_seq .= ${$cached_seq_ref};
304  }
305  #Otherwise it is uncached so fetch, concat and store in the cache
306  else {
307  my $min = ($i << $seq_chunk_pwr) + 1;
308  my $length = 1 << $seq_chunk_pwr;
309  my $tmp_seq_ref = $self->_fetch_raw_seq($id, $min, $length);
310  $entire_seq .= ${$tmp_seq_ref};
311  $cache->{$key} = $tmp_seq_ref;
312  }
313  }
314 
315  # now return only the requested portion of the entire sequence
316  my $min = ( $chunk_min << $seq_chunk_pwr ) + 1;
317  my $seq = substr( $entire_seq, $start - $min, $length );
318  $seq_ref = \$seq;
319  }
320  else {
321  # If it was too big then just ask it from the backing store. There's
322  # no use in caching this
323  $seq_ref = $self->_fetch_raw_seq($id, $start, $length);
324  }
325 
326  return $seq_ref;
327 }
328 
329 1;
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::DBSQL::SequenceAdaptor
Definition: SequenceAdaptor.pm:22
Bio::EnsEMBL::Utils::Cache
Definition: Cache.pm:71
Bio::EnsEMBL::DBSQL::BaseSequenceAdaptor
Definition: BaseSequenceAdaptor.pm:35
Bio::EnsEMBL::DBSQL::BaseSequenceAdaptor::_init_seq_instance
protected _init_seq_instance()
Bio::EnsEMBL::Slice::length
public Int length()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68