ensembl-hive  2.7.0
RangeRegistry.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
34 
35 =head1 SYNOPSIS
36 
38 
40 
41  # Get a fixed width chunk around the range of intereset. This
42  # will be used if any registration is actually necessary.
43  $chunk_start = ( $start >> 20 ) << 20 + 1;
44  $chunk_end = ( ( $end >> 20 ) + 1 ) << 20;
45 
46  # Check if any registration is necessary for the range. If it is
47  # register a large chunked area instead and return a listref of
48  # unregistered areas that need to be loaded.
49  if (
50  $pairs = $rr->check_and_register(
51  $id, $start, $end, $chunk_start, $chunk_end
52  ) )
53  {
54  foreach my $pair (@$pairs) {
55  my ( $pair_start, $pair_end ) = @$pair;
56  # Fetch mappings for these regions from the assembly table and
57  # load them into the mapper.
58  ...;
59  }
60  } else {
61  # The range ($start - $end) is already registered
62  ...;
63  }
64 
65  # Check if any registration is necessary. If it is register the
66  # region and return a listref of pairs that need to be loaded.
67  if ( $pairs = $rr->check_and_register( $id, $start, $end ) ) {
68  ...;
69  }
70 
71 =head1 DESCRIPTION
72 
73 This module maintains an internal list of registered regions and is
74 used to quickly ascertain if and what regions of a provided range need
75 registration.
76 
77 =head1 METHODS
78 
79 =cut
80 
81 package Bio::EnsEMBL::Mapper::RangeRegistry;
82 
83 use strict;
84 
85 use Bio::EnsEMBL::Utils::Exception qw(throw);
86 use integer;
87 
88 =head2 new
89 
90  Arg [1] : none
91  Example : my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new();
92  Description: Creates a new RangeRegistry object
94  Exceptions : none
95  Caller : AssemblyMapperAdaptor
96  Status : Stable
97 
98 =cut
99 
100 sub new {
101  my ($proto) = @_;
102 
103  my $class = ref($proto) || $proto;
104 
105  return bless( { 'registry' => {} }, $class );
106 }
107 
108 sub flush {
109  my ($self) = @_;
110  $self->{'registry'} = {};
111 }
112 
113 =head2 check_and_register
114 
115  Arg [1] : string $id
116  The id of the range to be checked/registered (e.g. a sequenceid)
117  Arg [2] : int $start
118  The start of the range to be checked
119  Arg [3] : int $end
120  The end of the range to be checked
121  Arg [4] : (optional) int $rstart
122  The start of the range to be registered
123  if the checked range was not fully registered
124  Arg [5] : (optional) int $rend
125  The end of the range to be registerd
126  if the checked range was not fully registered
127  Example : $ranges=$rr->check_and_register('X',500,600, 1,1_000_000));
128  Description: Checks the range registry to see if the entire range
129  denoted by ($id : $start-$end) is already registered. If
130  it already is, then undef is returned. If it is not then
131  the range specified by $rstart and $rend is registered, and
132  a list of regions that were required to completely register
133  $rstart-$rend is returned. If $rstart and $rend are not
134  defined they default to $start and $end respectively.
135 
136  The reason there is a single call to do both the checking and
137  registering is to reduce the overhead. Much of the work to
138  check if a range is registered is the same as registering a
139  region around that range.
140  Returntype : undef or listref of [start,end] range pairs
141  Exceptions : throw if rstart is greater than start
142  throw if rend is less than end
143  throw if end is less than start
144  throw if id, start, or end are not defined
145  Caller : AssemblyMapperAdaptor
146  Status : Stable
147 
148 =cut
149 
150 #"constants"
151 my $START = 0;
152 my $END = 1;
153 
154 sub check_and_register {
155  my ( $self, $id, $start, $end, $rstart, $rend ) = @_;
156 
157  $rstart = $start if ( !defined($rstart) );
158  $rend = $end if ( !defined($rend) );
159 
160  #
161  # Sanity checks
162  #
163  if ( !defined($id) || !defined($start) || !defined($end) ) {
164  throw("ID, start, end arguments are required");
165  }
166 
167  # The following was commented out due to Ensembl Genomes requirements
168  # for bacterial genomes.
169  #
170  ## if ( $start > $end ) {
171  ## throw( "start argument [$start] must be less than "
172  ## . "(or equal to) end argument [$end]" );
173  ## }
174  ##
175  ## if ( $rstart > $rend ) {
176  ## throw( "rstart argument [$rstart] must be less than "
177  ## . "(or equal to) rend argument [$rend] argument" );
178  ## }
179 
180  if ( $rstart > $start ) {
181  throw("rstart [$rstart] must be less than or equal to start [$start]");
182  }
183 
184  if ( $rend < $end ) {
185  throw("rend [$rend] must be greater than or equal to end [$end]");
186  }
187 
188  my $reg = $self->{'registry'};
189  my $list = $reg->{$id} ||= [];
190 
191  my @gap_pairs;
192 
193  my $len = scalar(@$list);
194 
195  if ( $len == 0 ) {
196  # this is the first request for this id, return a gap pair for the
197  # entire range and register it as seen
198  $list->[0] = [ $rstart, $rend ];
199  return [ [ $rstart, $rend ] ];
200  }
201 
202  #####
203  # loop through the list of existing ranges recording any "gaps" where
204  # the existing range does not cover part of the requested range
205  #
206 
207  my $start_idx = 0;
208  my $end_idx = $#$list;
209  my ( $mid_idx, $range );
210 
211  # binary search the relevant pairs
212  # helps if the list is big
213  while ( ( $end_idx - $start_idx ) > 1 ) {
214  $mid_idx = ( $start_idx + $end_idx ) >> 1;
215  $range = $list->[$mid_idx];
216 
217  if ( $range->[1] < $rstart ) {
218  $start_idx = $mid_idx;
219  } else {
220  $end_idx = $mid_idx;
221  }
222  }
223 
224  my ( $gap_start, $gap_end, $r_idx, $rstart_idx, $rend_idx );
225  $gap_start = $rstart;
226 
227  for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) {
228  my ( $pstart, $pend ) = @{ $list->[$CUR] };
229 
230  # no work needs to be done at all if we find a range pair that
231  # entirely overlaps the requested region
232  if ( $pstart <= $start && $pend >= $end ) {
233  return undef;
234  }
235 
236  # find adjacent or overlapping regions already registered
237  if ( $pend >= ( $rstart - 1 ) && $pstart <= ( $rend + 1 ) ) {
238  if ( !defined($rstart_idx) ) {
239  $rstart_idx = $CUR;
240  }
241  $rend_idx = $CUR;
242  }
243 
244  if ( $pstart > $rstart ) {
245  $gap_end = ( $rend < $pstart ) ? $rend : $pstart - 1;
246  push @gap_pairs, [ $gap_start, $gap_end ];
247  }
248 
249  $gap_start = ( $rstart > $pend ) ? $rstart : $pend + 1;
250 
251  # if($pstart > $rend && !defined($r_idx)) {
252  if ( $pend >= $rend && !defined($r_idx) ) {
253  $r_idx = $CUR;
254  last;
255  }
256  } ## end for ( my $CUR = $start_idx...
257 
258  # do we have to make another gap?
259  if ( $gap_start <= $rend ) {
260  push @gap_pairs, [ $gap_start, $rend ];
261  }
262 
263  #
264  # Merge the new range into the registered list
265  #
266  if ( defined($rstart_idx) ) {
267  my ( $new_start, $new_end );
268 
269  if ( $rstart < $list->[$rstart_idx]->[0] ) {
270  $new_start = $rstart;
271  } else {
272  $new_start = $list->[$rstart_idx]->[0];
273  }
274 
275  if ( $rend > $list->[$rend_idx]->[1] ) {
276  $new_end = $rend;
277  } else {
278  $new_end = $list->[$rend_idx]->[1];
279  }
280 
281  splice( @$list, $rstart_idx,
282  $rend_idx - $rstart_idx + 1,
283  [ $new_start, $new_end ] );
284 
285  } elsif ( defined($r_idx) ) {
286  splice( @$list, $r_idx, 0, [ $rstart, $rend ] );
287  } else {
288  push( @$list, [ $rstart, $rend ] );
289  }
290 
291  return \@gap_pairs;
292 } ## end sub check_and_register
293 
294 # overlap size is just added to make RangeRegistry generally more useful
295 
296 =head2 overlap_size
297 
298  Arg [1] : string $id
299  Arg [2] : int $start
300  Arg [3] : int $end
301  Example : my $overlap_size = $rr->( "chr1", 1, 100_000_000 )
302  Description: finds out how many bases of the given range are registered
303  Returntype : int
304  Exceptions : none
305  Caller : general
306  Status : Stable
307 
308 =cut
309 
310 sub overlap_size {
311  my ( $self, $id, $start, $end ) = @_;
312 
313  my $overlap = 0;
314 
315  if ( $start > $end ) { return 0 }
316 
317  my $reg = $self->{'registry'};
318  my $list = $reg->{$id} ||= [];
319 
320  my $len = scalar(@$list);
321 
322  if ( $len == 0 ) {
323  # this is the first request for this id, return a gap pair for the
324  # entire range and register it as seen
325  return 0;
326  }
327 
328  #####
329  # loop through the list of existing ranges recording any "gaps" where
330  # the existing range does not cover part of the requested range
331  #
332 
333  my $start_idx = 0;
334  my $end_idx = $#$list;
335  my ( $mid_idx, $range );
336 
337  # binary search the relevant pairs
338  # helps if the list is big
339  while ( ( $end_idx - $start_idx ) > 1 ) {
340  $mid_idx = ( $start_idx + $end_idx ) >> 1;
341  $range = $list->[$mid_idx];
342  if ( $range->[1] < $start ) {
343  $start_idx = $mid_idx;
344  } else {
345  $end_idx = $mid_idx;
346  }
347  }
348 
349  for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) {
350  my ( $pstart, $pend ) = @{ $list->[$CUR] };
351 
352  if ( $pstart > $end ) {
353  # No more interesting ranges here.
354  last;
355  }
356 
357  # no work needs to be done at all if we find a range pair that
358  # entirely overlaps the requested region
359  if ( $pstart <= $start && $pend >= $end ) {
360  $overlap = $end - $start + 1;
361  last;
362  }
363 
364  my $mstart = ( $start < $pstart ? $pstart : $start );
365  my $mend = ( $end < $pend ? $end : $pend );
366 
367  if ( $mend - $mstart >= 0 ) {
368  $overlap += ( $mend - $mstart + 1 );
369  }
370  }
371 
372  return $overlap;
373 } ## end sub overlap_size
374 
375 
376 # low level function to access the ranges
377 # only use for read access
378 
379 sub get_ranges {
380  my ( $self, $id ) = @_;
381 
382  return $self->{'registry'}->{$id};
383 }
384 
385 1;
386 
Bio::EnsEMBL::Mapper::RangeRegistry
Definition: RangeRegistry.pm:51
Bio::EnsEMBL::Mapper::RangeRegistry::new
public Bio::EnsEMBL::Mapper::RangeRegistry new()
Bio::EnsEMBL::Mapper::RangeRegistry::check_and_register
public Undef check_and_register()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68