ensembl-hive  2.7.0
Immutable.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
34 
35 =head1 SYNOPSIS
36 
37  # define a set of intervals to be added to the tree
38  my $intervals = [ Bio::EnsEMBL::Utils::Interval->new(121626874, 122092717),
39  Bio::EnsEMBL::Utils::Interval->new(121637917, 121658918),
40  Bio::EnsEMBL::Utils::Interval->new(122096077, 124088369) ];
41 
42  # initialise the tree with the above intervals
43  my $tree = Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($intervals);
44 
45  # point query
46  my $results = $tree->query(121779004);
47  if (scalar @$results) {
48  print "Intervals contain 121779004\n";
49  }
50 
51  # same query, but use interval query
52  my $results = $tree->query(121779004, 121779004);
53  if (scalar @$results) {
54  print "Found containing interval: [", $result->[0]->start, ', ', $result->[0]->end, "\n";
55  }
56 
57 =head1 DESCRIPTION
58 
59 An implementation of an immutable interval tree. Immutable means the tree is
60 initialised with a fixed set of intervals at creation time. Intervals cannot
61 be added to or removed from the tree during its life cycle.
62 
63 Implementation heavily inspired by https://github.com/tylerkahn/intervaltree-python
64 
65 This implementation does not support Intervals having a start > end - i.e.
66 intervals spanning the origin of a circular chromosome.
67 
68 =head1 METHODS
69 
70 =cut
71 
72 package Bio::EnsEMBL::Utils::Tree::Interval::Immutable;
73 
74 use strict;
75 
76 use Tie::RefHash;
77 
78 use Bio::EnsEMBL::Utils::Scalar qw(assert_ref);
79 use Bio::EnsEMBL::Utils::Exception qw(throw);
82 
83 
84 =head2 new
85 
86  Arg [1] : Arrayref of Bio::EnsEMBL::Utils::Interval instances
87  Example : my $tree = Bio::EnsEMBL::Utils::Tree::Immutable([ $i1, $i2, $i3 ]);
88  Description : Constructor. Creates a new immutable tree instance
90  Exceptions : none
91  Caller : general
92 
93 =cut
94 
95 sub new {
96  my $caller = shift;
97  my $class = ref($caller) || $caller;
98 
99  my $intervals = shift;
100  if (defined $intervals ) {
101  assert_ref($intervals, 'ARRAY');
102  }
103 
104  my $self = bless({}, $class);
105 
106  $self->{top_node} = $self->_divide_intervals($intervals);
107 
108  return $self;
109 }
110 
111 =head2 root
112 
113  Arg [] : none
114  Example : my $root = $tree->root();
115  Description : Return the tree top node
117  Exceptions : none
118  Caller : general
119 
120 =cut
121 
122 sub root {
123  return shift->{top_node};
124 }
125 
126 =head2 query
127 
128  Arg [1] : scalar, $start
129  Where the query interval begins
130  Arg [2] : (optional) scalar, $end
131  Where the query interval ends
132  Example : my $results = $tree->query(121626874, 122092717);
133  Description : Query the tree if its intervals overlap the interval whose start
134  and end points are specified by the argument list.
135  If end is not specified, it is assumed to be the same as start
136  so effectively making a point query.
137  Returntype : An arrayref of Bio::EnsEMBL::Utils::Interval instances
138  Exceptions : none
139  Caller : general
140 
141 =cut
142 
143 sub query {
144  my ($self, $start, $end) = @_;
145 
146  my $interval;
147  if (defined $start) {
148  $end = $start unless defined $end;
149  $interval = Bio::EnsEMBL::Utils::Interval->new($start, $end);
150  }
151 
152  return [] unless $interval;
153  return $self->_query_point($self->root, $interval->start, []) if $interval->is_point;
154 
155  my $result = [];
156  return $result unless $self->root or $interval->is_empty;
157 
158  my $node = $self->root;
159  while ($node) {
160  if ($interval->contains($node->x_center)) {
161  push @{$result}, @{$node->s_center_beg};
162  $self->_range_query_left($node->left, $interval, $result);
163  $self->_range_query_right($node->right, $interval, $result);
164  last;
165  }
166  if ($interval->is_left_of($node->x_center)) {
167  foreach my $s_beg (@{$node->s_center_beg}) {
168  last unless $interval->intersects($s_beg);
169  push @{$result}, $s_beg;
170  }
171  $node = $node->left;
172  } else {
173  foreach my $s_end (@{$node->s_center_end}) {
174  last unless $interval->intersects($s_end);
175  push @{$result}, $s_end;
176  }
177  $node = $node->right;
178  }
179  }
180 
181  return sort_by_begin(uniq($result));
182 }
183 
184 sub _query_point {
185  my ($self, $node, $point, $result) = @_;
186 
187  return $result unless $node;
188 
189  # if x is less than x_center, the leftmost set of intervals, S_left, is considered
190  if ($point <= $node->x_center) {
191  # if x is less than x_center, we know that all intervals in S_center end after x,
192  # or they could not also overlap x_center. Therefore, we need only find those intervals
193  # in S_center that begin before x. We can consult the lists of S_center that have already
194  # been constructed. Since we only care about the interval beginnings in this scenario,
195  # we can consult the list sorted by beginnings.
196  # Suppose we find the closest number no greater than x in this list. All ranges from the
197  # beginning of the list to that found point overlap x because they begin before x and end
198  # after x (as we know because they overlap x_center which is larger than x).
199  # Thus, we can simply start enumerating intervals in the list until the startpoint value exceeds x.
200  foreach my $s_beg (@{$node->s_center_beg}) {
201  last if $s_beg->is_right_of($point);
202  push @{$result}, $s_beg;
203  }
204 
205  # since x < x_center, we also consider the leftmost set of intervals
206  return $self->_query_point($node->left, $point, $result);
207  } else {
208  # if x is greater than x_center, we know that all intervals in S_center must begin before x,
209  # so we find those intervals that end after x using the list sorted by interval endings.
210  foreach my $s_end (@{$node->s_center_end}) {
211  last if $s_end->is_left_of($point);
212  push @{$result}, $s_end;
213  }
214 
215  # since x > x_center, we also consider the rightmost set of intervals
216  return $self->_query_point($node->right, $point, $result);
217  }
218 
219  return sort_by_begin(uniq($result));
220 }
221 
222 # This corresponds to the left branch of the range search, once we find a node, whose
223 # midpoint is contained in the query interval. All intervals in the left subtree of that node
224 # are guaranteed to intersect with the query, if they have an endpoint greater or equal than
225 # the start of the query interval. Basically, this means that every time we branch to the left
226 # in the binary search, we need to add the whole right subtree to the result set.
227 
228 sub _range_query_left {
229  my ($self, $node, $interval, $result) = @_;
230 
231  while ($node) {
232  if ($interval->contains($node->x_center)) {
233  push @{$result}, @{$node->s_center_beg};
234  if ($node->right) {
235  # in-order traversal of the right subtree to add all its intervals
236  $self->_in_order_traversal($node->right, $result);
237  }
238  $node = $node->left;
239  } else {
240  foreach my $seg_end (@{$node->s_center_end}) {
241  last if $seg_end->is_left_of($interval);
242  push @{$result}, $seg_end;
243  }
244  $node = $node->right;
245  }
246  }
247 }
248 
249 # This corresponds to the right branch of the range search, once we find a node, whose
250 # midpoint is contained in the query interval. All intervals in the right subtree of that node
251 # are guaranteed to intersect with the query, if they have an endpoint smaller or equal than
252 # the end of the query interval. Basically, this means that every time we branch to the right
253 # in the binary search, we need to add the whole left subtree to the result set.
254 
255 sub _range_query_right {
256  my ($self, $node, $interval, $result) = @_;
257 
258  while ($node) {
259  if ($interval->contains($node->x_center)) {
260  push @{$result}, @{$node->s_center_beg};
261  if ($node->left) {
262  # in-order traversal of the left subtree to add all its intervals
263  $self->_in_order_traversal($node->left, $result);
264  }
265  $node = $node->right;
266  } else {
267  foreach my $seg_beg (@{$node->s_center_beg}) {
268  last if $seg_beg->is_right_of($interval);
269  push @{$result}, $seg_beg;
270  }
271  $node = $node->left;
272  }
273  }
274 }
275 
276 sub in_order_traversal {
277  my ($self) = @_;
278 
279  my $result = [];
280  $self->_in_order_traversal($self->root, $result);
281 
282  return $result;
283 }
284 
285 sub _in_order_traversal {
286  my ($self, $node, $result) = @_;
287 
288  return unless $node;
289  $result ||= [];
290 
291  $self->_in_order_traversal($node->left, $result);
292  push @{$result}, @{$node->s_center_beg};
293  $self->_in_order_traversal($node->right, $result);
294 }
295 
296 sub _divide_intervals {
297  my ($self, $intervals, $sorted) = @_;
298 
299  return undef unless scalar @{$intervals};
300 
301  my $sorted_intervals;
302  if ($sorted) {
303  $sorted_intervals = $intervals;
304  } else {
305  $sorted_intervals = sort_by_begin($intervals);
306  }
307 
308  my $x_center = $self->_center_sorted($sorted_intervals);
309  my ($s_center, $s_left, $s_right) = ([], [], []);
310 
311  foreach my $sorted_interval (@{$sorted_intervals}) {
312  if ($sorted_interval->spans_origin) {
313  throw "Cannot build a tree containing an interval that spans the origin";
314  }
315  if ($sorted_interval->end < $x_center) {
316  push @{$s_left}, $sorted_interval;
317  } elsif ($sorted_interval->start > $x_center) {
318  push @{$s_right}, $sorted_interval;
319  } else {
320  push @{$s_center}, $sorted_interval;
321  }
322  }
323 
325  $s_center,
326  $self->_divide_intervals($s_left, 1),
327  $self->_divide_intervals($s_right, 1));
328 }
329 
330 sub _center {
331  my ($self, $intervals) = @_;
332 
333  my $sorted_intervals = sort_by_begin($intervals);
334  my $len = scalar @{$sorted_intervals};
335 
336  return $sorted_intervals->[int($len/2)]->start;
337 }
338 
339 sub _center_sorted {
340  my ($self, $sorted_intervals) = @_;
341  my $len = scalar @{$sorted_intervals};
342 
343  return $sorted_intervals->[int($len/2)]->start;
344 }
345 
346 sub sort_by_begin {
347  my $intervals = shift;
348 
349  return [ map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [ $_->start, $_ ] } @{$intervals} ];
350 }
351 
352 sub uniq {
353  my $intervals = shift;
354 
355  tie my %seen, 'Tie::RefHash';
356 
357  return [ grep { ! $seen{ $_ }++ } @{$intervals} ];
358 }
359 
360 1;
361 
Bio::EnsEMBL::Utils::Interval
Definition: Interval.pm:41
map
public map()
Bio::EnsEMBL::Utils::Tree::Interval::Immutable::Node
Definition: Node.pm:14
Bio::EnsEMBL::Utils::Scalar
Definition: Scalar.pm:66
Bio::EnsEMBL::Utils::Interval::new
public Bio::EnsEMBL::Utils::Interval new()
Bio::EnsEMBL::Utils::Tree::Interval::Immutable
Definition: Node.pm:8
Bio::EnsEMBL::Utils::Tree::Interval::Immutable::Node::new
public Bio::EnsEMBL::Utils::Tree::Interval::Immutable::Node new()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::Utils::Interval::is_point
public Boolean is_point()