ensembl-hive  2.8.1
BaseFeatureAdaptor.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor - An Abstract Base class for all
34 FeatureAdaptors
35 
36 =head1 SYNOPSIS
37 
38 Abstract class - should not be instantiated. Implementation of
39 abstract methods must be performed by subclasses.
40 
41 =head1 DESCRIPTION
42 
43 This is a base adaptor for feature adaptors. This base class is simply a way
44 of eliminating code duplication through the implementation of methods
45 common to all feature adaptors.
46 
47 =head1 METHODS
48 
49 =cut
50 
51 package Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor;
52 
53 use vars qw(@ISA @EXPORT);
54 use strict;
55 
58 use Bio::EnsEMBL::Utils::Exception qw(warning throw);
59 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
61 use Bio::EnsEMBL::Utils::Scalar qw/assert_ref/;
62 
64 
65 @EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
66 
67 our $SLICE_FEATURE_CACHE_SIZE = 4;
68 our $MAX_SPLIT_QUERY_SEQ_REGIONS = 3;
69 our $SILENCE_CACHE_WARNINGS = 0;
70 
71 =head2 new
72 
73  Arg [1] : list of args @args
74  Superclass constructor arguments
75  Example : none
76  Description: Constructor which warns if caching has been switched off
77  Returntype : Bio::EnsEMBL::BaseFeatureAdaptor
78  Exceptions : none
79  Caller : implementing subclass constructors
80  Status : Stable
81 
82 =cut
83 
84 sub new {
85  my ($class, @args) = @_;
86  my $self = $class->SUPER::new(@args);
87  if ( defined $self->db->no_cache() && $self->db->no_cache() && ! $SILENCE_CACHE_WARNINGS) {
88  warning( "You are using the API without caching most recent features. "
89  . "Performance might be affected." );
90  }
91  return $self;
92 }
93 
94 =head2 start_equals_end
95 
96  Arg [1] : (optional) boolean $newval
97  Example : $bfa->start_equals_end(1);
98  Description: Getter/Setter for the start_equals_end flag. If set
99  to true sub _slice_fetch will use a simplified sql to retrieve 1bp slices.
100  Returntype : boolean
101  Exceptions : none
102  Caller : EnsemblGenomes variation DB build
103  Status : Stable
104 
105 =cut
106 
107 sub start_equals_end {
108  my ( $self, $value ) = @_;
109 
110  if ( defined($value) ) {
111  $self->{'start_equals_end'} = $value;
112  }
113  return $self->{'start_equals_end'};
114 }
115 
116 
117 =head2 clear_cache
118 
119  Args : None
120  Example : my $sa =
121  $registry->get_adaptor( 'Mus musculus', 'Core',
122  'Slice' );
123  my $ga =
124  $registry->get_adaptor( 'Mus musculus', 'Core',
125  'Gene' );
126 
127  my $slice =
128  $sa->fetch_by_region( 'Chromosome', '1', 1e8,
129  1.05e8 );
130 
131  my $genes = $ga->fetch_all_by_Slice($slice);
132 
133  $ga->clear_cache();
134 
135  Description : Empties the feature cache associated with this
136  feature adaptor.
137  Return type : None
138  Exceptions : None
139  Caller : General
140  Status : At risk (under development)
141 
142 =cut
143 
144 sub clear_cache {
145  my ($self) = @_;
146  $self->_clear_slice_feature_cache();
147  if(!$self->_no_id_cache()) {
148  $self->_id_cache()->clear_cache();
149  }
150  return;
151 }
152 
153 sub _clear_slice_feature_cache {
154  my ($self) = @_;
155  %{$self->{_slice_feature_cache}} = ();
156  return;
157 }
158 
159 =head2 _slice_feature_cache
160 
161  Description : Returns the feature cache if we are allowed to cache and
162  will build it if we need to. We will never return a reference
163  to the hash to avoid unintentional auto-vivfying caching
164  Returntype : Bio::EnsEMBL::Utils::Cache
165  Exceptions : None
166  Caller : Internal
167 
168 =cut
169 
170 sub _slice_feature_cache {
171  my ($self) = @_;
172  return if $self->db()->no_cache();
173  if(! exists $self->{_slice_feature_cache}) {
174  tie my %cache, 'Bio::EnsEMBL::Utils::Cache', $SLICE_FEATURE_CACHE_SIZE;
175  $self->{_slice_feature_cache} = \%cache;
176  }
177  return $self->{_slice_feature_cache};
178 }
179 
180 =head2 fetch_all_by_Slice
181 
182  Arg [1] : Bio::EnsEMBL::Slice $slice
183  the slice from which to obtain features
184  Arg [2] : (optional) string $logic_name
185  the logic name of the type of features to obtain
186  Example : $fts = $a->fetch_all_by_Slice($slice, 'Swall');
187  Description: Returns a listref of features created from the database
188  which are on the Slice defined by $slice. If $logic_name is
189  defined only features with an analysis of type $logic_name
190  will be returned.
191  NOTE: only features that are entirely on the slice's seq_region
192  will be returned (i.e. if they hang off the start/end of a
193  seq_region they will be discarded). Features can extend over the
194  slice boundaries though (in cases where you have a slice that
195  doesn't span the whole seq_region).
196  Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
197  Exceptions : none
198  Caller : Bio::EnsEMBL::Slice
199  Status : Stable
200 
201 =cut
202 
203 sub fetch_all_by_Slice {
204  my ($self, $slice, $logic_name) = @_;
205  #fetch by constraint with empty constraint
206  return $self->fetch_all_by_Slice_constraint($slice, '', $logic_name);
207 }
208 
209 
210 
211 =head2 fetch_Iterator_by_Slice_method
212 
213  Arg [1] : CODE ref of Slice fetch method
214  Arg [2] : ARRAY ref of parameters for Slice fetch method
215  Arg [3] : Optional int: Slice index in parameters array
216  Arg [4] : Optional int: Slice chunk size. Default=500000
217  Example : my $slice_iter = $feature_adaptor->fetch_Iterator_by_Slice_method
218  ($feature_adaptor->can('fetch_all_by_Slice_Arrays'),
219  \@fetch_method_params,
220  0,#Slice idx
221  );
222 
223  while(my $feature = $slice_iter->next && defined $feature){
224  #Do something here
225  }
226 
227  Description: Creates an Iterator which chunks the query Slice to facilitate
228  large Slice queries which would have previously run out of memory
229  Returntype : Bio::EnsEMBL::Utils::Iterator
230  Exceptions : Throws if mandatory params not valid
231  Caller : general
232  Status : at risk
233 
234 =cut
235 
236 #Does not support Collections. See Funcgen ResultFeatureAdaptor::fetch_collection_Iterator_by_Slice_method
237 
238 sub fetch_Iterator_by_Slice_method{
239  my ($self, $slice_method_ref, $params_ref, $slice_idx, $chunk_size) = @_;
240 
241  if(! ( defined $slice_method_ref &&
242  ref($slice_method_ref) eq 'CODE')
243  ){
244  throw('Must pass a valid Slice fetch method CODE ref');
245  }
246 
247  if (! ($params_ref &&
248  ref($params_ref) eq 'ARRAY')) {
249  #Don't need to check size here so long as we have valid Slice
250  throw('You must pass a method params ARRAYREF');
251  }
252 
253  $slice_idx = 0 if(! defined $slice_idx);
254  my $slice = $params_ref->[$slice_idx];
255  $chunk_size ||= 1000000;
256 
257  my @feat_cache;
258  my $finished = 0;
259  my $start = 1; #local coord for sub slice
260  my $end = $slice->length;
261  my $num_overlaps = 0;
262 
263  my $coderef =
264  sub {
265 
266  while (scalar(@feat_cache) == 0 &&
267  ! $finished) {
268 
269  my $new_end = ($start + $chunk_size - 1);
270 
271  if ($new_end >= $end) {
272  # this is our last chunk
273  $new_end = $end;
274  $finished = 1;
275  }
276 
277  #Chunk by sub slicing
278  my $sub_slice = $slice->sub_Slice($start, $new_end);
279  $params_ref->[$slice_idx] = $sub_slice;
280  @feat_cache = @{ $slice_method_ref->($self, @$params_ref)};
281 
282  #Remove & count overlapping features
283  splice(@feat_cache, 0, $num_overlaps) if($num_overlaps);
284  my $i;
285 
286  if (scalar(@feat_cache) > 0) {
287 
288  my $feat_end = $feat_cache[$#feat_cache]->seq_region_end;
289  my $slice_end = $sub_slice->end;
290  $num_overlaps = 0;
291  for ($i = $#feat_cache; $i >=0; $i--) {
292  $feat_end = $feat_cache[$i]->seq_region_end;
293 
294  if ($feat_end > $slice_end) {
295  $num_overlaps ++;
296  } else {
297  last;
298  }
299 
300  }
301  }
302 
303  # update the start coordinate
304  $start = $new_end + 1;
305  }
306 
307  #this maybe returning from an undef cache
308  #Need to sub this out even more?
309  return shift @feat_cache;
310  };
311 
312  return Bio::EnsEMBL::Utils::Iterator->new($coderef);
313 }
314 
315 
316 =head2 fetch_Iterator_by_Slice
317 
318  Arg [1] : Bio::EnsEMBL::Slice
319  Arg [2] : Optional string: logic name of analysis
320  Arg [3] : Optional int: Chunk size to iterate over. Default is 500000
321  Example : my $slice_iter = $feature_adaptor->fetch_Iterator_by_Slice($slice);
322 
323  while(my $feature = $slice_iter->next && defined $feature){
324  #Do something here
325  }
326 
327  Description: Creates an Iterator which chunks the query Slice to facilitate
328  large Slice queries which would have previously run out of memory
329  Returntype : Bio::EnsEMBL::Utils::Iterator
330  Exceptions : None
331  Caller : general
332  Status : at risk
333 
334 =cut
335 
336 sub fetch_Iterator_by_Slice{
337  my ($self, $slice, $logic_name, $chunk_size) = @_;
338 
339  my $method_ref = $self->can('fetch_all_by_Slice');
340 
341  return $self->fetch_Iterator_by_Slice_method($method_ref, [$slice, $logic_name], 0, $chunk_size);
342 }
343 
344 
345 =head2 fetch_all_by_Slice_and_score
346 
347  Arg [1] : Bio::EnsEMBL::Slice $slice
348  the slice from which to obtain features
349  Arg [2] : (optional) float $score
350  lower bound of the the score of the features retrieved
351  Arg [3] : (optional) string $logic_name
352  the logic name of the type of features to obtain
353  Example : $fts = $a->fetch_all_by_Slice_and_score($slice,90,'Swall');
354  Description: Returns a list of features created from the database which are
355  are on the Slice defined by $slice and which have a score
356  greater than $score. If $logic_name is defined,
357  only features with an analysis of type $logic_name will be
358  returned.
359  Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
360  Exceptions : none
361  Caller : Bio::EnsEMBL::Slice
362  Status : Stable
363 
364 =cut
365 
366 sub fetch_all_by_Slice_and_score {
367  my ( $self, $slice, $score, $logic_name ) = @_;
368 
369  my $constraint;
370  if ( defined($score) ) {
371  # Get the synonym of the primary_table
372  my @tabs = $self->_tables();
373  my $syn = $tabs[0]->[1];
374 
375  $constraint = sprintf( "%s.score > %s",
376  $syn,
377  $self->dbc()->db_handle()->quote( $score, SQL_FLOAT ) );
378  }
379 
380  return
381  $self->fetch_all_by_Slice_constraint( $slice, $constraint,
382  $logic_name );
383 }
384 
385 
386 =head2 fetch_all_by_Slice_constraint
387 
388  Arg [1] : Bio::EnsEMBL::Slice $slice
389  the slice from which to obtain features
390  Arg [2] : (optional) string $constraint
391  An SQL query constraint (i.e. part of the WHERE clause)
392  Arg [3] : (optional) string $logic_name
393  the logic name of the type of features to obtain
394  Example : $fs = $a->fetch_all_by_Slice_constraint($slc, 'perc_ident > 5');
395  Description: Returns a listref of features created from the database which
396  are on the Slice defined by $slice and fulfill the SQL
397  constraint defined by $constraint. If logic name is defined,
398  only features with an analysis of type $logic_name will be
399  returned.
400  Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
401  Exceptions : thrown if $slice is not defined
402  Caller : Bio::EnsEMBL::Slice
403  Status : Stable
404 
405 =cut
406 
407 sub fetch_all_by_Slice_constraint {
408  my ( $self, $slice, $constraint, $logic_name ) = @_;
409 
410  my @result;
411 
412  #Pull out here as we need to remember them & reset accordingly
413  my $bind_params = $self->bind_param_generic_fetch();
414 
415  if ( !ref($slice)
416  || !( $slice->isa('Bio::EnsEMBL::Slice')
417  or $slice->isa('Bio::EnsEMBL::LRGSlice') ) )
418  {
419  throw("Bio::EnsEMBL::Slice argument expected.");
420  }
421 
422  $constraint ||= '';
423  $constraint =
424  $self->_logic_name_to_constraint( $constraint, $logic_name );
425 
426  # If the logic name was invalid, undef was returned
427  if ( !defined($constraint) ) { return [] }
428 
429  my $key;
430  my $cache;
431 
432  # Will only use feature_cache if hasn't been no_cache attribute set
433  if (
434  !( defined( $self->db()->no_cache() ) && $self->db()->no_cache() ) )
435  {
436 
437  #strain test and add to constraint if so to stop caching.
438  if ( $slice->isa('Bio::EnsEMBL::Variation::StrainSlice') ) {
439  my $string =
440  $self->dbc()->db_handle()->quote( $slice->strain_name() );
441 
442  if ( $constraint ne "" ) {
443  $constraint .= " AND $string = $string ";
444  } else {
445  $constraint .= " $string = $string ";
446  }
447  }
448 
449  # Check the cache and return the cached results if we have already
450  # done this query. The cache key is the made up from the slice
451  # name, the constraint, and the bound parameters (if there are any).
452  $key = uc( join( ':', $slice->name(), $constraint ) );
453 
454  if ( defined($bind_params) ) {
455  $key .= ':'
456  . join( ':', map { $_->[0] . '/' . $_->[1] } @{$bind_params} );
457  }
458 
459  $cache = $self->_slice_feature_cache();
460  if ( exists( $cache->{$key} ) ) {
461  # Clear the bound parameters and return the cached data.
462  $self->{'_bind_param_generic_fetch'} = ();
463  #IMPORTANT: NEVER EVER RETURN A COPY OF THE DATA STRUCTURE.
464  # This will hold arrays of values. Since we've been doing
465  # this for so long people are expecting multiple calls
466  # to fetch_by_SliceXXXXX() methods to return the same
467  # array reference.
468  return $cache->{$key};
469  }
470  } ## end if ( !( defined( $self...)))
471 
472 
473  my $proj_ref = $self->_get_and_filter_Slice_projections($slice);
474  my $bounds = $self->_generate_feature_bounds($slice);
475 
476  # fetch features for the primary slice AND all symlinked slices
477  foreach my $seg (@{$proj_ref}) {
478  # re-bind the params
479  $self->_bind_param_generic_fetch($bind_params);
480  my $offset = $seg->from_start();
481  my $seg_slice = $seg->to_Slice();
482  my $features =
483  $self->_slice_fetch( $seg_slice, $constraint );
484 
485  # If this was a symlinked slice offset the feature coordinates as
486  # needed.
487  if ( $seg_slice->name() ne $slice->name() ) {
488 
489  FEATURE:
490  foreach my $f ( @{$features} ) {
491  if ( $offset != 1 ) {
492  $f->{'start'} += $offset - 1;
493  $f->{'end'} += $offset - 1;
494  }
495 
496  # discard boundary crossing features from symlinked regions
497  foreach my $bound (@{$bounds}) {
498  if ( $f->{'start'} < $bound && $f->{'end'} >= $bound ) {
499  next FEATURE;
500  }
501  }
502 
503  $f->{'slice'} = $slice;
504  push( @result, $f );
505  }
506  } else {
507  push( @result, @{$features} );
508  }
509  } ## end foreach my $seg (@proj)
510 
511  # Will only use feature_cache when set attribute no_cache in DBAdaptor
512  if ( defined($key) ) {
513  $cache->{$key} = \@result;
514  }
515 
516  return \@result;
517 } ## end sub fetch_all_by_Slice_constraint
518 
519 
520 =head2 fetch_all_by_logic_name
521 
522  Arg [1] : string $logic_name
523  the logic name of the type of features to obtain
524  Example : $fs = $a->fetch_all_by_logic_name('foobar');
525  Description: Returns a listref of features created from the database.
526  only features with an analysis of type $logic_name will
527  be returned. If the logic name is invalid (not in the
528  analysis table), a reference to an empty list will be
529  returned.
530  Returntype : listref of Bio::EnsEMBL::SeqFeatures
531  Exceptions : thrown if no $logic_name
532  Caller : General
533  Status : Stable
534 
535 =cut
536 
537 sub fetch_all_by_logic_name {
538  my ( $self, $logic_name ) = @_;
539 
540  if ( !defined($logic_name) ) {
541  throw("Need a logic_name");
542  }
543 
544  my $constraint = $self->_logic_name_to_constraint( '', $logic_name );
545 
546  if ( !defined($constraint) ) {
547  warning("Invalid logic name: $logic_name");
548  return [];
549  }
550 
551  return $self->generic_fetch($constraint);
552 }
553 
554 =head2 fetch_all_by_stable_id_list
555 
556  Arg [1] : string $logic_name
557  the logic name of the type of features to obtain
558  Arg [2] : Bio::EnsEMBL::Slice $slice
559  the slice from which to obtain features
560  Example : $fs = $a->fetch_all_by_stable_id_list(["ENSG00001","ENSG00002", ...]);
561  Description: Returns a listref of features identified by their stable IDs.
562  This method only fetches features of the same type as the calling
563  adaptor.
564  Results are constrained to a slice if the slice is provided.
565  Returntype : listref of Bio::EnsEMBL::Feature
566  Exceptions : thrown if no stable ID list is provided.
567  Caller : General
568  Status : Stable
569 
570 =cut
571 
572 # Adapted from BaseAdaptor->uncached_fetch_all_by_dbID_list
573 sub fetch_all_by_stable_id_list {
574  my ( $self, $id_list_ref, $slice ) = @_;
575 
576  return $self->_uncached_fetch_all_by_id_list($id_list_ref,$slice,"stable_id");
577 }
578 
579 # Method that creates an object. Called by the _objs_from_sth() method
580 # in the sub-classes (the various feature adaptors). Overridden by the
581 # feature collection classes.
582 
583 sub _create_feature {
584  my ( $self, $feature_type, $args ) = @_;
585  return $feature_type->new( %{$args} );
586 }
587 
588 # This is the same as the above, but calls the new_fast() constructor of
589 # the feature type.
590 
591 sub _create_feature_fast {
592  my ( $self, $feature_type, $args ) = @_;
593  return $feature_type->new_fast($args);
594 }
595 
596 =head2 count_by_Slice_constraint
597 
598  Arg [1] : Bio::EnsEMBL::Slice
599  Arg [2] : String Custom SQL constraint
600  Arg [3] : String Logic name to search by
601  Description : Finds all features with at least partial overlap to the given
602  slice and sums them up.
603  Explanation of workings with projections:
604 
605  |-------------------------Constraint Slice---------------------------------|
606  | | | |
607  |--Segment 1--| | |
608  | |--Segment 2, on desired Coordinate System--| |
609  | | |---Segment 3----|
610  #Feature 1# #Feature 2# #Feature 3# |
611  | #####################Feature 4#################### |
612  | #Feature 5# | | |
613 
614  Feature 1 is overlapping the original constraint. Counted in Segment 1
615  Feature 2,3 and 4 are counted when inspecting Segment 2
616  Feature 5 is counted in Segment 1
617 
618  Returntype : Integer
619 =cut
620 
621 sub count_by_Slice_constraint {
622  my ($self, $slice, $constraint, $logic_name) = @_;
623 
624  assert_ref($slice, 'Bio::EnsEMBL::Slice', 'slice');
625 
626  my $count = 0;
627 
628  #Remember the bind params
629  my $bind_params = $self->bind_param_generic_fetch();
630 
631  #Table synonym
632  my @tables = $self->_tables;
633  my ($table_name, $table_synonym) = @{ $tables[0] };
634 
635  # Basic constraints limit the feature hits to the starting Slice constraint
636  $constraint ||= '';
637  $constraint = $self->_logic_name_to_constraint( $constraint, $logic_name );
638 
639  return $count if ! defined $constraint;
640 
641  #Query logic
642  my $sa = $slice->adaptor();
643  my $projections = $self->_get_and_filter_Slice_projections($slice);
644 
645  my $segment_count = @{$projections};
646  #Manual loop to support look-ahead/behind
647  for (my $i = 0; $i < $segment_count; $i++) {
648  my $seg = $projections->[$i];
649  my $seg_slice = $seg->to_Slice();
650  $self->_bind_param_generic_fetch($bind_params);
651 
652  # We cannot filter boundary crossing features in code, so we constrain the
653  # SQL. We detect when we are not on the original query Slice and manually
654  # filter by the segment Slice's start and end. If we are on "earlier" (5')
655  # projection segments, we only limit by the *end* and when we are on the later
656  # projection segments, we filter by the *start*.
657 
658  # This is the same as fetch_all_by_Slice_constraint()'s in-memory filtering
659  # except we need to alter projected features in that code with an offset
660  my $local_constraint = $constraint;
661  if ( $seg_slice->name() ne $slice->name() ) {
662  my ($limit_start, $limit_end) = (1,1); #fully constrained by default, flags are reset every iteration
663  if($i == 0) {
664  $limit_start = 0;
665  } elsif ($i == ($segment_count - 1)) {
666  $limit_end = 0; #don't check end as we are on the final projection
667  }
668  $local_constraint .= ' AND ' if $local_constraint;
669  my @conditionals;
670 
671  if($limit_end) {
672  # Restrict features to the end of this projection segment when after the named segment
673  push(@conditionals, sprintf('%1$s.seq_region_end <= %2$d', $table_synonym, $seg_slice->end));
674  }
675  if($limit_start) {
676  #Do not cross the start boundary on this projection segment
677  push(@conditionals, sprintf('%1$s.seq_region_start >= %2$d', $table_synonym, $seg_slice->start));
678  }
679 
680  $local_constraint .= join(q{ AND }, @conditionals);
681  }
682 
683  my $count_array = $self->_get_by_Slice($seg_slice, $local_constraint, 'count');
684  $count += $_ for @$count_array;
685  }
686 
687  return $count;
688 }
689 
690 =head2 _get_and_filter_Slice_projections
691 
692  Arg [1] : Bio::EnsEMBL::Slice
693  Description : Delegates onto SliceAdaptor::fetch_normalized_slice_projection()
694  with filtering on
695  Returntype : ArrayRef Bio::EnsEMBL::ProjectionSegment; Returns an array
696  of projected segments
697 =cut
698 
699 sub _get_and_filter_Slice_projections {
700  my ($self, $slice) = @_;
701  my $sa = $slice->adaptor();
702  my $filter_projections = 1;
703  return $sa->fetch_normalized_slice_projection($slice, $filter_projections);
704 }
705 
706 =head2 _generate_feature_bounds
707 
708  Arg [1] : Bio::EnsEMBL::Slice
709  Description : Performs a projection of Slice and records the bounds
710  of that projection. This can be used later on to restrict
711  Features which overlap into unwanted areas such as
712  regions which exist on another HAP/PAR region.
713 
714  Bounds are defined as projection_start - slice_start + 1.
715  Example : my $bounds = $self->_generate_feature_bounds($slice);
716  Returntype : ArrayRef Integer; Returns the location of the bounds.
717 =cut
718 
719 sub _generate_feature_bounds {
720  my ($self, $slice) = @_;
721  my $sa = $slice->adaptor();
722  # construct list of Hap/PAR boundaries for entire seq region
723  my @bounds;
724 
725  my $sr_id = $slice->get_seq_region_id();
726  my $ent_slice = $sa->fetch_by_seq_region_id($sr_id);
727  if ( $slice->strand() == -1 ) {
728  $ent_slice = $ent_slice->invert();
729  }
730 
731  my @ent_proj = @{ $sa->fetch_normalized_slice_projection($ent_slice) };
732  shift(@ent_proj); # skip first; 1st does not have bounds normally; may change if we ever have a patch a pos 1
733 
734  @bounds = map { $_->from_start() - $slice->start() + 1 } @ent_proj;
735 
736  return \@bounds;
737 }
738 
739 =head2 _get_by_Slice
740  Arg [0] : Bio::EnsEMBL::Slice to find all the features within
741  Arg [1] : SQL constraint string
742  Arg [2] : Type of query to run. Default behaviour is to select, but
743  'count' is also valid
744  Description: Abstracted logic from _slice_fetch
745  Returntype : Listref of Bio::EnsEMBL::Feature, or integers for counting mode
746 =cut
747 
748 sub _get_by_Slice {
749  my ($self, $slice, $orig_constraint, $query_type) = @_;
750 
751  # features can be scattered across multiple coordinate systems
752  my @tables = $self->_tables;
753  my ($table_name, $table_synonym) = @{ $tables[0] };
754  my $mapper;
755  my @feature_coord_systems;
756 
757  my $meta_values = $self->db->get_MetaContainer->list_value_by_key( $table_name."build.level");
758  if ( @$meta_values and $slice->is_toplevel() ) {
759  push @feature_coord_systems, $slice->coord_system();
760  } else {
761  @feature_coord_systems = @{ $self->db->get_MetaCoordContainer->fetch_all_CoordSystems_by_feature_type($table_name)};
762  }
763 
764  my $assembly_mapper_adaptor = $self->db->get_AssemblyMapperAdaptor();
765  my @pan_coord_features;
766 
767 COORD_SYSTEM: foreach my $coord_system (@feature_coord_systems) {
768  my @query_accumulator;
769  # Build up a combination of query constraints that will quickly establish the result set
770  my $constraint = $orig_constraint;
771  if ( $coord_system->equals( $slice->coord_system ) ) {
772  my $max_len = $self->_max_feature_length
773  || $self->db->get_MetaCoordContainer
774  ->fetch_max_length_by_CoordSystem_feature_type( $coord_system,$table_name );
775 
776  my $seq_region_id;
777  if ( $slice->adaptor ) {
778  $seq_region_id = $slice->adaptor->get_seq_region_id($slice);
779  } else {
780  $seq_region_id = $self->db->get_SliceAdaptor->get_seq_region_id($slice);
781  }
782 
783  my @seq_region_ids = ($seq_region_id);
784  while (1) {
785  my $ext_seq_region_id = $self->get_seq_region_id_external($seq_region_id);
786 
787  if ( $ext_seq_region_id == $seq_region_id ) { last }
788 
789  push( @seq_region_ids, $ext_seq_region_id );
790  $seq_region_id = $ext_seq_region_id;
791  }
792 
793  $constraint .= " AND " if ($constraint);
794 
795  $constraint .= ${table_synonym}.".seq_region_id IN (". join( ',', @seq_region_ids ) . ") AND ";
796 
797  #faster query for 1bp slices where SNP data is not compressed
798  if ( $self->start_equals_end && $slice->start == $slice->end ) {
799  $constraint .= " AND ".$table_synonym.".seq_region_start = ".$slice->end .
800  " AND ".$table_synonym.".seq_region_end = ".$slice->start;
801 
802  } else {
803  if ( !$slice->is_circular() ) {
804  # Deal with the default case of a non-circular chromosome.
805  $constraint .= $table_synonym.".seq_region_start <= ".$slice->end." AND "
806  .$table_synonym.".seq_region_end >= ".$slice->start;
807 
808  if ( $max_len ) {
809  my $min_start = $slice->start - $max_len;
810  $constraint .= " AND ".$table_synonym.".seq_region_start >= ".$min_start;
811  }
812 
813  } else {
814  # Deal with the case of a circular chromosome.
815  if ( $slice->start > $slice->end ) {
816  $constraint .= " ( ".$table_synonym.".seq_region_start >= ".$slice->start
817  . " OR ".$table_synonym.".seq_region_start <= ".$slice->end
818  . " OR ".$table_synonym.".seq_region_end >= ".$slice->start
819  . " OR ".$table_synonym.".seq_region_end <= ".$slice->end
820  . " OR ".$table_synonym.".seq_region_start > ".$table_synonym.".seq_region_end)";
821  } else {
822  $constraint .= " ((".$table_synonym.".seq_region_start <= ".$slice->end
823  . " AND ".$table_synonym.".seq_region_end >= ".$slice->start.") "
824  . "OR (".$table_synonym.".seq_region_start > ".$table_synonym.".seq_region_end"
825  . " AND (".$table_synonym.".seq_region_start <= ".$slice->end
826  . " OR ".$table_synonym.".seq_region_end >= ".$slice->start.")))";
827  }
828  }
829  }
830  push @query_accumulator, [$constraint,undef,$slice]; # $mapper intentionally absent here.
831 
832  } else {
833  #coordinate systems do not match
834  $mapper = $assembly_mapper_adaptor->fetch_by_CoordSystems( $slice->coord_system(), $coord_system );
835  next unless defined $mapper;
836 
837  # Get list of coordinates and corresponding internal ids for
838  # regions the slice spans
839  my @coords = $mapper->map( $slice->seq_region_name, $slice->start, $slice->end,
840  $slice->strand, $slice->coord_system );
841 
842  @coords = grep { !$_->isa('Bio::EnsEMBL::Mapper::Gap') } @coords;
843 
844  next COORD_SYSTEM if ( !@coords );
845 
846  my @ids = map { $_->id() } @coords;
847  #coords are now id rather than name
848 
849  if ( @coords > $MAX_SPLIT_QUERY_SEQ_REGIONS && ! $slice->isa('Bio::EnsEMBL::LRGSlice')
850  && $slice->coord_system->name() ne 'lrg') {
851  $constraint = $orig_constraint;
852  my $id_str = join( ',', @ids );
853  $constraint .= " AND " if ($constraint);
854  $constraint .= $table_synonym.".seq_region_id IN ($id_str)";
855 
856  push @query_accumulator, [$constraint,$mapper,$slice];
857  } else {
858  my $max_len = (
859  $self->_max_feature_length()
860  || $self->db->get_MetaCoordContainer
861  ->fetch_max_length_by_CoordSystem_feature_type($coord_system, $table_name)
862  );
863 
864  my $length = @coords;
865  for ( my $i = 0; $i < $length; $i++ ) {
866  $constraint = $orig_constraint;
867  $constraint .= " AND " if ($constraint);
868  $constraint .= $table_synonym.".seq_region_id = "
869  . $ids[$i] . " AND "
870  . $table_synonym.".seq_region_start <= "
871  . $coords[$i]->end() . " AND "
872  . $table_synonym.".seq_region_end >= "
873  . $coords[$i]->start();
874 
875  if ($max_len) {
876  my $min_start = $coords[$i]->start() - $max_len;
877  $constraint .= " AND ".$table_synonym.".seq_region_start >= ".$min_start;
878  }
879 
880  push @query_accumulator, [$constraint,$mapper,$slice];
881  } # end multi-query cycle
882  } # end else
883 
884  } # end else (coord sytems not matching)
885 
886  #Record the bind params if we have to do multiple queries
887  my $bind_params = $self->bind_param_generic_fetch();
888 
889  foreach my $query (@query_accumulator) {
890  my ($local_constraint,$local_mapper,$local_slice) = @$query;
891  $self->_bind_param_generic_fetch($bind_params);
892  if ($query_type and $query_type eq 'count') {
893  push @pan_coord_features, $self->generic_count($local_constraint);
894  }
895  else {
896  my $features = $self->generic_fetch( $local_constraint, $local_mapper, $local_slice );
897  $features = $self->_remap( $features, $local_mapper, $local_slice );
898  push @pan_coord_features, @$features;
899  }
900  }
901  $mapper = undef;
902  } # End foreach
903  $self->{_bind_param_generic_fetch} = undef;
904  return \@pan_coord_features;
905 }
906 #
907 # helper function used by fetch_all_by_Slice_constraint method
908 #
909 sub _slice_fetch {
910  my ( $self, $slice, $orig_constraint ) = @_;
911 
912  my $features = $self->_get_by_Slice($slice,$orig_constraint);
913 
914  return $features;
915 } ## end sub _slice_fetch
916 
917 
918 #for a given seq_region_id, gets the one used in an external database, if present, otherwise, returns the internal one
919 sub get_seq_region_id_external {
920  my ( $self, $sr_id ) = @_;
921  my $cs_a = $self->db()->get_CoordSystemAdaptor();
922  return ( exists( $cs_a->{'_internal_seq_region_mapping'}->{$sr_id} )
923  ? $cs_a->{'_internal_seq_region_mapping'}->{$sr_id}
924  : $sr_id );
925 }
926 
927 #for a given seq_region_id and coord_system, gets the one used in the internal (core) database
928 sub get_seq_region_id_internal{
929  my ( $self, $sr_id ) = @_;
930  my $cs_a = $self->db()->get_CoordSystemAdaptor();
931  return ( exists $cs_a->{'_external_seq_region_mapping'}->{$sr_id}
932  ? $cs_a->{'_external_seq_region_mapping'}->{$sr_id}
933  : $sr_id);
934 }
935 
936 #
937 # Helper function containing some common feature storing functionality
938 #
939 # Given a Feature this will return a copy (or the same feature if no changes
940 # to the feature are needed) of the feature which is relative to the start
941 # of the seq_region it is on. The seq_region_id of the seq_region it is on
942 # is also returned.
943 #
944 # This method will also ensure that the database knows which coordinate
945 # systems that this feature is stored in.
946 #
947 
948 sub _pre_store {
949  my $self = shift;
950  my $feature = shift;
951 
952  if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
953  throw('Expected Feature argument.');
954  }
955  my $slice = $feature->slice();
956 
957  $self->_check_start_end_strand($feature->start(),$feature->end(),
958  $feature->strand(), $slice);
959 
960 
961  my $db = $self->db();
962 
963  my $slice_adaptor = $db->get_SliceAdaptor();
964 
965  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
966  throw('Feature must be attached to Slice to be stored.');
967  }
968 
969  # make sure feature coords are relative to start of entire seq_region
970 
971  if($slice->start != 1 || $slice->strand != 1) {
972  #move feature onto a slice of the entire seq_region
973  $slice = $slice_adaptor->fetch_by_region($slice->coord_system->name(),
974  $slice->seq_region_name(),
975  undef, #start
976  undef, #end
977  undef, #strand
978  $slice->coord_system->version());
979 
980  $feature = $feature->transfer($slice);
981 
982  if(!$feature) {
983  throw('Could not transfer Feature to slice of ' .
984  'entire seq_region prior to storing');
985  }
986  }
987 
988  # Ensure this type of feature is known to be stored in this coord system.
989  my $cs = $slice->coord_system;
990 
991  my ($tab) = $self->_tables();
992  my $tabname = $tab->[0];
993 
994  my $mcc = $db->get_MetaCoordContainer();
995 
996  $mcc->add_feature_type($cs, $tabname, $feature->length);
997 
998  my $seq_region_id = $slice_adaptor->get_seq_region_id($slice);
999 
1000  if(!$seq_region_id) {
1001  throw('Feature is associated with seq_region which is not in this DB.');
1002  }
1003 
1004  return ($feature, $seq_region_id);
1005 }
1006 
1007 
1008 # The same function as _pre_store
1009 # This one is used to store user uploaded features in XXX_userdata db
1010 
1011 sub _pre_store_userdata {
1012  my $self = shift;
1013  my $feature = shift;
1014 
1015  if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
1016  throw('Expected Feature argument.');
1017  }
1018 
1019  my $slice = $feature->slice();
1020  my $slice_adaptor = $slice->adaptor;
1021 
1022  $self->_check_start_end_strand($feature->start(),$feature->end(),
1023  $feature->strand(), $slice);
1024 
1025 
1026  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
1027  throw('Feature must be attached to Slice to be stored.');
1028  }
1029 
1030  # make sure feature coords are relative to start of entire seq_region
1031 
1032  if($slice->start != 1 || $slice->strand != 1) {
1033  #move feature onto a slice of the entire seq_region
1034  $slice = $slice_adaptor->fetch_by_region($slice->coord_system->name(),
1035  $slice->seq_region_name(),
1036  undef, #start
1037  undef, #end
1038  undef, #strand
1039  $slice->coord_system->version());
1040 
1041  $feature = $feature->transfer($slice);
1042 
1043  if(!$feature) {
1044  throw('Could not transfer Feature to slice of ' .
1045  'entire seq_region prior to storing');
1046  }
1047  }
1048 
1049  # Ensure this type of feature is known to be stored in this coord system.
1050  my $cs = $slice->coord_system;
1051 
1052  my ($tab) = $self->_tables();
1053  my $tabname = $tab->[0];
1054 
1055  my $db = $self->db;
1056  my $mcc = $db->get_MetaCoordContainer();
1057 
1058  $mcc->add_feature_type($cs, $tabname, $feature->length);
1059 
1060  my $seq_region_id = $slice_adaptor->get_seq_region_id($slice);
1061 
1062  if(!$seq_region_id) {
1063  throw('Feature is associated with seq_region which is not in this DB.');
1064  }
1065 
1066  return ($feature, $seq_region_id);
1067 }
1068 
1069 
1070 #
1071 # helper function used to validate start/end/strand and
1072 # hstart/hend/hstrand etc.
1073 #
1074 sub _check_start_end_strand {
1075  my ($self, $start, $end, $strand, $slice) = @_;
1076 
1077  #
1078  # Make sure that the start, end, strand are valid
1079  #
1080  if(int($start) != $start) {
1081  throw("Invalid Feature start [$start]. Must be integer.");
1082  }
1083  if(int($end) != $end) {
1084  throw("Invalid Feature end [$end]. Must be integer.");
1085  }
1086  if(int($strand) != $strand || $strand < -1 || $strand > 1) {
1087  throw("Invalid Feature strand [$strand]. Must be -1, 0 or 1.");
1088  }
1089  if($end < $start && ! (defined $slice && $slice->is_circular())) {
1090  throw("Invalid Feature start/end [$start/$end]. Start must be less " .
1091  "than or equal to end.");
1092  }
1093 
1094  return 1;
1095 }
1096 
1097 
1098 #
1099 # Given a list of features checks if they are in the correct coord system
1100 # by looking at the first features slice. If they are not then they are
1101 # converted and placed on the slice.
1102 #
1103 sub _remap {
1104  my ( $self, $features, $mapper, $slice ) = @_;
1105 
1106  #check if any remapping is actually needed
1107  if(@$features && (!$features->[0]->isa('Bio::EnsEMBL::Feature') ||
1108  $features->[0]->slice == $slice)) {
1109  return $features;
1110  }
1111 
1112  #remapping has not been done, we have to do our own conversion from
1113  #to slice coords
1114 
1115  my @out;
1116 
1117  my $slice_start = $slice->start();
1118  my $slice_end = $slice->end();
1119  my $slice_strand = $slice->strand();
1120  my $slice_cs = $slice->coord_system();
1121 
1122  my ($seq_region_id, $start, $end, $strand);
1123 
1124  my $slice_seq_region_id = $slice->get_seq_region_id();
1125 
1126  foreach my $f (@$features) {
1127  #since feats were obtained in contig coords, attached seq is a contig
1128  my $fslice = $f->slice();
1129  if(!$fslice) {
1130  throw("Feature does not have attached slice.\n");
1131  }
1132  my $fseq_region_id = $fslice->get_seq_region_id();
1133  my $fcs = $fslice->coord_system();
1134 
1135  if(!$slice_cs->equals($fcs)) {
1136  #slice of feature in different coord system, mapping required
1137  # $seq_region comes out as an integer not a string
1138  ($seq_region_id, $start, $end, $strand) =
1139  $mapper->fastmap($fslice->seq_region_name(),$f->start(),$f->end(),$f->strand(),$fcs);
1140 
1141  # undefined start means gap
1142  next if(!defined $start);
1143  } else {
1144  $start = $f->start();
1145  $end = $f->end();
1146  $strand = $f->strand();
1147  $seq_region_id = $fseq_region_id;
1148  }
1149 
1150  # maps to region outside desired area
1151  next if ($start > $slice_end) || ($end < $slice_start) ||
1152  ($slice_seq_region_id != $seq_region_id);
1153 
1154  #shift the feature start, end and strand in one call
1155  if($slice_strand == -1) {
1156  $f->move( $slice_end - $end + 1, $slice_end - $start + 1, $strand * -1 );
1157  } else {
1158  $f->move( $start - $slice_start + 1, $end - $slice_start + 1, $strand );
1159  }
1160 
1161  $f->slice($slice);
1162 
1163  push @out,$f;
1164  }
1165 
1166  return \@out;
1167 }
1168 
1169 #
1170 # get seq region boundary (start|end) for a feature
1171 # the method attempts to retrieve the boundary directly from the db
1172 # return undef if cannot determine the associated feature table
1173 #
1174 sub _seq_region_boundary_from_db {
1175  my ($self, $feature, $boundary) = @_;
1176 
1177  if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
1178  throw('Expected Feature argument.');
1179  }
1180 
1181  throw "Undefined boundary"
1182  unless defined $boundary;
1183 
1184  $boundary eq 'start' or $boundary eq 'end'
1185  or throw "Wrong boundary: select start|end";
1186 
1187  $boundary = 'seq_region_' . $boundary;
1188 
1189  my $sql_helper = $self->dbc->sql_helper;
1190  throw "Unable to get SqlHelper instance" unless defined $sql_helper;
1191 
1192  my $feature_table = ($self->_tables)[0]->[0];
1193  warning (sprintf "Unable to get %s for %s instance\nCould not find associated feature table, returning undef", $boundary, ref $feature)
1194  and return undef unless defined $feature_table;
1195 
1196  my $db_id = $feature->dbID;
1197  my $attrib_id = $feature_table . '_id';
1198  my $query = "SELECT ${boundary} from ${feature_table} WHERE ${attrib_id} = ${db_id}";
1199 
1200  return $sql_helper->execute_single_result(-SQL => $query);
1201 }
1202 
1203 =head2 store
1204 
1205  Arg [1] : list of Bio::EnsEMBL::SeqFeature
1206  Example : $adaptor->store(@feats);
1207  Description: ABSTRACT Subclasses are responsible for implementing this
1208  method. It should take a list of features and store them in
1209  the database.
1210  Returntype : none
1211  Exceptions : thrown method is not implemented by subclass
1212  Caller : general
1213  Status : At Risk
1214  : throws if called.
1215 
1216 =cut
1217 
1218 sub store{
1219  my $self = @_;
1220 
1221  throw("Abstract method store not defined by implementing subclass\n");
1222 }
1223 
1224 
1225 =head2 remove
1226 
1227  Arg [1] : A feature $feature
1228  Example : $feature_adaptor->remove($feature);
1229  Description: This removes a feature from the database. The table the
1230  feature is removed from is defined by the abstract method
1231  _tablename, and the primary key of the table is assumed
1232  to be _tablename() . '_id'. The feature argument must
1233  be an object implementing the dbID method, and for the
1234  feature to be removed from the database a dbID value must
1235  be returned.
1236  Returntype : none
1237  Exceptions : thrown if $feature arg does not implement dbID(), or if
1238  $feature->dbID is not a true value
1239  Caller : general
1240  Status : Stable
1241 
1242 =cut
1243 
1244 
1245 sub remove {
1246  my ($self, $feature) = @_;
1247 
1248  if(!$feature || !ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
1249  throw('Feature argument is required');
1250  }
1251 
1252  if(!$feature->is_stored($self->db)) {
1253  throw("This feature is not stored in this database");
1254  }
1255 
1256  my @tabs = $self->_tables;
1257  my ($table) = @{$tabs[0]};
1258 
1259  my $sth = $self->prepare("DELETE FROM $table WHERE ${table}_id = ?");
1260  $sth->bind_param(1,$feature->dbID,SQL_INTEGER);
1261  $sth->execute();
1262 
1263  #unset the feature dbID ad adaptor
1264  $feature->dbID(undef);
1265  $feature->adaptor(undef);
1266 
1267  return;
1268 }
1269 
1270 
1271 =head2 remove_by_Slice
1272 
1273  Arg [1] : Bio::Ensembl::Slice $slice
1274  Example : $feature_adaptor->remove_by_Slice($slice);
1275  Description: This removes features from the database which lie on a region
1276  represented by the passed in slice. Only features which are
1277  fully contained by the slice are deleted; features which overlap
1278  the edge of the slice are not removed.
1279  The table the features are removed from is defined by
1280  the abstract method_tablename.
1281  Returntype : none
1282  Exceptions : thrown if no slice is supplied
1283  Caller : general
1284  Status : Stable
1285 
1286 =cut
1287 
1288 sub remove_by_Slice {
1289  my ($self, $slice) = @_;
1290 
1291  if(!$slice || !ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
1292  throw("Slice argument is required");
1293  }
1294 
1295  my @tabs = $self->_tables;
1296  my ($table_name) = @{$tabs[0]};
1297 
1298  my $seq_region_id = $self->db->get_SliceAdaptor->get_seq_region_id($slice);
1299  my $start = $slice->start();
1300  my $end = $slice->end();
1301 
1302  #
1303  # Delete only features fully on the slice, not overlapping ones
1304  #
1305  my $sth = $self->prepare("DELETE FROM $table_name " .
1306  "WHERE seq_region_id = ? " .
1307  "AND seq_region_start >= ? " .
1308  "AND seq_region_end <= ?");
1309 
1310  $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
1311  $sth->bind_param(2,$start,SQL_INTEGER);
1312  $sth->bind_param(3,$end,SQL_INTEGER);
1313  $sth->execute();
1314  $sth->finish();
1315 }
1316 
1317 
1318 #
1319 # Internal function. Allows the max feature length which is normally
1320 # retrieved from the meta_coord table to be overridden. This allows
1321 # for some significant optimizations to be put in when it is known
1322 # that requested features will not be over a certain size.
1323 #
1324 sub _max_feature_length {
1325  my $self = shift;
1326  return $self->{'_max_feature_length'} = shift if(@_);
1327  return $self->{'_max_feature_length'};
1328 }
1329 
1330 
1331 #
1332 # Lists all seq_region_ids that a particular feature type is found on.
1333 # Useful e.g. for finding out which seq_regions have genes.
1334 # Returns a listref of seq_region_ids.
1335 #
1336 sub _list_seq_region_ids {
1337  my ($self, $table) = @_;
1338 
1339  my @out;
1340 
1341  my $sql = qq(
1342  SELECT DISTINCT
1343  sr.seq_region_id
1344  FROM seq_region sr,
1345  $table a,
1346  coord_system cs
1347  WHERE sr.seq_region_id = a.seq_region_id
1348  AND sr.coord_system_id = cs.coord_system_id
1349  AND cs.species_id = ?);
1350 
1351  my $sth = $self->prepare($sql);
1352 
1353  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
1354 
1355  $sth->execute();
1356 
1357  while (my ($id) = $sth->fetchrow) {
1358  push(@out, $id);
1359  }
1360 
1361  $sth->finish;
1362 
1363  return \@out;
1364 }
1365 
1366 
1367 sub remove_by_analysis_id {
1368  my ($self, $analysis_id) = @_;
1369 
1370  $analysis_id or throw("Must call with analysis id");
1371 
1372  my @tabs = $self->_tables;
1373  my ($tablename) = @{$tabs[0]};
1374 
1375  my $sql = "DELETE FROM $tablename WHERE analysis_id = $analysis_id";
1376 # warn "SQL : $sql";
1377 
1378  my $sth = $self->prepare($sql);
1379  $sth->execute();
1380  $sth->finish();
1381 }
1382 
1383 sub remove_by_feature_id {
1384  my ($self, $features_list) = @_;
1385 
1386  my @feats = @$features_list or throw("Must call store with features");
1387 
1388  my @tabs = $self->_tables;
1389  my ($tablename) = @{$tabs[0]};
1390 
1391  my $sql = sprintf "DELETE FROM $tablename WHERE ${tablename}_id IN (%s)", join ', ', @feats;
1392 # warn "SQL : $sql";
1393 
1394  my $sth = $self->prepare($sql);
1395  $sth->execute();
1396  $sth->finish();
1397 }
1398 
1399 =head2 fetch_nearest_by_Feature
1400 
1401  Arg [1] : Reference Feature to start the search from
1402  Description: Searches iteratively outward from the starting feature until a nearby Feature is found
1403  If you require more than one result or more control of which features are returned, see
1404  fetch_all_nearest_by_Feature and fetch_all_by_outward_search. fetch_nearest_by_Feature
1405  is a convenience method.
1406  ReturnType : Bio::EnsEMBL::Feature
1407 =cut
1408 
1409 sub fetch_nearest_by_Feature {
1410  my $self = shift;
1411  my $feature = shift;
1412  my @hits = @{ $self->fetch_all_by_outward_search(-FEATURE => $feature, -MAX_RANGE => 1000000, -RANGE => 1000, -LIMIT => 1) };
1413  return $hits[0] if (scalar @hits > 0);
1414  return;
1415 }
1416 
1417 
1418 =head2 fetch_all_by_outward_search
1419 
1420  Arguments the same as fetch_all_nearest_by_Feature
1421  Arg [0] : -MAX_RANGE : Set an upper limit on the search range, defaults to 10000 bp
1422  Arg [1] : -FEATURE ,Bio::EnsEMBL::Feature : 'Source' Feature to anchor the search for nearest Features
1423  Arg [2] : -SAME_STRAND, Boolean (optional) : Respect the strand of the source Feature with ref, only
1424  returning Features on the same strand.
1425  Arg [3] : -OPPOSITE_STRAND, Boolean (optional) : Find features on the opposite strand of the same
1426  Arg [4] : -DOWNSTREAM/-UPSTREAM, (optional) : Search ONLY downstream or upstream from the source Feature.
1427  Can be omitted for searches in both directions.
1428  Arg [5] : -RANGE, Int : The size of the space to search for Features. Defaults to 1000 as a sensible starting point
1429  Arg [6] : -NOT_OVERLAPPING, Boolean (optional) : Do not return Features that overlap the source Feature
1430  Arg [7] : -FIVE_PRIME, Boolean (optional) : Determine range to a Feature by the 5' end, respecting strand
1431  Arg [8] : -THREE_PRIME, Boolean (optional): Determine range to a Feature by the 3' end, respecting strand
1432  Arg [9] : -LIMIT, Int : The maximum number of Features to return, defaulting to one. Equally near features are all returned
1433 
1434  Description: Searches for features within the suggested -RANGE, and if it finds none, expands the search area
1435  until it satisfies -LIMIT or hits -MAX_RANGE. Useful if you don't know how far away the features
1436  might be, or if dealing with areas of high feature density. In the case of Variation Features, it is
1437  conceivable that a 2000 bp window might contain very many features, resulting in a slow and bloated
1438  response, thus the ability to explore outward in smaller sections can be useful.
1439  Returntype : Listref of [$feature,$distance]
1440 =cut
1441 
1442 sub fetch_all_by_outward_search {
1443  my $self = shift;
1444  my ($ref_feature, $respect_strand, $opposite_strand, $downstream, $upstream, $start_search_range,
1445  $limit,$not_overlapping,$five_prime,$three_prime, $max_range) =
1446  rearrange([qw(FEATURE SAME_STRAND OPPOSITE_STRAND DOWNSTREAM UPSTREAM RANGE LIMIT NOT_OVERLAPPING FIVE_PRIME THREE_PRIME MAX_RANGE)], @_);
1447  my $factor = 1;
1448  $limit ||= 1;
1449  $start_search_range ||= 1000;
1450  my $current_search_range = $start_search_range;
1451  $max_range ||= 10000;
1452  my @results;
1453  while (scalar @results < $limit && $current_search_range < $max_range) {
1454  $current_search_range = $start_search_range * $factor;
1455  @results = @{
1456  $self->fetch_all_nearest_by_Feature(-RANGE => $current_search_range,
1457  -FEATURE => $ref_feature,
1458  -SAME_STRAND => $respect_strand,
1459  -OPPOSITE_STRAND => $opposite_strand,
1460  -DOWNSTREAM => $downstream,
1461  -UPSTREAM => $upstream,
1462  -LIMIT => $limit,
1463  -NOT_OVERLAPPING => $not_overlapping,
1464  -FIVE_PRIME => $five_prime,
1465  -THREE_PRIME => $three_prime,
1466  )
1467  };
1468  $factor++;
1469  }
1470  return \@results;
1471 }
1472 
1473 =head2 fetch_all_nearest_by_Feature
1474 
1475  Arg [1] : -FEATURE ,Bio::EnsEMBL::Feature : 'Source' Feature to anchor the search for nearest Features
1476  Arg [2] : -SAME_STRAND, Boolean (optional): Respect the strand of the source Feature with ref, only
1477  returning Features on the same strand
1478  Arg [3] : -OPPOSITE_STRAND, Boolean (optional) : Find features on the opposite strand of the same
1479  Arg [4] : -DOWNSTREAM/-UPSTREAM, (optional) : Search ONLY downstream or upstream from the source Feature.
1480  Can be omitted for searches in both directions.
1481  Arg [5] : -RANGE, Int : The size of the space to search for Features. Defaults to 1000 as a sensible starting point
1482  Arg [6] : -NOT_OVERLAPPING, Boolean (optional) : Do not return Features that overlap the source Feature
1483  Arg [7] : -FIVE_PRIME, Boolean (optional) : Determine range to a Feature by its 5' end, respecting strand
1484  Arg [8] : -THREE_PRIME, Boolean (optional): Determine range to a Feature by its 3' end, respecting strand
1485  Arg [9] : -LIMIT, Int : The maximum number of Features to return, defaulting to one. Equally near features are all returned
1486  Example : #To fetch the gene(s) with the nearest 5' end:
1487  $genes = $gene_adaptor->fetch_all_nearest_by_Feature(-FEATURE => $feat, -FIVE_PRIME => 1);
1488 
1489  Description: Gets the nearest Features to a given 'source' Feature. The Feature returned and the format of the result
1490  are non-obvious, please read on.
1491 
1492  When looking beyond the boundaries of the source Feature, the distance is measured to the nearest end
1493  of that Feature to the nearby Feature's nearest end.
1494  If Features overlap the source Feature, then they are given a distance of zero but ordered by
1495  their proximity to the centre of the Feature.
1496 
1497  Features are found and prioritised within 1000 base pairs unless a -RANGE is given to the method. Any overlap with
1498  the search region is included, and the results can be restricted to upstream, downstream, forward strand or reverse
1499 
1500  The -FIVE_PRIME and -THREE_PRIME options allow searching for specific ends of nearby features, but still needs
1501  a -DOWN/UPSTREAM value and/or -NOT_OVERLAPPING to fulfil its most common application.
1502 
1503 
1504  Returntype : Listref containing an Arrayref of Bio::EnsEMBL::Feature objects and the distance
1505  [ [$feature, $distance] ... ]
1506  Caller : general
1507 
1508 =cut
1509 sub fetch_all_nearest_by_Feature{
1510  my $self = shift;
1511  my ($ref_feature, $respect_strand, $opposite_strand, $downstream, $upstream, $search_range,$limit,$not_overlapping,$five_prime,$three_prime) =
1512  rearrange([qw(FEATURE SAME_STRAND OPPOSITE_STRAND DOWNSTREAM UPSTREAM RANGE LIMIT NOT_OVERLAPPING FIVE_PRIME THREE_PRIME)], @_);
1513  if ( !defined($search_range)) {
1514  $search_range ||= 1000;
1515  }
1516  $limit ||= 1;
1517 
1518  unless (defined($ref_feature) && $ref_feature->isa('Bio::EnsEMBL::Feature')) {
1519  throw ('fetch_all_nearest_by_Feature method requires a valid Ensembl Feature object to operate');
1520  }
1521  throw ('Do not specify both -upstream and -downstream. This is the same as no arguments') if ($upstream && $downstream);
1522  throw ('Do not use both -SAME_STRAND and -OPPOSITE_STRAND in one query') if ($opposite_strand && $respect_strand);
1523  my ($region_start,$region_end);
1524 
1525  # Define search box. Features overlapping the boundaries are found and considered
1526  my $slice = $ref_feature->feature_Slice;
1527 
1528  my $five_prime_expansion = 0;
1529  my $three_prime_expansion = 0;
1530  if ($upstream) {
1531  $five_prime_expansion = $search_range;
1532  $three_prime_expansion = $slice->length * -1;
1533  # shrinks the slice to exclude the features that do not exist beyond the end of the
1534  # considered feature
1535  }
1536  elsif ($downstream) {
1537  $three_prime_expansion = $search_range;
1538  $five_prime_expansion = $slice->length * -1;
1539  }
1540  else {
1541  $five_prime_expansion = $search_range;
1542  $three_prime_expansion = $search_range;
1543  }
1544 
1545  $slice = $slice->expand( $five_prime_expansion, $three_prime_expansion);
1546  my $sa = $self->db->get_SliceAdaptor;
1547  my @candidates; # Features in the search region
1548  @candidates = @{$self->fetch_all_by_Slice($slice)};
1549  if ($respect_strand) { @candidates = grep {$_->strand == $ref_feature->strand} @candidates }
1550  if ($opposite_strand) { @candidates = grep {$_->strand != $ref_feature->strand} @candidates }
1551  # When user chooses UPSTREAM/DOWNSTREAM && FIVE_PRIME/THREE_PRIME, discount any features with
1552  # their other end in the overlap region. fetch_all_by_Slice cannot filter these out.
1553  # fetch_all_by_Slice_constraint could in principle with the correct constraint.
1554  if (($upstream && $ref_feature->strand == 1) || ($downstream && $ref_feature->strand == -1)) {
1555  if ($five_prime) {
1556  @candidates = grep { ($_->strand == 1) ? $_->seq_region_start < $ref_feature->start : $_->seq_region_end < $ref_feature->start} @candidates;
1557  } elsif ($three_prime) {
1558  @candidates = grep { ($_->strand == 1) ? $_->seq_region_end < $ref_feature->start : $_->seq_region_start < $ref_feature->start} @candidates;
1559  }
1560  }
1561  if (($downstream && $ref_feature->strand == 1)|| ($upstream && $ref_feature->strand == -1)) {
1562  if ($five_prime) {
1563  @candidates = grep { ($_->strand == 1) ? $_->seq_region_start > $ref_feature->end : $_->seq_region_end > $ref_feature->end } @candidates;
1564  } elsif ($three_prime) {
1565  @candidates = grep { ($_->strand == 1) ? $_->seq_region_end > $ref_feature->end : $_->seq_region_start > $ref_feature->end } @candidates;
1566  }
1567  }
1568  # Then sort and prioritise the candidates
1569  my $finalists; # = [[feature, distance, centre-weighted distance, length, dbID],..]
1570  $finalists = $self->select_nearest($ref_feature,\@candidates,$limit,$not_overlapping,$five_prime,$three_prime);
1571  $finalists = [ map { [ splice @$_,0,2 ]} @$finalists ]; # Remove the ugly bits from the sight of users.
1572  return $finalists;
1573 }
1574 
1575 
1576 =head2 select_nearest
1577 
1578  Arg [1] : Bio::Ensembl::Feature, a Feature to find the nearest neighbouring feature to.
1579  Arg [2] : Listref of Features to be considered for nearness.
1580  Arg [3] : Integer, limited number of Features to return. Equally near features are all returned in spite of this limit
1581  Arg [4] : Boolean, Overlapping prohibition. Overlapped Features are forgotten
1582  Arg [5] : Boolean, use the 5' ends of the nearby features for distance calculation
1583  Arg [6] : Boolean, use the 3' ends of the nearby features for distance calculation
1584  Example : $feature_list = $feature_adaptor->select_nearest($ref_feature,\@candidates,$limit,$not_overlapping)
1585  Description: Take a list of possible features, and determine which is nearest. Nearness is a
1586  tricky concept. Beware of using the distance between Features, as it may not be the number you think
1587  it should be.
1588  Returntype : listref of Features ordered by proximity
1589  Caller : BaseFeatureAdaptor->fetch_all_nearest_by_Feature
1590 
1591 =cut
1592 
1593 sub select_nearest {
1594  my $self = shift;
1595  my $ref_feature = shift;
1596  my $candidates = shift;
1597  my $limit = shift;
1598  my $not_overlapping = shift;
1599  my $five_prime = shift;
1600  my $three_prime = shift;
1601 
1602  # Convert circular coordinates to linear ones for distance calculation
1603  my $ref_start = ($ref_feature->start < $ref_feature->end) ? $ref_feature->start : $ref_feature->start - $ref_feature->length; # Not ->end, in case circular
1604  my $ref_end = $ref_feature->end;
1605  my $ref_midpoint = $self->_compute_midpoint($ref_feature);
1606 
1607  my $position_matrix = [];
1608  my $shortest_distance;
1609  my $adjusted_distance; # for when features overlap and we need another metric to order by
1610 
1611  foreach my $neighbour (@$candidates) {
1612  # rearrange coordinates into forward-stranded orientation, plus flatten circular features.
1613  my $neigh_start = ($neighbour->seq_region_start < $neighbour->seq_region_end)
1614  ? $neighbour->seq_region_start : $neighbour->seq_region_start - $neighbour->length; # Not ->end, in case it is circular
1615  my $neigh_midpoint = $self->_compute_midpoint($neighbour);
1616  my $neigh_end = $neighbour->seq_region_end;
1617 
1618  # discard overlaps early if not required.
1619  next if ( $not_overlapping
1620  && (
1621  ( $neigh_start >= $ref_start && $neigh_end <= $ref_end )
1622  || ( $neigh_end <= $ref_end && $neigh_end >= $ref_start )
1623  )
1624  );
1625  my @args = ($ref_start,$ref_midpoint,$ref_end,$neigh_start,$neigh_midpoint,$neigh_end,$neighbour->strand);
1626  if ($five_prime || $three_prime) {
1627  my $five = $five_prime ? 1 : 0; # Choose 5' or 3' end
1628  ($shortest_distance,$adjusted_distance) = $self->_compute_prime_distance($five,@args);
1629  next unless ($shortest_distance || $adjusted_distance);
1630  }
1631  else {
1632  ($shortest_distance,$adjusted_distance) = $self->_compute_nearest_end(@args);
1633  }
1634  push @$position_matrix,[ $neighbour, $shortest_distance, $adjusted_distance, $neighbour->length, $neighbour->display_id] unless ($not_overlapping && $shortest_distance == 0);
1635  }
1636 
1637  # Order by distance, then centre-to-centre distance, then smallest feature first, then an arbitrary ID.
1638  # $position_matrix looks like this:
1639  # [ [ $feature, closest measure of distance, size, dbID ] ]
1640  # Not proud of this line. It re-emits the original array in sorted order to favour better measures of order.
1641  my @ordered_matrix = map { [$_->[0],$_->[1],$_->[2],$_->[3],$_->[4]] }
1642  sort { abs($a->[1]) <=> abs($b->[1]) || abs($a->[2]) <=> abs($b->[2]) || $a->[3] <=> $b->[3] || $a->[4] <=> $b->[4]} @$position_matrix;
1643  # Knock off unwanted hits. Equal distances are included, so more can be returned than asked for.
1644  @ordered_matrix = $self->_discard_excess_features_from_matrix(\@ordered_matrix,$limit);
1645  return \@ordered_matrix;
1646 }
1647 
1648 =head2 _compute_nearest_end
1649 
1650  Arg [1] : Reference feature start
1651  Arg [2] : Reference feature mid-point
1652  Arg [3] : Reference feature end
1653  Arg [4] : Considered feature start
1654  Arg [5] : Considered feature mid-point
1655  Arg [6] : Considered feature end
1656  Example : $distance = $feature_adaptor->_compute_nearest_end($ref_start,$ref_midpoint,$ref_end,$f_start,$f_midpoint,$f_end)
1657  Description: For a given feature, calculate the smallest legitimate distance to a reference feature
1658  Calculate by mid-points to accommodate overlaps
1659  Returntype : Integer distance in base pairs
1660  Caller : BaseFeatureAdaptor->select_nearest()
1661 
1662 =cut
1663 sub _compute_nearest_end {
1664  my ($self,$ref_start,$ref_midpoint,$ref_end,$f_start,$f_midpoint,$f_end) = @_;
1665  my ($effective_distance,$weighted_distance);
1666  if ($ref_start > $f_end ) {
1667  # upstream, no overlap
1668  $effective_distance = $f_end - $ref_start; # negative result for upstream
1669  } elsif ( $ref_end < $f_start ) {
1670  # downstream
1671  $effective_distance = $f_start - $ref_end; # positive result for downstream
1672  } else {
1673  # overlap,
1674  $effective_distance = 0;
1675  $weighted_distance = $f_midpoint - $ref_midpoint;
1676  }
1677  return $effective_distance, $weighted_distance;
1678 }
1679 
1680 =head2 _compute_prime_distance
1681 
1682  Arg [1] : Reference feature start
1683  Arg [2] : Reference feature mid-point
1684  Arg [3] : Reference feature end
1685  Arg [4] : Considered feature start
1686  Arg [5] : Considered feature mid-point
1687  Arg [6] : Considered feature end
1688  Arg [7] : Considered feature strand
1689  Example : $distance,$weighted_centre_distance = $feature_adaptor->_compute_prime_distance($ref_start,$ref_midpoint,$ref_end,$f_start,$f_midpoint,$f_end,$f_strand)
1690  Description: Calculate the smallest distance to the 5' end of the considered feature
1691  Returntype : Integer distance in base pairs or a string warning that the result doesn't mean anything.
1692  Nearest 5' and 3' features shouldn't reside inside the reference Feature
1693  Caller : BaseFeatureAdaptor->select_nearest()
1694 
1695 =cut
1696 sub _compute_prime_distance {
1697  my ($self,$five_prime,$ref_start,$ref_midpoint,$ref_end,$f_start,$f_midpoint,$f_end,$f_strand) = @_;
1698 
1699  my $f_coord;
1700  my $effective_distance;
1701  my $weighted_distance;
1702 
1703  if ($five_prime) {
1704  $f_coord = ($f_strand == 1) ? $f_start : $f_end;
1705  } else { # three prime end required
1706  $f_coord = ($f_strand ==1 ) ? $f_end : $f_start;
1707  }
1708 
1709  if ($ref_start > $f_coord) {
1710  # n' end of feature upstream of reference
1711  $effective_distance = $f_coord - $ref_start;
1712  if ($f_end < $ref_start) {
1713  $weighted_distance = $f_coord - $ref_midpoint;
1714  }
1715  } elsif ($ref_end < $f_coord && $f_start > $ref_end) {
1716  # n' end of feature downstream of reference
1717  $effective_distance = $ref_end - $f_coord;
1718  if ($f_start > $ref_end) {
1719  $weighted_distance = $f_coord - $ref_midpoint;
1720  }
1721  } elsif ($ref_start < $f_coord && $ref_end > $f_coord) {
1722  # total overlap condition
1723  $effective_distance = 0;
1724  $weighted_distance = $f_coord - $ref_midpoint;
1725  }
1726  # If feature's end falls outside of the reference feature, compute distance to reference midpoint.
1727  return $effective_distance, $weighted_distance;
1728 }
1729 
1730 =head2 _compute_midpoint
1731 
1732  Arg [1] : Bio::EnsEMBL::Feature
1733  Example : $middle = $feature_adaptor->_compute_midpoint($feature);
1734  Description: Calculate the mid-point of a Feature. Used for comparing Features that overlap each other
1735  and determining a canonical distance between two Features for the majority of use cases.
1736  Returntype : Integer coordinate rounded down.
1737  Caller : BaseFeatureAdaptor->select_nearest()
1738 
1739 =cut
1740 sub _compute_midpoint {
1741  my $self = shift;
1742  my $feature = shift;
1743  my $start = $feature->seq_region_start;
1744  my $end = $feature->seq_region_end;
1745  my $midpoint;
1746  # consider circular slice
1747  if ($start > $end) {
1748  $midpoint = int($end + ($start - $end) / 2);
1749  } else {
1750  $midpoint = int($start + ($end - $start) / 2);
1751  }
1752  return $midpoint;
1753 }
1754 
1755 sub _discard_excess_features_from_matrix {
1756  my $self = shift;
1757  my $list = shift;
1758  my @ordered_matrix = @$list;
1759  my $limit = shift;
1760  return @ordered_matrix if $#ordered_matrix == 0 || !defined($limit) || $limit > scalar @ordered_matrix;
1761  # cut off excess elements
1762  my @spares = splice @ordered_matrix, $limit, scalar @ordered_matrix - $limit;
1763  # Check nearest distance of the last member of ordered_matrix against spares and include those that match
1764  # This prevents first past the post.
1765  if (scalar @ordered_matrix > 0) {
1766  my $threshold_distance = $ordered_matrix[-1]->[1];
1767  my $i = 0;
1768  while ($i <= $#spares && $spares[$i]->[1] == $threshold_distance) {
1769  # printf "Considering option %s, %s\n",$spares[$i]->[0]->stable_id,$spares[$i]->[1];
1770  push @ordered_matrix, $spares[$i];
1771  $i++;
1772  }
1773 
1774  }
1775  return @ordered_matrix;
1776 }
1777 
1778 1;
EnsEMBL
Definition: Filter.pm:1
map
public map()
Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor
Definition: BaseFeatureAdaptor.pm:24
Bio::EnsEMBL::Feature
Definition: Feature.pm:47
Bio::EnsEMBL::Feature::seq_region_start
public Int seq_region_start()
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::Feature::start
public Int start()
Bio::EnsEMBL::LRGSlice
Definition: LRGSlice.pm:37
Bio::EnsEMBL::Utils::Cache
Definition: Cache.pm:71
Bio::EnsEMBL::Utils::Iterator
Definition: Iterator.pm:44
Bio::EnsEMBL::DBSQL::BaseAdaptor
Definition: BaseAdaptor.pm:71
Bio::EnsEMBL::Utils::Scalar
Definition: Scalar.pm:66
run
public run()
Bio::EnsEMBL::Feature::new
public Bio::EnsEMBL::Feature new()
Bio::EnsEMBL::Mapper::Gap
Definition: Gap.pm:14
Bio::EnsEMBL::Utils::Iterator::new
public Bio::EnsEMBL::Utils::Iterator new()
Bio::EnsEMBL::Storable::new_fast
public Instance new_fast()
Bio
Definition: AltAlleleGroup.pm:4
Bio::EnsEMBL::Utils::Argument
Definition: Argument.pm:34
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68