ensembl-hive  2.7.0
BaseFeatureAdaptor.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor - An Abstract Base class for all
34 FeatureAdaptors
35 
36 =head1 SYNOPSIS
37 
38 Abstract class - should not be instantiated. Implementation of
39 abstract methods must be performed by subclasses.
40 
41 =head1 DESCRIPTION
42 
43 This is a base adaptor for feature adaptors. This base class is simply a way
44 of eliminating code duplication through the implementation of methods
45 common to all feature adaptors.
46 
47 =head1 METHODS
48 
49 =cut
50 
51 package Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor;
52 use vars qw(@ISA @EXPORT);
53 use strict;
54 
57 use Bio::EnsEMBL::Utils::Exception qw(warning throw);
58 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
60 use Bio::EnsEMBL::Utils::Scalar qw/assert_ref/;
61 
63 
64 @EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
65 
66 our $SLICE_FEATURE_CACHE_SIZE = 4;
67 our $MAX_SPLIT_QUERY_SEQ_REGIONS = 3;
68 our $SILENCE_CACHE_WARNINGS = 0;
69 
70 =head2 new
71 
72  Arg [1] : list of args @args
73  Superclass constructor arguments
74  Example : none
75  Description: Constructor which warns if caching has been switched off
76  Returntype : Bio::EnsEMBL::BaseFeatureAdaptor
77  Exceptions : none
78  Caller : implementing subclass constructors
79  Status : Stable
80 
81 =cut
82 
83 sub new {
84  my ($class, @args) = @_;
85  my $self = $class->SUPER::new(@args);
86  if ( defined $self->db->no_cache() && $self->db->no_cache() && ! $SILENCE_CACHE_WARNINGS) {
87  warning( "You are using the API without caching most recent features. "
88  . "Performance might be affected." );
89  }
90  return $self;
91 }
92 
93 =head2 start_equals_end
94 
95  Arg [1] : (optional) boolean $newval
96  Example : $bfa->start_equals_end(1);
97  Description: Getter/Setter for the start_equals_end flag. If set
98  to true sub _slice_fetch will use a simplified sql to retrieve 1bp slices.
99  Returntype : boolean
100  Exceptions : none
101  Caller : EnsemblGenomes variation DB build
102  Status : Stable
103 
104 =cut
105 
106 sub start_equals_end {
107  my ( $self, $value ) = @_;
108 
109  if ( defined($value) ) {
110  $self->{'start_equals_end'} = $value;
111  }
112  return $self->{'start_equals_end'};
113 }
114 
115 
116 =head2 clear_cache
117 
118  Args : None
119  Example : my $sa =
120  $registry->get_adaptor( 'Mus musculus', 'Core',
121  'Slice' );
122  my $ga =
123  $registry->get_adaptor( 'Mus musculus', 'Core',
124  'Gene' );
125 
126  my $slice =
127  $sa->fetch_by_region( 'Chromosome', '1', 1e8,
128  1.05e8 );
129 
130  my $genes = $ga->fetch_all_by_Slice($slice);
131 
132  $ga->clear_cache();
133 
134  Description : Empties the feature cache associated with this
135  feature adaptor.
136  Return type : None
137  Exceptions : None
138  Caller : General
139  Status : At risk (under development)
140 
141 =cut
142 
143 sub clear_cache {
144  my ($self) = @_;
145  $self->_clear_slice_feature_cache();
146  if(!$self->_no_id_cache()) {
147  $self->_id_cache()->clear_cache();
148  }
149  return;
150 }
151 
152 sub _clear_slice_feature_cache {
153  my ($self) = @_;
154  %{$self->{_slice_feature_cache}} = ();
155  return;
156 }
157 
158 =head2 _slice_feature_cache
159 
160  Description : Returns the feature cache if we are allowed to cache and
161  will build it if we need to. We will never return a reference
162  to the hash to avoid unintentional auto-vivfying caching
163  Returntype : Bio::EnsEMBL::Utils::Cache
164  Exceptions : None
165  Caller : Internal
166 
167 =cut
168 
169 sub _slice_feature_cache {
170  my ($self) = @_;
171  return if $self->db()->no_cache();
172  if(! exists $self->{_slice_feature_cache}) {
173  tie my %cache, 'Bio::EnsEMBL::Utils::Cache', $SLICE_FEATURE_CACHE_SIZE;
174  $self->{_slice_feature_cache} = \%cache;
175  }
176  return $self->{_slice_feature_cache};
177 }
178 
179 =head2 fetch_all_by_Slice
180 
181  Arg [1] : Bio::EnsEMBL::Slice $slice
182  the slice from which to obtain features
183  Arg [2] : (optional) string $logic_name
184  the logic name of the type of features to obtain
185  Example : $fts = $a->fetch_all_by_Slice($slice, 'Swall');
186  Description: Returns a listref of features created from the database
187  which are on the Slice defined by $slice. If $logic_name is
188  defined only features with an analysis of type $logic_name
189  will be returned.
190  NOTE: only features that are entirely on the slice's seq_region
191  will be returned (i.e. if they hang off the start/end of a
192  seq_region they will be discarded). Features can extend over the
193  slice boundaries though (in cases where you have a slice that
194  doesn't span the whole seq_region).
195  Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
196  Exceptions : none
197  Caller : Bio::EnsEMBL::Slice
198  Status : Stable
199 
200 =cut
201 
202 sub fetch_all_by_Slice {
203  my ($self, $slice, $logic_name) = @_;
204  #fetch by constraint with empty constraint
205  return $self->fetch_all_by_Slice_constraint($slice, '', $logic_name);
206 }
207 
208 
209 
210 =head2 fetch_Iterator_by_Slice_method
211 
212  Arg [1] : CODE ref of Slice fetch method
213  Arg [2] : ARRAY ref of parameters for Slice fetch method
214  Arg [3] : Optional int: Slice index in parameters array
215  Arg [4] : Optional int: Slice chunk size. Default=500000
216  Example : my $slice_iter = $feature_adaptor->fetch_Iterator_by_Slice_method
217  ($feature_adaptor->can('fetch_all_by_Slice_Arrays'),
218  \@fetch_method_params,
219  0,#Slice idx
220  );
221 
222  while(my $feature = $slice_iter->next && defined $feature){
223  #Do something here
224  }
225 
226  Description: Creates an Iterator which chunks the query Slice to facilitate
227  large Slice queries which would have previously run out of memory
228  Returntype : Bio::EnsEMBL::Utils::Iterator
229  Exceptions : Throws if mandatory params not valid
230  Caller : general
231  Status : at risk
232 
233 =cut
234 
235 #Does not support Collections. See Funcgen ResultFeatureAdaptor::fetch_collection_Iterator_by_Slice_method
236 
237 sub fetch_Iterator_by_Slice_method{
238  my ($self, $slice_method_ref, $params_ref, $slice_idx, $chunk_size) = @_;
239 
240  if(! ( defined $slice_method_ref &&
241  ref($slice_method_ref) eq 'CODE')
242  ){
243  throw('Must pass a valid Slice fetch method CODE ref');
244  }
245 
246  if (! ($params_ref &&
247  ref($params_ref) eq 'ARRAY')) {
248  #Don't need to check size here so long as we have valid Slice
249  throw('You must pass a method params ARRAYREF');
250  }
251 
252  $slice_idx = 0 if(! defined $slice_idx);
253  my $slice = $params_ref->[$slice_idx];
254  $chunk_size ||= 1000000;
255 
256  my @feat_cache;
257  my $finished = 0;
258  my $start = 1; #local coord for sub slice
259  my $end = $slice->length;
260  my $num_overlaps = 0;
261 
262  my $coderef =
263  sub {
264 
265  while (scalar(@feat_cache) == 0 &&
266  ! $finished) {
267 
268  my $new_end = ($start + $chunk_size - 1);
269 
270  if ($new_end >= $end) {
271  # this is our last chunk
272  $new_end = $end;
273  $finished = 1;
274  }
275 
276  #Chunk by sub slicing
277  my $sub_slice = $slice->sub_Slice($start, $new_end);
278  $params_ref->[$slice_idx] = $sub_slice;
279  @feat_cache = @{ $slice_method_ref->($self, @$params_ref)};
280 
281  #Remove & count overlapping features
282  splice(@feat_cache, 0, $num_overlaps) if($num_overlaps);
283  my $i;
284 
285  if (scalar(@feat_cache) > 0) {
286 
287  my $feat_end = $feat_cache[$#feat_cache]->seq_region_end;
288  my $slice_end = $sub_slice->end;
289  $num_overlaps = 0;
290  for ($i = $#feat_cache; $i >=0; $i--) {
291  $feat_end = $feat_cache[$i]->seq_region_end;
292 
293  if ($feat_end > $slice_end) {
294  $num_overlaps ++;
295  } else {
296  last;
297  }
298 
299  }
300  }
301 
302  # update the start coordinate
303  $start = $new_end + 1;
304  }
305 
306  #this maybe returning from an undef cache
307  #Need to sub this out even more?
308  return shift @feat_cache;
309  };
310 
311  return Bio::EnsEMBL::Utils::Iterator->new($coderef);
312 }
313 
314 
315 =head2 fetch_Iterator_by_Slice
316 
317  Arg [1] : Bio::EnsEMBL::Slice
318  Arg [2] : Optional string: logic name of analysis
319  Arg [3] : Optional int: Chunk size to iterate over. Default is 500000
320  Example : my $slice_iter = $feature_adaptor->fetch_Iterator_by_Slice($slice);
321 
322  while(my $feature = $slice_iter->next && defined $feature){
323  #Do something here
324  }
325 
326  Description: Creates an Iterator which chunks the query Slice to facilitate
327  large Slice queries which would have previously run out of memory
328  Returntype : Bio::EnsEMBL::Utils::Iterator
329  Exceptions : None
330  Caller : general
331  Status : at risk
332 
333 =cut
334 
335 sub fetch_Iterator_by_Slice{
336  my ($self, $slice, $logic_name, $chunk_size) = @_;
337 
338  my $method_ref = $self->can('fetch_all_by_Slice');
339 
340  return $self->fetch_Iterator_by_Slice_method($method_ref, [$slice, $logic_name], 0, $chunk_size);
341 }
342 
343 
344 =head2 fetch_all_by_Slice_and_score
345 
346  Arg [1] : Bio::EnsEMBL::Slice $slice
347  the slice from which to obtain features
348  Arg [2] : (optional) float $score
349  lower bound of the the score of the features retrieved
350  Arg [3] : (optional) string $logic_name
351  the logic name of the type of features to obtain
352  Example : $fts = $a->fetch_all_by_Slice_and_score($slice,90,'Swall');
353  Description: Returns a list of features created from the database which are
354  are on the Slice defined by $slice and which have a score
355  greater than $score. If $logic_name is defined,
356  only features with an analysis of type $logic_name will be
357  returned.
358  Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
359  Exceptions : none
360  Caller : Bio::EnsEMBL::Slice
361  Status : Stable
362 
363 =cut
364 
365 sub fetch_all_by_Slice_and_score {
366  my ( $self, $slice, $score, $logic_name ) = @_;
367 
368  my $constraint;
369  if ( defined($score) ) {
370  # Get the synonym of the primary_table
371  my @tabs = $self->_tables();
372  my $syn = $tabs[0]->[1];
373 
374  $constraint = sprintf( "%s.score > %s",
375  $syn,
376  $self->dbc()->db_handle()->quote( $score, SQL_FLOAT ) );
377  }
378 
379  return
380  $self->fetch_all_by_Slice_constraint( $slice, $constraint,
381  $logic_name );
382 }
383 
384 
385 =head2 fetch_all_by_Slice_constraint
386 
387  Arg [1] : Bio::EnsEMBL::Slice $slice
388  the slice from which to obtain features
389  Arg [2] : (optional) string $constraint
390  An SQL query constraint (i.e. part of the WHERE clause)
391  Arg [3] : (optional) string $logic_name
392  the logic name of the type of features to obtain
393  Example : $fs = $a->fetch_all_by_Slice_constraint($slc, 'perc_ident > 5');
394  Description: Returns a listref of features created from the database which
395  are on the Slice defined by $slice and fulfill the SQL
396  constraint defined by $constraint. If logic name is defined,
397  only features with an analysis of type $logic_name will be
398  returned.
399  Returntype : listref of Bio::EnsEMBL::SeqFeatures in Slice coordinates
400  Exceptions : thrown if $slice is not defined
401  Caller : Bio::EnsEMBL::Slice
402  Status : Stable
403 
404 =cut
405 
406 sub fetch_all_by_Slice_constraint {
407  my ( $self, $slice, $constraint, $logic_name ) = @_;
408 
409  my @result;
410 
411  #Pull out here as we need to remember them & reset accordingly
412  my $bind_params = $self->bind_param_generic_fetch();
413 
414  if ( !ref($slice)
415  || !( $slice->isa('Bio::EnsEMBL::Slice')
416  or $slice->isa('Bio::EnsEMBL::LRGSlice') ) )
417  {
418  throw("Bio::EnsEMBL::Slice argument expected.");
419  }
420 
421  $constraint ||= '';
422  $constraint =
423  $self->_logic_name_to_constraint( $constraint, $logic_name );
424 
425  # If the logic name was invalid, undef was returned
426  if ( !defined($constraint) ) { return [] }
427 
428  my $key;
429  my $cache;
430 
431  # Will only use feature_cache if hasn't been no_cache attribute set
432  if (
433  !( defined( $self->db()->no_cache() ) && $self->db()->no_cache() ) )
434  {
435 
436  #strain test and add to constraint if so to stop caching.
437  if ( $slice->isa('Bio::EnsEMBL::Variation::StrainSlice') ) {
438  my $string =
439  $self->dbc()->db_handle()->quote( $slice->strain_name() );
440 
441  if ( $constraint ne "" ) {
442  $constraint .= " AND $string = $string ";
443  } else {
444  $constraint .= " $string = $string ";
445  }
446  }
447 
448  # Check the cache and return the cached results if we have already
449  # done this query. The cache key is the made up from the slice
450  # name, the constraint, and the bound parameters (if there are any).
451  $key = uc( join( ':', $slice->name(), $constraint ) );
452 
453  if ( defined($bind_params) ) {
454  $key .= ':'
455  . join( ':', map { $_->[0] . '/' . $_->[1] } @{$bind_params} );
456  }
457 
458  $cache = $self->_slice_feature_cache();
459  if ( exists( $cache->{$key} ) ) {
460  # Clear the bound parameters and return the cached data.
461  $self->{'_bind_param_generic_fetch'} = ();
462  #IMPORTANT: NEVER EVER RETURN A COPY OF THE DATA STRUCTURE.
463  # This will hold arrays of values. Since we've been doing
464  # this for so long people are expecting multiple calls
465  # to fetch_by_SliceXXXXX() methods to return the same
466  # array reference.
467  return $cache->{$key};
468  }
469  } ## end if ( !( defined( $self...)))
470 
471 
472  my $proj_ref = $self->_get_and_filter_Slice_projections($slice);
473  my $bounds = $self->_generate_feature_bounds($slice);
474 
475  # fetch features for the primary slice AND all symlinked slices
476  foreach my $seg (@{$proj_ref}) {
477  # re-bind the params
478  $self->_bind_param_generic_fetch($bind_params);
479  my $offset = $seg->from_start();
480  my $seg_slice = $seg->to_Slice();
481  my $features =
482  $self->_slice_fetch( $seg_slice, $constraint );
483 
484  # If this was a symlinked slice offset the feature coordinates as
485  # needed.
486  if ( $seg_slice->name() ne $slice->name() ) {
487 
488  FEATURE:
489  foreach my $f ( @{$features} ) {
490  if ( $offset != 1 ) {
491  $f->{'start'} += $offset - 1;
492  $f->{'end'} += $offset - 1;
493  }
494 
495  # discard boundary crossing features from symlinked regions
496  foreach my $bound (@{$bounds}) {
497  if ( $f->{'start'} < $bound && $f->{'end'} >= $bound ) {
498  next FEATURE;
499  }
500  }
501 
502  $f->{'slice'} = $slice;
503  push( @result, $f );
504  }
505  } else {
506  push( @result, @{$features} );
507  }
508  } ## end foreach my $seg (@proj)
509 
510  # Will only use feature_cache when set attribute no_cache in DBAdaptor
511  if ( defined($key) ) {
512  $cache->{$key} = \@result;
513  }
514 
515  return \@result;
516 } ## end sub fetch_all_by_Slice_constraint
517 
518 
519 =head2 fetch_all_by_logic_name
520 
521  Arg [1] : string $logic_name
522  the logic name of the type of features to obtain
523  Example : $fs = $a->fetch_all_by_logic_name('foobar');
524  Description: Returns a listref of features created from the database.
525  only features with an analysis of type $logic_name will
526  be returned. If the logic name is invalid (not in the
527  analysis table), a reference to an empty list will be
528  returned.
529  Returntype : listref of Bio::EnsEMBL::SeqFeatures
530  Exceptions : thrown if no $logic_name
531  Caller : General
532  Status : Stable
533 
534 =cut
535 
536 sub fetch_all_by_logic_name {
537  my ( $self, $logic_name ) = @_;
538 
539  if ( !defined($logic_name) ) {
540  throw("Need a logic_name");
541  }
542 
543  my $constraint = $self->_logic_name_to_constraint( '', $logic_name );
544 
545  if ( !defined($constraint) ) {
546  warning("Invalid logic name: $logic_name");
547  return [];
548  }
549 
550  return $self->generic_fetch($constraint);
551 }
552 
553 =head2 fetch_all_by_stable_id_list
554 
555  Arg [1] : string $logic_name
556  the logic name of the type of features to obtain
557  Arg [2] : Bio::EnsEMBL::Slice $slice
558  the slice from which to obtain features
559  Example : $fs = $a->fetch_all_by_stable_id_list(["ENSG00001","ENSG00002", ...]);
560  Description: Returns a listref of features identified by their stable IDs.
561  This method only fetches features of the same type as the calling
562  adaptor.
563  Results are constrained to a slice if the slice is provided.
564  Returntype : listref of Bio::EnsEMBL::Feature
565  Exceptions : thrown if no stable ID list is provided.
566  Caller : General
567  Status : Stable
568 
569 =cut
570 
571 # Adapted from BaseAdaptor->uncached_fetch_all_by_dbID_list
572 sub fetch_all_by_stable_id_list {
573  my ( $self, $id_list_ref, $slice ) = @_;
574 
575  return $self->_uncached_fetch_all_by_id_list($id_list_ref,$slice,"stable_id");
576 }
577 
578 # Method that creates an object. Called by the _objs_from_sth() method
579 # in the sub-classes (the various feature adaptors). Overridden by the
580 # feature collection classes.
581 
582 sub _create_feature {
583  my ( $self, $feature_type, $args ) = @_;
584  return $feature_type->new( %{$args} );
585 }
586 
587 # This is the same as the above, but calls the new_fast() constructor of
588 # the feature type.
589 
590 sub _create_feature_fast {
591  my ( $self, $feature_type, $args ) = @_;
592  return $feature_type->new_fast($args);
593 }
594 
595 =head2 count_by_Slice_constraint
596 
597  Arg [1] : Bio::EnsEMBL::Slice
598  Arg [2] : String Custom SQL constraint
599  Arg [3] : String Logic name to search by
600  Description : Finds all features with at least partial overlap to the given
601  slice and sums them up.
602  Explanation of workings with projections:
603 
604  |-------------------------Constraint Slice---------------------------------|
605  | | | |
606  |--Segment 1--| | |
607  | |--Segment 2, on desired Coordinate System--| |
608  | | |---Segment 3----|
609  #Feature 1# #Feature 2# #Feature 3# |
610  | #####################Feature 4#################### |
611  | #Feature 5# | | |
612 
613  Feature 1 is overlapping the original constraint. Counted in Segment 1
614  Feature 2,3 and 4 are counted when inspecting Segment 2
615  Feature 5 is counted in Segment 1
616 
617  Returntype : Integer
618 =cut
619 
620 sub count_by_Slice_constraint {
621  my ($self, $slice, $constraint, $logic_name) = @_;
622 
623  assert_ref($slice, 'Bio::EnsEMBL::Slice', 'slice');
624 
625  my $count = 0;
626 
627  #Remember the bind params
628  my $bind_params = $self->bind_param_generic_fetch();
629 
630  #Table synonym
631  my @tables = $self->_tables;
632  my ($table_name, $table_synonym) = @{ $tables[0] };
633 
634  # Basic constraints limit the feature hits to the starting Slice constraint
635  $constraint ||= '';
636  $constraint = $self->_logic_name_to_constraint( $constraint, $logic_name );
637 
638  return $count if ! defined $constraint;
639 
640  #Query logic
641  my $sa = $slice->adaptor();
642  my $projections = $self->_get_and_filter_Slice_projections($slice);
643 
644  my $segment_count = @{$projections};
645  #Manual loop to support look-ahead/behind
646  for (my $i = 0; $i < $segment_count; $i++) {
647  my $seg = $projections->[$i];
648  my $seg_slice = $seg->to_Slice();
649  $self->_bind_param_generic_fetch($bind_params);
650 
651  # We cannot filter boundary crossing features in code, so we constrain the
652  # SQL. We detect when we are not on the original query Slice and manually
653  # filter by the segment Slice's start and end. If we are on "earlier" (5')
654  # projection segments, we only limit by the *end* and when we are on the later
655  # projection segments, we filter by the *start*.
656 
657  # This is the same as fetch_all_by_Slice_constraint()'s in-memory filtering
658  # except we need to alter projected features in that code with an offset
659  my $local_constraint = $constraint;
660  if ( $seg_slice->name() ne $slice->name() ) {
661  my ($limit_start, $limit_end) = (1,1); #fully constrained by default, flags are reset every iteration
662  if($i == 0) {
663  $limit_start = 0;
664  } elsif ($i == ($segment_count - 1)) {
665  $limit_end = 0; #don't check end as we are on the final projection
666  }
667  $local_constraint .= ' AND ' if $local_constraint;
668  my @conditionals;
669 
670  if($limit_end) {
671  # Restrict features to the end of this projection segment when after the named segment
672  push(@conditionals, sprintf('%1$s.seq_region_end <= %2$d', $table_synonym, $seg_slice->end));
673  }
674  if($limit_start) {
675  #Do not cross the start boundary on this projection segment
676  push(@conditionals, sprintf('%1$s.seq_region_start >= %2$d', $table_synonym, $seg_slice->start));
677  }
678 
679  $local_constraint .= join(q{ AND }, @conditionals);
680  }
681 
682  my $count_array = $self->_get_by_Slice($seg_slice, $local_constraint, 'count');
683  $count += $_ for @$count_array;
684  }
685 
686  return $count;
687 }
688 
689 =head2 _get_and_filter_Slice_projections
690 
691  Arg [1] : Bio::EnsEMBL::Slice
692  Description : Delegates onto SliceAdaptor::fetch_normalized_slice_projection()
693  with filtering on
694  Returntype : ArrayRef Bio::EnsEMBL::ProjectionSegment; Returns an array
695  of projected segments
696 =cut
697 
698 sub _get_and_filter_Slice_projections {
699  my ($self, $slice) = @_;
700  my $sa = $slice->adaptor();
701  my $filter_projections = 1;
702  return $sa->fetch_normalized_slice_projection($slice, $filter_projections);
703 }
704 
705 =head2 _generate_feature_bounds
706 
707  Arg [1] : Bio::EnsEMBL::Slice
708  Description : Performs a projection of Slice and records the bounds
709  of that projection. This can be used later on to restrict
710  Features which overlap into unwanted areas such as
711  regions which exist on another HAP/PAR region.
712 
713  Bounds are defined as projection_start - slice_start + 1.
714  Example : my $bounds = $self->_generate_feature_bounds($slice);
715  Returntype : ArrayRef Integer; Returns the location of the bounds.
716 =cut
717 
718 sub _generate_feature_bounds {
719  my ($self, $slice) = @_;
720  my $sa = $slice->adaptor();
721  # construct list of Hap/PAR boundaries for entire seq region
722  my @bounds;
723 
724  my $sr_id = $slice->get_seq_region_id();
725  my $ent_slice = $sa->fetch_by_seq_region_id($sr_id);
726  if ( $slice->strand() == -1 ) {
727  $ent_slice = $ent_slice->invert();
728  }
729 
730  my @ent_proj = @{ $sa->fetch_normalized_slice_projection($ent_slice) };
731  shift(@ent_proj); # skip first; 1st does not have bounds normally; may change if we ever have a patch a pos 1
732 
733  @bounds = map { $_->from_start() - $slice->start() + 1 } @ent_proj;
734 
735  return \@bounds;
736 }
737 
738 =head2 _get_by_Slice
739  Arg [0] : Bio::EnsEMBL::Slice to find all the features within
740  Arg [1] : SQL constraint string
741  Arg [2] : Type of query to run. Default behaviour is to select, but
742  'count' is also valid
743  Description: Abstracted logic from _slice_fetch
744  Returntype : Listref of Bio::EnsEMBL::Feature, or integers for counting mode
745 =cut
746 
747 sub _get_by_Slice {
748  my ($self, $slice, $orig_constraint, $query_type) = @_;
749 
750  # features can be scattered across multiple coordinate systems
751  my @tables = $self->_tables;
752  my ($table_name, $table_synonym) = @{ $tables[0] };
753  my $mapper;
754  my @feature_coord_systems;
755 
756  my $meta_values = $self->db->get_MetaContainer->list_value_by_key( $table_name."build.level");
757  if ( @$meta_values and $slice->is_toplevel() ) {
758  push @feature_coord_systems, $slice->coord_system();
759  } else {
760  @feature_coord_systems = @{ $self->db->get_MetaCoordContainer->fetch_all_CoordSystems_by_feature_type($table_name)};
761  }
762 
763  my $assembly_mapper_adaptor = $self->db->get_AssemblyMapperAdaptor();
764  my @pan_coord_features;
765 
766 COORD_SYSTEM: foreach my $coord_system (@feature_coord_systems) {
767  my @query_accumulator;
768  # Build up a combination of query constraints that will quickly establish the result set
769  my $constraint = $orig_constraint;
770  if ( $coord_system->equals( $slice->coord_system ) ) {
771  my $max_len = $self->_max_feature_length
772  || $self->db->get_MetaCoordContainer
773  ->fetch_max_length_by_CoordSystem_feature_type( $coord_system,$table_name );
774 
775  my $seq_region_id;
776  if ( $slice->adaptor ) {
777  $seq_region_id = $slice->adaptor->get_seq_region_id($slice);
778  } else {
779  $seq_region_id = $self->db->get_SliceAdaptor->get_seq_region_id($slice);
780  }
781 
782  my @seq_region_ids = ($seq_region_id);
783  while (1) {
784  my $ext_seq_region_id = $self->get_seq_region_id_external($seq_region_id);
785 
786  if ( $ext_seq_region_id == $seq_region_id ) { last }
787 
788  push( @seq_region_ids, $ext_seq_region_id );
789  $seq_region_id = $ext_seq_region_id;
790  }
791 
792  $constraint .= " AND " if ($constraint);
793 
794  $constraint .= ${table_synonym}.".seq_region_id IN (". join( ',', @seq_region_ids ) . ") AND ";
795 
796  #faster query for 1bp slices where SNP data is not compressed
797  if ( $self->start_equals_end && $slice->start == $slice->end ) {
798  $constraint .= " AND ".$table_synonym.".seq_region_start = ".$slice->end .
799  " AND ".$table_synonym.".seq_region_end = ".$slice->start;
800 
801  } else {
802  if ( !$slice->is_circular() ) {
803  # Deal with the default case of a non-circular chromosome.
804  $constraint .= $table_synonym.".seq_region_start <= ".$slice->end." AND "
805  .$table_synonym.".seq_region_end >= ".$slice->start;
806 
807  if ( $max_len ) {
808  my $min_start = $slice->start - $max_len;
809  $constraint .= " AND ".$table_synonym.".seq_region_start >= ".$min_start;
810  }
811 
812  } else {
813  # Deal with the case of a circular chromosome.
814  if ( $slice->start > $slice->end ) {
815  $constraint .= " ( ".$table_synonym.".seq_region_start >= ".$slice->start
816  . " OR ".$table_synonym.".seq_region_start <= ".$slice->end
817  . " OR ".$table_synonym.".seq_region_end >= ".$slice->start
818  . " OR ".$table_synonym.".seq_region_end <= ".$slice->end
819  . " OR ".$table_synonym.".seq_region_start > ".$table_synonym.".seq_region_end)";
820  } else {
821  $constraint .= " ((".$table_synonym.".seq_region_start <= ".$slice->end
822  . " AND ".$table_synonym.".seq_region_end >= ".$slice->start.") "
823  . "OR (".$table_synonym.".seq_region_start > ".$table_synonym.".seq_region_end"
824  . " AND (".$table_synonym.".seq_region_start <= ".$slice->end
825  . " OR ".$table_synonym.".seq_region_end >= ".$slice->start.")))";
826  }
827  }
828  }
829  push @query_accumulator, [$constraint,undef,$slice]; # $mapper intentionally absent here.
830 
831  } else {
832  #coordinate systems do not match
833  $mapper = $assembly_mapper_adaptor->fetch_by_CoordSystems( $slice->coord_system(), $coord_system );
834  next unless defined $mapper;
835 
836  # Get list of coordinates and corresponding internal ids for
837  # regions the slice spans
838  my @coords = $mapper->map( $slice->seq_region_name, $slice->start, $slice->end,
839  $slice->strand, $slice->coord_system );
840 
841  @coords = grep { !$_->isa('Bio::EnsEMBL::Mapper::Gap') } @coords;
842 
843  next COORD_SYSTEM if ( !@coords );
844 
845  my @ids = map { $_->id() } @coords;
846  #coords are now id rather than name
847 
848  if ( @coords > $MAX_SPLIT_QUERY_SEQ_REGIONS && ! $slice->isa('Bio::EnsEMBL::LRGSlice')
849  && $slice->coord_system->name() ne 'lrg') {
850  $constraint = $orig_constraint;
851  my $id_str = join( ',', @ids );
852  $constraint .= " AND " if ($constraint);
853  $constraint .= $table_synonym.".seq_region_id IN ($id_str)";
854 
855  push @query_accumulator, [$constraint,$mapper,$slice];
856  } else {
857  my $max_len = (
858  $self->_max_feature_length()
859  || $self->db->get_MetaCoordContainer
860  ->fetch_max_length_by_CoordSystem_feature_type($coord_system, $table_name)
861  );
862 
863  my $length = @coords;
864  for ( my $i = 0; $i < $length; $i++ ) {
865  $constraint = $orig_constraint;
866  $constraint .= " AND " if ($constraint);
867  $constraint .= $table_synonym.".seq_region_id = "
868  . $ids[$i] . " AND "
869  . $table_synonym.".seq_region_start <= "
870  . $coords[$i]->end() . " AND "
871  . $table_synonym.".seq_region_end >= "
872  . $coords[$i]->start();
873 
874  if ($max_len) {
875  my $min_start = $coords[$i]->start() - $max_len;
876  $constraint .= " AND ".$table_synonym.".seq_region_start >= ".$min_start;
877  }
878 
879  push @query_accumulator, [$constraint,$mapper,$slice];
880  } # end multi-query cycle
881  } # end else
882 
883  } # end else (coord sytems not matching)
884 
885  #Record the bind params if we have to do multiple queries
886  my $bind_params = $self->bind_param_generic_fetch();
887 
888  foreach my $query (@query_accumulator) {
889  my ($local_constraint,$local_mapper,$local_slice) = @$query;
890  $self->_bind_param_generic_fetch($bind_params);
891  if ($query_type and $query_type eq 'count') {
892  push @pan_coord_features, $self->generic_count($local_constraint);
893  }
894  else {
895  my $features = $self->generic_fetch( $local_constraint, $local_mapper, $local_slice );
896  $features = $self->_remap( $features, $local_mapper, $local_slice );
897  push @pan_coord_features, @$features;
898  }
899  }
900  $mapper = undef;
901  } # End foreach
902  $self->{_bind_param_generic_fetch} = undef;
903  return \@pan_coord_features;
904 }
905 #
906 # helper function used by fetch_all_by_Slice_constraint method
907 #
908 sub _slice_fetch {
909  my ( $self, $slice, $orig_constraint ) = @_;
910 
911  my $features = $self->_get_by_Slice($slice,$orig_constraint);
912 
913  return $features;
914 } ## end sub _slice_fetch
915 
916 
917 #for a given seq_region_id, gets the one used in an external database, if present, otherwise, returns the internal one
918 sub get_seq_region_id_external {
919  my ( $self, $sr_id ) = @_;
920  my $cs_a = $self->db()->get_CoordSystemAdaptor();
921  return ( exists( $cs_a->{'_internal_seq_region_mapping'}->{$sr_id} )
922  ? $cs_a->{'_internal_seq_region_mapping'}->{$sr_id}
923  : $sr_id );
924 }
925 
926 #for a given seq_region_id and coord_system, gets the one used in the internal (core) database
927 sub get_seq_region_id_internal{
928  my ( $self, $sr_id ) = @_;
929  my $cs_a = $self->db()->get_CoordSystemAdaptor();
930  return ( exists $cs_a->{'_external_seq_region_mapping'}->{$sr_id}
931  ? $cs_a->{'_external_seq_region_mapping'}->{$sr_id}
932  : $sr_id);
933 }
934 
935 #
936 # Helper function containing some common feature storing functionality
937 #
938 # Given a Feature this will return a copy (or the same feature if no changes
939 # to the feature are needed) of the feature which is relative to the start
940 # of the seq_region it is on. The seq_region_id of the seq_region it is on
941 # is also returned.
942 #
943 # This method will also ensure that the database knows which coordinate
944 # systems that this feature is stored in.
945 #
946 
947 sub _pre_store {
948  my $self = shift;
949  my $feature = shift;
950 
951  if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
952  throw('Expected Feature argument.');
953  }
954  my $slice = $feature->slice();
955 
956  $self->_check_start_end_strand($feature->start(),$feature->end(),
957  $feature->strand(), $slice);
958 
959 
960  my $db = $self->db();
961 
962  my $slice_adaptor = $db->get_SliceAdaptor();
963 
964  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
965  throw('Feature must be attached to Slice to be stored.');
966  }
967 
968  # make sure feature coords are relative to start of entire seq_region
969 
970  if($slice->start != 1 || $slice->strand != 1) {
971  #move feature onto a slice of the entire seq_region
972  $slice = $slice_adaptor->fetch_by_region($slice->coord_system->name(),
973  $slice->seq_region_name(),
974  undef, #start
975  undef, #end
976  undef, #strand
977  $slice->coord_system->version());
978 
979  $feature = $feature->transfer($slice);
980 
981  if(!$feature) {
982  throw('Could not transfer Feature to slice of ' .
983  'entire seq_region prior to storing');
984  }
985  }
986 
987  # Ensure this type of feature is known to be stored in this coord system.
988  my $cs = $slice->coord_system;
989 
990  my ($tab) = $self->_tables();
991  my $tabname = $tab->[0];
992 
993  my $mcc = $db->get_MetaCoordContainer();
994 
995  $mcc->add_feature_type($cs, $tabname, $feature->length);
996 
997  my $seq_region_id = $slice_adaptor->get_seq_region_id($slice);
998 
999  if(!$seq_region_id) {
1000  throw('Feature is associated with seq_region which is not in this DB.');
1001  }
1002 
1003  return ($feature, $seq_region_id);
1004 }
1005 
1006 
1007 # The same function as _pre_store
1008 # This one is used to store user uploaded features in XXX_userdata db
1009 
1010 sub _pre_store_userdata {
1011  my $self = shift;
1012  my $feature = shift;
1013 
1014  if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
1015  throw('Expected Feature argument.');
1016  }
1017 
1018  my $slice = $feature->slice();
1019  my $slice_adaptor = $slice->adaptor;
1020 
1021  $self->_check_start_end_strand($feature->start(),$feature->end(),
1022  $feature->strand(), $slice);
1023 
1024 
1025  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
1026  throw('Feature must be attached to Slice to be stored.');
1027  }
1028 
1029  # make sure feature coords are relative to start of entire seq_region
1030 
1031  if($slice->start != 1 || $slice->strand != 1) {
1032  #move feature onto a slice of the entire seq_region
1033  $slice = $slice_adaptor->fetch_by_region($slice->coord_system->name(),
1034  $slice->seq_region_name(),
1035  undef, #start
1036  undef, #end
1037  undef, #strand
1038  $slice->coord_system->version());
1039 
1040  $feature = $feature->transfer($slice);
1041 
1042  if(!$feature) {
1043  throw('Could not transfer Feature to slice of ' .
1044  'entire seq_region prior to storing');
1045  }
1046  }
1047 
1048  # Ensure this type of feature is known to be stored in this coord system.
1049  my $cs = $slice->coord_system;
1050 
1051  my ($tab) = $self->_tables();
1052  my $tabname = $tab->[0];
1053 
1054  my $db = $self->db;
1055  my $mcc = $db->get_MetaCoordContainer();
1056 
1057  $mcc->add_feature_type($cs, $tabname, $feature->length);
1058 
1059  my $seq_region_id = $slice_adaptor->get_seq_region_id($slice);
1060 
1061  if(!$seq_region_id) {
1062  throw('Feature is associated with seq_region which is not in this DB.');
1063  }
1064 
1065  return ($feature, $seq_region_id);
1066 }
1067 
1068 
1069 #
1070 # helper function used to validate start/end/strand and
1071 # hstart/hend/hstrand etc.
1072 #
1073 sub _check_start_end_strand {
1074  my ($self, $start, $end, $strand, $slice) = @_;
1075 
1076  #
1077  # Make sure that the start, end, strand are valid
1078  #
1079  if(int($start) != $start) {
1080  throw("Invalid Feature start [$start]. Must be integer.");
1081  }
1082  if(int($end) != $end) {
1083  throw("Invalid Feature end [$end]. Must be integer.");
1084  }
1085  if(int($strand) != $strand || $strand < -1 || $strand > 1) {
1086  throw("Invalid Feature strand [$strand]. Must be -1, 0 or 1.");
1087  }
1088  if($end < $start && ! (defined $slice && $slice->is_circular())) {
1089  throw("Invalid Feature start/end [$start/$end]. Start must be less " .
1090  "than or equal to end.");
1091  }
1092 
1093  return 1;
1094 }
1095 
1096 
1097 #
1098 # Given a list of features checks if they are in the correct coord system
1099 # by looking at the first features slice. If they are not then they are
1100 # converted and placed on the slice.
1101 #
1102 sub _remap {
1103  my ( $self, $features, $mapper, $slice ) = @_;
1104 
1105  #check if any remapping is actually needed
1106  if(@$features && (!$features->[0]->isa('Bio::EnsEMBL::Feature') ||
1107  $features->[0]->slice == $slice)) {
1108  return $features;
1109  }
1110 
1111  #remapping has not been done, we have to do our own conversion from
1112  #to slice coords
1113 
1114  my @out;
1115 
1116  my $slice_start = $slice->start();
1117  my $slice_end = $slice->end();
1118  my $slice_strand = $slice->strand();
1119  my $slice_cs = $slice->coord_system();
1120 
1121  my ($seq_region_id, $start, $end, $strand);
1122 
1123  my $slice_seq_region_id = $slice->get_seq_region_id();
1124 
1125  foreach my $f (@$features) {
1126  #since feats were obtained in contig coords, attached seq is a contig
1127  my $fslice = $f->slice();
1128  if(!$fslice) {
1129  throw("Feature does not have attached slice.\n");
1130  }
1131  my $fseq_region_id = $fslice->get_seq_region_id();
1132  my $fcs = $fslice->coord_system();
1133 
1134  if(!$slice_cs->equals($fcs)) {
1135  #slice of feature in different coord system, mapping required
1136  # $seq_region comes out as an integer not a string
1137  ($seq_region_id, $start, $end, $strand) =
1138  $mapper->fastmap($fslice->seq_region_name(),$f->start(),$f->end(),$f->strand(),$fcs);
1139 
1140  # undefined start means gap
1141  next if(!defined $start);
1142  } else {
1143  $start = $f->start();
1144  $end = $f->end();
1145  $strand = $f->strand();
1146  $seq_region_id = $fseq_region_id;
1147  }
1148 
1149  # maps to region outside desired area
1150  next if ($start > $slice_end) || ($end < $slice_start) ||
1151  ($slice_seq_region_id != $seq_region_id);
1152 
1153  #shift the feature start, end and strand in one call
1154  if($slice_strand == -1) {
1155  $f->move( $slice_end - $end + 1, $slice_end - $start + 1, $strand * -1 );
1156  } else {
1157  $f->move( $start - $slice_start + 1, $end - $slice_start + 1, $strand );
1158  }
1159 
1160  $f->slice($slice);
1161 
1162  push @out,$f;
1163  }
1164 
1165  return \@out;
1166 }
1167 
1168 #
1169 # get seq region boundary (start|end) for a feature
1170 # the method attempts to retrieve the boundary directly from the db
1171 # return undef if cannot determine the associated feature table
1172 #
1173 sub _seq_region_boundary_from_db {
1174  my ($self, $feature, $boundary) = @_;
1175 
1176  if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
1177  throw('Expected Feature argument.');
1178  }
1179 
1180  throw "Undefined boundary"
1181  unless defined $boundary;
1182 
1183  $boundary eq 'start' or $boundary eq 'end'
1184  or throw "Wrong boundary: select start|end";
1185 
1186  $boundary = 'seq_region_' . $boundary;
1187 
1188  my $sql_helper = $self->dbc->sql_helper;
1189  throw "Unable to get SqlHelper instance" unless defined $sql_helper;
1190 
1191  my $feature_table = ($self->_tables)[0]->[0];
1192  warning (sprintf "Unable to get %s for %s instance\nCould not find associated feature table, returning undef", $boundary, ref $feature)
1193  and return undef unless defined $feature_table;
1194 
1195  my $db_id = $feature->dbID;
1196  my $attrib_id = $feature_table . '_id';
1197  my $query = "SELECT ${boundary} from ${feature_table} WHERE ${attrib_id} = ${db_id}";
1198 
1199  return $sql_helper->execute_single_result(-SQL => $query);
1200 }
1201 
1202 =head2 store
1203 
1204  Arg [1] : list of Bio::EnsEMBL::SeqFeature
1205  Example : $adaptor->store(@feats);
1206  Description: ABSTRACT Subclasses are responsible for implementing this
1207  method. It should take a list of features and store them in
1208  the database.
1209  Returntype : none
1210  Exceptions : thrown method is not implemented by subclass
1211  Caller : general
1212  Status : At Risk
1213  : throws if called.
1214 
1215 =cut
1216 
1217 sub store{
1218  my $self = @_;
1219 
1220  throw("Abstract method store not defined by implementing subclass\n");
1221 }
1222 
1223 
1224 =head2 remove
1225 
1226  Arg [1] : A feature $feature
1227  Example : $feature_adaptor->remove($feature);
1228  Description: This removes a feature from the database. The table the
1229  feature is removed from is defined by the abstract method
1230  _tablename, and the primary key of the table is assumed
1231  to be _tablename() . '_id'. The feature argument must
1232  be an object implementing the dbID method, and for the
1233  feature to be removed from the database a dbID value must
1234  be returned.
1235  Returntype : none
1236  Exceptions : thrown if $feature arg does not implement dbID(), or if
1237  $feature->dbID is not a true value
1238  Caller : general
1239  Status : Stable
1240 
1241 =cut
1242 
1243 
1244 sub remove {
1245  my ($self, $feature) = @_;
1246 
1247  if(!$feature || !ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
1248  throw('Feature argument is required');
1249  }
1250 
1251  if(!$feature->is_stored($self->db)) {
1252  throw("This feature is not stored in this database");
1253  }
1254 
1255  my @tabs = $self->_tables;
1256  my ($table) = @{$tabs[0]};
1257 
1258  my $sth = $self->prepare("DELETE FROM $table WHERE ${table}_id = ?");
1259  $sth->bind_param(1,$feature->dbID,SQL_INTEGER);
1260  $sth->execute();
1261 
1262  #unset the feature dbID ad adaptor
1263  $feature->dbID(undef);
1264  $feature->adaptor(undef);
1265 
1266  return;
1267 }
1268 
1269 
1270 =head2 remove_by_Slice
1271 
1272  Arg [1] : Bio::Ensembl::Slice $slice
1273  Example : $feature_adaptor->remove_by_Slice($slice);
1274  Description: This removes features from the database which lie on a region
1275  represented by the passed in slice. Only features which are
1276  fully contained by the slice are deleted; features which overlap
1277  the edge of the slice are not removed.
1278  The table the features are removed from is defined by
1279  the abstract method_tablename.
1280  Returntype : none
1281  Exceptions : thrown if no slice is supplied
1282  Caller : general
1283  Status : Stable
1284 
1285 =cut
1286 
1287 sub remove_by_Slice {
1288  my ($self, $slice) = @_;
1289 
1290  if(!$slice || !ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
1291  throw("Slice argument is required");
1292  }
1293 
1294  my @tabs = $self->_tables;
1295  my ($table_name) = @{$tabs[0]};
1296 
1297  my $seq_region_id = $self->db->get_SliceAdaptor->get_seq_region_id($slice);
1298  my $start = $slice->start();
1299  my $end = $slice->end();
1300 
1301  #
1302  # Delete only features fully on the slice, not overlapping ones
1303  #
1304  my $sth = $self->prepare("DELETE FROM $table_name " .
1305  "WHERE seq_region_id = ? " .
1306  "AND seq_region_start >= ? " .
1307  "AND seq_region_end <= ?");
1308 
1309  $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
1310  $sth->bind_param(2,$start,SQL_INTEGER);
1311  $sth->bind_param(3,$end,SQL_INTEGER);
1312  $sth->execute();
1313  $sth->finish();
1314 }
1315 
1316 
1317 #
1318 # Internal function. Allows the max feature length which is normally
1319 # retrieved from the meta_coord table to be overridden. This allows
1320 # for some significant optimizations to be put in when it is known
1321 # that requested features will not be over a certain size.
1322 #
1323 sub _max_feature_length {
1324  my $self = shift;
1325  return $self->{'_max_feature_length'} = shift if(@_);
1326  return $self->{'_max_feature_length'};
1327 }
1328 
1329 
1330 #
1331 # Lists all seq_region_ids that a particular feature type is found on.
1332 # Useful e.g. for finding out which seq_regions have genes.
1333 # Returns a listref of seq_region_ids.
1334 #
1335 sub _list_seq_region_ids {
1336  my ($self, $table) = @_;
1337 
1338  my @out;
1339 
1340  my $sql = qq(
1341  SELECT DISTINCT
1342  sr.seq_region_id
1343  FROM seq_region sr,
1344  $table a,
1345  coord_system cs
1346  WHERE sr.seq_region_id = a.seq_region_id
1347  AND sr.coord_system_id = cs.coord_system_id
1348  AND cs.species_id = ?);
1349 
1350  my $sth = $self->prepare($sql);
1351 
1352  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
1353 
1354  $sth->execute();
1355 
1356  while (my ($id) = $sth->fetchrow) {
1357  push(@out, $id);
1358  }
1359 
1360  $sth->finish;
1361 
1362  return \@out;
1363 }
1364 
1365 
1366 sub remove_by_analysis_id {
1367  my ($self, $analysis_id) = @_;
1368 
1369  $analysis_id or throw("Must call with analysis id");
1370 
1371  my @tabs = $self->_tables;
1372  my ($tablename) = @{$tabs[0]};
1373 
1374  my $sql = "DELETE FROM $tablename WHERE analysis_id = $analysis_id";
1375 # warn "SQL : $sql";
1376 
1377  my $sth = $self->prepare($sql);
1378  $sth->execute();
1379  $sth->finish();
1380 }
1381 
1382 sub remove_by_feature_id {
1383  my ($self, $features_list) = @_;
1384 
1385  my @feats = @$features_list or throw("Must call store with features");
1386 
1387  my @tabs = $self->_tables;
1388  my ($tablename) = @{$tabs[0]};
1389 
1390  my $sql = sprintf "DELETE FROM $tablename WHERE ${tablename}_id IN (%s)", join ', ', @feats;
1391 # warn "SQL : $sql";
1392 
1393  my $sth = $self->prepare($sql);
1394  $sth->execute();
1395  $sth->finish();
1396 }
1397 
1398 =head2 fetch_nearest_by_Feature
1399 
1400  Arg [1] : Reference Feature to start the search from
1401  Description: Searches iteratively outward from the starting feature until a nearby Feature is found
1402  If you require more than one result or more control of which features are returned, see
1403  fetch_all_nearest_by_Feature and fetch_all_by_outward_search. fetch_nearest_by_Feature
1404  is a convenience method.
1405  ReturnType : Bio::EnsEMBL::Feature
1406 =cut
1407 
1408 sub fetch_nearest_by_Feature {
1409  my $self = shift;
1410  my $feature = shift;
1411  my @hits = @{ $self->fetch_all_by_outward_search(-FEATURE => $feature, -MAX_RANGE => 1000000, -RANGE => 1000, -LIMIT => 1) };
1412  return $hits[0] if (scalar @hits > 0);
1413  return;
1414 }
1415 
1416 
1417 =head2 fetch_all_by_outward_search
1418 
1419  Arguments the same as fetch_all_nearest_by_Feature
1420  Arg [0] : -MAX_RANGE : Set an upper limit on the search range, defaults to 10000 bp
1421  Arg [1] : -FEATURE ,Bio::EnsEMBL::Feature : 'Source' Feature to anchor the search for nearest Features
1422  Arg [2] : -SAME_STRAND, Boolean (optional) : Respect the strand of the source Feature with ref, only
1423  returning Features on the same strand.
1424  Arg [3] : -OPPOSITE_STRAND, Boolean (optional) : Find features on the opposite strand of the same
1425  Arg [4] : -DOWNSTREAM/-UPSTREAM, (optional) : Search ONLY downstream or upstream from the source Feature.
1426  Can be omitted for searches in both directions.
1427  Arg [5] : -RANGE, Int : The size of the space to search for Features. Defaults to 1000 as a sensible starting point
1428  Arg [6] : -NOT_OVERLAPPING, Boolean (optional) : Do not return Features that overlap the source Feature
1429  Arg [7] : -FIVE_PRIME, Boolean (optional) : Determine range to a Feature by the 5' end, respecting strand
1430  Arg [8] : -THREE_PRIME, Boolean (optional): Determine range to a Feature by the 3' end, respecting strand
1431  Arg [9] : -LIMIT, Int : The maximum number of Features to return, defaulting to one. Equally near features are all returned
1432 
1433  Description: Searches for features within the suggested -RANGE, and if it finds none, expands the search area
1434  until it satisfies -LIMIT or hits -MAX_RANGE. Useful if you don't know how far away the features
1435  might be, or if dealing with areas of high feature density. In the case of Variation Features, it is
1436  conceivable that a 2000 bp window might contain very many features, resulting in a slow and bloated
1437  response, thus the ability to explore outward in smaller sections can be useful.
1438  Returntype : Listref of [$feature,$distance]
1439 =cut
1440 
1441 sub fetch_all_by_outward_search {
1442  my $self = shift;
1443  my ($ref_feature, $respect_strand, $opposite_strand, $downstream, $upstream, $start_search_range,
1444  $limit,$not_overlapping,$five_prime,$three_prime, $max_range) =
1445  rearrange([qw(FEATURE SAME_STRAND OPPOSITE_STRAND DOWNSTREAM UPSTREAM RANGE LIMIT NOT_OVERLAPPING FIVE_PRIME THREE_PRIME MAX_RANGE)], @_);
1446  my $factor = 1;
1447  $limit ||= 1;
1448  $start_search_range ||= 1000;
1449  my $current_search_range = $start_search_range;
1450  $max_range ||= 10000;
1451  my @results;
1452  while (scalar @results < $limit && $current_search_range < $max_range) {
1453  $current_search_range = $start_search_range * $factor;
1454  @results = @{
1455  $self->fetch_all_nearest_by_Feature(-RANGE => $current_search_range,
1456  -FEATURE => $ref_feature,
1457  -SAME_STRAND => $respect_strand,
1458  -OPPOSITE_STRAND => $opposite_strand,
1459  -DOWNSTREAM => $downstream,
1460  -UPSTREAM => $upstream,
1461  -LIMIT => $limit,
1462  -NOT_OVERLAPPING => $not_overlapping,
1463  -FIVE_PRIME => $five_prime,
1464  -THREE_PRIME => $three_prime,
1465  )
1466  };
1467  $factor++;
1468  }
1469  return \@results;
1470 }
1471 
1472 =head2 fetch_all_nearest_by_Feature
1473 
1474  Arg [1] : -FEATURE ,Bio::EnsEMBL::Feature : 'Source' Feature to anchor the search for nearest Features
1475  Arg [2] : -SAME_STRAND, Boolean (optional): Respect the strand of the source Feature with ref, only
1476  returning Features on the same strand
1477  Arg [3] : -OPPOSITE_STRAND, Boolean (optional) : Find features on the opposite strand of the same
1478  Arg [4] : -DOWNSTREAM/-UPSTREAM, (optional) : Search ONLY downstream or upstream from the source Feature.
1479  Can be omitted for searches in both directions.
1480  Arg [5] : -RANGE, Int : The size of the space to search for Features. Defaults to 1000 as a sensible starting point
1481  Arg [6] : -NOT_OVERLAPPING, Boolean (optional) : Do not return Features that overlap the source Feature
1482  Arg [7] : -FIVE_PRIME, Boolean (optional) : Determine range to a Feature by its 5' end, respecting strand
1483  Arg [8] : -THREE_PRIME, Boolean (optional): Determine range to a Feature by its 3' end, respecting strand
1484  Arg [9] : -LIMIT, Int : The maximum number of Features to return, defaulting to one. Equally near features are all returned
1485  Example : #To fetch the gene(s) with the nearest 5' end:
1486  $genes = $gene_adaptor->fetch_all_nearest_by_Feature(-FEATURE => $feat, -FIVE_PRIME => 1);
1487 
1488  Description: Gets the nearest Features to a given 'source' Feature. The Feature returned and the format of the result
1489  are non-obvious, please read on.
1490 
1491  When looking beyond the boundaries of the source Feature, the distance is measured to the nearest end
1492  of that Feature to the nearby Feature's nearest end.
1493  If Features overlap the source Feature, then they are given a distance of zero but ordered by
1494  their proximity to the centre of the Feature.
1495 
1496  Features are found and prioritised within 1000 base pairs unless a -RANGE is given to the method. Any overlap with
1497  the search region is included, and the results can be restricted to upstream, downstream, forward strand or reverse
1498 
1499  The -FIVE_PRIME and -THREE_PRIME options allow searching for specific ends of nearby features, but still needs
1500  a -DOWN/UPSTREAM value and/or -NOT_OVERLAPPING to fulfil its most common application.
1501 
1502 
1503  Returntype : Listref containing an Arrayref of Bio::EnsEMBL::Feature objects and the distance
1504  [ [$feature, $distance] ... ]
1505  Caller : general
1506 
1507 =cut
1508 sub fetch_all_nearest_by_Feature{
1509  my $self = shift;
1510  my ($ref_feature, $respect_strand, $opposite_strand, $downstream, $upstream, $search_range,$limit,$not_overlapping,$five_prime,$three_prime) =
1511  rearrange([qw(FEATURE SAME_STRAND OPPOSITE_STRAND DOWNSTREAM UPSTREAM RANGE LIMIT NOT_OVERLAPPING FIVE_PRIME THREE_PRIME)], @_);
1512  if ( !defined($search_range)) {
1513  $search_range ||= 1000;
1514  }
1515  $limit ||= 1;
1516 
1517  unless (defined($ref_feature) && $ref_feature->isa('Bio::EnsEMBL::Feature')) {
1518  throw ('fetch_all_nearest_by_Feature method requires a valid Ensembl Feature object to operate');
1519  }
1520  throw ('Do not specify both -upstream and -downstream. This is the same as no arguments') if ($upstream && $downstream);
1521  throw ('Do not use both -SAME_STRAND and -OPPOSITE_STRAND in one query') if ($opposite_strand && $respect_strand);
1522  my ($region_start,$region_end);
1523 
1524  # Define search box. Features overlapping the boundaries are found and considered
1525  my $slice = $ref_feature->feature_Slice;
1526 
1527  my $five_prime_expansion = 0;
1528  my $three_prime_expansion = 0;
1529  if ($upstream) {
1530  $five_prime_expansion = $search_range;
1531  $three_prime_expansion = $slice->length * -1;
1532  # shrinks the slice to exclude the features that do not exist beyond the end of the
1533  # considered feature
1534  }
1535  elsif ($downstream) {
1536  $three_prime_expansion = $search_range;
1537  $five_prime_expansion = $slice->length * -1;
1538  }
1539  else {
1540  $five_prime_expansion = $search_range;
1541  $three_prime_expansion = $search_range;
1542  }
1543 
1544  $slice = $slice->expand( $five_prime_expansion, $three_prime_expansion);
1545  my $sa = $self->db->get_SliceAdaptor;
1546  my @candidates; # Features in the search region
1547  @candidates = @{$self->fetch_all_by_Slice($slice)};
1548  if ($respect_strand) { @candidates = grep {$_->strand == $ref_feature->strand} @candidates }
1549  if ($opposite_strand) { @candidates = grep {$_->strand != $ref_feature->strand} @candidates }
1550  # When user chooses UPSTREAM/DOWNSTREAM && FIVE_PRIME/THREE_PRIME, discount any features with
1551  # their other end in the overlap region. fetch_all_by_Slice cannot filter these out.
1552  # fetch_all_by_Slice_constraint could in principle with the correct constraint.
1553  if (($upstream && $ref_feature->strand == 1) || ($downstream && $ref_feature->strand == -1)) {
1554  if ($five_prime) {
1555  @candidates = grep { ($_->strand == 1) ? $_->seq_region_start < $ref_feature->start : $_->seq_region_end < $ref_feature->start} @candidates;
1556  } elsif ($three_prime) {
1557  @candidates = grep { ($_->strand == 1) ? $_->seq_region_end < $ref_feature->start : $_->seq_region_start < $ref_feature->start} @candidates;
1558  }
1559  }
1560  if (($downstream && $ref_feature->strand == 1)|| ($upstream && $ref_feature->strand == -1)) {
1561  if ($five_prime) {
1562  @candidates = grep { ($_->strand == 1) ? $_->seq_region_start > $ref_feature->end : $_->seq_region_end > $ref_feature->end } @candidates;
1563  } elsif ($three_prime) {
1564  @candidates = grep { ($_->strand == 1) ? $_->seq_region_end > $ref_feature->end : $_->seq_region_start > $ref_feature->end } @candidates;
1565  }
1566  }
1567  # Then sort and prioritise the candidates
1568  my $finalists; # = [[feature, distance, centre-weighted distance, length, dbID],..]
1569  $finalists = $self->select_nearest($ref_feature,\@candidates,$limit,$not_overlapping,$five_prime,$three_prime);
1570  $finalists = [ map { [ splice @$_,0,2 ]} @$finalists ]; # Remove the ugly bits from the sight of users.
1571  return $finalists;
1572 }
1573 
1574 
1575 =head2 select_nearest
1576 
1577  Arg [1] : Bio::Ensembl::Feature, a Feature to find the nearest neighbouring feature to.
1578  Arg [2] : Listref of Features to be considered for nearness.
1579  Arg [3] : Integer, limited number of Features to return. Equally near features are all returned in spite of this limit
1580  Arg [4] : Boolean, Overlapping prohibition. Overlapped Features are forgotten
1581  Arg [5] : Boolean, use the 5' ends of the nearby features for distance calculation
1582  Arg [6] : Boolean, use the 3' ends of the nearby features for distance calculation
1583  Example : $feature_list = $feature_adaptor->select_nearest($ref_feature,\@candidates,$limit,$not_overlapping)
1584  Description: Take a list of possible features, and determine which is nearest. Nearness is a
1585  tricky concept. Beware of using the distance between Features, as it may not be the number you think
1586  it should be.
1587  Returntype : listref of Features ordered by proximity
1588  Caller : BaseFeatureAdaptor->fetch_all_nearest_by_Feature
1589 
1590 =cut
1591 
1592 sub select_nearest {
1593  my $self = shift;
1594  my $ref_feature = shift;
1595  my $candidates = shift;
1596  my $limit = shift;
1597  my $not_overlapping = shift;
1598  my $five_prime = shift;
1599  my $three_prime = shift;
1600 
1601  # Convert circular coordinates to linear ones for distance calculation
1602  my $ref_start = ($ref_feature->start < $ref_feature->end) ? $ref_feature->start : $ref_feature->start - $ref_feature->length; # Not ->end, in case circular
1603  my $ref_end = $ref_feature->end;
1604  my $ref_midpoint = $self->_compute_midpoint($ref_feature);
1605 
1606  my $position_matrix = [];
1607  my $shortest_distance;
1608  my $adjusted_distance; # for when features overlap and we need another metric to order by
1609 
1610  foreach my $neighbour (@$candidates) {
1611  # rearrange coordinates into forward-stranded orientation, plus flatten circular features.
1612  my $neigh_start = ($neighbour->seq_region_start < $neighbour->seq_region_end)
1613  ? $neighbour->seq_region_start : $neighbour->seq_region_start - $neighbour->length; # Not ->end, in case it is circular
1614  my $neigh_midpoint = $self->_compute_midpoint($neighbour);
1615  my $neigh_end = $neighbour->seq_region_end;
1616 
1617  # discard overlaps early if not required.
1618  next if ( $not_overlapping
1619  && (
1620  ( $neigh_start >= $ref_start && $neigh_end <= $ref_end )
1621  || ( $neigh_end <= $ref_end && $neigh_end >= $ref_start )
1622  )
1623  );
1624  my @args = ($ref_start,$ref_midpoint,$ref_end,$neigh_start,$neigh_midpoint,$neigh_end,$neighbour->strand);
1625  if ($five_prime || $three_prime) {
1626  my $five = $five_prime ? 1 : 0; # Choose 5' or 3' end
1627  ($shortest_distance,$adjusted_distance) = $self->_compute_prime_distance($five,@args);
1628  next unless ($shortest_distance || $adjusted_distance);
1629  }
1630  else {
1631  ($shortest_distance,$adjusted_distance) = $self->_compute_nearest_end(@args);
1632  }
1633  push @$position_matrix,[ $neighbour, $shortest_distance, $adjusted_distance, $neighbour->length, $neighbour->display_id] unless ($not_overlapping && $shortest_distance == 0);
1634  }
1635 
1636  # Order by distance, then centre-to-centre distance, then smallest feature first, then an arbitrary ID.
1637  # $position_matrix looks like this:
1638  # [ [ $feature, closest measure of distance, size, dbID ] ]
1639  # Not proud of this line. It re-emits the original array in sorted order to favour better measures of order.
1640  my @ordered_matrix = map { [$_->[0],$_->[1],$_->[2],$_->[3],$_->[4]] }
1641  sort { abs($a->[1]) <=> abs($b->[1]) || abs($a->[2]) <=> abs($b->[2]) || $a->[3] <=> $b->[3] || $a->[4] <=> $b->[4]} @$position_matrix;
1642  # Knock off unwanted hits. Equal distances are included, so more can be returned than asked for.
1643  @ordered_matrix = $self->_discard_excess_features_from_matrix(\@ordered_matrix,$limit);
1644  return \@ordered_matrix;
1645 }
1646 
1647 =head2 _compute_nearest_end
1648 
1649  Arg [1] : Reference feature start
1650  Arg [2] : Reference feature mid-point
1651  Arg [3] : Reference feature end
1652  Arg [4] : Considered feature start
1653  Arg [5] : Considered feature mid-point
1654  Arg [6] : Considered feature end
1655  Example : $distance = $feature_adaptor->_compute_nearest_end($ref_start,$ref_midpoint,$ref_end,$f_start,$f_midpoint,$f_end)
1656  Description: For a given feature, calculate the smallest legitimate distance to a reference feature
1657  Calculate by mid-points to accommodate overlaps
1658  Returntype : Integer distance in base pairs
1659  Caller : BaseFeatureAdaptor->select_nearest()
1660 
1661 =cut
1662 sub _compute_nearest_end {
1663  my ($self,$ref_start,$ref_midpoint,$ref_end,$f_start,$f_midpoint,$f_end) = @_;
1664  my ($effective_distance,$weighted_distance);
1665  if ($ref_start > $f_end ) {
1666  # upstream, no overlap
1667  $effective_distance = $f_end - $ref_start; # negative result for upstream
1668  } elsif ( $ref_end < $f_start ) {
1669  # downstream
1670  $effective_distance = $f_start - $ref_end; # positive result for downstream
1671  } else {
1672  # overlap,
1673  $effective_distance = 0;
1674  $weighted_distance = $f_midpoint - $ref_midpoint;
1675  }
1676  return $effective_distance, $weighted_distance;
1677 }
1678 
1679 =head2 _compute_prime_distance
1680 
1681  Arg [1] : Reference feature start
1682  Arg [2] : Reference feature mid-point
1683  Arg [3] : Reference feature end
1684  Arg [4] : Considered feature start
1685  Arg [5] : Considered feature mid-point
1686  Arg [6] : Considered feature end
1687  Arg [7] : Considered feature strand
1688  Example : $distance,$weighted_centre_distance = $feature_adaptor->_compute_prime_distance($ref_start,$ref_midpoint,$ref_end,$f_start,$f_midpoint,$f_end,$f_strand)
1689  Description: Calculate the smallest distance to the 5' end of the considered feature
1690  Returntype : Integer distance in base pairs or a string warning that the result doesn't mean anything.
1691  Nearest 5' and 3' features shouldn't reside inside the reference Feature
1692  Caller : BaseFeatureAdaptor->select_nearest()
1693 
1694 =cut
1695 sub _compute_prime_distance {
1696  my ($self,$five_prime,$ref_start,$ref_midpoint,$ref_end,$f_start,$f_midpoint,$f_end,$f_strand) = @_;
1697 
1698  my $f_coord;
1699  my $effective_distance;
1700  my $weighted_distance;
1701 
1702  if ($five_prime) {
1703  $f_coord = ($f_strand == 1) ? $f_start : $f_end;
1704  } else { # three prime end required
1705  $f_coord = ($f_strand ==1 ) ? $f_end : $f_start;
1706  }
1707 
1708  if ($ref_start > $f_coord) {
1709  # n' end of feature upstream of reference
1710  $effective_distance = $f_coord - $ref_start;
1711  if ($f_end < $ref_start) {
1712  $weighted_distance = $f_coord - $ref_midpoint;
1713  }
1714  } elsif ($ref_end < $f_coord && $f_start > $ref_end) {
1715  # n' end of feature downstream of reference
1716  $effective_distance = $ref_end - $f_coord;
1717  if ($f_start > $ref_end) {
1718  $weighted_distance = $f_coord - $ref_midpoint;
1719  }
1720  } elsif ($ref_start < $f_coord && $ref_end > $f_coord) {
1721  # total overlap condition
1722  $effective_distance = 0;
1723  $weighted_distance = $f_coord - $ref_midpoint;
1724  }
1725  # If feature's end falls outside of the reference feature, compute distance to reference midpoint.
1726  return $effective_distance, $weighted_distance;
1727 }
1728 
1729 =head2 _compute_midpoint
1730 
1731  Arg [1] : Bio::EnsEMBL::Feature
1732  Example : $middle = $feature_adaptor->_compute_midpoint($feature);
1733  Description: Calculate the mid-point of a Feature. Used for comparing Features that overlap each other
1734  and determining a canonical distance between two Features for the majority of use cases.
1735  Returntype : Integer coordinate rounded down.
1736  Caller : BaseFeatureAdaptor->select_nearest()
1737 
1738 =cut
1739 sub _compute_midpoint {
1740  my $self = shift;
1741  my $feature = shift;
1742  my $start = $feature->seq_region_start;
1743  my $end = $feature->seq_region_end;
1744  my $midpoint;
1745  # consider circular slice
1746  if ($start > $end) {
1747  $midpoint = int($end + ($start - $end) / 2);
1748  } else {
1749  $midpoint = int($start + ($end - $start) / 2);
1750  }
1751  return $midpoint;
1752 }
1753 
1754 sub _discard_excess_features_from_matrix {
1755  my $self = shift;
1756  my $list = shift;
1757  my @ordered_matrix = @$list;
1758  my $limit = shift;
1759  return @ordered_matrix if $#ordered_matrix == 0 || !defined($limit) || $limit > scalar @ordered_matrix;
1760  # cut off excess elements
1761  my @spares = splice @ordered_matrix, $limit, scalar @ordered_matrix - $limit;
1762  # Check nearest distance of the last member of ordered_matrix against spares and include those that match
1763  # This prevents first past the post.
1764  if (scalar @ordered_matrix > 0) {
1765  my $threshold_distance = $ordered_matrix[-1]->[1];
1766  my $i = 0;
1767  while ($i <= $#spares && $spares[$i]->[1] == $threshold_distance) {
1768  # printf "Considering option %s, %s\n",$spares[$i]->[0]->stable_id,$spares[$i]->[1];
1769  push @ordered_matrix, $spares[$i];
1770  $i++;
1771  }
1772 
1773  }
1774  return @ordered_matrix;
1775 }
1776 
1777 1;
EnsEMBL
Definition: Filter.pm:1
map
public map()
Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor
Definition: BaseFeatureAdaptor.pm:24
Bio::EnsEMBL::Feature
Definition: Feature.pm:47
Bio::EnsEMBL::Feature::seq_region_start
public Int seq_region_start()
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::Feature::start
public Int start()
Bio::EnsEMBL::LRGSlice
Definition: LRGSlice.pm:37
Bio::EnsEMBL::Utils::Cache
Definition: Cache.pm:71
Bio::EnsEMBL::Utils::Iterator
Definition: Iterator.pm:44
Bio::EnsEMBL::DBSQL::BaseAdaptor
Definition: BaseAdaptor.pm:71
Bio::EnsEMBL::Utils::Scalar
Definition: Scalar.pm:66
run
public run()
Bio::EnsEMBL::Feature::new
public Bio::EnsEMBL::Feature new()
Bio::EnsEMBL::Mapper::Gap
Definition: Gap.pm:14
Bio::EnsEMBL::Utils::Iterator::new
public Bio::EnsEMBL::Utils::Iterator new()
Bio::EnsEMBL::Storable::new_fast
public Instance new_fast()
Bio
Definition: AltAlleleGroup.pm:4
Bio::EnsEMBL::Utils::Argument
Definition: Argument.pm:34
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68