ensembl-hive  2.7.0
SliceAdaptor.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::DBSQL::SliceAdaptor - A database aware adaptor responsible for
34 the creation of Slice objects.
35 
36 =head1 SYNOPSIS
37 
38  use Bio::EnsEMBL::Utils::Slice qw(split_Slices);
40 
42  -host => 'ensembldb.ensembl.org',
43  -user => 'anonymous'
44  );
45 
46  $slice_adaptor =
47  Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "slice" );
48 
49  # get a slice on the entire chromosome X
50  $chr_slice = $slice_adaptor->fetch_by_region( 'chromosome', 'X' );
51 
52  # get a slice for each clone in the database
53  foreach $cln_slice ( @{ $slice_adaptor->fetch_all('clone') } ) {
54  # do something with clone
55  }
56 
57  # get a slice which is part of NT_004321
58  $spctg_slice =
59  $slice_adaptor->fetch_by_region( 'supercontig', 'NT_004321',
60  200_000, 600_000 );
61 
62  # get all non-redundant slices from the highest possible coordinate
63  # systems
64  $slices = $slice_adaptor->fetch_all('toplevel');
65 
66  # include non-reference regions
67  $slices = $slice_adaptor->fetch_all( 'toplevel', undef, 1 );
68 
69  # include duplicate regions
70  $slices = $slice_adaptor->fetch_all( 'toplevel', undef, 0, 1 );
71 
72  # split up a list of slices into smaller slices
73  $overlap = 1000;
74  $max_length = 1e6;
75  $slices = split_Slices( $slices, $max_length, $overlap );
76 
77  # store a list of slice names in a file
78  open( FILE, ">$filename" ) or die("Could not open file $filename");
79  foreach my $slice (@$slices) {
80  print FILE $slice->name(), "\n";
81  }
82  close FILE;
83 
84  # retrieve a list of slices from a file
85  open( FILE, $filename ) or die("Could not open file $filename");
86  while ( $name = <FILE> ) {
87  chomp($name);
88  $slice = $slice_adaptor->fetch_by_name($name);
89  # do something with slice
90  }
91 
92 =head1 DESCRIPTION
93 
94 This module is responsible for fetching Slices representing genomic
95 regions from a database. A Details on how slices can be used are in the
96 Bio::EnsEMBL::Slice module.
97 
98 =head1 METHODS
99 
100 =cut
101 
102 
103 package Bio::EnsEMBL::DBSQL::SliceAdaptor;
104 use vars qw(@ISA);
105 use strict;
106 
107 
113 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
115 use Scalar::Util qw/looks_like_number/;
116 use Bio::EnsEMBL::Utils::Scalar qw/assert_integer/;
117 
118 @ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor');
119 
120 sub new {
121  my $caller = shift;
122 
123  my $class = ref($caller) || $caller;
124 
125  my $self = $class->SUPER::new(@_);
126 
127  # use a cache which is shared and also used by the assembly
128  # mapper adaptor
129 
130  my $seq_region_cache = $self->db->get_SeqRegionCache();
131 
132  $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'};
133  $self->{'sr_id_cache'} = $seq_region_cache->{'id_cache'};
134 
135  $self->{'lrg_region_test'} = undef;
136  my $meta_container = $self->db->get_MetaContainer();
137  my @values = $meta_container->list_value_by_key("LRG");
138  if(scalar(@values) and $values[0]->[0]){
139  $self->{'lrg_region_test'} = $values[0]->[0];
140  }
141  return $self;
142 }
143 
144 
145 =head2 fetch_by_region
146 
147  Arg [1] : string $coord_system_name (optional)
148  The name of the coordinate system of the slice to be created
149  This may be a name of an actual coordinate system or an alias
150  to a coordinate system. Valid aliases are 'seqlevel' or
151  'toplevel'.
152  Arg [2] : string $seq_region_name
153  The name of the sequence region that the slice will be
154  created on.
155  Arg [3] : int $start (optional, default = 1)
156  The start of the slice on the sequence region
157  Arg [4] : int $end (optional, default = seq_region length)
158  The end of the slice on the sequence region
159  Arg [5] : int $strand (optional, default = 1)
160  The orientation of the slice on the sequence region
161  Arg [6] : string $version (optional, default = default version)
162  The version of the coordinate system to use (e.g. NCBI33)
163  Arg [7] : boolean $no_fuzz (optional, default = undef (false))
164  If true (non-zero), do not use "fuzzy matching" (see below).
165  Example : $slice = $slice_adaptor->fetch_by_region('chromosome', 'X');
166  $slice = $slice_adaptor->fetch_by_region('clone', 'AC008066.4');
167  Description: Retrieves a slice on the requested region. At a minimum the
168  name the name of the seq_region to fetch must be provided.
169 
170  If no coordinate system name is provided than a slice on the
171  highest ranked coordinate system with a matching
172  seq_region_name will be returned. If a version but no
173  coordinate system name is provided, the same behaviour will
174  apply, but only coordinate systems of the appropriate version
175  are considered. The same applies if the 'toplevel' coordinate
176  system is specified, however in this case the version is
177  ignored. The coordinate system should always be specified if
178  it is known, since this is unambiguous and faster.
179 
180  Some fuzzy matching is performed if no exact match for
181  the provided name is found. This allows clones to be
182  fetched even when their version is not known. For
183  example fetch_by_region('clone', 'AC008066') will
184  retrieve the sequence_region with name 'AC008066.4'.
185 
186  The fuzzy matching can be turned off by setting the
187  $no_fuzz argument to a true value.
188 
189  If the requested seq_region is not found in the database undef
190  is returned.
191 
192  Returntype : Bio::EnsEMBL::Slice or undef
193  Exceptions : throw if no seq_region_name is provided
194  throw if invalid coord_system_name is provided
195  throw if start > end is provided
196  Caller : general
197  Status : Stable
198 
199 =cut
200 
201 
202 #
203 # ARNE: This subroutine needs simplification!!
204 #
205 sub fetch_by_region {
206  my ( $self, $coord_system_name, $seq_region_name, $start, $end,
207  $strand, $version, $no_fuzz )
208  = @_;
209 
210  assert_integer($start, 'start') if defined $start;
211  assert_integer($end, 'end') if defined $end;
212 
213  if ( !defined($start) ) { $start = 1 }
214  if ( !defined($strand) ) { $strand = 1 }
215 
216  if ( !defined($seq_region_name) ) {
217  throw('seq_region_name argument is required');
218  }
219 
220  my $cs;
221  my $csa = $self->db->get_CoordSystemAdaptor();
222 
223  if ( defined($coord_system_name) ) {
224  $cs = $csa->fetch_by_name( $coord_system_name, $version );
225 
226  if ( !defined($cs) ) {
227  # deal with cases where undefined coordsystem name requested is 'chromosome'
228  if ( $coord_system_name eq "chromosome" ) {
229 
230  # set the 'chromosome' alias, if not yet set
231  $cs = $self->_create_chromosome_alias();
232 
233  } else {
234  throw( sprintf( "Unknown coordinate system:\n"
235  . "name='%s' version='%s' seq region name='%s'\n",
236  $coord_system_name, $version, $seq_region_name ) );
237  }
238  }
239 
240  # fetching by toplevel is same as fetching w/o name or version
241  if ( $cs->is_top_level() ) {
242  $cs = undef;
243  $version = undef;
244  }
245 
246  } ## end if ( defined($coord_system_name...))
247 
248  my $constraint;
249  my $sql;
250  my @bind_params;
251  my $key;
252 
253  if ( defined($cs) ) {
254 
255  # if chromosome alias is defined, use karyotype_cache to access seq region data
256  # rather than a database query
257  if ( defined($cs->alias_to()) && $cs->alias_to() eq "chromosome" ) {
258 
259  $key = "karyotype_cache";
260 
261  } else { # otherwise, search database to access seq region data, etc.
262 
263  $sql = sprintf( "SELECT sr.name, sr.seq_region_id, sr.length, %d "
264  . "FROM seq_region sr ",
265  $cs->dbID() );
266 
267  $constraint = "AND sr.coord_system_id = ?";
268  push( @bind_params, [ $cs->dbID(), SQL_INTEGER ] );
269 
270  $key = "$seq_region_name:" . $cs->dbID();
271 
272  }
273  } else {
274  $sql =
275  "SELECT sr.name, sr.seq_region_id, sr.length, cs.coord_system_id "
276  . "FROM seq_region sr, coord_system cs ";
277 
278  $constraint = "AND sr.coord_system_id = cs.coord_system_id "
279  . "AND cs.species_id = ? ";
280  push( @bind_params, [ $self->species_id(), SQL_INTEGER ] );
281 
282  if ( defined($version) ) {
283  $constraint .= "AND cs.version = ? ";
284  push( @bind_params, [ $version, SQL_VARCHAR ] );
285  }
286 
287  $constraint .= "ORDER BY cs.rank ASC";
288  }
289 
290  # check the cache so we only go to the db if necessary
291  my $length;
292  my $arr;
293 
294  # use $key to access karyotype cache and define $arr
295  if ( defined($key) && ($key eq "karyotype_cache") ) {
296 
297  my $coord_system_id = $cs->dbID();
298 
299  # retrieve values from karyotype cache
300  my $seq_region_id = $cs->{$key}{$seq_region_name}{$coord_system_id}{'seq_region_id'};
301  $length = $cs->{$key}{$seq_region_name}{$coord_system_id}{'length'};
302 
303  # populate $arr with values from karyotype cache
304  $arr = [ $seq_region_name, $seq_region_id, $length, $coord_system_id ];
305 
306  # define $end, if not yet defined
307  if ( !defined($end) ) { $end = $length }
308 
309  # add in check that $arr is populated
310  throw("Unable to popular $arr with values from karyotype cache") unless $arr;
311 
312  } else {
313 
314  if ( defined($key) ) { $arr = $self->{'sr_name_cache'}->{$key} }
315 
316  if ( defined($arr) ) {
317  $length = $arr->[3];
318  } else {
319  my $sth =
320  $self->prepare( $sql . "WHERE sr.name = ? " . $constraint );
321 
322  unshift( @bind_params, [ $seq_region_name, SQL_VARCHAR ] );
323 
324  my $pos = 0;
325  foreach my $param (@bind_params) {
326  $sth->bind_param( ++$pos, $param->[0], $param->[1] );
327  }
328 
329  $sth->execute();
330  my @row = $sth->fetchrow_array();
331  $sth->finish();
332 
333  unless ( @row ) {
334 
335  # deal with cases where a coordsystem might not be defined by user
336  my $slice;
337  if (!defined $cs) {
338  $slice = $self->_fetch_by_seq_region_synonym( undef, $seq_region_name, $start, $end, $strand, $version, $no_fuzz );
339  } else {
340  $slice = $self->_fetch_by_seq_region_synonym( $cs, $seq_region_name, $start, $end, $strand, $version, $no_fuzz );
341  }
342 
343  # check whether any slice data has been returned
344  if ( $slice && $slice->seq_region_name ) {
345  my $matched_name = $slice->seq_region_name;
346 
347  # if matched name is different to query name, skip fuzzy matching
348  if ( $matched_name ne $seq_region_name ) {
349  $seq_region_name = $matched_name;
350 
351  # define $arr
352  my $tmp_key_string = "$seq_region_name:" . $slice->coord_system()->dbID();
353  $arr = $self->{'sr_name_cache'}->{$tmp_key_string};
354  $length = $arr->[3];
355  $cs = $slice->coord_system() if (!$cs);
356  }
357 
358  } else { # if no slice object with a match returned, try using a fuzzy match
359  if (!$no_fuzz) {
360 
361  my ($fuzzy_matched_name, $cs_new) = $self->_fetch_by_fuzzy_matching( $cs, $seq_region_name, $sql, $constraint, \@bind_params );
362  $cs = $cs_new;
363 
364  if (!$fuzzy_matched_name) {
365  return;
366  }
367 
368  # define arr
369  my $tmp_key_string = $fuzzy_matched_name . ":" . $cs_new->dbID();
370  if (exists $self->{'sr_name_cache'}->{$tmp_key_string}) {
371  $seq_region_name = $fuzzy_matched_name;
372  $arr = $self->{'sr_name_cache'}->{$tmp_key_string};
373  $length = $arr->[3];
374  } else {
375  return;
376  }
377  } else {
378  return;
379  }
380  }
381  } else {
382 
383  my ( $id, $cs_id );
384  ( $seq_region_name, $id, $length, $cs_id ) = @row;
385 
386  # cache to speed up for future queries
387  my $arr = [ $id, $seq_region_name, $cs_id, $length ];
388  $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"} = $arr;
389  $self->{'sr_id_cache'}->{"$id"} = $arr;
390  $cs = $csa->fetch_by_dbID($cs_id);
391  }
392  } ## end else [ if ( defined($arr) ) ]
393  }
394 
395  if ( !defined($end) ) { $end = $length }
396 
397  #If this was given then check if we've got a circular seq region otherwise
398  #let it fall through to the normal Slice method
399  if ( $end + 1 < $start ) {
400  my $cs_id = $cs->dbID();
401  my $seq_region_id = $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"}->[0];
402  if($self->is_circular($seq_region_id)) {
403  my $new_sl =
405  -COORD_SYSTEM => $cs,
406  -SEQ_REGION_NAME => $seq_region_name,
407  -SEQ_REGION_LENGTH => $length,
408  -START => $start,
409  -END => $end,
410  -STRAND => 1,
411  -ADAPTOR => $self );
412 
413  return $new_sl;
414  }
415  }
416 
417  if ( defined( $self->{'lrg_region_test'} )
418  and substr( $cs->name, 0, 3 ) eq $self->{'lrg_region_test'} )
419  {
420  return
421  Bio::EnsEMBL::LRGSlice->new( -COORD_SYSTEM => $cs,
422  -SEQ_REGION_NAME => $seq_region_name,
423  -SEQ_REGION_LENGTH => $length,
424  -START => $start,
425  -END => $end,
426  -STRAND => $strand,
427  -ADAPTOR => $self );
428  } else {
429  return
431  'coord_system' => $cs,
432  'seq_region_name' => $seq_region_name,
433  'seq_region_length' => $length,
434  'start' => $start,
435  'end' => $end,
436  'strand' => $strand,
437  'adaptor' => $self } );
438  }
439 } ## end sub fetch_by_region
440 
441 =head2 fetch_by_toplevel_location
442 
443  Arg [1] : string $location
444  Ensembl formatted location. Can be a format like
445  C<name:start-end>, C<name:start..end>, C<name:start:end>,
446  C<name:start>, C<name>. We can also support strand
447  specification as a +/- or 1/-1.
448 
449  Location names must be separated by a C<:>. All others can be
450  separated by C<..>, C<:> or C<->.
451  Arg[2] : boolean $no_warnings
452  Suppress warnings from this method
453  Arg[3] : boolean $no_fuzz
454  Stop fuzzy matching of sequence regions from occuring
455  Arg[4] : boolean $ucsc
456  If we are unsuccessful at retriving a location retry taking any
457  possible chr prefix into account e.g. chrX and X are treated as
458  equivalents
459  Example : my $slice = $sa->fetch_by_toplevel_location('X:1-10000')
460  my $slice = $sa->fetch_by_toplevel_location('X:1-10000:-1')
461  Description : Converts an Ensembl location/region into the sequence region
462  name, start and end and passes them onto C<fetch_by_region()>.
463  The code assumes that the required slice is on the top level
464  coordinate system. The code assumes that location formatting
465  is not perfect and will perform basic cleanup before parsing.
466  Returntype : Bio::EnsEMBL::Slice
467  Exceptions : If $location is false otherwise see C<fetch_by_location()>
468  or C<fetch_by_region()>
469  Caller : General
470  Status : Beta
471 
472 =cut
473 
474 sub fetch_by_toplevel_location {
475  my ($self, $location, $no_warnings, $no_fuzz, $ucsc) = @_;
476  return $self->fetch_by_location($location, 'toplevel', undef, $no_warnings, $no_fuzz, $ucsc);
477 }
478 
479 =head2 fetch_by_location
480 
481  Arg [1] : string $location
482  Ensembl formatted location. Can be a format like
483  C<name:start-end>, C<name:start..end>, C<name:start:end>,
484  C<name:start>, C<name>. We can also support strand
485  specification as a +/- or 1/-1.
486 
487  Location names must be separated by a C<:>. All others can be
488  separated by C<..>, C<:>, C<_> or C<->.
489  Arg[2] : String $coord_system_name
490  The coordinate system to retrieve
491  Arg[3] : String $coord_system_version
492  Optional parameter. Version of the coordinate system to fetch
493  Arg[4] : boolean $no_warnings
494  Suppress warnings from this method
495  Arg[5] : boolean $no_fuzz
496  Stop fuzzy matching of sequence regions from occuring
497  Arg[6] : boolean $ucsc
498  If we are unsuccessful at retriving a location retry taking any
499  possible chr prefix into account e.g. chrX and X are treated as
500  equivalents
501  Example : my $slice = $sa->fetch_by_location('X:1-10000','chromosome')
502  my $slice = $sa->fetch_by_location('X:1-10000:-1','toplevel')
503  Description : Converts an Ensembl location/region into the sequence region
504  name, start and end and passes them onto C<fetch_by_region()>.
505  The code assumes that location formatting is not perfect and
506  will perform basic cleanup before parsing.
507  Returntype : Bio::EnsEMBL::Slice
508  Exceptions : If $location or coordinate system is false otherwise
509  see C<fetch_by_region()>
510  Caller : General
511  Status : Beta
512 
513 =cut
514 
515 sub fetch_by_location {
516  my ($self, $location, $coord_system_name, $coord_system_version, $no_warnings, $no_fuzz, $ucsc) = @_;
517  throw "No coordinate system name specified" unless $coord_system_name;
518 
519  my ($seq_region_name, $start, $end, $strand) = $self->parse_location_to_values($location, $no_warnings);
520 
521  if(! $seq_region_name) {
522  return;
523  }
524 
525  if(defined $start && defined $end && $start > $end) {
526  throw "Cannot request a slice whose start is greater than its end. Start: $start. End: $end";
527  }
528 
529  $seq_region_name =~ s/^chr// if ($ucsc);
530  my $slice = $self->fetch_by_region($coord_system_name, $seq_region_name, $start, $end, $strand, $coord_system_version, $no_fuzz);
531  if(! defined $slice) {
532  if($ucsc) {
533  my $ucsc_seq_region_name = $seq_region_name;
534  $ucsc_seq_region_name =~ s/^chr//;
535  if($ucsc_seq_region_name ne $seq_region_name) {
536  $slice = $self->fetch_by_region($coord_system_name, $ucsc_seq_region_name, $start, $end, $strand, $coord_system_version, $no_fuzz);
537  return if ! defined $slice; #if we had no slice still then bail
538  }
539  else {
540  return; #If it was not different then we didn't have the prefix so just return (same bail as before)
541  }
542  }
543  else {
544  return; #We didn't have a slice and no UCSC specifics are being triggered
545  }
546  }
547 
548  my $srl = $slice->seq_region_length();
549  my $name = $slice->seq_region_name();
550  if(defined $start && $start > $srl) {
551  throw "Cannot request a slice whose start ($start) is greater than $srl for $name.";
552  }
553  if(defined $end && $end > $srl) {
554  warning "Requested end ($end) is greater than $srl for $name. Resetting to $srl" if ! $no_warnings;
555  $slice->{end} = $srl;
556  }
557 
558  return $slice;
559 }
560 
561 =head2 parse_location_to_values
562 
563  Arg [1] : string $location
564  Ensembl formatted location. Can be a format like
565  C<name:start-end>, C<name:start..end>, C<name:start:end>,
566  C<name:start>, C<name>. We can also support strand
567  specification as a +/- or 1/-1.
568 
569  Location names must be separated by a C<:>. All others can be
570  separated by C<..>, C<:> C<_>, or C<->.
571 
572  If the start is negative, start will be reset to 1 (e.g.: 1: -10-1,000')
573  If both start and end are negative, returns undef (e.g.: 1: -10--1,000')
574  Arg[2] : boolean $no_warnings
575  Suppress warnings from this method
576  Arg[3] : boolean $no_errors
577  Supress errors being thrown from this method
578  Example : my ($name, $start, $end, $strand) = $sa->parse_location_to_values('X:1..100:1);
579  Description : Takes in an Ensembl location String and returns the parsed
580  values
581  Returntype : List. Contains name, start, end and strand
582 
583 =cut
584 
585 
586 sub parse_location_to_values {
587  my ($self, $location, $no_warnings, $no_errors) = @_;
588 
589  throw 'You must specify a location' if ! $location;
590 
591  #cleanup any nomenclature like 1 000 or 1,000
592  my $number_seps_regex = qr/\s+|,/;
593  my $separator_regex = qr/(?:-|[.]{2}|\:|_)?/; # support -, .., : and _ as separators
594  my $hgvs_nomenclature_regex = qr/(?:g\.)?/; # check for HGVS looking locations e.g. X:g.1-100
595  my $number_regex = qr/[0-9, EMKG]+/xmsi;
596  my $number_regex_signed = qr/-?[0-9, EMKG]+/xmsi; # to capture negative locations as sometimes we end up in negative location if the location is padded
597  my $strand_regex = qr/[+-1]|-1/xms;
598 
599  my $regex = qr/^((?:\w|\.|_|-)+) \s* :? \s* $hgvs_nomenclature_regex ($number_regex_signed)? $separator_regex ($number_regex)? $separator_regex ($strand_regex)? $/xms;
600  my ($seq_region_name, $start, $end, $strand);
601  if(($seq_region_name, $start, $end, $strand) = $location =~ $regex) {
602 
603  if(defined $strand) {
604  if(!looks_like_number($strand)) {
605  $strand = ($strand eq '+') ? 1 : -1;
606  }
607  }
608 
609  if(defined $start) {
610  $start =~ s/$number_seps_regex//g;
611  if($start < 1) {
612  warning "Start was less than 1 (${start}) which is not allowed. Resetting to 1" if ! $no_warnings;
613 
614  unless(defined $end) {
615  # We will reach here only when the location is given without start and '-' is used as seperator eg: 1:-10 (expected to return 1:1-10)
616  $end = abs($start);
617  }
618  $start = 1;
619  }
620  }
621  if(defined $end) {
622  $end =~ s/$number_seps_regex//g;
623  if($end < 1) {
624  throw "Cannot request negative or 0 end indexes through this interface. Given $end but expected something greater than 0" unless $no_errors;
625  }
626  }
627 
628  }
629 
630  return ($seq_region_name, $start, $end, $strand);
631 }
632 
633 =head2 fetch_by_region_unique
634 
635  Arg [1] : string $coord_system_name (optional)
636  The name of the coordinate system of the slice to be created
637  This may be a name of an actual coordinate system or an alias
638  to a coordinate system. Valid aliases are 'seqlevel' or
639  'toplevel'.
640  Arg [2] : string $seq_region_name
641  The name of the sequence region that the slice will be
642  created on.
643  Arg [3] : int $start (optional, default = 1)
644  The start of the slice on the sequence region
645  Arg [4] : int $end (optional, default = seq_region length)
646  The end of the slice on the sequence region
647  Arg [5] : int $strand (optional, default = 1)
648  The orientation of the slice on the sequence region
649  Arg [6] : string $version (optional, default = default version)
650  The version of the coordinate system to use (e.g. NCBI33)
651  Arg [7] : boolean $no_fuzz (optional, default = undef (false))
652  If true (non-zero), do not use "fuzzy matching" (see below).
653  Example : $slice = $slice_adaptor->fetch_by_region_unique('chromosome', 'HSCHR6_MHC_COX');
654  Description: Retrieves a slice on the requested region but returns only the unique
655  parts of the slice. At a minimum the
656  name the name of the seq_region to fetch must be provided.
657 
658  If no coordinate system name is provided than a slice on the
659  highest ranked coordinate system with a matching
660  seq_region_name will be returned. If a version but no
661  coordinate system name is provided, the same behaviour will
662  apply, but only coordinate systems of the appropriate version
663  are considered. The same applies if the 'toplevel' coordinate
664  system is specified, however in this case the version is
665  ignored. The coordinate system should always be specified if
666  it is known, since this is unambiguous and faster.
667 
668  Some fuzzy matching is performed if no exact match for
669  the provided name is found. This allows clones to be
670  fetched even when their version is not known. For
671  example fetch_by_region('clone', 'AC008066') will
672  retrieve the sequence_region with name 'AC008066.4'.
673 
674  The fuzzy matching can be turned off by setting the
675  $no_fuzz argument to a true value.
676 
677  If the requested seq_region is not found in the database undef
678  is returned.
679 
680  Returntype : listref Bio::EnsEMBL::Slice
681  Exceptions : throw if no seq_region_name is provided
682  throw if invalid coord_system_name is provided
683  throw if start > end is provided
684  Caller : general
685  Status : Stable
686 
687 =cut
688 
689 sub fetch_by_region_unique {
690  my $self = shift;
691 
692  my @out = ();
693  my $slice = $self->fetch_by_region(@_);
694 
695 
696  if ( !exists( $self->{'asm_exc_cache'} ) ) {
697  $self->_build_exception_cache();
698  }
699 
700  if ( exists(
701  $self->{'asm_exc_cache'}->{ $self->get_seq_region_id($slice) }
702  ) )
703  {
704  # Dereference symlinked assembly regions. Take out any regions
705  # which are symlinked because these are duplicates.
706  my @projection =
707  @{ $self->fetch_normalized_slice_projection($slice) };
708 
709  foreach my $segment (@projection) {
710  if ( $segment->[2]->seq_region_name() eq $slice->seq_region_name()
711  && $segment->[2]->coord_system->equals( $slice->coord_system ) )
712  {
713  push( @out, $segment->[2] );
714  }
715  }
716  } else {
717  @out = ($slice);
718  }
719 
720  return \@out;
721 } ## end sub fetch_by_region_unique
722 
723 =head2 fetch_by_name
724 
725  Arg [1] : string $name
726  Example : $name = 'chromosome:NCBI34:X:1000000:2000000:1';
727  $slice = $slice_adaptor->fetch_by_name($name);
728  $slice2 = $slice_adaptor->fetch_by_name($slice3->name());
729  Description: Fetches a slice using a slice name (i.e. the value returned by
730  the Slice::name method). This is useful if you wish to
731  store a unique identifier for a slice in a file or database or
732  pass a slice over a network.
733  Slice::name allows you to serialise/marshall a slice and this
734  method allows you to deserialise/unmarshal it.
735 
736  Returns undef if no seq_region with the provided name exists in
737  the database.
738 
739  Returntype : Bio::EnsEMBL::Slice or undef
740  Exceptions : throw if incorrent arg provided
741  Caller : Pipeline
742  Status : Stable
743 
744 =cut
745 
746 sub fetch_by_name {
747  my $self = shift;
748  my $name = shift;
749 
750  if(!$name) {
751  throw("name argument is required");
752  }
753 
754  my @array = split(/:/,$name);
755 
756  if(scalar(@array) < 3 || scalar(@array) > 6) {
757  throw("Malformed slice name [$name]. Format is " .
758  "coord_system:version:name:start:end:strand");
759  }
760 
761  # Rearrange arguments to suit fetch_by_region
762 
763  my @targetarray;
764 
765  $targetarray[0]=$array[0];
766  $targetarray[5]=(($array[1]&&$array[1] ne "")?$array[1]:undef);
767  $targetarray[1]=(($array[2]&&$array[2] ne "")?$array[2]:undef);
768  $targetarray[2]=(($array[3]&&$array[3] ne "")?$array[3]:undef);
769  $targetarray[3]=(($array[4]&&$array[4] ne "")?$array[4]:undef);
770  $targetarray[4]=(($array[5]&&$array[5] ne "")?$array[5]:undef);
771  return $self->fetch_by_region(@targetarray);
772 }
773 
774 
775 
776 =head2 fetch_by_seq_region_id
777 
778  Arg [1] : string $seq_region_id
779  The internal identifier of the seq_region to create this slice
780  on
781  Arg [2] : optional start
782  Arg [3] : optional end
783  Arg [4] : optional strand
784  Example : $slice = $slice_adaptor->fetch_by_seq_region_id(34413);
785  Description: Creates a slice object of an entire seq_region using the
786  seq_region internal identifier to resolve the seq_region.
787  Returns undef if no such slice exists.
788  Returntype : Bio::EnsEMBL::Slice or undef
789  Exceptions : none
790  Caller : general
791  Status : Stable
792 
793 =cut
794 
795 sub fetch_by_seq_region_id {
796  my ( $self, $seq_region_id, $start, $end, $strand, $check_prior_ids ) = @_;
797 
798  my $csa = $self->db->get_CoordSystemAdaptor();
799  my $arr = $self->{'sr_id_cache'}->{$seq_region_id};
800  my ( $name, $length, $cs, $cs_id );
801 
802 
803  if ( $arr && defined( $arr->[2] ) ) {
804  ( $name, $cs_id, $length ) = ( $arr->[1], $arr->[2], $arr->[3] );
805  $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id);
806  } else {
807  my $sth =
808  $self->prepare( "SELECT sr.name, sr.coord_system_id, sr.length "
809  . "FROM seq_region sr "
810  . "WHERE sr.seq_region_id = ? " );
811 
812  $sth->bind_param( 1, $seq_region_id, SQL_INTEGER );
813  $sth->execute();
814 
815  my @row = $sth->fetchrow_array();
816  unless ( @row ) {
817  # This could have been an old seq region id so see if we can
818  # translate it into a more recent version.
819  if($check_prior_ids) {
820  if(exists $csa->{_external_seq_region_mapping}->{$seq_region_id}) {
821  my $new_seq_region_id = $csa->{_external_seq_region_mapping}->{$seq_region_id};
822  # No need to pass check prior ids flag because it's a 1 step relationship
823  return $self->fetch_by_seq_region_id($new_seq_region_id, $start, $end, $strand);
824  }
825  }
826  return undef;
827  }
828 
829  ( $name, $cs_id, $length ) = @row;
830  $sth->finish();
831 
832  $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id);
833 
834  #cache results to speed up repeated queries
835  my $arr = [ $seq_region_id, $name, $cs_id, $length ];
836 
837  $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr;
838  $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr;
839  }
840 
841  return
843  'coord_system' => $cs,
844  'seq_region_name' => $name,
845  'seq_region_length'=> $length,
846  'start' => $start || 1,
847  'end' => $end || $length,
848  'strand' => $strand || 1,
849  'adaptor' => $self} );
850 } ## end sub fetch_by_seq_region_id
851 
852 
853 
854 =head2 get_seq_region_id
855 
856  Arg [1] : Bio::EnsEMBL::Slice $slice
857  The slice to fetch a seq_region_id for
858  Example : $srid = $slice_adaptor->get_seq_region_id($slice);
859  Description: Retrieves the seq_region id (in this database) given a slice
860  Seq region ids are not stored on the slices themselves
861  because they are intended to be somewhat database independant
862  and seq_region_ids vary accross databases.
863  Returntype : int
864  Exceptions : throw if the seq_region of the slice is not in the db
865  throw if incorrect arg provided
866  Caller : BaseFeatureAdaptor
867  Status : Stable
868 
869 =cut
870 
871 sub get_seq_region_id {
872  my $self = shift;
873  my $slice = shift;
874 
875  if(!$slice || !ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) {
876  throw('Slice argument is required');
877  }
878 
879  my $seq_region_name = $slice->seq_region_name();
880  my $key = $seq_region_name.":".$slice->coord_system->dbID();
881  my $arr = $self->{'sr_name_cache'}->{"$key"};
882 
883  if( $arr ) {
884  return $arr->[0];
885  }
886 
887  my $cs_id = $slice->coord_system->dbID();
888 
889  my $sth = $self->prepare("SELECT seq_region_id, length " .
890  "FROM seq_region " .
891  "WHERE name = ? AND coord_system_id = ?");
892 
893  #force seq_region_name cast to string so mysql cannot treat as int
894  $sth->bind_param(1,"$seq_region_name",SQL_VARCHAR);
895  $sth->bind_param(2,$cs_id,SQL_INTEGER);
896  $sth->execute();
897 
898  my @row = $sth->fetchrow_array();
899  unless ( @row ) {
900  throw("No-existent seq_region [$seq_region_name] in coord system [$cs_id]");
901  }
902  my @more = $sth->fetchrow_array();
903  if ( @more ) {
904  throw("Ambiguous seq_region [$seq_region_name] in coord system [$cs_id]");
905  }
906 
907  my($seq_region_id, $length) = @row;
908  $sth->finish();
909 
910  #cache information for future requests
911  $arr = [ $seq_region_id, $seq_region_name, $cs_id, $length ];
912 
913  $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"} = $arr;
914  $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr;
915 
916  return $seq_region_id;
917 }
918 
919 
920 
921 =head2 fetch_all
922 
923  Arg [1] : string $coord_system_name
924  The name of the coordinate system to retrieve slices of.
925  This may be a name of an acutal coordinate system or an alias
926  to a coordinate system. Valid aliases are 'seqlevel' or
927  'toplevel'.
928  Arg [2] : string $coord_system_version (optional)
929  The version of the coordinate system to retrieve slices of
930  Arg [3] : bool $include_non_reference (optional)
931  If this argument is not provided then only reference slices
932  will be returned. If set, both reference and non reference
933  slices will be returned.
934  Arg [4] : int $include_duplicates (optional)
935  If set duplicate regions will be returned.
936 
937  NOTE: if you do not use this option and you have a PAR
938  (pseudo-autosomal region) at the beginning of your seq_region
939  then your slice will not start at position 1, so coordinates
940  retrieved from this slice might not be what you expected.
941 
942  Arg[5] : bool $include_lrg (optional) (default 0)
943  If set lrg regions will be returned aswell.
944 
945 
946  Example : @chromos = @{$slice_adaptor->fetch_all('chromosome','NCBI33')};
947  @contigs = @{$slice_adaptor->fetch_all('contig')};
948 
949  # get even non-reference regions
950  @slices = @{$slice_adaptor->fetch_all('toplevel',undef,1)};
951 
952  # include duplicate regions (such as pseudo autosomal regions)
953  @slices = @{$slice_adaptor->fetch_all('toplevel', undef,0,1)};
954 
955  Description: Retrieves slices of all seq_regions for a given coordinate
956  system. This is analagous to the methods fetch_all which were
957  formerly on the ChromosomeAdaptor, RawContigAdaptor and
958  CloneAdaptor classes. Slices fetched span the entire
959  seq_regions and are on the forward strand.
960  If the coordinate system with the provided name and version
961  does not exist an empty list is returned.
962  If the coordinate system name provided is 'toplevel', all
963  non-redundant toplevel slices are returned (note that any
964  coord_system_version argument is ignored in that case).
965 
966  Retrieved slices can be broken into smaller slices using the
968 
969  Returntype : listref of Bio::EnsEMBL::Slices
970  Exceptions : none
971  Caller : general
972  Status : Stable
973 
974 =cut
975 
976 sub fetch_all {
977  my $self = shift;
978  my $cs_name = shift;
979  my $cs_version = shift || '';
980 
981  my ($include_non_reference, $include_duplicates, $include_lrg) = @_;
982 
983  #
984  # verify existance of requested coord system and get its id
985  #
986  my $csa = $self->db->get_CoordSystemAdaptor();
987  my $orig_cs = $csa->fetch_by_name($cs_name, $cs_version);
988 
989  return [] if ( !$orig_cs );
990 
991  my %bad_vals=();
992 
993 
994  #
995  # Get a hash of non reference seq regions
996  #
997  if ( !$include_non_reference ) {
998  my $sth =
999  $self->prepare( 'SELECT sr.seq_region_id '
1000  . 'FROM seq_region sr, seq_region_attrib sra, '
1001  . 'attrib_type at, coord_system cs '
1002  . 'WHERE at.code = "non_ref" '
1003  . 'AND sra.seq_region_id = sr.seq_region_id '
1004  . 'AND at.attrib_type_id = sra.attrib_type_id '
1005  . 'AND sr.coord_system_id = cs.coord_system_id '
1006  . 'AND cs.species_id = ?' );
1007 
1008  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
1009  $sth->execute();
1010 
1011  my ($seq_region_id);
1012  $sth->bind_columns( \$seq_region_id );
1013 
1014  while ( $sth->fetch() ) {
1015  $bad_vals{$seq_region_id} = 1;
1016  }
1017  }
1018 
1019  #
1020  # if we do not want lrg's then add them to the bad list;
1021  #
1022  if ( !$include_lrg ) {
1023  my $sth =
1024  $self->prepare( 'SELECT sr.seq_region_id '
1025  . 'FROM seq_region sr, seq_region_attrib sra, '
1026  . 'attrib_type at, coord_system cs '
1027  . 'WHERE at.code = "LRG" '
1028  . 'AND sra.seq_region_id = sr.seq_region_id '
1029  . 'AND at.attrib_type_id = sra.attrib_type_id '
1030  . 'AND sr.coord_system_id = cs.coord_system_id '
1031  . 'AND cs.species_id = ?' );
1032 
1033  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
1034  $sth->execute();
1035 
1036  my ($seq_region_id);
1037  $sth->bind_columns( \$seq_region_id );
1038 
1039  while ( $sth->fetch() ) {
1040  $bad_vals{$seq_region_id} = 1;
1041  }
1042  }
1043 
1044  #
1045  # Retrieve the seq_regions from the database
1046  #
1047 
1048  my $sth;
1049  if ( $orig_cs->is_top_level() ) {
1050  $sth =
1051  $self->prepare( 'SELECT sr.seq_region_id, sr.name, '
1052  . 'sr.length, sr.coord_system_id '
1053  . 'FROM seq_region sr, seq_region_attrib sra, '
1054  . 'attrib_type at, coord_system cs '
1055  . 'WHERE at.code = "toplevel" '
1056  . 'AND at.attrib_type_id = sra.attrib_type_id '
1057  . 'AND sra.seq_region_id = sr.seq_region_id '
1058  . 'AND sr.coord_system_id = cs.coord_system_id '
1059  . 'AND cs.species_id = ?' );
1060 
1061  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
1062  $sth->execute();
1063  } else {
1064  $sth =
1065  $self->prepare( 'SELECT sr.seq_region_id, sr.name, '
1066  . 'sr.length, sr.coord_system_id '
1067  . 'FROM seq_region sr '
1068  . 'WHERE sr.coord_system_id = ?' );
1069 
1070  $sth->bind_param( 1, $orig_cs->dbID, SQL_INTEGER );
1071  $sth->execute();
1072  }
1073 
1074  my ( $seq_region_id, $name, $length, $cs_id );
1075  $sth->bind_columns( \( $seq_region_id, $name, $length, $cs_id ) );
1076 
1077  my $cache_count = 0;
1078 
1079  my @out;
1080  while($sth->fetch()) {
1081  if(!defined($bad_vals{$seq_region_id})){
1082  my $cs = $csa->fetch_by_dbID($cs_id);
1083 
1084  if(!$cs) {
1085  throw("seq_region $name references non-existent coord_system $cs_id.");
1086  }
1087 
1088  #cache values for future reference, but stop adding to the cache once we
1089  #we know we have filled it up
1090  if($cache_count < $Bio::EnsEMBL::Utils::SeqRegionCache::SEQ_REGION_CACHE_SIZE) {
1091  my $arr = [ $seq_region_id, $name, $cs_id, $length ];
1092 
1093  $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr;
1094  $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr;
1095 
1096  $cache_count++;
1097  }
1098 
1099  my $slice = Bio::EnsEMBL::Slice->new_fast({
1100  'start' => 1,
1101  'end' => $length,
1102  'strand' => 1,
1103  'seq_region_name' => $name,
1104  'seq_region_length'=> $length,
1105  'coord_system' => $cs,
1106  'adaptor' => $self});
1107 
1108  if(!defined($include_duplicates) or !$include_duplicates){
1109  # test if this slice *could* have a duplicate (exception) region
1110  $self->_build_exception_cache() if(!exists $self->{'asm_exc_cache'});
1111  if(exists $self->{asm_exc_cache}->{$seq_region_id}) {
1112 
1113  # Dereference symlinked assembly regions. Take out
1114  # any regions which are symlinked because these are duplicates
1115  my @projection = @{$self->fetch_normalized_slice_projection($slice)};
1116  foreach my $segment ( @projection) {
1117  if($segment->[2]->seq_region_name() eq $slice->seq_region_name() &&
1118  $segment->[2]->coord_system->equals($slice->coord_system)) {
1119  push @out, $segment->[2];
1120  }
1121  }
1122  } else {
1123  # no duplicate regions
1124  push @out, $slice;
1125  }
1126  } else {
1127  # we want duplicates anyway so do not do any checks
1128  push @out, $slice;
1129  }
1130  }
1131  }
1132 
1133  return \@out;
1134 }
1135 
1136 =head2 fetch_all_by_genome_component
1137 
1138  Arg [1] : string $genome_component_name
1139  The name of the genome component to retrieve slices of.
1140  Example : @slices = @{$slice_adaptor->fetch_all_by_genome_component('A')};
1141  Description: Returns the list of all top level slices for a a given
1142  genome component
1143  Returntype : listref of Bio::EnsEMBL::Slices
1144  Exceptions : If argument is not provided or is not a valid genome
1145  component
1146  Caller : general
1147  Status : Stable
1148 
1149 =cut
1150 
1151 sub fetch_all_by_genome_component {
1152  my $self = shift;
1153  my $genome_component = shift;
1154  defined $genome_component or
1155  throw "Undefined genome component";
1156 
1157  # check the provided genome component is valid
1158  my $gc = $self->db->get_adaptor('GenomeContainer');
1159  my $is_valid_component = grep { $_ eq $genome_component }
1160  @{$gc->get_genome_components};
1161  throw "Invalid genome component"
1162  unless $is_valid_component;
1163 
1164  #
1165  # Retrieve the toplevel seq_regions from the database
1166  #
1167  my $sth =
1168  $self->prepare( "SELECT sr.seq_region_id, sr.name, sr.length, sr.coord_system_id "
1169  . "FROM seq_region sr "
1170  . "JOIN seq_region_attrib sa1 USING (seq_region_id) "
1171  . "JOIN attrib_type a1 ON sa1.attrib_type_id = a1.attrib_type_id "
1172  . "JOIN seq_region_attrib sa2 USING (seq_region_id) "
1173  . "JOIN attrib_type a2 ON sa2.attrib_type_id = a2.attrib_type_id "
1174  . "WHERE sa2.value=? and a1.code='toplevel' and a2.code='genome_component'"
1175  );
1176 
1177  if (looks_like_number($genome_component)) {
1178  $sth->bind_param( 1, $genome_component, SQL_INTEGER );
1179  } else {
1180  $sth->bind_param( 1, $genome_component, SQL_VARCHAR );
1181  }
1182 
1183  $sth->execute();
1184 
1185  my ( $seq_region_id, $name, $length, $cs_id );
1186  $sth->bind_columns( \( $seq_region_id, $name, $length, $cs_id ) );
1187 
1188  my $cache_count = 0;
1189 
1190  my @out;
1191  my $csa = $self->db->get_CoordSystemAdaptor();
1192 
1193  while($sth->fetch()) {
1194  my $cs = $csa->fetch_by_dbID($cs_id);
1195  throw("seq_region $name references non-existent coord_system $cs_id.")
1196  unless $cs;
1197 
1198  # cache values for future reference, but stop adding to the cache once we
1199  # we know we have filled it up
1200  if($cache_count < $Bio::EnsEMBL::Utils::SeqRegionCache::SEQ_REGION_CACHE_SIZE) {
1201  my $arr = [ $seq_region_id, $name, $cs_id, $length ];
1202 
1203  $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr;
1204  $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr;
1205 
1206  $cache_count++;
1207  }
1208 
1209  push @out, Bio::EnsEMBL::Slice->new_fast({
1210  'start' => 1,
1211  'end' => $length,
1212  'strand' => 1,
1213  'seq_region_name' => $name,
1214  'seq_region_length'=> $length,
1215  'coord_system' => $cs,
1216  'adaptor' => $self});
1217  }
1218 
1219  return \@out;
1220 }
1221 
1222 =head2 get_genome_component_for_slice
1223 
1224  Arg [1] : An object of type Bio::EnsEMBL::Slice
1225  Example : my $component = $slice->get_genome_component();
1226  Description: Returns the genome component of a slice
1227  Returntype : Scalar; the identifier of the genome component of the slice
1228  Exceptions : none
1229  Caller : general
1230  Status : Stable
1231 
1232 =cut
1233 
1234 sub get_genome_component_for_slice {
1235  my ($self, $slice) = @_;
1236 
1237  throw "Undefined slice" unless defined $slice;
1238  throw "Argument is not a slice"
1239  unless $slice->isa("Bio::EnsEMBL::Slice");
1240 
1241  my $seq_region_id = $self->get_seq_region_id($slice);
1242 
1243  my $sth =
1244  $self->prepare( "SELECT sa.value "
1245  . "FROM seq_region_attrib sa "
1246  . "JOIN seq_region sr USING (seq_region_id) "
1247  . "JOIN attrib_type at ON sa.attrib_type_id = at.attrib_type_id "
1248  . "WHERE sr.seq_region_id=? AND at.code='genome_component'"
1249  );
1250  $sth->bind_param( 1, $seq_region_id, SQL_INTEGER );
1251  $sth->execute();
1252 
1253  my $genome_component;
1254  $sth->bind_columns( \( $genome_component ) );
1255  $sth->fetch();
1256 
1257  return $genome_component;
1258 }
1259 
1260 =head2 fetch_all_karyotype
1261 
1262  Example : my $top = $slice_adptor->fetch_all_karyotype()
1263  Description: returns the list of all slices which are part of the karyotype
1264  Returntype : listref of Bio::EnsEMBL::Slices
1265  Caller : general
1266  Status : At Risk
1267 
1268 =cut
1269 
1270 sub fetch_all_karyotype {
1271  my $self = shift;
1272 
1273  my $csa = $self->db->get_CoordSystemAdaptor();
1274 
1275  my $sth =
1276  $self->prepare( 'SELECT sr.seq_region_id, sr.name, '
1277  . 'sr.length, sr.coord_system_id, sra.value '
1278  . 'FROM seq_region sr, seq_region_attrib sra, '
1279  . 'attrib_type at, coord_system cs '
1280  . 'WHERE at.code = "karyotype_rank" '
1281  . 'AND at.attrib_type_id = sra.attrib_type_id '
1282  . 'AND sra.seq_region_id = sr.seq_region_id '
1283  . 'AND sr.coord_system_id = cs.coord_system_id '
1284  . 'AND cs.species_id = ?');
1285  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
1286  $sth->execute();
1287  my ( $seq_region_id, $name, $length, $cs_id, $rank );
1288  $sth->bind_columns( \( $seq_region_id, $name, $length, $cs_id, $rank ) );
1289 
1290  my @out;
1291  while($sth->fetch()) {
1292  my $cs = $csa->fetch_by_dbID($cs_id);
1293 
1294  my $slice = Bio::EnsEMBL::Slice->new_fast({
1295  'start' => 1,
1296  'end' => ($length+0),
1297  'strand' => 1,
1298  'seq_region_name' => $name,
1299  'seq_region_length'=> ($length+0),
1300  'coord_system' => $cs,
1301  'adaptor' => $self,
1302  'karyotype' => 1,
1303  'karyotype_rank' => ($rank+0),
1304  });
1305 
1306  push @out, $slice;
1307  }
1308 
1309  #Sort using Perl as value in MySQL is a text field and not portable
1310  @out = sort { $a->{karyotype_rank} <=> $b->{karyotype_rank} } @out;
1311 
1312  return \@out;
1313 }
1314 
1315 =head2 is_toplevel
1316  Arg : int seq_region_id
1317  Example : my $top = $slice_adptor->is_toplevel($seq_region_id)
1318  Description: Returns 1 if slice is a toplevel slice else 0
1319  Returntype : int
1320  Caller : Slice method is_toplevel
1321  Status : At Risk
1322 
1323 =cut
1324 
1325 sub is_toplevel {
1326  my $self = shift;
1327  my $id = shift;
1328 
1329  my $sth = $self->prepare(
1330  "SELECT at.code from seq_region_attrib sra, attrib_type at "
1331  . "WHERE sra.seq_region_id = ? "
1332  . "AND at.attrib_type_id = sra.attrib_type_id "
1333  . "AND at.code = 'toplevel'" );
1334 
1335  $sth->bind_param( 1, $id, SQL_INTEGER );
1336  $sth->execute();
1337 
1338  my $code;
1339  $sth->bind_columns( \$code );
1340 
1341  while ( $sth->fetch ) {
1342  $sth->finish;
1343  return 1;
1344  }
1345 
1346  $sth->finish;
1347  return 0;
1348 }
1349 
1350 
1351 =head2 has_karyotype
1352  Arg : int seq_region_id
1353  Example : my $karyotype = $slice_adptor->has_karyotype($seq_region_id)
1354  Description: Returns 1 if slice is a part of a karyotype else 0
1355  Returntype : int
1356  Caller : Slice method has_karyotype
1357  Status : At Risk
1358 
1359 =cut
1360 
1361 sub has_karyotype {
1362  my $self = shift;
1363  my $id = shift;
1364 
1365  my $sth = $self->prepare(
1366  "SELECT at.code from seq_region_attrib sra, attrib_type at "
1367  . "WHERE sra.seq_region_id = ? "
1368  . "AND at.attrib_type_id = sra.attrib_type_id "
1369  . "AND at.code = 'karyotype_rank'" );
1370 
1371  $sth->bind_param( 1, $id, SQL_INTEGER );
1372  $sth->execute();
1373 
1374  my $code;
1375  $sth->bind_columns( \$code );
1376 
1377  while ( $sth->fetch ) {
1378  $sth->finish;
1379  return 1;
1380  }
1381 
1382  $sth->finish;
1383  return 0;
1384 }
1385 
1386 =head2 get_karyotype_rank
1387  Arg : int seq_region_id
1388  Example : my $rank = $slice_adptor->get_karyotype_rank($seq_region_id)
1389  Description: Returns the rank of a slice if it is part of the karyotype else 0
1390  Returntype : int
1391  Caller : Slice method get_karyotype_rank
1392  Status : At Risk
1393 
1394 =cut
1395 
1396 sub get_karyotype_rank {
1397  my $self = shift;
1398  my $id = shift;
1399 
1400  my $sth = $self->prepare(
1401  "SELECT sra.value from seq_region_attrib sra, attrib_type at "
1402  . "WHERE sra.seq_region_id = ? "
1403  . "AND at.attrib_type_id = sra.attrib_type_id "
1404  . "AND at.code = 'karyotype_rank'" );
1405 
1406  $sth->bind_param( 1, $id, SQL_INTEGER );
1407  $sth->execute();
1408 
1409  my $code;
1410  $sth->bind_columns( \$code );
1411 
1412  my $rank = $sth->fetchrow_array();
1413  $sth->finish();
1414 
1415  return $rank;
1416 }
1417 
1418 
1419 
1420 =head2 is_reference
1421  Arg : int seq_region_id
1422  Example : my $reference = $slice_adptor->is_reference($seq_region_id)
1423  Description: Returns 1 if slice is a reference slice else 0
1424  Returntype : int
1425  Caller : Slice method is_reference
1426  Status : At Risk
1427 
1428 =cut
1429 
1430 sub is_reference {
1431  my $self = shift;
1432  my $id = shift;
1433 
1434  my $sth = $self->prepare(
1435  "SELECT at.code from seq_region_attrib sra, attrib_type at "
1436  . "WHERE sra.seq_region_id = ? "
1437  . "AND at.attrib_type_id = sra.attrib_type_id "
1438  . "AND at.code = 'non_ref'" );
1439 
1440  $sth->bind_param( 1, $id, SQL_INTEGER );
1441  $sth->execute();
1442 
1443  my $code;
1444  $sth->bind_columns( \$code );
1445 
1446  while ( $sth->fetch ) {
1447  $sth->finish;
1448  return 0;
1449  }
1450 
1451  $sth->finish;
1452  return 1;
1453 }
1454 
1455 =head2 is_circular
1456 
1457  Arg[1] : int seq_region_id
1458  Example : my $circular = $slice_adptor->is_circular($seq_region_id);
1459  Description : Indicates if the sequence region was circular or not
1460  Returntype : Boolean
1461 
1462 =cut
1463 
1464 sub is_circular {
1465  my ($self, $id) = @_;
1466 
1467  if (! defined $self->{is_circular}) {
1468  $self->_build_circular_slice_cache();
1469  }
1470 
1471  return 0 if $self->{is_circular} == 0;
1472  return (exists $self->{circular_sr_id_cache}->{$id}) ? 1 : 0;
1473 }
1474 
1475 
1476 =head2 fetch_by_chr_band
1477 
1478  Title : fetch_by_chr_band
1479  Usage :
1480  Function: create a Slice representing a series of bands
1481  Example :
1482  Returns : Bio::EnsEMBL::Slice
1483  Args : the band name
1484  Status : Stable
1485 
1486 =cut
1487 
1488 sub fetch_by_chr_band {
1489  my ( $self, $chr, $band ) = @_;
1490 
1491  my $chr_slice = $self->fetch_by_region( 'toplevel', $chr );
1492  my $seq_region_id = $self->get_seq_region_id($chr_slice);
1493 
1494  my $sth =
1495  $self->prepare( 'SELECT MIN(k.seq_region_start), '
1496  . 'MAX(k.seq_region_end) '
1497  . 'FROM karyotype k '
1498  . 'WHERE k.seq_region_id = ? '
1499  . 'AND k.band LIKE ?' );
1500 
1501  $sth->bind_param( 1, $seq_region_id, SQL_INTEGER );
1502  $sth->bind_param( 2, "$band%", SQL_VARCHAR );
1503  $sth->execute();
1504 
1505  my ( $slice_start, $slice_end ) = $sth->fetchrow_array;
1506 
1507  if ( defined $slice_start ) {
1508  return
1509  $self->fetch_by_region( 'toplevel', $chr,
1510  $slice_start, $slice_end );
1511  }
1512 
1513  throw("Band not recognised in database");
1514 } ## end sub fetch_by_chr_band
1515 
1516 
1517 
1518 =head2 fetch_by_exon_stable_id
1519 
1520  Arg [1] : string $exonid
1521  The stable id of the exon around which the slice is
1522  desired
1523  Arg [2] : (optional) int $size
1524  The length of the flanking regions the slice should encompass
1525  on either side of the exon (0 by default)
1526  Example : $slc = $sa->fetch_by_exon_stable_id('ENSE00000302930',10);
1527  Description: Creates a slice around the region of the specified exon.
1528  If a context size is given, the slice is extended by that
1529  number of basepairs on either side of the exon.
1530 
1531  The slice will be created in the exon's native coordinate system
1532  and in the forward orientation.
1533  Returntype : Bio::EnsEMBL::Slice
1534  Exceptions : Thrown if the exon is not in the database.
1535  Caller : general
1536  Status : Stable
1537 
1538 =cut
1539 
1540 sub fetch_by_exon_stable_id{
1541  my ($self,$exonid,$size) = @_;
1542 
1543  throw('Exon argument is required.') if(!$exonid);
1544 
1545  my $ea = $self->db->get_ExonAdaptor;
1546  my $exon = $ea->fetch_by_stable_id($exonid);
1547 
1548  throw("Exon [$exonid] does not exist in DB.") if(!$exon);
1549 
1550  return $self->fetch_by_Feature($exon, $size);
1551 }
1552 
1553 =head2 fetch_by_transcript_stable_id
1554 
1555  Arg [1] : string $transcriptid
1556  The stable id of the transcript around which the slice is
1557  desired
1558  Arg [2] : (optional) int $size
1559  The length of the flanking regions the slice should encompass
1560  on either side of the transcript (0 by default)
1561  Example : $slc = $sa->fetch_by_transcript_stable_id('ENST00000302930',10);
1562  Description: Creates a slice around the region of the specified transcript.
1563  If a context size is given, the slice is extended by that
1564  number of basepairs on either side of the
1565  transcript.
1566 
1567  The slice will be created in the transcript's native coordinate
1568  system and in the forward orientation.
1569  Returntype : Bio::EnsEMBL::Slice
1570  Exceptions : Thrown if the transcript is not in the database.
1571  Caller : general
1572  Status : Stable
1573 
1574 =cut
1575 
1576 sub fetch_by_transcript_stable_id{
1577  my ($self,$transcriptid,$size) = @_;
1578 
1579  throw('Transcript argument is required.') if(!$transcriptid);
1580 
1581  my $ta = $self->db->get_TranscriptAdaptor;
1582  my $transcript = $ta->fetch_by_stable_id($transcriptid);
1583 
1584  throw("Transcript [$transcriptid] does not exist in DB.") if(!$transcript);
1585 
1586  return $self->fetch_by_Feature($transcript, $size);
1587 }
1588 
1589 =head2 fetch_by_transcript_id
1590 
1591  Arg [1] : int $transcriptid
1592  The unique database identifier of the transcript around which
1593  the slice is desired
1594  Arg [2] : (optional) int $size
1595  The length of the flanking regions the slice should encompass
1596  on either side of the transcript (0 by default)
1597  Example : $slc = $sa->fetch_by_transcript_id(24, 1000);
1598  Description: Creates a slice around the region of the specified transcript.
1599  If a context size is given, the slice is extended by that
1600  number of basepairs on either side of the
1601  transcript.
1602 
1603  The slice will be created in the transcript's native coordinate
1604  system and in the forward orientation.
1605  Returntype : Bio::EnsEMBL::Slice
1606  Exceptions : throw on incorrect args
1607  throw if transcript is not in database
1608  Caller : general
1609  Status : Stable
1610 
1611 =cut
1612 
1613 sub fetch_by_transcript_id {
1614  my ($self,$transcriptid,$size) = @_;
1615 
1616  throw('Transcript id argument is required.') if(!$transcriptid);
1617 
1618  my $transcript_adaptor = $self->db()->get_TranscriptAdaptor();
1619  my $transcript = $transcript_adaptor->fetch_by_dbID($transcriptid);
1620 
1621  throw("Transcript [$transcriptid] does not exist in DB.") if(!$transcript);
1622 
1623  return $self->fetch_by_Feature($transcript, $size);
1624 }
1625 
1626 
1627 
1628 =head2 fetch_by_gene_stable_id
1629 
1630  Arg [1] : string $geneid
1631  The stable id of the gene around which the slice is
1632  desired
1633  Arg [2] : (optional) int $size
1634  The length of the flanking regions the slice should encompass
1635  on either side of the gene (0 by default)
1636  Example : $slc = $sa->fetch_by_gene_stable_id('ENSG00000012123',10);
1637  Description: Creates a slice around the region of the specified gene.
1638  If a context size is given, the slice is extended by that
1639  number of basepairs on either side of the gene.
1640 
1641  The slice will be created in the gene's native coordinate system
1642  and in the forward orientation.
1643  Returntype : Bio::EnsEMBL::Slice
1644  Exceptions : throw on incorrect args
1645  throw if transcript does not exist
1646  Caller : general
1647  Status : Stable
1648 
1649 =cut
1650 
1651 sub fetch_by_gene_stable_id {
1652  my ($self,$geneid,$size) = @_;
1653 
1654  throw('Gene argument is required.') if(!$geneid);
1655 
1656  my $gene_adaptor = $self->db->get_GeneAdaptor();
1657  my $gene = $gene_adaptor->fetch_by_stable_id($geneid);
1658 
1659  throw("Gene [$geneid] does not exist in DB.") if(!$gene);
1660 
1661  return $self->fetch_by_Feature($gene, $size);
1662 }
1663 
1664 
1665 
1666 =head2 fetch_by_Feature
1667 
1668  Arg [1] : Bio::EnsEMBL::Feature $feat
1669  The feature to fetch the slice around
1670  Arg [2] : int size (optional)
1671  The desired number of flanking basepairs around the feature.
1672  The size may also be provided as a percentage of the feature
1673  size such as 200% or 80.5%.
1674  Example : $slice = $slice_adaptor->fetch_by_Feature($feat, 100);
1675  Description: Retrieves a slice around a specific feature. All this really
1676  does is return a resized version of the slice that the feature
1677  is already on. Note that slices returned from this method
1678  are always on the forward strand of the seq_region regardless of
1679  the strandedness of the feature passed in.
1680  Returntype : Bio::EnsEMBL::Slice
1681  Exceptions : throw if the feature does not have an attached slice
1682  throw if feature argument is not provided
1683  Caller : fetch_by_gene_stable_id, fetch_by_transcript_stable_id,
1684  fetch_by_gene_id, fetch_by_transcript_id
1685  Status : Stable
1686 
1687 =cut
1688 
1689 sub fetch_by_Feature{
1690  my ($self, $feature, $size) = @_;
1691 
1692  $size ||= 0;
1693 
1694  if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
1695  throw('Feature argument expected.');
1696  }
1697 
1698  my $slice = $feature->slice();
1699  if(!$slice || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice') )) {
1700  throw('Feature must be attached to a valid slice.');
1701  }
1702 
1703 
1704  my $fstart = $feature->start();
1705  my $fend = $feature->end();
1706  if(!defined($fstart) || !defined($fend)) {
1707  throw('Feature must have defined start and end.');
1708  }
1709 
1710  #convert the feature slice coordinates to seq_region coordinates
1711  my $slice_start = $slice->start();
1712  my $slice_end = $slice->end();
1713  my $slice_strand = $slice->strand();
1714  if($slice_start != 1 || $slice_strand != 1) {
1715  if($slice_strand == 1) {
1716  $fstart = $fstart + $slice_start - 1;
1717  $fend = $fend + $slice_start - 1;
1718  } else {
1719  my $tmp_start = $fstart;
1720  $fstart = $slice_end - $fend + 1;
1721  $fend = $slice_end - $tmp_start + 1;
1722  }
1723  }
1724 
1725  ## Size may be stored as a %age of the length of the feature
1726  ## Size = 100% gives no context
1727  ## Size = 200% gives context - 50% the size of the feature either side of
1728  ## feature
1729 
1730  $size = int( ($1-100)/200 * ($fend-$fstart+1) ) if( $size =~/([\d+\.]+)%/ );
1731 
1732  #return a new slice covering the region of the feature
1733  my $S = Bio::EnsEMBL::Slice->new_fast({
1734  'seq_region_name' => $slice->seq_region_name,
1735  'seq_region_length' => $slice->seq_region_length,
1736  'coord_system' => $slice->coord_system,
1737  'start' => $fstart - $size,
1738  'end' => $fend + $size,
1739  'strand' => 1,
1740  'adaptor' => $self});
1741  $S->{'_raw_feature_strand'} = $feature->strand * $slice_strand if $feature->can('strand');
1742  return $S;
1743 }
1744 
1745 
1746 
1747 =head2 fetch_by_misc_feature_attribute
1748 
1749  Arg [1] : string $attribute_type
1750  The code of the attribute type
1751  Arg [2] : (optional) string $attribute_value
1752  The value of the attribute to fetch by
1753  Arg [3] : (optional) int $size
1754  The amount of flanking region around the misc feature desired.
1755  Example : $slice = $sa->fetch_by_misc_feature_attribute('superctg',
1756  'NT_030871');
1757  $slice = $sa->fetch_by_misc_feature_attribute('synonym',
1758  'AL00012311',
1759  $flanking);
1760  Description: Fetches a slice around a MiscFeature with a particular
1761  attribute type and value. If no value is specified then
1762  the feature with the particular attribute is used.
1763  If no size is specified then 0 is used.
1764  Returntype : Bio::EnsEMBL::Slice
1765  Exceptions : Throw if no feature with the specified attribute type and value
1766  exists in the database
1767  Warning if multiple features with the specified attribute type
1768  and value exist in the database.
1769  Caller : webcode
1770  Status : Stable
1771 
1772 =cut
1773 
1774 sub fetch_by_misc_feature_attribute {
1775  my ($self, $attrib_type_code, $attrib_value, $size) = @_;
1776 
1777  my $mfa = $self->db()->get_MiscFeatureAdaptor();
1778 
1779  my $feats = $mfa->fetch_all_by_attribute_type_value($attrib_type_code,
1780  $attrib_value);
1781 
1782  if(@$feats == 0) {
1783  throw("MiscFeature with $attrib_type_code=$attrib_value does " .
1784  "not exist in DB.");
1785  }
1786 
1787  if(@$feats > 1) {
1788  warning("MiscFeature with $attrib_type_code=$attrib_value is " .
1789  "ambiguous - using first one found.");
1790  }
1791 
1792  my ($feat) = @$feats;
1793 
1794  return $self->fetch_by_Feature($feat, $size);
1795 }
1796 
1797 =head2 fetch_by_misc_feature_set
1798 
1799  Arg [1] : string $attribute_type
1800  The code of the attribute type
1801  Arg [2] : (optional) string $attribute_value
1802  The value of the attribute to fetch by
1803  Arg [3] : (optional) the name of the set
1804  Arg [4] : (optional) int $size
1805  The amount of flanking region around the misc feature desired.
1806  Example : $slice = $sa->fetch_by_misc_feature_set('clone',
1807  'RP11-411G9'
1808  'tilepath');
1809  Description: Fetches a slice around a MiscFeature with a particular
1810  attribute type, value and set. If no value is specified then
1811  the feature with the particular attribute is used.
1812  A size can be specified to include flanking region
1813  If no size is specified then 0 is used.
1814  Returntype : Bio::EnsEMBL::Slice
1815  Exceptions : Throw if no feature with the specified attribute type, value and set
1816  exists in the database
1817  Warning if multiple features with the specified attribute type, set
1818  and value exist in the database.
1819  Caller : webcode
1820  Status : Stable
1821 
1822 =cut
1823 
1824 sub fetch_by_misc_feature_set {
1825  my ($self, $attrib_type_code, $attrib_value, $misc_set, $size) = @_;
1826 
1827  my $mfa = $self->db()->get_MiscFeatureAdaptor();
1828 
1829  my $feat = $mfa->fetch_by_attribute_set_value($attrib_type_code,
1830  $attrib_value,
1831  $misc_set);
1832 
1833  return $self->fetch_by_Feature($feat, $size);
1834 }
1835 
1836 =head2 fetch_normalized_slice_projection
1837 
1838  Arg [1] : Bio::EnsEMBL::Slice $slice
1839  Arg [2] : boolean $filter_projections
1840  Optionally filter the projections to remove anything
1841  which is the same sequence region as the given slice
1842  Example : ( optional )
1843  Description: gives back a project style result. The returned slices
1844  represent the areas to which there are symlinks for the
1845  given slice. start, end show which area on given slice is
1846  symlinked
1847  Returntype : [[start,end,$slice][]]
1848  Exceptions : none
1849  Caller : BaseFeatureAdaptor
1850  Status : Stable
1851 
1852 =cut
1853 
1854 
1855 sub fetch_normalized_slice_projection {
1856  my $self = shift;
1857  my $slice = shift;
1858  my $filter_projections = shift;
1859 
1860  my $slice_seq_region_id = $self->get_seq_region_id( $slice );
1861 
1862  $self->_build_exception_cache() if(!exists($self->{'asm_exc_cache'}));
1863 
1864  my $result = $self->{'asm_exc_cache'}->{$slice_seq_region_id};
1865 
1866  $result ||= [];
1867 
1868  my (@haps, @pars);
1869 
1870  foreach my $row (@$result) {
1871  my ( $seq_region_id, $seq_region_start, $seq_region_end,
1872  $exc_type, $exc_seq_region_id, $exc_seq_region_start,
1873  $exc_seq_region_end ) = @$row;
1874 
1875  # need overlapping PAR and all HAPs if any
1876  if( $exc_type eq "PAR" ) {
1877  if( $seq_region_start <= $slice->end() &&
1878  $seq_region_end >= $slice->start() ) {
1879  push( @pars, [ $seq_region_start, $seq_region_end, $exc_seq_region_id,
1880  $exc_seq_region_start, $exc_seq_region_end ] );
1881  }
1882  } else {
1883  push( @haps, [ $seq_region_start, $seq_region_end, $exc_seq_region_id,
1884  $exc_seq_region_start, $exc_seq_region_end ] );
1885  }
1886  }
1887 
1888  if(!@pars && !@haps) {
1889  #just return this slice, there were no haps or pars
1890  return [bless ( [1,$slice->length, $slice], "Bio::EnsEMBL::ProjectionSegment")];
1891  }
1892 
1893  my @syms;
1894  if( @haps >= 1 ) {
1895  my @sort_haps = sort { $a->[1] <=> $b->[1] } @haps;
1896 
1897  my $count =0;
1898  my $chr_start = 1;
1899  my $hap_start = 1;
1900  my $last = 0;
1901 
1902  my $seq_region_slice = $self->fetch_by_seq_region_id($slice_seq_region_id);
1903  my $exc_slice = $self->fetch_by_seq_region_id( $sort_haps[0][2] );
1904  my $len1 = $seq_region_slice->length();
1905  my $len2 = $exc_slice->length();
1906  my $max_len = ($len1 > $len2) ? $len1 : $len2;
1907 
1908  while($count <= scalar(@sort_haps) and !$last){
1909  my $chr_end;
1910  my $hap_end;
1911  if(defined($sort_haps[$count]) and defined($sort_haps[$count][0]) ){
1912  $hap_end = $sort_haps[$count][0]-1;
1913  $chr_end = $sort_haps[$count][3]-1
1914  }
1915  else{
1916  $last = 1;
1917  $hap_end = $len1;
1918  $chr_end = $len2;
1919  my $diff = ($hap_end-$hap_start)-($chr_end-$chr_start);
1920  if($diff > 0){
1921  push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end+$diff] );
1922  }
1923  elsif($diff < 0){
1924  push( @syms, [ $hap_start, $hap_end - $diff, $sort_haps[0][2], $chr_start, $chr_end] );
1925  }
1926  else{
1927  push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end] );
1928  }
1929  next;
1930  }
1931  if($hap_end and $hap_start < $len1){ # if hap at start or end of chromosome
1932  push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end] );
1933  }
1934  $chr_start = $chr_end + ($sort_haps[$count][4]-$sort_haps[$count][3]) + 2;
1935  $hap_start = $hap_end + ($sort_haps[$count][1]-$sort_haps[$count][0]) + 2;
1936  $count++;
1937  }
1938 
1939 
1940  }
1941 
1942 
1943  # for now haps and pars should not be both there, but in theory we
1944  # could handle it here by cleverly merging the pars into the existing syms,
1945  # for now just:
1946  push( @syms, @pars );
1947 
1948  my $mapper = Bio::EnsEMBL::Mapper->new( "sym", "org" );
1949  my $count = 0;
1950  for my $sym ( @syms ) {
1951  $mapper->add_map_coordinates( $slice_seq_region_id, $sym->[0], $sym->[1],
1952  1, $sym->[2], $sym->[3], $sym->[4] );
1953  }
1954 
1955  my @linked = $mapper->map_coordinates( $slice_seq_region_id,
1956  $slice->start(), $slice->end(),
1957  $slice->strand(), "sym" );
1958 
1959  # gaps are regions where there is no mapping to another region
1960  my $rel_start = 1;
1961 
1962  #if there was only one coord and it is a gap, we know it is just the
1963  #same slice with no overlapping symlinks
1964  if(@linked == 1 && $linked[0]->isa('Bio::EnsEMBL::Mapper::Gap')) {
1965  return [bless( [1,$slice->length, $slice], "Bio::EnsEMBL::ProjectionSegment" )];
1966  }
1967 
1968  my @out;
1969  for my $coord ( @linked ) {
1970  if( $coord->isa( "Bio::EnsEMBL::Mapper::Gap" )) {
1971  my $exc_slice = Bio::EnsEMBL::Slice->new_fast({
1972  'start' => $coord->start(),
1973  'end' => $coord->end(),
1974  'strand' => $slice->strand(),
1975  'coord_system' => $slice->coord_system(),
1976  'adaptor' => $self,
1977  'seq_region_name' => $slice->seq_region_name(),
1978  'seq_region_length' => $slice->seq_region_length()});
1979  push( @out, bless ( [ $rel_start, $coord->length()+$rel_start-1,
1980  $exc_slice ], "Bio::EnsEMBL::ProjectionSegment") );
1981  } else {
1982  my $exc_slice = $self->fetch_by_seq_region_id( $coord->id() );
1983  my $exc2_slice = Bio::EnsEMBL::Slice->new_fast({
1984 
1985  'start' => $coord->start(),
1986  'end' => $coord->end(),
1987  'strand' => $coord->strand(),
1988  'seq_region_name' => $exc_slice->seq_region_name(),
1989  'seq_region_length' => $exc_slice->seq_region_length(),
1990  'coord_system' => $exc_slice->coord_system(),
1991  'adaptor' => $self
1992  });
1993 
1994  push( @out, bless( [ $rel_start, $coord->length() + $rel_start - 1,
1995  $exc2_slice ], "Bio::EnsEMBL::ProjectionSegment") );
1996  }
1997  $rel_start += $coord->length();
1998  }
1999 
2000  if($filter_projections) {
2001  return $self->_filter_Slice_projections($slice, \@out);
2002  }
2003  return \@out;
2004 }
2005 
2006 =head2 _filter_Slice_projections
2007 
2008  Arg [1] : Bio::EnsEMBL::Slice The slice the projections were made from
2009  Arg [2] : Array The projections which were fetched from the previous slice
2010  Description : Removes any projections which occur within the same sequence
2011  region as the given Slice object
2012  Returntype : ArrayRef Bio::EnsEMBL::ProjectionSegment; Returns an array
2013  of projected segments
2014 =cut
2015 
2016 sub _filter_Slice_projections {
2017  my ($self, $slice, $projections) = @_;
2018  my @proj = @{ $projections };
2019  if ( !@proj ) {
2020  throw('Was not given any projections to filter. Database may have incorrect assembly_exception information loaded');
2021  }
2022 
2023  # Want to get features on the FULL original slice as well as any
2024  # symlinked slices.
2025 
2026  # Filter out partial slices from projection that are on same
2027  # seq_region as original slice.
2028 
2029  my $sr_id = $slice->get_seq_region_id();
2030 
2031  @proj = grep { $_->to_Slice->get_seq_region_id() != $sr_id } @proj;
2032 
2033  my $segment = bless( [ 1, $slice->length(), $slice ],
2034  'Bio::EnsEMBL::ProjectionSegment' );
2035  push( @proj, $segment );
2036  return \@proj;
2037 }
2038 
2039 
2040 =head2 store
2041 
2042  Arg [1] : Bio::EnsEMBL::Slice $slice
2043  Arg [2] : (optional) $seqref reference to a string
2044  The sequence associated with the slice to be stored.
2045  Example : $slice = Bio::EnsEMBL::Slice->new(...);
2046  $seq_region_id = $slice_adaptor->store($slice, \$sequence);
2047  Description: This stores a slice as a sequence region in the database
2048  and returns the seq region id. The passed in slice must
2049  start at 1, and must have a valid seq_region name and coordinate
2050  system. The attached coordinate system must already be stored in
2051  the database. The sequence region is assumed to start at 1 and
2052  to have a length equalling the length of the slice. The end of
2053  the slice must equal the seq_region_length.
2054  If the slice coordinate system is the sequence level coordinate
2055  system then the seqref argument must also be passed. If the
2056  slice coordinate system is NOT a sequence level coordinate
2057  system then the sequence argument cannot be passed.
2058  Returntype : int
2059  Exceptions : throw if slice has no coord system.
2060  throw if slice coord system is not already stored.
2061  throw if slice coord system is seqlevel and no sequence is
2062  provided.
2063  throw if slice coord system is not seqlevel and sequence is
2064  provided.
2065  throw if slice does not start at 1
2066  throw if sequence is provided and the sequence length does not
2067  match the slice length.
2068  throw if the SQL insert fails (e.g. on duplicate seq region)
2069  throw if slice argument is not passed
2070  throw if the slice end is not equal to seq_region_length
2071  Caller : database loading scripts
2072  Status : Stable
2073 
2074 =cut
2075 
2076 
2077 
2078 sub store {
2079  my $self = shift;
2080  my $slice = shift;
2081  my $seqref = shift;
2082  my $not_dna = shift;
2083 
2084  #
2085  # Get all of the sanity checks out of the way before storing anything
2086  #
2087 
2088  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2089  throw('Slice argument is required');
2090  }
2091 
2092  my $cs = $slice->coord_system();
2093  throw("Slice must have attached CoordSystem.") if(!$cs);
2094 
2095  my $db = $self->db();
2096  if(!$cs->is_stored($db)) {
2097  throw("Slice CoordSystem must already be stored in DB.")
2098  }
2099 
2100  if($slice->start != 1 || $slice->strand != 1) {
2101  throw("Slice must have start==1 and strand==1.");
2102  }
2103 
2104  if($slice->end() != $slice->seq_region_length()) {
2105  throw("Slice must have end==seq_region_length");
2106  }
2107 
2108  my $sr_len = $slice->length();
2109  my $sr_name = $slice->seq_region_name();
2110 
2111  if($sr_name eq '') {
2112  throw("Slice must have valid seq region name.");
2113  }
2114 
2115  if($cs->is_sequence_level() && !$not_dna) {
2116  if(!$seqref) {
2117  throw("Must provide sequence for sequence level coord system.");
2118  }
2119  if(ref($seqref) ne 'SCALAR') {
2120  throw("Sequence must be a scalar reference.");
2121  }
2122  my $seq_len = length($$seqref);
2123 
2124  if($seq_len != $sr_len) {
2125  throw("Sequence length ($seq_len) must match slice length ($sr_len).");
2126  }
2127  } else {
2128  if($seqref) {
2129  throw("Cannot provide sequence for non-sequence level seq regions.");
2130  }
2131  }
2132 
2133  #store the seq_region
2134 
2135  my $sth = $db->dbc->prepare("INSERT INTO seq_region " .
2136  " ( name, length, coord_system_id ) " .
2137  " VALUES ( ?, ?, ? )"
2138  );
2139 
2140  $sth->bind_param(1,$sr_name,SQL_VARCHAR);
2141  $sth->bind_param(2,$sr_len,SQL_INTEGER);
2142  $sth->bind_param(3,$cs->dbID,SQL_INTEGER);
2143 
2144  $sth->execute();
2145 
2146  my $seq_region_id = $self->last_insert_id('seq_region_id', undef, 'seq_region');
2147 
2148  if(!$seq_region_id) {
2149  throw("Database seq_region insertion failed.");
2150  }
2151 
2152  if($cs->is_sequence_level() && !$not_dna) {
2153  #store sequence if it was provided
2154  my $seq_adaptor = $db->get_SequenceAdaptor();
2155  $seq_adaptor->store($seq_region_id, $$seqref);
2156  }
2157 
2158  #synonyms
2159  if(defined($slice->{'synonym'})){
2160  foreach my $syn (@{$slice->{'synonym'}} ){
2161  $syn->seq_region_id($seq_region_id); # set the seq_region_id
2162  $syn->adaptor->store($syn);
2163  }
2164  }
2165 
2166 
2167  $slice->adaptor($self);
2168 
2169  return $seq_region_id;
2170 }
2171 
2172 
2173 sub update {
2174  my $self = shift;
2175  my $slice = shift;
2176 
2177  #
2178  # Get all of the sanity checks out of the way before storing anything
2179  #
2180 
2181  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2182  throw('Slice argument is required');
2183  }
2184 
2185  my $cs = $slice->coord_system();
2186  throw("Slice must have attached CoordSystem.") if(!$cs);
2187 
2188  my $db = $self->db();
2189 
2190  if($slice->start != 1 || $slice->strand != 1) {
2191  throw("Slice must have start==1 and strand==1.");
2192  }
2193 
2194  if($slice->end() != $slice->seq_region_length()) {
2195  throw("Slice must have end==seq_region_length");
2196  }
2197 
2198  my $sr_len = $slice->length();
2199  my $sr_name = $slice->seq_region_name();
2200 
2201  if(!$sr_name) {
2202  throw("Slice must have valid seq region name.");
2203  }
2204 
2205  #update the seq_region
2206 
2207  my $seq_region_id = $slice->get_seq_region_id();
2208  my $update_sql = qq(
2209  UPDATE seq_region
2210  SET name = ?,
2211  length = ?,
2212  coord_system_id = ?
2213  WHERE seq_region_id = ?
2214  );
2215 
2216  my $sth = $db->dbc->prepare($update_sql);
2217 
2218  $sth->bind_param(1,$sr_name,SQL_VARCHAR);
2219  $sth->bind_param(2,$sr_len,SQL_INTEGER);
2220  $sth->bind_param(3,$cs->dbID,SQL_INTEGER);
2221 
2222  $sth->bind_param(4, $seq_region_id, SQL_INTEGER);
2223 
2224  $sth->execute();
2225 
2226  #synonyms
2227  if(defined($slice->{'synonym'})){
2228  foreach my $syn (@{$slice->{'synonym'}} ){
2229  $syn->seq_region_id($seq_region_id); # set the seq_region_id
2230  my $syn_adaptor = $db->get_SeqRegionSynonymAdaptor();
2231  $syn_adaptor->store($syn);
2232  }
2233  }
2234 }
2235 
2236 =head2 remove
2237 
2238  Arg [1] : Bio::EnsEMBL::Slice $slice
2239  The slice to remove from the database
2240  Example : $slice_adaptor->remove($slice);
2241  Description: Removes a slice completely from the database.
2242  All associated seq_region_attrib are removed as well.
2243  If dna is attached to the slice, it is also removed.
2244  Returntype : none
2245  Exceptions : throw if slice has no coord system.
2246  throw if slice argument is not passed
2247  warning if slice is not stored in this database
2248  Caller : general
2249  Status : Stable
2250 
2251 =cut
2252 
2253 
2254 sub remove {
2255  my $self = shift;
2256  my $slice = shift;
2257 
2258  #
2259  # Get all of the sanity checks out of the way before storing anything
2260  #
2261 
2262  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2263  throw('Slice argument is required');
2264  }
2265 
2266  my $cs = $slice->coord_system();
2267  throw("Slice must have attached CoordSystem.") if(!$cs);
2268 
2269  my $db = $self->db();
2270  my $seq_region_id = $slice->get_seq_region_id();
2271 
2272  if ($cs->is_sequence_level()) {
2273  my $seq_adaptor = $db->get_SequenceAdaptor();
2274  $seq_adaptor->remove($seq_region_id);
2275  }
2276 
2277  if(defined($slice->{'synonym'})){
2278  foreach my $syn (@{$slice->{'synonym'}} ){
2279  $syn->seq_region_id($seq_region_id); # set the seq_region_id
2280  $syn->adaptor->remove($syn);
2281  }
2282  }
2283 
2284  my $attrib_adaptor = $self->db->get_AttributeAdaptor();
2285  $attrib_adaptor->remove_from_Slice($slice);
2286  $self->remove_assembly($slice);
2287 
2288  my $sr_name = $slice->seq_region_name();
2289 
2290  #remove the seq_region
2291 
2292  my $sth = $db->dbc->prepare("DELETE FROM seq_region " .
2293  "WHERE name = ? AND " .
2294  " coord_system_id = ?" );
2295 
2296  $sth->bind_param(1,$sr_name,SQL_VARCHAR);
2297  $sth->bind_param(2,$cs->dbID,SQL_INTEGER);
2298 
2299  $sth->execute();
2300 
2301  return;
2302 }
2303 
2304 
2305 =head2 remove_assembly
2306 
2307  Arg [1] : Bio::EnsEMBL::Slice $asm_slice or $cmp_slice
2308  Example : $slice_adaptor->remove_assembly( $slice );
2309  Description: Deletes from the assembly table
2310  where asm or cmp corresponds to slice
2311  Do not call this method unless you really know what you are doing
2312  Returntype : none
2313  Exceptions : throw if slice has no coord system (cs).
2314  throw if no slice provided, or argument is not a slice
2315  Caller : Internal
2316  Status : Experimental
2317 
2318 =cut
2319 
2320 
2321 sub remove_assembly {
2322  my $self = shift;
2323  my $slice = shift;
2324 
2325  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2326  throw('Slice argument is required');
2327  }
2328 
2329  my $cs = $slice->coord_system();
2330  throw("Slice must have attached CoordSystem.") if(!$cs);
2331 
2332  #
2333  # Checks complete. Delete the data
2334  #
2335  my $sth = $self->db->dbc->prepare
2336  ("DELETE FROM assembly " .
2337  "WHERE asm_seq_region_id = ? OR " .
2338  " cmp_seq_region_id = ? ");
2339 
2340  my $asm_seq_region_id = $self->get_seq_region_id( $slice );
2341  my $cmp_seq_region_id = $self->get_seq_region_id( $slice );
2342 
2343  $sth->bind_param(1,$asm_seq_region_id,SQL_INTEGER);
2344  $sth->bind_param(2,$cmp_seq_region_id,SQL_INTEGER);
2345 
2346  $sth->execute();
2347  $sth->finish();
2348 
2349  return;
2350 
2351 }
2352 
2353 =head2 fetch_assembly
2354 
2355  Arg [1] : Bio::EnsEMBL::Slice $asm_slice
2356  Arg [2] : Bio::EnsEMBL::Slice $cmp_slice
2357  Example : $asm = $slice_adaptor->fetch_assembly( $slice1, $slice2 );
2358  Description: Fetches from the assembly table based on the
2359  coordinates of the two slices supplied.
2360  Returns a mapper object mapping the two slices
2361  Do not call this method unless you really know what you are doing
2362  Returntype : Bio::EnsEMBL::Mapper
2363  Exceptions : throw if either slice has no coord system (cs).
2364  throw if there is no mapping path between coord systems
2365  throw if there are existing mappings between either slice
2366  and the oposite cs
2367  Caller : Internal
2368  Status : Experimental
2369 
2370 =cut
2371 
2372 
2373 sub fetch_assembly {
2374  my $self = shift;
2375  my $asm_slice = shift;
2376  my $cmp_slice = shift;
2377 
2378  if(!ref($asm_slice) || !($asm_slice->isa('Bio::EnsEMBL::Slice') or $asm_slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2379  throw('Assembled Slice argument is required');
2380  }
2381  if(!ref($cmp_slice) || !($cmp_slice->isa('Bio::EnsEMBL::Slice') or $cmp_slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
2382  throw('Assembled Slice argument is required');
2383  }
2384 
2385  my $asm_cs = $asm_slice->coord_system();
2386  throw("Slice must have attached CoordSystem.") if(!$asm_cs);
2387  my $cmp_cs = $cmp_slice->coord_system();
2388  throw("Slice must have attached CoordSystem.") if(!$cmp_cs);
2389 
2390  my @path =
2391  @{ $asm_cs->adaptor()->get_mapping_path( $asm_cs, $cmp_cs ) };
2392 
2393  if ( !@path ) {
2394  throw("No mapping path defined between "
2395  . $asm_cs->name() . " and "
2396  . $cmp_cs->name() );
2397  }
2398 
2399  #
2400  # Checks complete. Fetch the data
2401  #
2402  my $sth = $self->db->dbc->prepare
2403  ("SELECT * FROM assembly " .
2404  "WHERE asm_seq_region_id = ? AND " .
2405  " cmp_seq_region_id = ? AND " .
2406  " asm_start = ? AND " .
2407  " asm_end = ? AND " .
2408  " cmp_start = ? AND " .
2409  " cmp_end = ? AND " .
2410  " ori = ?" );
2411 
2412  my $asm_seq_region_id = $self->get_seq_region_id( $asm_slice );
2413  my $cmp_seq_region_id = $self->get_seq_region_id( $cmp_slice );
2414  my $ori = $asm_slice->strand * $cmp_slice->strand;
2415 
2416  $sth->bind_param(1,$asm_seq_region_id,SQL_INTEGER);
2417  $sth->bind_param(2,$cmp_seq_region_id,SQL_INTEGER);
2418  $sth->bind_param(3,$asm_slice->start,SQL_INTEGER);
2419  $sth->bind_param(4,$asm_slice->end,SQL_INTEGER);
2420  $sth->bind_param(5,$cmp_slice->start,SQL_INTEGER);
2421  $sth->bind_param(6,$cmp_slice->end,SQL_INTEGER);
2422  $sth->bind_param(7,$ori,SQL_INTEGER);
2423 
2424  $sth->execute();
2425 
2426  my $results = $sth->fetchrow_array();
2427  $sth->finish();
2428 
2429  my $mapper;
2430  if ($results) {
2431  $mapper = Bio::EnsEMBL::Mapper->new($asm_slice, $cmp_slice, $asm_cs, $cmp_cs);
2432  }
2433 
2434  return $mapper;
2435 
2436 }
2437 
2438 =head2 store_assembly
2439 
2440  Arg [1] : Bio::EnsEMBL::Slice $asm_slice
2441  Arg [2] : Bio::EnsEMBL::Slice $cmp_slice
2442  Example : $asm = $slice_adaptor->store_assembly( $slice1, $slice2 );
2443  Description: Creates an entry in the assembly table based on the
2444  coordinates of the two slices supplied. Returns a string
2445  representation of the assembly that gets created.
2446  Returntype : string
2447  Exceptions : throw if either slice has no coord system (cs).
2448  throw unless the cs rank of the asm_slice is lower than the
2449  cmp_slice.
2450  throw if there is no mapping path between coord systems
2451  throw if the lengths of each slice are not equal
2452  throw if there are existing mappings between either slice
2453  and the oposite cs
2454  Caller : database loading scripts
2455  Status : Experimental
2456 
2457 =cut
2458 
2459 sub store_assembly{
2460  my $self = shift;
2461  my $asm_slice = shift;
2462  my $cmp_slice = shift;
2463 
2464  #
2465  # Get all of the sanity checks out of the way before storing anything
2466  #
2467 
2468  if(!ref($asm_slice) || !($asm_slice->isa('Bio::EnsEMBL::Slice') or $asm_slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2469  throw('Assembled Slice argument is required');
2470  }
2471  if(!ref($cmp_slice) || !($cmp_slice->isa('Bio::EnsEMBL::Slice') or $cmp_slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
2472  throw('Assembled Slice argument is required');
2473  }
2474 
2475  my $asm_cs = $asm_slice->coord_system();
2476  throw("Slice must have attached CoordSystem.") if(!$asm_cs);
2477  my $cmp_cs = $cmp_slice->coord_system();
2478  throw("Slice must have attached CoordSystem.") if(!$cmp_cs);
2479 
2480  my @path =
2481  @{ $asm_cs->adaptor()->get_mapping_path( $asm_cs, $cmp_cs ) };
2482 
2483  if ( !@path ) {
2484  throw("No mapping path defined between "
2485  . $asm_cs->name() . " and "
2486  . $cmp_cs->name() );
2487  }
2488 
2489  if( $asm_slice->length != $cmp_slice->length ){
2490  throw("The lengths of the assembled and component slices are not equal" );
2491  }
2492 
2493  # For now we disallow any existing mappings between the asm slice and cmp
2494  # CoordSystem and vice-versa.
2495  # Some cases of multiple mappings may be allowable by the API, but their
2496  # logic needs to be coded below.
2497 
2498  my $asm_proj = $asm_slice->project( $cmp_cs->name, $cmp_cs->version );
2499  if( @$asm_proj && $cmp_cs->name ne 'lrg' && $asm_cs->name ne 'lrg'){
2500  throw("Regions of the assembled slice are already assembled ".
2501  "into the component CoordSystem" );
2502  }
2503  my $cmp_proj = $cmp_slice->project( $asm_cs->name, $asm_cs->version );
2504  if( @$cmp_proj && $cmp_cs->name ne 'lrg' && $asm_cs->name ne 'lrg'){
2505  throw("Regions of the component slice are already assembled ".
2506  "into the assembled CoordSystem" );
2507  }
2508 
2509  #
2510  # Checks complete. Store the data
2511  #
2512  my $sth = $self->db->dbc->prepare
2513  ("INSERT INTO assembly " .
2514  " ( asm_seq_region_id, " .
2515  " cmp_seq_region_id, " .
2516  " asm_start, " .
2517  " asm_end , " .
2518  " cmp_start, " .
2519  " cmp_end , " .
2520  " ori )" .
2521  "VALUES ( ?, ?, ?, ?, ?, ?, ? )");
2522 
2523  my $asm_seq_region_id = $self->get_seq_region_id( $asm_slice );
2524  my $cmp_seq_region_id = $self->get_seq_region_id( $cmp_slice );
2525  my $ori = $asm_slice->strand * $cmp_slice->strand;
2526 
2527  $sth->bind_param(1,$asm_seq_region_id,SQL_INTEGER);
2528  $sth->bind_param(2,$cmp_seq_region_id,SQL_INTEGER);
2529  $sth->bind_param(3,$asm_slice->start,SQL_INTEGER);
2530  $sth->bind_param(4,$asm_slice->end,SQL_INTEGER);
2531  $sth->bind_param(5,$cmp_slice->start,SQL_INTEGER);
2532  $sth->bind_param(6,$cmp_slice->end,SQL_INTEGER);
2533  $sth->bind_param(7,$ori,SQL_INTEGER);
2534 
2535  $sth->execute();
2536 
2537  #use Data::Dumper qw( Dumper );
2538  #warn Dumper( $self->db->{seq_region_cache} );
2539  #$self->db->{seq_region_cache} = undef;
2540  #$self->_cache_seq_regions();
2541 
2542  my $ama = $self->db->get_AssemblyMapperAdaptor();
2543  $ama->delete_cache();
2544 
2545 
2546  return $asm_slice->name . "<>" . $cmp_slice->name;
2547 
2548 }
2549 
2550 
2551 =head2 prepare
2552 
2553  Arg [1] : String $sql
2554  Example : ( optional )
2555  Description: overrides the default adaptor prepare method.
2556  All slice sql will usually use the dna_db.
2557  Returntype : DBD::sth
2558  Exceptions : none
2559  Caller : internal, convenience method
2560  Status : Stable
2561 
2562 =cut
2563 
2564 sub prepare {
2565  my ( $self, $sql ) = @_;
2566  return $self->db()->dnadb()->dbc->prepare($sql);
2567 }
2568 
2569 sub _build_exception_cache {
2570  my $self = shift;
2571 
2572  # build up a cache of the entire assembly exception table
2573  # it should be small anyway
2574  my $sth =
2575  $self->prepare( 'SELECT ae.seq_region_id, ae.seq_region_start, '
2576  . 'ae.seq_region_end, ae.exc_type, ae.exc_seq_region_id, '
2577  . 'ae.exc_seq_region_start, ae.exc_seq_region_end '
2578  . 'FROM assembly_exception ae, '
2579  . 'seq_region sr, coord_system cs '
2580  . 'WHERE sr.seq_region_id = ae.seq_region_id '
2581  . 'AND sr.coord_system_id = cs.coord_system_id '
2582  . 'AND cs.species_id = ?' );
2583 
2584  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
2585  $sth->execute();
2586 
2587  my %hash;
2588  $self->{'asm_exc_cache'} = \%hash;
2589 
2590  my $row;
2591  while ( $row = $sth->fetchrow_arrayref() ) {
2592  my @result = @$row;
2593  $hash{ $result[0] } ||= [];
2594  push( @{ $hash{ $result[0] } }, \@result );
2595  }
2596  $sth->finish();
2597 } ## end sub _build_exception_cache
2598 
2599 =head2 cache_toplevel_seq_mappings
2600 
2601  Args : none
2602  Example : $slice_adaptor->cache_toplevel_seq_mappings();
2603  Description: caches all the assembly mappings needed for genes
2604  Returntype : None
2605  Exceptions : None
2606  Caller : general
2607  Status : At Risk
2608  : New experimental code
2609 
2610 =cut
2611 
2612 sub cache_toplevel_seq_mappings {
2613  my ($self) = @_;
2614 
2615  # Get the sequence level to map too
2616 
2617  my $sql = (<<SSQL);
2618  SELECT name
2619  FROM coord_system
2620  WHERE attrib like "%sequence_level%"
2621  AND species_id = ?
2622 SSQL
2623 
2624  my $sth = $self->prepare($sql);
2625  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
2626  $sth->execute();
2627 
2628  my $sequence_level = $sth->fetchrow_array();
2629 
2630  $sth->finish();
2631 
2632  my $csa = $self->db->get_CoordSystemAdaptor();
2633  my $ama = $self->db->get_AssemblyMapperAdaptor();
2634 
2635  my $cs1 = $csa->fetch_by_name($sequence_level);
2636 
2637  #get level to map too.
2638 
2639  $sql = (<<LSQL);
2640  SELECT DISTINCT(cs.name)
2641  FROM seq_region sr,
2642  seq_region_attrib sra,
2643  attrib_type at,
2644  coord_system cs
2645  WHERE sra.seq_region_id = sr.seq_region_id
2646  AND sra.attrib_type_id = at.attrib_type_id
2647  AND at.code = "toplevel"
2648  AND cs.coord_system_id = sr.coord_system_id
2649  AND cs.species_id = ?
2650 LSQL
2651 
2652  $sth = $self->prepare($sql);
2653  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
2654  $sth->execute();
2655 
2656  while ( my $csn = $sth->fetchrow_array() ) {
2657  if ( $csn eq $sequence_level ) { next }
2658  my $cs2 = $csa->fetch_by_name($csn);
2659  my $am = $ama->fetch_by_CoordSystems( $cs1, $cs2 );
2660  $am->register_all();
2661  }
2662 
2663 } ## end sub cache_toplevel_seq_mappings
2664 
2665 
2666 sub _build_circular_slice_cache {
2667  my $self = shift;
2668 
2669  # build up a cache of circular sequence region ids
2670  my $sth =
2671  $self->prepare( "SELECT sra.seq_region_id FROM seq_region_attrib sra "
2672  . "INNER JOIN attrib_type at ON sra.attrib_type_id = at.attrib_type_id "
2673  . "INNER JOIN seq_region sr ON sra.seq_region_id = sr.seq_region_id "
2674  . "INNER JOIN coord_system cs ON sr.coord_system_id = cs.coord_system_id "
2675  . "WHERE code = 'circular_seq' and cs.species_id = ?");
2676 
2677  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
2678  $sth->execute();
2679 
2680  my $id;
2681  my %hash;
2682  if ( ($id) = $sth->fetchrow_array() ) {
2683  $self->{'circular_sr_id_cache'} = \%hash;
2684  $self->{'is_circular'} = 1;
2685  $hash{ $id } = $id;
2686  while ( ($id) = $sth->fetchrow_array() ) {
2687  $hash{ $id } = $id;
2688  }
2689  } else {
2690  $self->{'is_circular'} = 0;
2691  }
2692  $sth->finish();
2693 } ## end _build_circular_slice_cache
2694 
2695 =head2 _create_chromosome_alias
2696 
2697  Args : none
2698  Example : $self->create_chromosome_alias();
2699  Description: create chromosome alias in coordinate system with karyotype attributes
2700  Returntype : Bio::EnsEMBL::CoordSystem, or none
2701  Exceptions : None
2702  Caller : general
2703  Status : in testing
2704 
2705 =cut
2706 
2707 sub _create_chromosome_alias {
2708  my $self = shift;
2709  my $csa = $self->db->get_CoordSystemAdaptor();
2710 
2711  # to store reformatted db query results
2712  my $karyotype_seq_regions;
2713 
2714  my $karyotype_rank_string = "karyotype_rank";
2715 
2716  # to store coord_system_id to alias
2717  my $cs_id;
2718  my %cs_id_rank;
2719 
2720  # check whether seq region cache exists, otherwise run database query
2721  if ( $self->{'karyotype_cache'} ) {
2722  $karyotype_seq_regions = $self->{'karyotype_cache'};
2723 
2724  # get candidate coordSystem(s) to alias
2725  foreach my $seq_region_name ( keys %{$karyotype_seq_regions} ) {
2726  foreach my $coord_system_id ( keys %{ $karyotype_seq_regions->{ $seq_region_name } } ) {
2727  my $rank = $karyotype_seq_regions->{ $seq_region_name }->{ $coord_system_id }->{ rank };
2728  $cs_id_rank{ $coord_system_id } = $rank;
2729  }
2730  }
2731  } else {
2732  # cannot find a coordssystem called chromosome
2733  # look through attribs to find seq_regions with karyotype
2734 
2735  my $sth =
2736  $self->prepare( "SELECT sr.seq_region_id, sr.name, sr.coord_system_id, sr.length, a.code, cs.rank "
2737  . "FROM seq_region sr, seq_region_attrib sra, attrib_type a, coord_system cs "
2738  . "WHERE sr.seq_region_id = sra.seq_region_id "
2739  . "AND a.attrib_type_id = sra.attrib_type_id "
2740  . "AND cs.coord_system_id = sr.coord_system_id "
2741  . "AND a.code = ?" );
2742 
2743  $sth->bind_param( 1, $karyotype_rank_string );
2744  $sth->execute();
2745 
2746  # fetch SQL results and identify coord_system_id(s) that
2747  # has(have) karyotype attribs
2748  while ( my $hashref = $sth->fetchrow_hashref() ) {
2749  my $seq_region_id = $hashref->{ seq_region_id };
2750  my $seq_region_name = $hashref->{ name };
2751  my $coord_system_id = $hashref->{ coord_system_id };
2752  my $length = $hashref->{ length };
2753  my $code = $hashref->{ code };
2754  my $rank = $hashref->{ rank };
2755 
2756  # account for possibility of multiple coord_system_ids returned from db query
2757  $karyotype_seq_regions->{ $seq_region_name }->{ $coord_system_id } = {
2758  code => $code,
2759  length => $length,
2760  seq_region_id => $seq_region_id,
2761  rank => $rank
2762  };
2763 
2764  # hash to help decide which cs id to use, if multiple
2765  $cs_id_rank{ $coord_system_id } = $rank;
2766  }
2767  $sth->finish();
2768  }
2769 
2770  my $cs_id_count = keys %cs_id_rank;
2771  # if number of coordSystem ids retrieved is one, simply use this coordSystem id
2772  # otherwise, choose the coordSystem with the best-rank (i.e. lowest number)
2773  if ( $cs_id_count == 1 ) {
2774  $cs_id = (keys %cs_id_rank)[0]; # get only key name in hash
2775  } else {
2776  foreach my $id (sort { $cs_id_rank{$a} <=> $cs_id_rank{$b} } keys %cs_id_rank) {
2777  $cs_id = $id;
2778  last; # exit loop after getting lowest-ranked coordSystem id
2779  }
2780  }
2781 
2782  if ( !$cs_id ) {
2783  throw("No coordinate system to create a chromosome slice from (because there is no suitable coordinate system to create an alias to).\n");
2784  } else {
2785 
2786  # use appropriate 'coord_system_id' to set 'alias_to' variable
2787  # first check that a chromosome object does not already exist
2788  my $cs = $csa->fetch_by_dbID( $cs_id );
2789  if ( !$cs ) {
2790  throw("Unable to retrieve CoordSystem object using dbID: $cs_id");
2791  }
2792  if ( $cs->name eq "chromosome" ) {
2793  throw("A chromosome CoordSystem object already exists. Cannot create chromosome alias.");
2794  }
2795  $cs->alias_to('chromosome');
2796 
2797  # create karyotype cache in retrieved coordsystem
2798  $cs->{'karyotype_cache'} = $karyotype_seq_regions;
2799 
2800  return $cs;
2801  }
2802 } ## end create_chromosome_alias
2803 
2804 
2805 =head2 _fetch_by_seq_region_synonym
2806  Args : $cs, $seq_region_name, $start, $end, $strand, $version, $no_fuzz
2807  Example : $self->fetch_by_seq_region_synonym( $cs, $seq_region_name, $start, $end, $strand, $version, $no_fuzz );
2808  Description: fetches all the seq region synonyms (or uses wildcard) when requested
2809  Returntype : Bio::EnsEMBL::SliceAdaptor, or none
2810  Exceptions : None
2811  Caller : general
2812  Status : (refactored from fetch_by_region)
2813 =cut
2814 
2815 sub _fetch_by_seq_region_synonym {
2816 
2817  my ( $self, $cs, $seq_region_name, $start, $end, $strand, $version, $no_fuzz ) = @_;
2818 
2819  # check input
2820  assert_integer($start, 'start') if defined $start;
2821  assert_integer($end, 'end') if defined $end;
2822 
2823  if ( !defined($start) ) { $start = 1 }
2824  if ( !defined($strand) ) { $strand = 1 }
2825 
2826  if ( !defined($seq_region_name) ) {
2827  throw('seq_region_name argument is required');
2828  }
2829 
2830  my $coord_system_name;
2831  # try synonyms
2832  my $syn_sql = "select s.name, cs.name, cs.version from seq_region s join seq_region_synonym ss using (seq_region_id) join coord_system cs using (coord_system_id) where ss.synonym like ? and cs.species_id =? ";
2833  if (defined $cs) {
2834  $coord_system_name = $cs->name;
2835  $syn_sql .= "AND cs.name = '" . $coord_system_name . "' ";
2836  }
2837  if (defined $version) {
2838  $syn_sql .= "AND cs.version = '" . $version . "' ";
2839  }
2840  $syn_sql .= "ORDER BY cs.rank ASC";
2841  my $syn_sql_sth = $self->prepare($syn_sql);
2842  $syn_sql_sth->bind_param(1, $seq_region_name, SQL_VARCHAR);
2843  $syn_sql_sth->bind_param(2, $self->species_id(), SQL_INTEGER);
2844  $syn_sql_sth->execute();
2845  my ($new_name, $new_coord_system, $new_version);
2846  $syn_sql_sth->bind_columns( \$new_name, \$new_coord_system, \$new_version);
2847 
2848  # we want to see if there was an exact match for the seq_region_synonym.
2849  # The overall logic is:
2850  # Case 1 - A synonym exactly matches the query, and either:
2851  # a - the coord_system matches
2852  # or
2853  # b - the user is looking for toplevel
2854  # In this case, we now have the real seq_region name, so we can re-run
2855  # fetch_by_region using that name, coord_system, and version
2856  # Case 2 - A synonym exactly matches, but none of the coord_systems match
2857  # and the user is not looking for toplevel.
2858  # In this case, we return nothing, and move on to try fuzzy-matching
2859  # if requested.
2860  # Case 3 - No synonyms match, so repeat the query using wildcards
2861  my $matched_synonym = 0;
2862  while ($syn_sql_sth->fetch){
2863  $matched_synonym = 1;
2864  if ((not defined($cs)) || ($cs->name eq $new_coord_system && $cs->version eq $new_version)) {
2865  return $self->fetch_by_region($new_coord_system, $new_name, $start, $end, $strand, $new_version, $no_fuzz);
2866  }
2867  }
2868 
2869  if ($matched_synonym == 0) {
2870  # Try wildcard searching if no exact synonym was found
2871  $syn_sql_sth = $self->prepare($syn_sql);
2872  my $escaped_seq_region_name = $seq_region_name;
2873  my $escape_char = $self->dbc->db_handle->get_info(14);
2874  $escaped_seq_region_name =~ s/([_%])/$escape_char$1/g;
2875  $syn_sql_sth->bind_param(1, "$escaped_seq_region_name%", SQL_VARCHAR);
2876  $syn_sql_sth->bind_param(2, $self->species_id(), SQL_INTEGER);
2877  $syn_sql_sth->execute();
2878  $syn_sql_sth->bind_columns( \$new_name, \$new_coord_system, \$new_version);
2879 
2880  my @matched_syn_wrong_cs;
2881  while ($syn_sql_sth->fetch){
2882  if ((not defined($cs)) || ($cs->name eq $new_coord_system && $cs->version eq $new_version)) {
2883  return $self->fetch_by_region($new_coord_system, $new_name, $start, $end, $strand, $new_version, $no_fuzz);
2884  } else {
2885  push(@matched_syn_wrong_cs, [$new_coord_system, $new_version]);
2886  }
2887  }
2888 
2889  if (scalar(@matched_syn_wrong_cs) > 0) {
2890  my $cs_warning_text = "Searched for a known feature on coordniate system: " .
2891  $cs->dbID .
2892  " but found it on " .
2893  join(", ", map {if ($_->[1]) {
2894  $_->[0] . ":" . $_->[1]
2895  } else {
2896  $_->[0]
2897  }
2898  } @matched_syn_wrong_cs) .
2899  "\n No result returned, consider searching without coordinate system or use toplevel.";
2900  warning($cs_warning_text);
2901  return;
2902  }
2903  }
2904  $syn_sql_sth->finish;
2905  return;
2906 }
2907 
2908 
2909 =head2 _fetch_by_fuzzy_matching
2910  Args : $cs, $seq_region_name, $sql, $constraint, $bind_params
2911  Example : my $fuzzy_matched_name = $self->fetch_by_fuzzy_matching( $cs, $seq_region_name, $sql, $constraint, $bind_params );
2912  Description: fetches all the fuzzy matches for a given seq_region_name when requested
2913  Returntype : string, Bio::EnsEMBL::CoordSystem
2914  Exceptions : None
2915  Caller : general
2916  Status : (refactored from fetch_by_region)
2917 =cut
2918 
2919 sub _fetch_by_fuzzy_matching {
2920 
2921  my ($self, $cs, $seq_region_name, $sql, $constraint, $bind_params) = @_;
2922 
2923  my $csa = $self->db->get_CoordSystemAdaptor();
2924  my $length;
2925 
2926  # Do fuzzy matching, assuming that we are just missing a version
2927  # on the end of the seq_region name.
2928 
2929  my $sth =
2930  $self->prepare( $sql . " WHERE sr.name LIKE ? " . $constraint );
2931 
2932  @$bind_params[0] = [ sprintf( '%s.%%', $seq_region_name ), SQL_VARCHAR ];
2933 
2934  my $pos = 0;
2935  foreach my $param (@$bind_params) {
2936  $sth->bind_param( ++$pos, $param->[0], $param->[1] );
2937  }
2938 
2939  $sth->execute();
2940 
2941  my $prefix_len = length($seq_region_name) + 1;
2942  my $high_ver = undef;
2943  my $high_cs = $cs;
2944 
2945  # Find the fuzzy-matched seq_region with the highest postfix
2946  # (which ought to be a version).
2947 
2948  my ( $tmp_name, $id, $tmp_length, $cs_id );
2949  $sth->bind_columns( \( $tmp_name, $id, $tmp_length, $cs_id ) );
2950 
2951  my $i = 0;
2952 
2953  while ( $sth->fetch ) {
2954  my $tmp_cs =
2955  ( defined($cs) ? $cs : $csa->fetch_by_dbID($cs_id) );
2956 
2957  # cache values for future reference
2958  my $arr = [ $id, $tmp_name, $cs_id, $tmp_length ];
2959  $self->{'sr_name_cache'}->{"$tmp_name:$cs_id"} = $arr;
2960  $self->{'sr_id_cache'}->{"$id"} = $arr;
2961 
2962  my $tmp_ver = substr( $tmp_name, $prefix_len );
2963 
2964  # skip versions which are non-numeric and apparently not
2965  # versions
2966  if ( $tmp_ver !~ /^\d+$/ ) { next }
2967 
2968  # take version with highest num, if two versions match take one
2969  # with highest ranked coord system (lowest num)
2970  if ( !defined($high_ver)
2971  || $tmp_ver > $high_ver
2972  || ( $tmp_ver == $high_ver && $tmp_cs->rank < $high_cs->rank )
2973  )
2974  {
2975  $seq_region_name = $tmp_name;
2976  $length = $tmp_length;
2977  $high_ver = $tmp_ver;
2978  $high_cs = $tmp_cs;
2979  }
2980 
2981  $i++;
2982  } ## end while ( $sth->fetch )
2983  $sth->finish();
2984 
2985  # warn if fuzzy matching found more than one result
2986  if ( $i > 1 ) {
2987  warning(
2988  sprintf(
2989  "Fuzzy matching of seq_region_name "
2990  . "returned more than one result.\n"
2991  . "You might want to check whether the returned seq_region\n"
2992  . "(%s:%s) is the one you intended to fetch.\n",
2993  $high_cs->name(), $seq_region_name ) );
2994  }
2995 
2996  $cs = $high_cs;
2997 
2998  # return if we did not find any appropriate match:
2999  if ( !defined($high_ver) ) { return; }
3000 
3001  return ($seq_region_name, $cs);
3002 }
3003 
3004 1;
transcript
public transcript()
Bio::EnsEMBL::CircularSlice
Definition: CircularSlice.pm:48
Bio::EnsEMBL::Registry::get_adaptor
public Adaptor get_adaptor()
EnsEMBL
Definition: Filter.pm:1
Bio::EnsEMBL::ProjectionSegment
Definition: ProjectionSegment.pm:27
Bio::EnsEMBL::LRGSlice::new
public new()
map
public map()
Bio::EnsEMBL::Utils::Slice
Definition: Slice.pm:21
Bio::EnsEMBL::CircularSlice::new
public Bio::EnsEMBL::CircularSlice new()
Bio::EnsEMBL::CoordSystem::name
public String name()
Bio::EnsEMBL::Feature
Definition: Feature.pm:47
Bio::EnsEMBL::DBSQL::DBAdaptor::dbc
public Bio::EnsEMBL::DBSQL::DBConnection dbc()
Bio::EnsEMBL::Slice::get_genome_component
public Scalar get_genome_component()
Bio::EnsEMBL::Slice::slice
public slice()
cleanup
public cleanup()
Bio::EnsEMBL::CoordSystem
Definition: CoordSystem.pm:40
Bio::EnsEMBL::DBSQL::SliceAdaptor
Definition: SliceAdaptor.pm:78
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::CoordSystem::alias_to
public Bio::EnsEMBL::CoordSystem alias_to()
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
Bio::EnsEMBL::Slice::new
public Bio::EnsEMBL::Slice new()
exon
public exon()
Bio::EnsEMBL::DBSQL::DBConnection::prepare
public DBI prepare()
Bio::EnsEMBL::Slice::seq_region_name
public String seq_region_name()
Bio::EnsEMBL::Slice::seq_region_length
public Int seq_region_length()
Bio::EnsEMBL::LRGSlice
Definition: LRGSlice.pm:37
Bio::EnsEMBL::Slice::get_seq_region_id
public Int get_seq_region_id()
has_karyotype
public has_karyotype()
Bio::EnsEMBL::Slice::new_fast
public Bio::EnsEMBL::Slice new_fast()
Bio::EnsEMBL::DBSQL::BaseAdaptor
Definition: BaseAdaptor.pm:71
Bio::EnsEMBL::Utils::Scalar
Definition: Scalar.pm:66
Bio::EnsEMBL::DBSQL::BaseAdaptor::db
public Bio::EnsEMBL::DBSQL::DBAdaptor db()
get_seq_region_id
public get_seq_region_id()
Bio::EnsEMBL::Slice::is_toplevel
public Int is_toplevel()
Bio::EnsEMBL::Registry::load_registry_from_db
public Int load_registry_from_db()
Bio::EnsEMBL::Slice::coord_system
public Bio::EnsEMBL::CoordSystem coord_system()
Bio::EnsEMBL::ProjectionSegment::to_Slice
public Bio::EnsEMBL::Slice to_Slice()
Bio
Definition: AltAlleleGroup.pm:4
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::Slice::strand
public Int strand()
Bio::EnsEMBL::Mapper
Definition: Coordinate.pm:3
Bio::EnsEMBL::Storable::adaptor
public Bio::EnsEMBL::DBSQL::BaseAdaptor adaptor()