ensembl-hive  2.8.1
SliceAdaptor.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::DBSQL::SliceAdaptor - A database aware adaptor responsible for
34 the creation of Slice objects.
35 
36 =head1 SYNOPSIS
37 
38  use Bio::EnsEMBL::Utils::Slice qw(split_Slices);
40 
42  -host => 'ensembldb.ensembl.org',
43  -user => 'anonymous'
44  );
45 
46  $slice_adaptor =
47  Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "slice" );
48 
49  # get a slice on the entire chromosome X
50  $chr_slice = $slice_adaptor->fetch_by_region( 'chromosome', 'X' );
51 
52  # get a slice for each clone in the database
53  foreach $cln_slice ( @{ $slice_adaptor->fetch_all('clone') } ) {
54  # do something with clone
55  }
56 
57  # get a slice which is part of NT_004321
58  $spctg_slice =
59  $slice_adaptor->fetch_by_region( 'supercontig', 'NT_004321',
60  200_000, 600_000 );
61 
62  # get all non-redundant slices from the highest possible coordinate
63  # systems
64  $slices = $slice_adaptor->fetch_all('toplevel');
65 
66  # include non-reference regions
67  $slices = $slice_adaptor->fetch_all( 'toplevel', undef, 1 );
68 
69  # include duplicate regions
70  $slices = $slice_adaptor->fetch_all( 'toplevel', undef, 0, 1 );
71 
72  # split up a list of slices into smaller slices
73  $overlap = 1000;
74  $max_length = 1e6;
75  $slices = split_Slices( $slices, $max_length, $overlap );
76 
77  # store a list of slice names in a file
78  open( FILE, ">$filename" ) or die("Could not open file $filename");
79  foreach my $slice (@$slices) {
80  print FILE $slice->name(), "\n";
81  }
82  close FILE;
83 
84  # retrieve a list of slices from a file
85  open( FILE, $filename ) or die("Could not open file $filename");
86  while ( $name = <FILE> ) {
87  chomp($name);
88  $slice = $slice_adaptor->fetch_by_name($name);
89  # do something with slice
90  }
91 
92 =head1 DESCRIPTION
93 
94 This module is responsible for fetching Slices representing genomic
95 regions from a database. A Details on how slices can be used are in the
96 Bio::EnsEMBL::Slice module.
97 
98 =head1 METHODS
99 
100 =cut
101 
102 
103 package Bio::EnsEMBL::DBSQL::SliceAdaptor;
104 
105 use vars qw(@ISA);
106 use strict;
107 
108 
114 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
116 use Scalar::Util qw/looks_like_number/;
117 use Bio::EnsEMBL::Utils::Scalar qw/assert_integer/;
118 
119 @ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor');
120 
121 sub new {
122  my $caller = shift;
123 
124  my $class = ref($caller) || $caller;
125 
126  my $self = $class->SUPER::new(@_);
127 
128  # use a cache which is shared and also used by the assembly
129  # mapper adaptor
130 
131  my $seq_region_cache = $self->db->get_SeqRegionCache();
132 
133  $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'};
134  $self->{'sr_id_cache'} = $seq_region_cache->{'id_cache'};
135 
136  $self->{'lrg_region_test'} = undef;
137  my $meta_container = $self->db->get_MetaContainer();
138  my @values = $meta_container->list_value_by_key("LRG");
139  if(scalar(@values) and $values[0]->[0]){
140  $self->{'lrg_region_test'} = $values[0]->[0];
141  }
142  return $self;
143 }
144 
145 
146 =head2 fetch_by_region
147 
148  Arg [1] : string $coord_system_name (optional)
149  The name of the coordinate system of the slice to be created
150  This may be a name of an actual coordinate system or an alias
151  to a coordinate system. Valid aliases are 'seqlevel' or
152  'toplevel'.
153  Arg [2] : string $seq_region_name
154  The name of the sequence region that the slice will be
155  created on.
156  Arg [3] : int $start (optional, default = 1)
157  The start of the slice on the sequence region
158  Arg [4] : int $end (optional, default = seq_region length)
159  The end of the slice on the sequence region
160  Arg [5] : int $strand (optional, default = 1)
161  The orientation of the slice on the sequence region
162  Arg [6] : string $version (optional, default = default version)
163  The version of the coordinate system to use (e.g. NCBI33)
164  Arg [7] : boolean $no_fuzz (optional, default = undef (false))
165  If true (non-zero), do not use "fuzzy matching" (see below).
166  Example : $slice = $slice_adaptor->fetch_by_region('chromosome', 'X');
167  $slice = $slice_adaptor->fetch_by_region('clone', 'AC008066.4');
168  Description: Retrieves a slice on the requested region. At a minimum the
169  name the name of the seq_region to fetch must be provided.
170 
171  If no coordinate system name is provided than a slice on the
172  highest ranked coordinate system with a matching
173  seq_region_name will be returned. If a version but no
174  coordinate system name is provided, the same behaviour will
175  apply, but only coordinate systems of the appropriate version
176  are considered. The same applies if the 'toplevel' coordinate
177  system is specified, however in this case the version is
178  ignored. The coordinate system should always be specified if
179  it is known, since this is unambiguous and faster.
180 
181  Some fuzzy matching is performed if no exact match for
182  the provided name is found. This allows clones to be
183  fetched even when their version is not known. For
184  example fetch_by_region('clone', 'AC008066') will
185  retrieve the sequence_region with name 'AC008066.4'.
186 
187  The fuzzy matching can be turned off by setting the
188  $no_fuzz argument to a true value.
189 
190  If the requested seq_region is not found in the database undef
191  is returned.
192 
193  Returntype : Bio::EnsEMBL::Slice or undef
194  Exceptions : throw if no seq_region_name is provided
195  throw if invalid coord_system_name is provided
196  throw if start > end is provided
197  Caller : general
198  Status : Stable
199 
200 =cut
201 
202 
203 #
204 # ARNE: This subroutine needs simplification!!
205 #
206 sub fetch_by_region {
207  my ( $self, $coord_system_name, $seq_region_name, $start, $end,
208  $strand, $version, $no_fuzz )
209  = @_;
210 
211  assert_integer($start, 'start') if defined $start;
212  assert_integer($end, 'end') if defined $end;
213 
214  if ( !defined($start) ) { $start = 1 }
215  if ( !defined($strand) ) { $strand = 1 }
216 
217  if ( !defined($seq_region_name) ) {
218  throw('seq_region_name argument is required');
219  }
220 
221  my $cs;
222  my $csa = $self->db->get_CoordSystemAdaptor();
223 
224  if ( defined($coord_system_name) ) {
225  $cs = $csa->fetch_by_name( $coord_system_name, $version );
226 
227  if ( !defined($cs) ) {
228  # deal with cases where undefined coordsystem name requested is 'chromosome'
229  if ( $coord_system_name eq "chromosome" ) {
230 
231  # set the 'chromosome' alias, if not yet set
232  $cs = $self->_create_chromosome_alias();
233 
234  } else {
235  throw( sprintf( "Unknown coordinate system:\n"
236  . "name='%s' version='%s' seq region name='%s'\n",
237  $coord_system_name, $version, $seq_region_name ) );
238  }
239  }
240 
241  # fetching by toplevel is same as fetching w/o name or version
242  if ( $cs->is_top_level() ) {
243  $cs = undef;
244  $version = undef;
245  }
246 
247  } ## end if ( defined($coord_system_name...))
248 
249  my $constraint;
250  my $sql;
251  my @bind_params;
252  my $key;
253 
254  if ( defined($cs) ) {
255 
256  # if chromosome alias is defined, use karyotype_cache to access seq region data
257  # rather than a database query
258  if ( defined($cs->alias_to()) && $cs->alias_to() eq "chromosome" ) {
259 
260  $key = "karyotype_cache";
261 
262  } else { # otherwise, search database to access seq region data, etc.
263 
264  $sql = sprintf( "SELECT sr.name, sr.seq_region_id, sr.length, %d "
265  . "FROM seq_region sr ",
266  $cs->dbID() );
267 
268  $constraint = "AND sr.coord_system_id = ?";
269  push( @bind_params, [ $cs->dbID(), SQL_INTEGER ] );
270 
271  $key = "$seq_region_name:" . $cs->dbID();
272 
273  }
274  } else {
275  $sql =
276  "SELECT sr.name, sr.seq_region_id, sr.length, cs.coord_system_id "
277  . "FROM seq_region sr, coord_system cs ";
278 
279  $constraint = "AND sr.coord_system_id = cs.coord_system_id "
280  . "AND cs.species_id = ? ";
281  push( @bind_params, [ $self->species_id(), SQL_INTEGER ] );
282 
283  if ( defined($version) ) {
284  $constraint .= "AND cs.version = ? ";
285  push( @bind_params, [ $version, SQL_VARCHAR ] );
286  }
287 
288  $constraint .= "ORDER BY cs.rank ASC";
289  }
290 
291  # check the cache so we only go to the db if necessary
292  my $length;
293  my $arr;
294 
295  # use $key to access karyotype cache and define $arr
296  if ( defined($key) && ($key eq "karyotype_cache") ) {
297 
298  my $coord_system_id = $cs->dbID();
299 
300  # retrieve values from karyotype cache
301  my $seq_region_id = $cs->{$key}{$seq_region_name}{$coord_system_id}{'seq_region_id'};
302  $length = $cs->{$key}{$seq_region_name}{$coord_system_id}{'length'};
303 
304  # populate $arr with values from karyotype cache
305  $arr = [ $seq_region_name, $seq_region_id, $length, $coord_system_id ];
306 
307  # define $end, if not yet defined
308  if ( !defined($end) ) { $end = $length }
309 
310  # add in check that $arr is populated
311  throw("Unable to popular $arr with values from karyotype cache") unless $arr;
312 
313  } else {
314 
315  if ( defined($key) ) { $arr = $self->{'sr_name_cache'}->{$key} }
316 
317  if ( defined($arr) ) {
318  $length = $arr->[3];
319  } else {
320  my $sth =
321  $self->prepare( $sql . "WHERE sr.name = ? " . $constraint );
322 
323  unshift( @bind_params, [ $seq_region_name, SQL_VARCHAR ] );
324 
325  my $pos = 0;
326  foreach my $param (@bind_params) {
327  $sth->bind_param( ++$pos, $param->[0], $param->[1] );
328  }
329 
330  $sth->execute();
331  my @row = $sth->fetchrow_array();
332  $sth->finish();
333 
334  unless ( @row ) {
335 
336  # deal with cases where a coordsystem might not be defined by user
337  my $slice;
338  if (!defined $cs) {
339  $slice = $self->_fetch_by_seq_region_synonym( undef, $seq_region_name, $start, $end, $strand, $version, $no_fuzz );
340  } else {
341  $slice = $self->_fetch_by_seq_region_synonym( $cs, $seq_region_name, $start, $end, $strand, $version, $no_fuzz );
342  }
343 
344  # check whether any slice data has been returned
345  if ( $slice && $slice->seq_region_name ) {
346  my $matched_name = $slice->seq_region_name;
347 
348  # if matched name is different to query name, skip fuzzy matching
349  if ( $matched_name ne $seq_region_name ) {
350  $seq_region_name = $matched_name;
351 
352  # define $arr
353  my $tmp_key_string = "$seq_region_name:" . $slice->coord_system()->dbID();
354  $arr = $self->{'sr_name_cache'}->{$tmp_key_string};
355  $length = $arr->[3];
356  $cs = $slice->coord_system() if (!$cs);
357  }
358 
359  } else { # if no slice object with a match returned, try using a fuzzy match
360  if (!$no_fuzz) {
361 
362  my ($fuzzy_matched_name, $cs_new) = $self->_fetch_by_fuzzy_matching( $cs, $seq_region_name, $sql, $constraint, \@bind_params );
363  $cs = $cs_new;
364 
365  if (!$fuzzy_matched_name) {
366  return;
367  }
368 
369  # define arr
370  my $tmp_key_string = $fuzzy_matched_name . ":" . $cs_new->dbID();
371  if (exists $self->{'sr_name_cache'}->{$tmp_key_string}) {
372  $seq_region_name = $fuzzy_matched_name;
373  $arr = $self->{'sr_name_cache'}->{$tmp_key_string};
374  $length = $arr->[3];
375  } else {
376  return;
377  }
378  } else {
379  return;
380  }
381  }
382  } else {
383 
384  my ( $id, $cs_id );
385  ( $seq_region_name, $id, $length, $cs_id ) = @row;
386 
387  # cache to speed up for future queries
388  my $arr = [ $id, $seq_region_name, $cs_id, $length ];
389  $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"} = $arr;
390  $self->{'sr_id_cache'}->{"$id"} = $arr;
391  $cs = $csa->fetch_by_dbID($cs_id);
392  }
393  } ## end else [ if ( defined($arr) ) ]
394  }
395 
396  if ( !defined($end) ) { $end = $length }
397 
398  #If this was given then check if we've got a circular seq region otherwise
399  #let it fall through to the normal Slice method
400  if ( $end + 1 < $start ) {
401  my $cs_id = $cs->dbID();
402  my $seq_region_id = $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"}->[0];
403  if($self->is_circular($seq_region_id)) {
404  my $new_sl =
406  -COORD_SYSTEM => $cs,
407  -SEQ_REGION_NAME => $seq_region_name,
408  -SEQ_REGION_LENGTH => $length,
409  -START => $start,
410  -END => $end,
411  -STRAND => 1,
412  -ADAPTOR => $self );
413 
414  return $new_sl;
415  }
416  }
417 
418  if ( defined( $self->{'lrg_region_test'} )
419  and substr( $cs->name, 0, 3 ) eq $self->{'lrg_region_test'} )
420  {
421  return
422  Bio::EnsEMBL::LRGSlice->new( -COORD_SYSTEM => $cs,
423  -SEQ_REGION_NAME => $seq_region_name,
424  -SEQ_REGION_LENGTH => $length,
425  -START => $start,
426  -END => $end,
427  -STRAND => $strand,
428  -ADAPTOR => $self );
429  } else {
430  return
432  'coord_system' => $cs,
433  'seq_region_name' => $seq_region_name,
434  'seq_region_length' => $length,
435  'start' => $start,
436  'end' => $end,
437  'strand' => $strand,
438  'adaptor' => $self } );
439  }
440 } ## end sub fetch_by_region
441 
442 =head2 fetch_by_toplevel_location
443 
444  Arg [1] : string $location
445  Ensembl formatted location. Can be a format like
446  C<name:start-end>, C<name:start..end>, C<name:start:end>,
447  C<name:start>, C<name>. We can also support strand
448  specification as a +/- or 1/-1.
449 
450  Location names must be separated by a C<:>. All others can be
451  separated by C<..>, C<:> or C<->.
452  Arg[2] : boolean $no_warnings
453  Suppress warnings from this method
454  Arg[3] : boolean $no_fuzz
455  Stop fuzzy matching of sequence regions from occuring
456  Arg[4] : boolean $ucsc
457  If we are unsuccessful at retriving a location retry taking any
458  possible chr prefix into account e.g. chrX and X are treated as
459  equivalents
460  Example : my $slice = $sa->fetch_by_toplevel_location('X:1-10000')
461  my $slice = $sa->fetch_by_toplevel_location('X:1-10000:-1')
462  Description : Converts an Ensembl location/region into the sequence region
463  name, start and end and passes them onto C<fetch_by_region()>.
464  The code assumes that the required slice is on the top level
465  coordinate system. The code assumes that location formatting
466  is not perfect and will perform basic cleanup before parsing.
467  Returntype : Bio::EnsEMBL::Slice
468  Exceptions : If $location is false otherwise see C<fetch_by_location()>
469  or C<fetch_by_region()>
470  Caller : General
471  Status : Beta
472 
473 =cut
474 
475 sub fetch_by_toplevel_location {
476  my ($self, $location, $no_warnings, $no_fuzz, $ucsc) = @_;
477  return $self->fetch_by_location($location, 'toplevel', undef, $no_warnings, $no_fuzz, $ucsc);
478 }
479 
480 =head2 fetch_by_location
481 
482  Arg [1] : string $location
483  Ensembl formatted location. Can be a format like
484  C<name:start-end>, C<name:start..end>, C<name:start:end>,
485  C<name:start>, C<name>. We can also support strand
486  specification as a +/- or 1/-1.
487 
488  Location names must be separated by a C<:>. All others can be
489  separated by C<..>, C<:>, C<_> or C<->.
490  Arg[2] : String $coord_system_name
491  The coordinate system to retrieve
492  Arg[3] : String $coord_system_version
493  Optional parameter. Version of the coordinate system to fetch
494  Arg[4] : boolean $no_warnings
495  Suppress warnings from this method
496  Arg[5] : boolean $no_fuzz
497  Stop fuzzy matching of sequence regions from occuring
498  Arg[6] : boolean $ucsc
499  If we are unsuccessful at retriving a location retry taking any
500  possible chr prefix into account e.g. chrX and X are treated as
501  equivalents
502  Example : my $slice = $sa->fetch_by_location('X:1-10000','chromosome')
503  my $slice = $sa->fetch_by_location('X:1-10000:-1','toplevel')
504  Description : Converts an Ensembl location/region into the sequence region
505  name, start and end and passes them onto C<fetch_by_region()>.
506  The code assumes that location formatting is not perfect and
507  will perform basic cleanup before parsing.
508  Returntype : Bio::EnsEMBL::Slice
509  Exceptions : If $location or coordinate system is false otherwise
510  see C<fetch_by_region()>
511  Caller : General
512  Status : Beta
513 
514 =cut
515 
516 sub fetch_by_location {
517  my ($self, $location, $coord_system_name, $coord_system_version, $no_warnings, $no_fuzz, $ucsc) = @_;
518  throw "No coordinate system name specified" unless $coord_system_name;
519 
520  my ($seq_region_name, $start, $end, $strand) = $self->parse_location_to_values($location, $no_warnings);
521 
522  if(! $seq_region_name) {
523  return;
524  }
525 
526  if(defined $start && defined $end && $start > $end) {
527  throw "Cannot request a slice whose start is greater than its end. Start: $start. End: $end";
528  }
529 
530  $seq_region_name =~ s/^chr// if ($ucsc);
531  my $slice = $self->fetch_by_region($coord_system_name, $seq_region_name, $start, $end, $strand, $coord_system_version, $no_fuzz);
532  if(! defined $slice) {
533  if($ucsc) {
534  my $ucsc_seq_region_name = $seq_region_name;
535  $ucsc_seq_region_name =~ s/^chr//;
536  if($ucsc_seq_region_name ne $seq_region_name) {
537  $slice = $self->fetch_by_region($coord_system_name, $ucsc_seq_region_name, $start, $end, $strand, $coord_system_version, $no_fuzz);
538  return if ! defined $slice; #if we had no slice still then bail
539  }
540  else {
541  return; #If it was not different then we didn't have the prefix so just return (same bail as before)
542  }
543  }
544  else {
545  return; #We didn't have a slice and no UCSC specifics are being triggered
546  }
547  }
548 
549  my $srl = $slice->seq_region_length();
550  my $name = $slice->seq_region_name();
551  if(defined $start && $start > $srl) {
552  throw "Cannot request a slice whose start ($start) is greater than $srl for $name.";
553  }
554  if(defined $end && $end > $srl) {
555  warning "Requested end ($end) is greater than $srl for $name. Resetting to $srl" if ! $no_warnings;
556  $slice->{end} = $srl;
557  }
558 
559  return $slice;
560 }
561 
562 =head2 parse_location_to_values
563 
564  Arg [1] : string $location
565  Ensembl formatted location. Can be a format like
566  C<name:start-end>, C<name:start..end>, C<name:start:end>,
567  C<name:start>, C<name>. We can also support strand
568  specification as a +/- or 1/-1.
569 
570  Location names must be separated by a C<:>. All others can be
571  separated by C<..>, C<:> C<_>, or C<->.
572 
573  If the start is negative, start will be reset to 1 (e.g.: 1: -10-1,000')
574  If both start and end are negative, returns undef (e.g.: 1: -10--1,000')
575  Arg[2] : boolean $no_warnings
576  Suppress warnings from this method
577  Arg[3] : boolean $no_errors
578  Supress errors being thrown from this method
579  Example : my ($name, $start, $end, $strand) = $sa->parse_location_to_values('X:1..100:1);
580  Description : Takes in an Ensembl location String and returns the parsed
581  values
582  Returntype : List. Contains name, start, end and strand
583 
584 =cut
585 
586 
587 sub parse_location_to_values {
588  my ($self, $location, $no_warnings, $no_errors) = @_;
589 
590  throw 'You must specify a location' if ! $location;
591 
592  #cleanup any nomenclature like 1 000 or 1,000
593  my $number_seps_regex = qr/\s+|,/;
594  my $separator_regex = qr/(?:-|[.]{2}|\:|_)?/; # support -, .., : and _ as separators
595  my $hgvs_nomenclature_regex = qr/(?:g\.)?/; # check for HGVS looking locations e.g. X:g.1-100
596  my $number_regex = qr/[0-9, EMKG]+/xmsi;
597  my $number_regex_signed = qr/-?[0-9, EMKG]+/xmsi; # to capture negative locations as sometimes we end up in negative location if the location is padded
598  my $strand_regex = qr/[+-1]|-1/xms;
599 
600  my $regex = qr/^((?:\w|\.|_|-)+) \s* :? \s* $hgvs_nomenclature_regex ($number_regex_signed)? $separator_regex ($number_regex)? $separator_regex ($strand_regex)? $/xms;
601  my ($seq_region_name, $start, $end, $strand);
602  if(($seq_region_name, $start, $end, $strand) = $location =~ $regex) {
603 
604  if(defined $strand) {
605  if(!looks_like_number($strand)) {
606  $strand = ($strand eq '+') ? 1 : -1;
607  }
608  }
609 
610  if(defined $start) {
611  $start =~ s/$number_seps_regex//g;
612  if($start < 1) {
613  warning "Start was less than 1 (${start}) which is not allowed. Resetting to 1" if ! $no_warnings;
614 
615  unless(defined $end) {
616  # We will reach here only when the location is given without start and '-' is used as seperator eg: 1:-10 (expected to return 1:1-10)
617  $end = abs($start);
618  }
619  $start = 1;
620  }
621  }
622  if(defined $end) {
623  $end =~ s/$number_seps_regex//g;
624  if($end < 1) {
625  throw "Cannot request negative or 0 end indexes through this interface. Given $end but expected something greater than 0" unless $no_errors;
626  }
627  }
628 
629  }
630 
631  return ($seq_region_name, $start, $end, $strand);
632 }
633 
634 =head2 fetch_by_region_unique
635 
636  Arg [1] : string $coord_system_name (optional)
637  The name of the coordinate system of the slice to be created
638  This may be a name of an actual coordinate system or an alias
639  to a coordinate system. Valid aliases are 'seqlevel' or
640  'toplevel'.
641  Arg [2] : string $seq_region_name
642  The name of the sequence region that the slice will be
643  created on.
644  Arg [3] : int $start (optional, default = 1)
645  The start of the slice on the sequence region
646  Arg [4] : int $end (optional, default = seq_region length)
647  The end of the slice on the sequence region
648  Arg [5] : int $strand (optional, default = 1)
649  The orientation of the slice on the sequence region
650  Arg [6] : string $version (optional, default = default version)
651  The version of the coordinate system to use (e.g. NCBI33)
652  Arg [7] : boolean $no_fuzz (optional, default = undef (false))
653  If true (non-zero), do not use "fuzzy matching" (see below).
654  Example : $slice = $slice_adaptor->fetch_by_region_unique('chromosome', 'HSCHR6_MHC_COX');
655  Description: Retrieves a slice on the requested region but returns only the unique
656  parts of the slice. At a minimum the
657  name the name of the seq_region to fetch must be provided.
658 
659  If no coordinate system name is provided than a slice on the
660  highest ranked coordinate system with a matching
661  seq_region_name will be returned. If a version but no
662  coordinate system name is provided, the same behaviour will
663  apply, but only coordinate systems of the appropriate version
664  are considered. The same applies if the 'toplevel' coordinate
665  system is specified, however in this case the version is
666  ignored. The coordinate system should always be specified if
667  it is known, since this is unambiguous and faster.
668 
669  Some fuzzy matching is performed if no exact match for
670  the provided name is found. This allows clones to be
671  fetched even when their version is not known. For
672  example fetch_by_region('clone', 'AC008066') will
673  retrieve the sequence_region with name 'AC008066.4'.
674 
675  The fuzzy matching can be turned off by setting the
676  $no_fuzz argument to a true value.
677 
678  If the requested seq_region is not found in the database undef
679  is returned.
680 
681  Returntype : listref Bio::EnsEMBL::Slice
682  Exceptions : throw if no seq_region_name is provided
683  throw if invalid coord_system_name is provided
684  throw if start > end is provided
685  Caller : general
686  Status : Stable
687 
688 =cut
689 
690 sub fetch_by_region_unique {
691  my $self = shift;
692 
693  my @out = ();
694  my $slice = $self->fetch_by_region(@_);
695 
696 
697  if ( !exists( $self->{'asm_exc_cache'} ) ) {
698  $self->_build_exception_cache();
699  }
700 
701  if ( exists(
702  $self->{'asm_exc_cache'}->{ $self->get_seq_region_id($slice) }
703  ) )
704  {
705  # Dereference symlinked assembly regions. Take out any regions
706  # which are symlinked because these are duplicates.
707  my @projection =
708  @{ $self->fetch_normalized_slice_projection($slice) };
709 
710  foreach my $segment (@projection) {
711  if ( $segment->[2]->seq_region_name() eq $slice->seq_region_name()
712  && $segment->[2]->coord_system->equals( $slice->coord_system ) )
713  {
714  push( @out, $segment->[2] );
715  }
716  }
717  } else {
718  @out = ($slice);
719  }
720 
721  return \@out;
722 } ## end sub fetch_by_region_unique
723 
724 =head2 fetch_by_name
725 
726  Arg [1] : string $name
727  Example : $name = 'chromosome:NCBI34:X:1000000:2000000:1';
728  $slice = $slice_adaptor->fetch_by_name($name);
729  $slice2 = $slice_adaptor->fetch_by_name($slice3->name());
730  Description: Fetches a slice using a slice name (i.e. the value returned by
731  the Slice::name method). This is useful if you wish to
732  store a unique identifier for a slice in a file or database or
733  pass a slice over a network.
734  Slice::name allows you to serialise/marshall a slice and this
735  method allows you to deserialise/unmarshal it.
736 
737  Returns undef if no seq_region with the provided name exists in
738  the database.
739 
740  Returntype : Bio::EnsEMBL::Slice or undef
741  Exceptions : throw if incorrent arg provided
742  Caller : Pipeline
743  Status : Stable
744 
745 =cut
746 
747 sub fetch_by_name {
748  my $self = shift;
749  my $name = shift;
750 
751  if(!$name) {
752  throw("name argument is required");
753  }
754 
755  my @array = split(/:/,$name);
756 
757  if(scalar(@array) < 3 || scalar(@array) > 6) {
758  throw("Malformed slice name [$name]. Format is " .
759  "coord_system:version:name:start:end:strand");
760  }
761 
762  # Rearrange arguments to suit fetch_by_region
763 
764  my @targetarray;
765 
766  $targetarray[0]=$array[0];
767  $targetarray[5]=(($array[1]&&$array[1] ne "")?$array[1]:undef);
768  $targetarray[1]=(($array[2]&&$array[2] ne "")?$array[2]:undef);
769  $targetarray[2]=(($array[3]&&$array[3] ne "")?$array[3]:undef);
770  $targetarray[3]=(($array[4]&&$array[4] ne "")?$array[4]:undef);
771  $targetarray[4]=(($array[5]&&$array[5] ne "")?$array[5]:undef);
772  return $self->fetch_by_region(@targetarray);
773 }
774 
775 
776 
777 =head2 fetch_by_seq_region_id
778 
779  Arg [1] : string $seq_region_id
780  The internal identifier of the seq_region to create this slice
781  on
782  Arg [2] : optional start
783  Arg [3] : optional end
784  Arg [4] : optional strand
785  Example : $slice = $slice_adaptor->fetch_by_seq_region_id(34413);
786  Description: Creates a slice object of an entire seq_region using the
787  seq_region internal identifier to resolve the seq_region.
788  Returns undef if no such slice exists.
789  Returntype : Bio::EnsEMBL::Slice or undef
790  Exceptions : none
791  Caller : general
792  Status : Stable
793 
794 =cut
795 
796 sub fetch_by_seq_region_id {
797  my ( $self, $seq_region_id, $start, $end, $strand, $check_prior_ids ) = @_;
798 
799  my $csa = $self->db->get_CoordSystemAdaptor();
800  my $arr = $self->{'sr_id_cache'}->{$seq_region_id};
801  my ( $name, $length, $cs, $cs_id );
802 
803 
804  if ( $arr && defined( $arr->[2] ) ) {
805  ( $name, $cs_id, $length ) = ( $arr->[1], $arr->[2], $arr->[3] );
806  $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id);
807  } else {
808  my $sth =
809  $self->prepare( "SELECT sr.name, sr.coord_system_id, sr.length "
810  . "FROM seq_region sr "
811  . "WHERE sr.seq_region_id = ? " );
812 
813  $sth->bind_param( 1, $seq_region_id, SQL_INTEGER );
814  $sth->execute();
815 
816  my @row = $sth->fetchrow_array();
817  unless ( @row ) {
818  # This could have been an old seq region id so see if we can
819  # translate it into a more recent version.
820  if($check_prior_ids) {
821  if(exists $csa->{_external_seq_region_mapping}->{$seq_region_id}) {
822  my $new_seq_region_id = $csa->{_external_seq_region_mapping}->{$seq_region_id};
823  # No need to pass check prior ids flag because it's a 1 step relationship
824  return $self->fetch_by_seq_region_id($new_seq_region_id, $start, $end, $strand);
825  }
826  }
827  return undef;
828  }
829 
830  ( $name, $cs_id, $length ) = @row;
831  $sth->finish();
832 
833  $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id);
834 
835  #cache results to speed up repeated queries
836  my $arr = [ $seq_region_id, $name, $cs_id, $length ];
837 
838  $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr;
839  $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr;
840  }
841 
842  return
844  'coord_system' => $cs,
845  'seq_region_name' => $name,
846  'seq_region_length'=> $length,
847  'start' => $start || 1,
848  'end' => $end || $length,
849  'strand' => $strand || 1,
850  'adaptor' => $self} );
851 } ## end sub fetch_by_seq_region_id
852 
853 
854 
855 =head2 get_seq_region_id
856 
857  Arg [1] : Bio::EnsEMBL::Slice $slice
858  The slice to fetch a seq_region_id for
859  Example : $srid = $slice_adaptor->get_seq_region_id($slice);
860  Description: Retrieves the seq_region id (in this database) given a slice
861  Seq region ids are not stored on the slices themselves
862  because they are intended to be somewhat database independant
863  and seq_region_ids vary accross databases.
864  Returntype : int
865  Exceptions : throw if the seq_region of the slice is not in the db
866  throw if incorrect arg provided
867  Caller : BaseFeatureAdaptor
868  Status : Stable
869 
870 =cut
871 
872 sub get_seq_region_id {
873  my $self = shift;
874  my $slice = shift;
875 
876  if(!$slice || !ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) {
877  throw('Slice argument is required');
878  }
879 
880  my $seq_region_name = $slice->seq_region_name();
881  my $key = $seq_region_name.":".$slice->coord_system->dbID();
882  my $arr = $self->{'sr_name_cache'}->{"$key"};
883 
884  if( $arr ) {
885  return $arr->[0];
886  }
887 
888  my $cs_id = $slice->coord_system->dbID();
889 
890  my $sth = $self->prepare("SELECT seq_region_id, length " .
891  "FROM seq_region " .
892  "WHERE name = ? AND coord_system_id = ?");
893 
894  #force seq_region_name cast to string so mysql cannot treat as int
895  $sth->bind_param(1,"$seq_region_name",SQL_VARCHAR);
896  $sth->bind_param(2,$cs_id,SQL_INTEGER);
897  $sth->execute();
898 
899  my @row = $sth->fetchrow_array();
900  unless ( @row ) {
901  throw("No-existent seq_region [$seq_region_name] in coord system [$cs_id]");
902  }
903  my @more = $sth->fetchrow_array();
904  if ( @more ) {
905  throw("Ambiguous seq_region [$seq_region_name] in coord system [$cs_id]");
906  }
907 
908  my($seq_region_id, $length) = @row;
909  $sth->finish();
910 
911  #cache information for future requests
912  $arr = [ $seq_region_id, $seq_region_name, $cs_id, $length ];
913 
914  $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"} = $arr;
915  $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr;
916 
917  return $seq_region_id;
918 }
919 
920 
921 
922 =head2 fetch_all
923 
924  Arg [1] : string $coord_system_name
925  The name of the coordinate system to retrieve slices of.
926  This may be a name of an acutal coordinate system or an alias
927  to a coordinate system. Valid aliases are 'seqlevel' or
928  'toplevel'.
929  Arg [2] : string $coord_system_version (optional)
930  The version of the coordinate system to retrieve slices of
931  Arg [3] : bool $include_non_reference (optional)
932  If this argument is not provided then only reference slices
933  will be returned. If set, both reference and non reference
934  slices will be returned.
935  Arg [4] : int $include_duplicates (optional)
936  If set duplicate regions will be returned.
937 
938  NOTE: if you do not use this option and you have a PAR
939  (pseudo-autosomal region) at the beginning of your seq_region
940  then your slice will not start at position 1, so coordinates
941  retrieved from this slice might not be what you expected.
942 
943  Arg[5] : bool $include_lrg (optional) (default 0)
944  If set lrg regions will be returned aswell.
945 
946 
947  Example : @chromos = @{$slice_adaptor->fetch_all('chromosome','NCBI33')};
948  @contigs = @{$slice_adaptor->fetch_all('contig')};
949 
950  # get even non-reference regions
951  @slices = @{$slice_adaptor->fetch_all('toplevel',undef,1)};
952 
953  # include duplicate regions (such as pseudo autosomal regions)
954  @slices = @{$slice_adaptor->fetch_all('toplevel', undef,0,1)};
955 
956  Description: Retrieves slices of all seq_regions for a given coordinate
957  system. This is analagous to the methods fetch_all which were
958  formerly on the ChromosomeAdaptor, RawContigAdaptor and
959  CloneAdaptor classes. Slices fetched span the entire
960  seq_regions and are on the forward strand.
961  If the coordinate system with the provided name and version
962  does not exist an empty list is returned.
963  If the coordinate system name provided is 'toplevel', all
964  non-redundant toplevel slices are returned (note that any
965  coord_system_version argument is ignored in that case).
966 
967  Retrieved slices can be broken into smaller slices using the
969 
970  Returntype : listref of Bio::EnsEMBL::Slices
971  Exceptions : none
972  Caller : general
973  Status : Stable
974 
975 =cut
976 
977 sub fetch_all {
978  my $self = shift;
979  my $cs_name = shift;
980  my $cs_version = shift || '';
981 
982  my ($include_non_reference, $include_duplicates, $include_lrg) = @_;
983 
984  #
985  # verify existance of requested coord system and get its id
986  #
987  my $csa = $self->db->get_CoordSystemAdaptor();
988  my $orig_cs = $csa->fetch_by_name($cs_name, $cs_version);
989 
990  return [] if ( !$orig_cs );
991 
992  my %bad_vals=();
993 
994 
995  #
996  # Get a hash of non reference seq regions
997  #
998  if ( !$include_non_reference ) {
999  my $sth =
1000  $self->prepare( 'SELECT sr.seq_region_id '
1001  . 'FROM seq_region sr, seq_region_attrib sra, '
1002  . 'attrib_type at, coord_system cs '
1003  . 'WHERE at.code = "non_ref" '
1004  . 'AND sra.seq_region_id = sr.seq_region_id '
1005  . 'AND at.attrib_type_id = sra.attrib_type_id '
1006  . 'AND sr.coord_system_id = cs.coord_system_id '
1007  . 'AND cs.species_id = ?' );
1008 
1009  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
1010  $sth->execute();
1011 
1012  my ($seq_region_id);
1013  $sth->bind_columns( \$seq_region_id );
1014 
1015  while ( $sth->fetch() ) {
1016  $bad_vals{$seq_region_id} = 1;
1017  }
1018  }
1019 
1020  #
1021  # if we do not want lrg's then add them to the bad list;
1022  #
1023  if ( !$include_lrg ) {
1024  my $sth =
1025  $self->prepare( 'SELECT sr.seq_region_id '
1026  . 'FROM seq_region sr, seq_region_attrib sra, '
1027  . 'attrib_type at, coord_system cs '
1028  . 'WHERE at.code = "LRG" '
1029  . 'AND sra.seq_region_id = sr.seq_region_id '
1030  . 'AND at.attrib_type_id = sra.attrib_type_id '
1031  . 'AND sr.coord_system_id = cs.coord_system_id '
1032  . 'AND cs.species_id = ?' );
1033 
1034  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
1035  $sth->execute();
1036 
1037  my ($seq_region_id);
1038  $sth->bind_columns( \$seq_region_id );
1039 
1040  while ( $sth->fetch() ) {
1041  $bad_vals{$seq_region_id} = 1;
1042  }
1043  }
1044 
1045  #
1046  # Retrieve the seq_regions from the database
1047  #
1048 
1049  my $sth;
1050  if ( $orig_cs->is_top_level() ) {
1051  $sth =
1052  $self->prepare( 'SELECT sr.seq_region_id, sr.name, '
1053  . 'sr.length, sr.coord_system_id '
1054  . 'FROM seq_region sr, seq_region_attrib sra, '
1055  . 'attrib_type at, coord_system cs '
1056  . 'WHERE at.code = "toplevel" '
1057  . 'AND at.attrib_type_id = sra.attrib_type_id '
1058  . 'AND sra.seq_region_id = sr.seq_region_id '
1059  . 'AND sr.coord_system_id = cs.coord_system_id '
1060  . 'AND cs.species_id = ?' );
1061 
1062  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
1063  $sth->execute();
1064  } else {
1065  $sth =
1066  $self->prepare( 'SELECT sr.seq_region_id, sr.name, '
1067  . 'sr.length, sr.coord_system_id '
1068  . 'FROM seq_region sr '
1069  . 'WHERE sr.coord_system_id = ?' );
1070 
1071  $sth->bind_param( 1, $orig_cs->dbID, SQL_INTEGER );
1072  $sth->execute();
1073  }
1074 
1075  my ( $seq_region_id, $name, $length, $cs_id );
1076  $sth->bind_columns( \( $seq_region_id, $name, $length, $cs_id ) );
1077 
1078  my $cache_count = 0;
1079 
1080  my @out;
1081  while($sth->fetch()) {
1082  if(!defined($bad_vals{$seq_region_id})){
1083  my $cs = $csa->fetch_by_dbID($cs_id);
1084 
1085  if(!$cs) {
1086  throw("seq_region $name references non-existent coord_system $cs_id.");
1087  }
1088 
1089  #cache values for future reference, but stop adding to the cache once we
1090  #we know we have filled it up
1091  if($cache_count < $Bio::EnsEMBL::Utils::SeqRegionCache::SEQ_REGION_CACHE_SIZE) {
1092  my $arr = [ $seq_region_id, $name, $cs_id, $length ];
1093 
1094  $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr;
1095  $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr;
1096 
1097  $cache_count++;
1098  }
1099 
1100  my $slice = Bio::EnsEMBL::Slice->new_fast({
1101  'start' => 1,
1102  'end' => $length,
1103  'strand' => 1,
1104  'seq_region_name' => $name,
1105  'seq_region_length'=> $length,
1106  'coord_system' => $cs,
1107  'adaptor' => $self});
1108 
1109  if(!defined($include_duplicates) or !$include_duplicates){
1110  # test if this slice *could* have a duplicate (exception) region
1111  $self->_build_exception_cache() if(!exists $self->{'asm_exc_cache'});
1112  if(exists $self->{asm_exc_cache}->{$seq_region_id}) {
1113 
1114  # Dereference symlinked assembly regions. Take out
1115  # any regions which are symlinked because these are duplicates
1116  my @projection = @{$self->fetch_normalized_slice_projection($slice)};
1117  foreach my $segment ( @projection) {
1118  if($segment->[2]->seq_region_name() eq $slice->seq_region_name() &&
1119  $segment->[2]->coord_system->equals($slice->coord_system)) {
1120  push @out, $segment->[2];
1121  }
1122  }
1123  } else {
1124  # no duplicate regions
1125  push @out, $slice;
1126  }
1127  } else {
1128  # we want duplicates anyway so do not do any checks
1129  push @out, $slice;
1130  }
1131  }
1132  }
1133 
1134  return \@out;
1135 }
1136 
1137 =head2 fetch_all_by_genome_component
1138 
1139  Arg [1] : string $genome_component_name
1140  The name of the genome component to retrieve slices of.
1141  Example : @slices = @{$slice_adaptor->fetch_all_by_genome_component('A')};
1142  Description: Returns the list of all top level slices for a a given
1143  genome component
1144  Returntype : listref of Bio::EnsEMBL::Slices
1145  Exceptions : If argument is not provided or is not a valid genome
1146  component
1147  Caller : general
1148  Status : Stable
1149 
1150 =cut
1151 
1152 sub fetch_all_by_genome_component {
1153  my $self = shift;
1154  my $genome_component = shift;
1155  defined $genome_component or
1156  throw "Undefined genome component";
1157 
1158  # check the provided genome component is valid
1159  my $gc = $self->db->get_adaptor('GenomeContainer');
1160  my $is_valid_component = grep { $_ eq $genome_component }
1161  @{$gc->get_genome_components};
1162  throw "Invalid genome component"
1163  unless $is_valid_component;
1164 
1165  #
1166  # Retrieve the toplevel seq_regions from the database
1167  #
1168  my $sth =
1169  $self->prepare( "SELECT sr.seq_region_id, sr.name, sr.length, sr.coord_system_id "
1170  . "FROM seq_region sr "
1171  . "JOIN seq_region_attrib sa1 USING (seq_region_id) "
1172  . "JOIN attrib_type a1 ON sa1.attrib_type_id = a1.attrib_type_id "
1173  . "JOIN seq_region_attrib sa2 USING (seq_region_id) "
1174  . "JOIN attrib_type a2 ON sa2.attrib_type_id = a2.attrib_type_id "
1175  . "WHERE sa2.value=? and a1.code='toplevel' and a2.code='genome_component'"
1176  );
1177 
1178  if (looks_like_number($genome_component)) {
1179  $sth->bind_param( 1, $genome_component, SQL_INTEGER );
1180  } else {
1181  $sth->bind_param( 1, $genome_component, SQL_VARCHAR );
1182  }
1183 
1184  $sth->execute();
1185 
1186  my ( $seq_region_id, $name, $length, $cs_id );
1187  $sth->bind_columns( \( $seq_region_id, $name, $length, $cs_id ) );
1188 
1189  my $cache_count = 0;
1190 
1191  my @out;
1192  my $csa = $self->db->get_CoordSystemAdaptor();
1193 
1194  while($sth->fetch()) {
1195  my $cs = $csa->fetch_by_dbID($cs_id);
1196  throw("seq_region $name references non-existent coord_system $cs_id.")
1197  unless $cs;
1198 
1199  # cache values for future reference, but stop adding to the cache once we
1200  # we know we have filled it up
1201  if($cache_count < $Bio::EnsEMBL::Utils::SeqRegionCache::SEQ_REGION_CACHE_SIZE) {
1202  my $arr = [ $seq_region_id, $name, $cs_id, $length ];
1203 
1204  $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr;
1205  $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr;
1206 
1207  $cache_count++;
1208  }
1209 
1210  push @out, Bio::EnsEMBL::Slice->new_fast({
1211  'start' => 1,
1212  'end' => $length,
1213  'strand' => 1,
1214  'seq_region_name' => $name,
1215  'seq_region_length'=> $length,
1216  'coord_system' => $cs,
1217  'adaptor' => $self});
1218  }
1219 
1220  return \@out;
1221 }
1222 
1223 =head2 get_genome_component_for_slice
1224 
1225  Arg [1] : An object of type Bio::EnsEMBL::Slice
1226  Example : my $component = $slice->get_genome_component();
1227  Description: Returns the genome component of a slice
1228  Returntype : Scalar; the identifier of the genome component of the slice
1229  Exceptions : none
1230  Caller : general
1231  Status : Stable
1232 
1233 =cut
1234 
1235 sub get_genome_component_for_slice {
1236  my ($self, $slice) = @_;
1237 
1238  throw "Undefined slice" unless defined $slice;
1239  throw "Argument is not a slice"
1240  unless $slice->isa("Bio::EnsEMBL::Slice");
1241 
1242  my $seq_region_id = $self->get_seq_region_id($slice);
1243 
1244  my $sth =
1245  $self->prepare( "SELECT sa.value "
1246  . "FROM seq_region_attrib sa "
1247  . "JOIN seq_region sr USING (seq_region_id) "
1248  . "JOIN attrib_type at ON sa.attrib_type_id = at.attrib_type_id "
1249  . "WHERE sr.seq_region_id=? AND at.code='genome_component'"
1250  );
1251  $sth->bind_param( 1, $seq_region_id, SQL_INTEGER );
1252  $sth->execute();
1253 
1254  my $genome_component;
1255  $sth->bind_columns( \( $genome_component ) );
1256  $sth->fetch();
1257 
1258  return $genome_component;
1259 }
1260 
1261 =head2 fetch_all_karyotype
1262 
1263  Example : my $top = $slice_adptor->fetch_all_karyotype()
1264  Description: returns the list of all slices which are part of the karyotype
1265  Returntype : listref of Bio::EnsEMBL::Slices
1266  Caller : general
1267  Status : At Risk
1268 
1269 =cut
1270 
1271 sub fetch_all_karyotype {
1272  my $self = shift;
1273 
1274  my $csa = $self->db->get_CoordSystemAdaptor();
1275 
1276  my $sth =
1277  $self->prepare( 'SELECT sr.seq_region_id, sr.name, '
1278  . 'sr.length, sr.coord_system_id, sra.value '
1279  . 'FROM seq_region sr, seq_region_attrib sra, '
1280  . 'attrib_type at, coord_system cs '
1281  . 'WHERE at.code = "karyotype_rank" '
1282  . 'AND at.attrib_type_id = sra.attrib_type_id '
1283  . 'AND sra.seq_region_id = sr.seq_region_id '
1284  . 'AND sr.coord_system_id = cs.coord_system_id '
1285  . 'AND cs.species_id = ?');
1286  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
1287  $sth->execute();
1288  my ( $seq_region_id, $name, $length, $cs_id, $rank );
1289  $sth->bind_columns( \( $seq_region_id, $name, $length, $cs_id, $rank ) );
1290 
1291  my @out;
1292  while($sth->fetch()) {
1293  my $cs = $csa->fetch_by_dbID($cs_id);
1294 
1295  my $slice = Bio::EnsEMBL::Slice->new_fast({
1296  'start' => 1,
1297  'end' => ($length+0),
1298  'strand' => 1,
1299  'seq_region_name' => $name,
1300  'seq_region_length'=> ($length+0),
1301  'coord_system' => $cs,
1302  'adaptor' => $self,
1303  'karyotype' => 1,
1304  'karyotype_rank' => ($rank+0),
1305  });
1306 
1307  push @out, $slice;
1308  }
1309 
1310  #Sort using Perl as value in MySQL is a text field and not portable
1311  @out = sort { $a->{karyotype_rank} <=> $b->{karyotype_rank} } @out;
1312 
1313  return \@out;
1314 }
1315 
1316 =head2 is_toplevel
1317  Arg : int seq_region_id
1318  Example : my $top = $slice_adptor->is_toplevel($seq_region_id)
1319  Description: Returns 1 if slice is a toplevel slice else 0
1320  Returntype : int
1321  Caller : Slice method is_toplevel
1322  Status : At Risk
1323 
1324 =cut
1325 
1326 sub is_toplevel {
1327  my $self = shift;
1328  my $id = shift;
1329 
1330  my $sth = $self->prepare(
1331  "SELECT at.code from seq_region_attrib sra, attrib_type at "
1332  . "WHERE sra.seq_region_id = ? "
1333  . "AND at.attrib_type_id = sra.attrib_type_id "
1334  . "AND at.code = 'toplevel'" );
1335 
1336  $sth->bind_param( 1, $id, SQL_INTEGER );
1337  $sth->execute();
1338 
1339  my $code;
1340  $sth->bind_columns( \$code );
1341 
1342  while ( $sth->fetch ) {
1343  $sth->finish;
1344  return 1;
1345  }
1346 
1347  $sth->finish;
1348  return 0;
1349 }
1350 
1351 
1352 =head2 has_karyotype
1353  Arg : int seq_region_id
1354  Example : my $karyotype = $slice_adptor->has_karyotype($seq_region_id)
1355  Description: Returns 1 if slice is a part of a karyotype else 0
1356  Returntype : int
1357  Caller : Slice method has_karyotype
1358  Status : At Risk
1359 
1360 =cut
1361 
1362 sub has_karyotype {
1363  my $self = shift;
1364  my $id = shift;
1365 
1366  my $sth = $self->prepare(
1367  "SELECT at.code from seq_region_attrib sra, attrib_type at "
1368  . "WHERE sra.seq_region_id = ? "
1369  . "AND at.attrib_type_id = sra.attrib_type_id "
1370  . "AND at.code = 'karyotype_rank'" );
1371 
1372  $sth->bind_param( 1, $id, SQL_INTEGER );
1373  $sth->execute();
1374 
1375  my $code;
1376  $sth->bind_columns( \$code );
1377 
1378  while ( $sth->fetch ) {
1379  $sth->finish;
1380  return 1;
1381  }
1382 
1383  $sth->finish;
1384  return 0;
1385 }
1386 
1387 =head2 get_karyotype_rank
1388  Arg : int seq_region_id
1389  Example : my $rank = $slice_adptor->get_karyotype_rank($seq_region_id)
1390  Description: Returns the rank of a slice if it is part of the karyotype else 0
1391  Returntype : int
1392  Caller : Slice method get_karyotype_rank
1393  Status : At Risk
1394 
1395 =cut
1396 
1397 sub get_karyotype_rank {
1398  my $self = shift;
1399  my $id = shift;
1400 
1401  my $sth = $self->prepare(
1402  "SELECT sra.value from seq_region_attrib sra, attrib_type at "
1403  . "WHERE sra.seq_region_id = ? "
1404  . "AND at.attrib_type_id = sra.attrib_type_id "
1405  . "AND at.code = 'karyotype_rank'" );
1406 
1407  $sth->bind_param( 1, $id, SQL_INTEGER );
1408  $sth->execute();
1409 
1410  my $code;
1411  $sth->bind_columns( \$code );
1412 
1413  my $rank = $sth->fetchrow_array();
1414  $sth->finish();
1415 
1416  return $rank;
1417 }
1418 
1419 
1420 
1421 =head2 is_reference
1422  Arg : int seq_region_id
1423  Example : my $reference = $slice_adptor->is_reference($seq_region_id)
1424  Description: Returns 1 if slice is a reference slice else 0
1425  Returntype : int
1426  Caller : Slice method is_reference
1427  Status : At Risk
1428 
1429 =cut
1430 
1431 sub is_reference {
1432  my $self = shift;
1433  my $id = shift;
1434 
1435  my $sth = $self->prepare(
1436  "SELECT at.code from seq_region_attrib sra, attrib_type at "
1437  . "WHERE sra.seq_region_id = ? "
1438  . "AND at.attrib_type_id = sra.attrib_type_id "
1439  . "AND at.code = 'non_ref'" );
1440 
1441  $sth->bind_param( 1, $id, SQL_INTEGER );
1442  $sth->execute();
1443 
1444  my $code;
1445  $sth->bind_columns( \$code );
1446 
1447  while ( $sth->fetch ) {
1448  $sth->finish;
1449  return 0;
1450  }
1451 
1452  $sth->finish;
1453  return 1;
1454 }
1455 
1456 =head2 is_circular
1457 
1458  Arg[1] : int seq_region_id
1459  Example : my $circular = $slice_adptor->is_circular($seq_region_id);
1460  Description : Indicates if the sequence region was circular or not
1461  Returntype : Boolean
1462 
1463 =cut
1464 
1465 sub is_circular {
1466  my ($self, $id) = @_;
1467 
1468  if (! defined $self->{is_circular}) {
1469  $self->_build_circular_slice_cache();
1470  }
1471 
1472  return 0 if $self->{is_circular} == 0;
1473  return (exists $self->{circular_sr_id_cache}->{$id}) ? 1 : 0;
1474 }
1475 
1476 
1477 =head2 fetch_by_chr_band
1478 
1479  Title : fetch_by_chr_band
1480  Usage :
1481  Function: create a Slice representing a series of bands
1482  Example :
1483  Returns : Bio::EnsEMBL::Slice
1484  Args : the band name
1485  Status : Stable
1486 
1487 =cut
1488 
1489 sub fetch_by_chr_band {
1490  my ( $self, $chr, $band ) = @_;
1491 
1492  my $chr_slice = $self->fetch_by_region( 'toplevel', $chr );
1493  my $seq_region_id = $self->get_seq_region_id($chr_slice);
1494 
1495  my $sth =
1496  $self->prepare( 'SELECT MIN(k.seq_region_start), '
1497  . 'MAX(k.seq_region_end) '
1498  . 'FROM karyotype k '
1499  . 'WHERE k.seq_region_id = ? '
1500  . 'AND k.band LIKE ?' );
1501 
1502  $sth->bind_param( 1, $seq_region_id, SQL_INTEGER );
1503  $sth->bind_param( 2, "$band%", SQL_VARCHAR );
1504  $sth->execute();
1505 
1506  my ( $slice_start, $slice_end ) = $sth->fetchrow_array;
1507 
1508  if ( defined $slice_start ) {
1509  return
1510  $self->fetch_by_region( 'toplevel', $chr,
1511  $slice_start, $slice_end );
1512  }
1513 
1514  throw("Band not recognised in database");
1515 } ## end sub fetch_by_chr_band
1516 
1517 
1518 
1519 =head2 fetch_by_exon_stable_id
1520 
1521  Arg [1] : string $exonid
1522  The stable id of the exon around which the slice is
1523  desired
1524  Arg [2] : (optional) int $size
1525  The length of the flanking regions the slice should encompass
1526  on either side of the exon (0 by default)
1527  Example : $slc = $sa->fetch_by_exon_stable_id('ENSE00000302930',10);
1528  Description: Creates a slice around the region of the specified exon.
1529  If a context size is given, the slice is extended by that
1530  number of basepairs on either side of the exon.
1531 
1532  The slice will be created in the exon's native coordinate system
1533  and in the forward orientation.
1534  Returntype : Bio::EnsEMBL::Slice
1535  Exceptions : Thrown if the exon is not in the database.
1536  Caller : general
1537  Status : Stable
1538 
1539 =cut
1540 
1541 sub fetch_by_exon_stable_id{
1542  my ($self,$exonid,$size) = @_;
1543 
1544  throw('Exon argument is required.') if(!$exonid);
1545 
1546  my $ea = $self->db->get_ExonAdaptor;
1547  my $exon = $ea->fetch_by_stable_id($exonid);
1548 
1549  throw("Exon [$exonid] does not exist in DB.") if(!$exon);
1550 
1551  return $self->fetch_by_Feature($exon, $size);
1552 }
1553 
1554 =head2 fetch_by_transcript_stable_id
1555 
1556  Arg [1] : string $transcriptid
1557  The stable id of the transcript around which the slice is
1558  desired
1559  Arg [2] : (optional) int $size
1560  The length of the flanking regions the slice should encompass
1561  on either side of the transcript (0 by default)
1562  Example : $slc = $sa->fetch_by_transcript_stable_id('ENST00000302930',10);
1563  Description: Creates a slice around the region of the specified transcript.
1564  If a context size is given, the slice is extended by that
1565  number of basepairs on either side of the
1566  transcript.
1567 
1568  The slice will be created in the transcript's native coordinate
1569  system and in the forward orientation.
1570  Returntype : Bio::EnsEMBL::Slice
1571  Exceptions : Thrown if the transcript is not in the database.
1572  Caller : general
1573  Status : Stable
1574 
1575 =cut
1576 
1577 sub fetch_by_transcript_stable_id{
1578  my ($self,$transcriptid,$size) = @_;
1579 
1580  throw('Transcript argument is required.') if(!$transcriptid);
1581 
1582  my $ta = $self->db->get_TranscriptAdaptor;
1583  my $transcript = $ta->fetch_by_stable_id($transcriptid);
1584 
1585  throw("Transcript [$transcriptid] does not exist in DB.") if(!$transcript);
1586 
1587  return $self->fetch_by_Feature($transcript, $size);
1588 }
1589 
1590 =head2 fetch_by_transcript_id
1591 
1592  Arg [1] : int $transcriptid
1593  The unique database identifier of the transcript around which
1594  the slice is desired
1595  Arg [2] : (optional) int $size
1596  The length of the flanking regions the slice should encompass
1597  on either side of the transcript (0 by default)
1598  Example : $slc = $sa->fetch_by_transcript_id(24, 1000);
1599  Description: Creates a slice around the region of the specified transcript.
1600  If a context size is given, the slice is extended by that
1601  number of basepairs on either side of the
1602  transcript.
1603 
1604  The slice will be created in the transcript's native coordinate
1605  system and in the forward orientation.
1606  Returntype : Bio::EnsEMBL::Slice
1607  Exceptions : throw on incorrect args
1608  throw if transcript is not in database
1609  Caller : general
1610  Status : Stable
1611 
1612 =cut
1613 
1614 sub fetch_by_transcript_id {
1615  my ($self,$transcriptid,$size) = @_;
1616 
1617  throw('Transcript id argument is required.') if(!$transcriptid);
1618 
1619  my $transcript_adaptor = $self->db()->get_TranscriptAdaptor();
1620  my $transcript = $transcript_adaptor->fetch_by_dbID($transcriptid);
1621 
1622  throw("Transcript [$transcriptid] does not exist in DB.") if(!$transcript);
1623 
1624  return $self->fetch_by_Feature($transcript, $size);
1625 }
1626 
1627 
1628 
1629 =head2 fetch_by_gene_stable_id
1630 
1631  Arg [1] : string $geneid
1632  The stable id of the gene around which the slice is
1633  desired
1634  Arg [2] : (optional) int $size
1635  The length of the flanking regions the slice should encompass
1636  on either side of the gene (0 by default)
1637  Example : $slc = $sa->fetch_by_gene_stable_id('ENSG00000012123',10);
1638  Description: Creates a slice around the region of the specified gene.
1639  If a context size is given, the slice is extended by that
1640  number of basepairs on either side of the gene.
1641 
1642  The slice will be created in the gene's native coordinate system
1643  and in the forward orientation.
1644  Returntype : Bio::EnsEMBL::Slice
1645  Exceptions : throw on incorrect args
1646  throw if transcript does not exist
1647  Caller : general
1648  Status : Stable
1649 
1650 =cut
1651 
1652 sub fetch_by_gene_stable_id {
1653  my ($self,$geneid,$size) = @_;
1654 
1655  throw('Gene argument is required.') if(!$geneid);
1656 
1657  my $gene_adaptor = $self->db->get_GeneAdaptor();
1658  my $gene = $gene_adaptor->fetch_by_stable_id($geneid);
1659 
1660  throw("Gene [$geneid] does not exist in DB.") if(!$gene);
1661 
1662  return $self->fetch_by_Feature($gene, $size);
1663 }
1664 
1665 
1666 
1667 =head2 fetch_by_Feature
1668 
1669  Arg [1] : Bio::EnsEMBL::Feature $feat
1670  The feature to fetch the slice around
1671  Arg [2] : int size (optional)
1672  The desired number of flanking basepairs around the feature.
1673  The size may also be provided as a percentage of the feature
1674  size such as 200% or 80.5%.
1675  Example : $slice = $slice_adaptor->fetch_by_Feature($feat, 100);
1676  Description: Retrieves a slice around a specific feature. All this really
1677  does is return a resized version of the slice that the feature
1678  is already on. Note that slices returned from this method
1679  are always on the forward strand of the seq_region regardless of
1680  the strandedness of the feature passed in.
1681  Returntype : Bio::EnsEMBL::Slice
1682  Exceptions : throw if the feature does not have an attached slice
1683  throw if feature argument is not provided
1684  Caller : fetch_by_gene_stable_id, fetch_by_transcript_stable_id,
1685  fetch_by_gene_id, fetch_by_transcript_id
1686  Status : Stable
1687 
1688 =cut
1689 
1690 sub fetch_by_Feature{
1691  my ($self, $feature, $size) = @_;
1692 
1693  $size ||= 0;
1694 
1695  if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) {
1696  throw('Feature argument expected.');
1697  }
1698 
1699  my $slice = $feature->slice();
1700  if(!$slice || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice') )) {
1701  throw('Feature must be attached to a valid slice.');
1702  }
1703 
1704 
1705  my $fstart = $feature->start();
1706  my $fend = $feature->end();
1707  if(!defined($fstart) || !defined($fend)) {
1708  throw('Feature must have defined start and end.');
1709  }
1710 
1711  #convert the feature slice coordinates to seq_region coordinates
1712  my $slice_start = $slice->start();
1713  my $slice_end = $slice->end();
1714  my $slice_strand = $slice->strand();
1715  if($slice_start != 1 || $slice_strand != 1) {
1716  if($slice_strand == 1) {
1717  $fstart = $fstart + $slice_start - 1;
1718  $fend = $fend + $slice_start - 1;
1719  } else {
1720  my $tmp_start = $fstart;
1721  $fstart = $slice_end - $fend + 1;
1722  $fend = $slice_end - $tmp_start + 1;
1723  }
1724  }
1725 
1726  ## Size may be stored as a %age of the length of the feature
1727  ## Size = 100% gives no context
1728  ## Size = 200% gives context - 50% the size of the feature either side of
1729  ## feature
1730 
1731  $size = int( ($1-100)/200 * ($fend-$fstart+1) ) if( $size =~/([\d+\.]+)%/ );
1732 
1733  #return a new slice covering the region of the feature
1734  my $S = Bio::EnsEMBL::Slice->new_fast({
1735  'seq_region_name' => $slice->seq_region_name,
1736  'seq_region_length' => $slice->seq_region_length,
1737  'coord_system' => $slice->coord_system,
1738  'start' => $fstart - $size,
1739  'end' => $fend + $size,
1740  'strand' => 1,
1741  'adaptor' => $self});
1742  $S->{'_raw_feature_strand'} = $feature->strand * $slice_strand if $feature->can('strand');
1743  return $S;
1744 }
1745 
1746 
1747 
1748 =head2 fetch_by_misc_feature_attribute
1749 
1750  Arg [1] : string $attribute_type
1751  The code of the attribute type
1752  Arg [2] : (optional) string $attribute_value
1753  The value of the attribute to fetch by
1754  Arg [3] : (optional) int $size
1755  The amount of flanking region around the misc feature desired.
1756  Example : $slice = $sa->fetch_by_misc_feature_attribute('superctg',
1757  'NT_030871');
1758  $slice = $sa->fetch_by_misc_feature_attribute('synonym',
1759  'AL00012311',
1760  $flanking);
1761  Description: Fetches a slice around a MiscFeature with a particular
1762  attribute type and value. If no value is specified then
1763  the feature with the particular attribute is used.
1764  If no size is specified then 0 is used.
1765  Returntype : Bio::EnsEMBL::Slice
1766  Exceptions : Throw if no feature with the specified attribute type and value
1767  exists in the database
1768  Warning if multiple features with the specified attribute type
1769  and value exist in the database.
1770  Caller : webcode
1771  Status : Stable
1772 
1773 =cut
1774 
1775 sub fetch_by_misc_feature_attribute {
1776  my ($self, $attrib_type_code, $attrib_value, $size) = @_;
1777 
1778  my $mfa = $self->db()->get_MiscFeatureAdaptor();
1779 
1780  my $feats = $mfa->fetch_all_by_attribute_type_value($attrib_type_code,
1781  $attrib_value);
1782 
1783  if(@$feats == 0) {
1784  throw("MiscFeature with $attrib_type_code=$attrib_value does " .
1785  "not exist in DB.");
1786  }
1787 
1788  if(@$feats > 1) {
1789  warning("MiscFeature with $attrib_type_code=$attrib_value is " .
1790  "ambiguous - using first one found.");
1791  }
1792 
1793  my ($feat) = @$feats;
1794 
1795  return $self->fetch_by_Feature($feat, $size);
1796 }
1797 
1798 =head2 fetch_by_misc_feature_set
1799 
1800  Arg [1] : string $attribute_type
1801  The code of the attribute type
1802  Arg [2] : (optional) string $attribute_value
1803  The value of the attribute to fetch by
1804  Arg [3] : (optional) the name of the set
1805  Arg [4] : (optional) int $size
1806  The amount of flanking region around the misc feature desired.
1807  Example : $slice = $sa->fetch_by_misc_feature_set('clone',
1808  'RP11-411G9'
1809  'tilepath');
1810  Description: Fetches a slice around a MiscFeature with a particular
1811  attribute type, value and set. If no value is specified then
1812  the feature with the particular attribute is used.
1813  A size can be specified to include flanking region
1814  If no size is specified then 0 is used.
1815  Returntype : Bio::EnsEMBL::Slice
1816  Exceptions : Throw if no feature with the specified attribute type, value and set
1817  exists in the database
1818  Warning if multiple features with the specified attribute type, set
1819  and value exist in the database.
1820  Caller : webcode
1821  Status : Stable
1822 
1823 =cut
1824 
1825 sub fetch_by_misc_feature_set {
1826  my ($self, $attrib_type_code, $attrib_value, $misc_set, $size) = @_;
1827 
1828  my $mfa = $self->db()->get_MiscFeatureAdaptor();
1829 
1830  my $feat = $mfa->fetch_by_attribute_set_value($attrib_type_code,
1831  $attrib_value,
1832  $misc_set);
1833 
1834  return $self->fetch_by_Feature($feat, $size);
1835 }
1836 
1837 =head2 fetch_normalized_slice_projection
1838 
1839  Arg [1] : Bio::EnsEMBL::Slice $slice
1840  Arg [2] : boolean $filter_projections
1841  Optionally filter the projections to remove anything
1842  which is the same sequence region as the given slice
1843  Example : ( optional )
1844  Description: gives back a project style result. The returned slices
1845  represent the areas to which there are symlinks for the
1846  given slice. start, end show which area on given slice is
1847  symlinked
1848  Returntype : [[start,end,$slice][]]
1849  Exceptions : none
1850  Caller : BaseFeatureAdaptor
1851  Status : Stable
1852 
1853 =cut
1854 
1855 
1856 sub fetch_normalized_slice_projection {
1857  my $self = shift;
1858  my $slice = shift;
1859  my $filter_projections = shift;
1860 
1861  my $slice_seq_region_id = $self->get_seq_region_id( $slice );
1862 
1863  $self->_build_exception_cache() if(!exists($self->{'asm_exc_cache'}));
1864 
1865  my $result = $self->{'asm_exc_cache'}->{$slice_seq_region_id};
1866 
1867  $result ||= [];
1868 
1869  my (@haps, @pars);
1870 
1871  foreach my $row (@$result) {
1872  my ( $seq_region_id, $seq_region_start, $seq_region_end,
1873  $exc_type, $exc_seq_region_id, $exc_seq_region_start,
1874  $exc_seq_region_end ) = @$row;
1875 
1876  # need overlapping PAR and all HAPs if any
1877  if( $exc_type eq "PAR" ) {
1878  if( $seq_region_start <= $slice->end() &&
1879  $seq_region_end >= $slice->start() ) {
1880  push( @pars, [ $seq_region_start, $seq_region_end, $exc_seq_region_id,
1881  $exc_seq_region_start, $exc_seq_region_end ] );
1882  }
1883  } else {
1884  push( @haps, [ $seq_region_start, $seq_region_end, $exc_seq_region_id,
1885  $exc_seq_region_start, $exc_seq_region_end ] );
1886  }
1887  }
1888 
1889  if(!@pars && !@haps) {
1890  #just return this slice, there were no haps or pars
1891  return [bless ( [1,$slice->length, $slice], "Bio::EnsEMBL::ProjectionSegment")];
1892  }
1893 
1894  my @syms;
1895  if( @haps >= 1 ) {
1896  my @sort_haps = sort { $a->[1] <=> $b->[1] } @haps;
1897 
1898  my $count =0;
1899  my $chr_start = 1;
1900  my $hap_start = 1;
1901  my $last = 0;
1902 
1903  my $seq_region_slice = $self->fetch_by_seq_region_id($slice_seq_region_id);
1904  my $exc_slice = $self->fetch_by_seq_region_id( $sort_haps[0][2] );
1905  my $len1 = $seq_region_slice->length();
1906  my $len2 = $exc_slice->length();
1907  my $max_len = ($len1 > $len2) ? $len1 : $len2;
1908 
1909  while($count <= scalar(@sort_haps) and !$last){
1910  my $chr_end;
1911  my $hap_end;
1912  if(defined($sort_haps[$count]) and defined($sort_haps[$count][0]) ){
1913  $hap_end = $sort_haps[$count][0]-1;
1914  $chr_end = $sort_haps[$count][3]-1
1915  }
1916  else{
1917  $last = 1;
1918  $hap_end = $len1;
1919  $chr_end = $len2;
1920  my $diff = ($hap_end-$hap_start)-($chr_end-$chr_start);
1921  if($diff > 0){
1922  push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end+$diff] );
1923  }
1924  elsif($diff < 0){
1925  push( @syms, [ $hap_start, $hap_end - $diff, $sort_haps[0][2], $chr_start, $chr_end] );
1926  }
1927  else{
1928  push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end] );
1929  }
1930  next;
1931  }
1932  if($hap_end and $hap_start < $len1){ # if hap at start or end of chromosome
1933  push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end] );
1934  }
1935  $chr_start = $chr_end + ($sort_haps[$count][4]-$sort_haps[$count][3]) + 2;
1936  $hap_start = $hap_end + ($sort_haps[$count][1]-$sort_haps[$count][0]) + 2;
1937  $count++;
1938  }
1939 
1940 
1941  }
1942 
1943 
1944  # for now haps and pars should not be both there, but in theory we
1945  # could handle it here by cleverly merging the pars into the existing syms,
1946  # for now just:
1947  push( @syms, @pars );
1948 
1949  my $mapper = Bio::EnsEMBL::Mapper->new( "sym", "org" );
1950  my $count = 0;
1951  for my $sym ( @syms ) {
1952  $mapper->add_map_coordinates( $slice_seq_region_id, $sym->[0], $sym->[1],
1953  1, $sym->[2], $sym->[3], $sym->[4] );
1954  }
1955 
1956  my @linked = $mapper->map_coordinates( $slice_seq_region_id,
1957  $slice->start(), $slice->end(),
1958  $slice->strand(), "sym" );
1959 
1960  # gaps are regions where there is no mapping to another region
1961  my $rel_start = 1;
1962 
1963  #if there was only one coord and it is a gap, we know it is just the
1964  #same slice with no overlapping symlinks
1965  if(@linked == 1 && $linked[0]->isa('Bio::EnsEMBL::Mapper::Gap')) {
1966  return [bless( [1,$slice->length, $slice], "Bio::EnsEMBL::ProjectionSegment" )];
1967  }
1968 
1969  my @out;
1970  for my $coord ( @linked ) {
1971  if( $coord->isa( "Bio::EnsEMBL::Mapper::Gap" )) {
1972  my $exc_slice = Bio::EnsEMBL::Slice->new_fast({
1973  'start' => $coord->start(),
1974  'end' => $coord->end(),
1975  'strand' => $slice->strand(),
1976  'coord_system' => $slice->coord_system(),
1977  'adaptor' => $self,
1978  'seq_region_name' => $slice->seq_region_name(),
1979  'seq_region_length' => $slice->seq_region_length()});
1980  push( @out, bless ( [ $rel_start, $coord->length()+$rel_start-1,
1981  $exc_slice ], "Bio::EnsEMBL::ProjectionSegment") );
1982  } else {
1983  my $exc_slice = $self->fetch_by_seq_region_id( $coord->id() );
1984  my $exc2_slice = Bio::EnsEMBL::Slice->new_fast({
1985 
1986  'start' => $coord->start(),
1987  'end' => $coord->end(),
1988  'strand' => $coord->strand(),
1989  'seq_region_name' => $exc_slice->seq_region_name(),
1990  'seq_region_length' => $exc_slice->seq_region_length(),
1991  'coord_system' => $exc_slice->coord_system(),
1992  'adaptor' => $self
1993  });
1994 
1995  push( @out, bless( [ $rel_start, $coord->length() + $rel_start - 1,
1996  $exc2_slice ], "Bio::EnsEMBL::ProjectionSegment") );
1997  }
1998  $rel_start += $coord->length();
1999  }
2000 
2001  if($filter_projections) {
2002  return $self->_filter_Slice_projections($slice, \@out);
2003  }
2004  return \@out;
2005 }
2006 
2007 =head2 _filter_Slice_projections
2008 
2009  Arg [1] : Bio::EnsEMBL::Slice The slice the projections were made from
2010  Arg [2] : Array The projections which were fetched from the previous slice
2011  Description : Removes any projections which occur within the same sequence
2012  region as the given Slice object
2013  Returntype : ArrayRef Bio::EnsEMBL::ProjectionSegment; Returns an array
2014  of projected segments
2015 =cut
2016 
2017 sub _filter_Slice_projections {
2018  my ($self, $slice, $projections) = @_;
2019  my @proj = @{ $projections };
2020  if ( !@proj ) {
2021  throw('Was not given any projections to filter. Database may have incorrect assembly_exception information loaded');
2022  }
2023 
2024  # Want to get features on the FULL original slice as well as any
2025  # symlinked slices.
2026 
2027  # Filter out partial slices from projection that are on same
2028  # seq_region as original slice.
2029 
2030  my $sr_id = $slice->get_seq_region_id();
2031 
2032  @proj = grep { $_->to_Slice->get_seq_region_id() != $sr_id } @proj;
2033 
2034  my $segment = bless( [ 1, $slice->length(), $slice ],
2035  'Bio::EnsEMBL::ProjectionSegment' );
2036  push( @proj, $segment );
2037  return \@proj;
2038 }
2039 
2040 
2041 =head2 store
2042 
2043  Arg [1] : Bio::EnsEMBL::Slice $slice
2044  Arg [2] : (optional) $seqref reference to a string
2045  The sequence associated with the slice to be stored.
2046  Example : $slice = Bio::EnsEMBL::Slice->new(...);
2047  $seq_region_id = $slice_adaptor->store($slice, \$sequence);
2048  Description: This stores a slice as a sequence region in the database
2049  and returns the seq region id. The passed in slice must
2050  start at 1, and must have a valid seq_region name and coordinate
2051  system. The attached coordinate system must already be stored in
2052  the database. The sequence region is assumed to start at 1 and
2053  to have a length equalling the length of the slice. The end of
2054  the slice must equal the seq_region_length.
2055  If the slice coordinate system is the sequence level coordinate
2056  system then the seqref argument must also be passed. If the
2057  slice coordinate system is NOT a sequence level coordinate
2058  system then the sequence argument cannot be passed.
2059  Returntype : int
2060  Exceptions : throw if slice has no coord system.
2061  throw if slice coord system is not already stored.
2062  throw if slice coord system is seqlevel and no sequence is
2063  provided.
2064  throw if slice coord system is not seqlevel and sequence is
2065  provided.
2066  throw if slice does not start at 1
2067  throw if sequence is provided and the sequence length does not
2068  match the slice length.
2069  throw if the SQL insert fails (e.g. on duplicate seq region)
2070  throw if slice argument is not passed
2071  throw if the slice end is not equal to seq_region_length
2072  Caller : database loading scripts
2073  Status : Stable
2074 
2075 =cut
2076 
2077 
2078 
2079 sub store {
2080  my $self = shift;
2081  my $slice = shift;
2082  my $seqref = shift;
2083  my $not_dna = shift;
2084 
2085  #
2086  # Get all of the sanity checks out of the way before storing anything
2087  #
2088 
2089  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2090  throw('Slice argument is required');
2091  }
2092 
2093  my $cs = $slice->coord_system();
2094  throw("Slice must have attached CoordSystem.") if(!$cs);
2095 
2096  my $db = $self->db();
2097  if(!$cs->is_stored($db)) {
2098  throw("Slice CoordSystem must already be stored in DB.")
2099  }
2100 
2101  if($slice->start != 1 || $slice->strand != 1) {
2102  throw("Slice must have start==1 and strand==1.");
2103  }
2104 
2105  if($slice->end() != $slice->seq_region_length()) {
2106  throw("Slice must have end==seq_region_length");
2107  }
2108 
2109  my $sr_len = $slice->length();
2110  my $sr_name = $slice->seq_region_name();
2111 
2112  if($sr_name eq '') {
2113  throw("Slice must have valid seq region name.");
2114  }
2115 
2116  if($cs->is_sequence_level() && !$not_dna) {
2117  if(!$seqref) {
2118  throw("Must provide sequence for sequence level coord system.");
2119  }
2120  if(ref($seqref) ne 'SCALAR') {
2121  throw("Sequence must be a scalar reference.");
2122  }
2123  my $seq_len = length($$seqref);
2124 
2125  if($seq_len != $sr_len) {
2126  throw("Sequence length ($seq_len) must match slice length ($sr_len).");
2127  }
2128  } else {
2129  if($seqref) {
2130  throw("Cannot provide sequence for non-sequence level seq regions.");
2131  }
2132  }
2133 
2134  #store the seq_region
2135 
2136  my $sth = $db->dbc->prepare("INSERT INTO seq_region " .
2137  " ( name, length, coord_system_id ) " .
2138  " VALUES ( ?, ?, ? )"
2139  );
2140 
2141  $sth->bind_param(1,$sr_name,SQL_VARCHAR);
2142  $sth->bind_param(2,$sr_len,SQL_INTEGER);
2143  $sth->bind_param(3,$cs->dbID,SQL_INTEGER);
2144 
2145  $sth->execute();
2146 
2147  my $seq_region_id = $self->last_insert_id('seq_region_id', undef, 'seq_region');
2148 
2149  if(!$seq_region_id) {
2150  throw("Database seq_region insertion failed.");
2151  }
2152 
2153  if($cs->is_sequence_level() && !$not_dna) {
2154  #store sequence if it was provided
2155  my $seq_adaptor = $db->get_SequenceAdaptor();
2156  $seq_adaptor->store($seq_region_id, $$seqref);
2157  }
2158 
2159  #synonyms
2160  if(defined($slice->{'synonym'})){
2161  foreach my $syn (@{$slice->{'synonym'}} ){
2162  $syn->seq_region_id($seq_region_id); # set the seq_region_id
2163  $syn->adaptor->store($syn);
2164  }
2165  }
2166 
2167 
2168  $slice->adaptor($self);
2169 
2170  return $seq_region_id;
2171 }
2172 
2173 
2174 sub update {
2175  my $self = shift;
2176  my $slice = shift;
2177 
2178  #
2179  # Get all of the sanity checks out of the way before storing anything
2180  #
2181 
2182  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2183  throw('Slice argument is required');
2184  }
2185 
2186  my $cs = $slice->coord_system();
2187  throw("Slice must have attached CoordSystem.") if(!$cs);
2188 
2189  my $db = $self->db();
2190 
2191  if($slice->start != 1 || $slice->strand != 1) {
2192  throw("Slice must have start==1 and strand==1.");
2193  }
2194 
2195  if($slice->end() != $slice->seq_region_length()) {
2196  throw("Slice must have end==seq_region_length");
2197  }
2198 
2199  my $sr_len = $slice->length();
2200  my $sr_name = $slice->seq_region_name();
2201 
2202  if(!$sr_name) {
2203  throw("Slice must have valid seq region name.");
2204  }
2205 
2206  #update the seq_region
2207 
2208  my $seq_region_id = $slice->get_seq_region_id();
2209  my $update_sql = qq(
2210  UPDATE seq_region
2211  SET name = ?,
2212  length = ?,
2213  coord_system_id = ?
2214  WHERE seq_region_id = ?
2215  );
2216 
2217  my $sth = $db->dbc->prepare($update_sql);
2218 
2219  $sth->bind_param(1,$sr_name,SQL_VARCHAR);
2220  $sth->bind_param(2,$sr_len,SQL_INTEGER);
2221  $sth->bind_param(3,$cs->dbID,SQL_INTEGER);
2222 
2223  $sth->bind_param(4, $seq_region_id, SQL_INTEGER);
2224 
2225  $sth->execute();
2226 
2227  #synonyms
2228  if(defined($slice->{'synonym'})){
2229  foreach my $syn (@{$slice->{'synonym'}} ){
2230  $syn->seq_region_id($seq_region_id); # set the seq_region_id
2231  my $syn_adaptor = $db->get_SeqRegionSynonymAdaptor();
2232  $syn_adaptor->store($syn);
2233  }
2234  }
2235 }
2236 
2237 =head2 remove
2238 
2239  Arg [1] : Bio::EnsEMBL::Slice $slice
2240  The slice to remove from the database
2241  Example : $slice_adaptor->remove($slice);
2242  Description: Removes a slice completely from the database.
2243  All associated seq_region_attrib are removed as well.
2244  If dna is attached to the slice, it is also removed.
2245  Returntype : none
2246  Exceptions : throw if slice has no coord system.
2247  throw if slice argument is not passed
2248  warning if slice is not stored in this database
2249  Caller : general
2250  Status : Stable
2251 
2252 =cut
2253 
2254 
2255 sub remove {
2256  my $self = shift;
2257  my $slice = shift;
2258 
2259  #
2260  # Get all of the sanity checks out of the way before storing anything
2261  #
2262 
2263  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2264  throw('Slice argument is required');
2265  }
2266 
2267  my $cs = $slice->coord_system();
2268  throw("Slice must have attached CoordSystem.") if(!$cs);
2269 
2270  my $db = $self->db();
2271  my $seq_region_id = $slice->get_seq_region_id();
2272 
2273  if ($cs->is_sequence_level()) {
2274  my $seq_adaptor = $db->get_SequenceAdaptor();
2275  $seq_adaptor->remove($seq_region_id);
2276  }
2277 
2278  if(defined($slice->{'synonym'})){
2279  foreach my $syn (@{$slice->{'synonym'}} ){
2280  $syn->seq_region_id($seq_region_id); # set the seq_region_id
2281  $syn->adaptor->remove($syn);
2282  }
2283  }
2284 
2285  my $attrib_adaptor = $self->db->get_AttributeAdaptor();
2286  $attrib_adaptor->remove_from_Slice($slice);
2287  $self->remove_assembly($slice);
2288 
2289  my $sr_name = $slice->seq_region_name();
2290 
2291  #remove the seq_region
2292 
2293  my $sth = $db->dbc->prepare("DELETE FROM seq_region " .
2294  "WHERE name = ? AND " .
2295  " coord_system_id = ?" );
2296 
2297  $sth->bind_param(1,$sr_name,SQL_VARCHAR);
2298  $sth->bind_param(2,$cs->dbID,SQL_INTEGER);
2299 
2300  $sth->execute();
2301 
2302  return;
2303 }
2304 
2305 
2306 =head2 remove_assembly
2307 
2308  Arg [1] : Bio::EnsEMBL::Slice $asm_slice or $cmp_slice
2309  Example : $slice_adaptor->remove_assembly( $slice );
2310  Description: Deletes from the assembly table
2311  where asm or cmp corresponds to slice
2312  Do not call this method unless you really know what you are doing
2313  Returntype : none
2314  Exceptions : throw if slice has no coord system (cs).
2315  throw if no slice provided, or argument is not a slice
2316  Caller : Internal
2317  Status : Experimental
2318 
2319 =cut
2320 
2321 
2322 sub remove_assembly {
2323  my $self = shift;
2324  my $slice = shift;
2325 
2326  if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2327  throw('Slice argument is required');
2328  }
2329 
2330  my $cs = $slice->coord_system();
2331  throw("Slice must have attached CoordSystem.") if(!$cs);
2332 
2333  #
2334  # Checks complete. Delete the data
2335  #
2336  my $sth = $self->db->dbc->prepare
2337  ("DELETE FROM assembly " .
2338  "WHERE asm_seq_region_id = ? OR " .
2339  " cmp_seq_region_id = ? ");
2340 
2341  my $asm_seq_region_id = $self->get_seq_region_id( $slice );
2342  my $cmp_seq_region_id = $self->get_seq_region_id( $slice );
2343 
2344  $sth->bind_param(1,$asm_seq_region_id,SQL_INTEGER);
2345  $sth->bind_param(2,$cmp_seq_region_id,SQL_INTEGER);
2346 
2347  $sth->execute();
2348  $sth->finish();
2349 
2350  return;
2351 
2352 }
2353 
2354 =head2 fetch_assembly
2355 
2356  Arg [1] : Bio::EnsEMBL::Slice $asm_slice
2357  Arg [2] : Bio::EnsEMBL::Slice $cmp_slice
2358  Example : $asm = $slice_adaptor->fetch_assembly( $slice1, $slice2 );
2359  Description: Fetches from the assembly table based on the
2360  coordinates of the two slices supplied.
2361  Returns a mapper object mapping the two slices
2362  Do not call this method unless you really know what you are doing
2363  Returntype : Bio::EnsEMBL::Mapper
2364  Exceptions : throw if either slice has no coord system (cs).
2365  throw if there is no mapping path between coord systems
2366  throw if there are existing mappings between either slice
2367  and the oposite cs
2368  Caller : Internal
2369  Status : Experimental
2370 
2371 =cut
2372 
2373 
2374 sub fetch_assembly {
2375  my $self = shift;
2376  my $asm_slice = shift;
2377  my $cmp_slice = shift;
2378 
2379  if(!ref($asm_slice) || !($asm_slice->isa('Bio::EnsEMBL::Slice') or $asm_slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2380  throw('Assembled Slice argument is required');
2381  }
2382  if(!ref($cmp_slice) || !($cmp_slice->isa('Bio::EnsEMBL::Slice') or $cmp_slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
2383  throw('Assembled Slice argument is required');
2384  }
2385 
2386  my $asm_cs = $asm_slice->coord_system();
2387  throw("Slice must have attached CoordSystem.") if(!$asm_cs);
2388  my $cmp_cs = $cmp_slice->coord_system();
2389  throw("Slice must have attached CoordSystem.") if(!$cmp_cs);
2390 
2391  my @path =
2392  @{ $asm_cs->adaptor()->get_mapping_path( $asm_cs, $cmp_cs ) };
2393 
2394  if ( !@path ) {
2395  throw("No mapping path defined between "
2396  . $asm_cs->name() . " and "
2397  . $cmp_cs->name() );
2398  }
2399 
2400  #
2401  # Checks complete. Fetch the data
2402  #
2403  my $sth = $self->db->dbc->prepare
2404  ("SELECT * FROM assembly " .
2405  "WHERE asm_seq_region_id = ? AND " .
2406  " cmp_seq_region_id = ? AND " .
2407  " asm_start = ? AND " .
2408  " asm_end = ? AND " .
2409  " cmp_start = ? AND " .
2410  " cmp_end = ? AND " .
2411  " ori = ?" );
2412 
2413  my $asm_seq_region_id = $self->get_seq_region_id( $asm_slice );
2414  my $cmp_seq_region_id = $self->get_seq_region_id( $cmp_slice );
2415  my $ori = $asm_slice->strand * $cmp_slice->strand;
2416 
2417  $sth->bind_param(1,$asm_seq_region_id,SQL_INTEGER);
2418  $sth->bind_param(2,$cmp_seq_region_id,SQL_INTEGER);
2419  $sth->bind_param(3,$asm_slice->start,SQL_INTEGER);
2420  $sth->bind_param(4,$asm_slice->end,SQL_INTEGER);
2421  $sth->bind_param(5,$cmp_slice->start,SQL_INTEGER);
2422  $sth->bind_param(6,$cmp_slice->end,SQL_INTEGER);
2423  $sth->bind_param(7,$ori,SQL_INTEGER);
2424 
2425  $sth->execute();
2426 
2427  my $results = $sth->fetchrow_array();
2428  $sth->finish();
2429 
2430  my $mapper;
2431  if ($results) {
2432  $mapper = Bio::EnsEMBL::Mapper->new($asm_slice, $cmp_slice, $asm_cs, $cmp_cs);
2433  }
2434 
2435  return $mapper;
2436 
2437 }
2438 
2439 =head2 store_assembly
2440 
2441  Arg [1] : Bio::EnsEMBL::Slice $asm_slice
2442  Arg [2] : Bio::EnsEMBL::Slice $cmp_slice
2443  Example : $asm = $slice_adaptor->store_assembly( $slice1, $slice2 );
2444  Description: Creates an entry in the assembly table based on the
2445  coordinates of the two slices supplied. Returns a string
2446  representation of the assembly that gets created.
2447  Returntype : string
2448  Exceptions : throw if either slice has no coord system (cs).
2449  throw unless the cs rank of the asm_slice is lower than the
2450  cmp_slice.
2451  throw if there is no mapping path between coord systems
2452  throw if the lengths of each slice are not equal
2453  throw if there are existing mappings between either slice
2454  and the oposite cs
2455  Caller : database loading scripts
2456  Status : Experimental
2457 
2458 =cut
2459 
2460 sub store_assembly{
2461  my $self = shift;
2462  my $asm_slice = shift;
2463  my $cmp_slice = shift;
2464 
2465  #
2466  # Get all of the sanity checks out of the way before storing anything
2467  #
2468 
2469  if(!ref($asm_slice) || !($asm_slice->isa('Bio::EnsEMBL::Slice') or $asm_slice->isa('Bio::EnsEMBL::LRGSlice'))) {
2470  throw('Assembled Slice argument is required');
2471  }
2472  if(!ref($cmp_slice) || !($cmp_slice->isa('Bio::EnsEMBL::Slice') or $cmp_slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
2473  throw('Assembled Slice argument is required');
2474  }
2475 
2476  my $asm_cs = $asm_slice->coord_system();
2477  throw("Slice must have attached CoordSystem.") if(!$asm_cs);
2478  my $cmp_cs = $cmp_slice->coord_system();
2479  throw("Slice must have attached CoordSystem.") if(!$cmp_cs);
2480 
2481  my @path =
2482  @{ $asm_cs->adaptor()->get_mapping_path( $asm_cs, $cmp_cs ) };
2483 
2484  if ( !@path ) {
2485  throw("No mapping path defined between "
2486  . $asm_cs->name() . " and "
2487  . $cmp_cs->name() );
2488  }
2489 
2490  if( $asm_slice->length != $cmp_slice->length ){
2491  throw("The lengths of the assembled and component slices are not equal" );
2492  }
2493 
2494  # For now we disallow any existing mappings between the asm slice and cmp
2495  # CoordSystem and vice-versa.
2496  # Some cases of multiple mappings may be allowable by the API, but their
2497  # logic needs to be coded below.
2498 
2499  my $asm_proj = $asm_slice->project( $cmp_cs->name, $cmp_cs->version );
2500  if( @$asm_proj && $cmp_cs->name ne 'lrg' && $asm_cs->name ne 'lrg'){
2501  throw("Regions of the assembled slice are already assembled ".
2502  "into the component CoordSystem" );
2503  }
2504  my $cmp_proj = $cmp_slice->project( $asm_cs->name, $asm_cs->version );
2505  if( @$cmp_proj && $cmp_cs->name ne 'lrg' && $asm_cs->name ne 'lrg'){
2506  throw("Regions of the component slice are already assembled ".
2507  "into the assembled CoordSystem" );
2508  }
2509 
2510  #
2511  # Checks complete. Store the data
2512  #
2513  my $sth = $self->db->dbc->prepare
2514  ("INSERT INTO assembly " .
2515  " ( asm_seq_region_id, " .
2516  " cmp_seq_region_id, " .
2517  " asm_start, " .
2518  " asm_end , " .
2519  " cmp_start, " .
2520  " cmp_end , " .
2521  " ori )" .
2522  "VALUES ( ?, ?, ?, ?, ?, ?, ? )");
2523 
2524  my $asm_seq_region_id = $self->get_seq_region_id( $asm_slice );
2525  my $cmp_seq_region_id = $self->get_seq_region_id( $cmp_slice );
2526  my $ori = $asm_slice->strand * $cmp_slice->strand;
2527 
2528  $sth->bind_param(1,$asm_seq_region_id,SQL_INTEGER);
2529  $sth->bind_param(2,$cmp_seq_region_id,SQL_INTEGER);
2530  $sth->bind_param(3,$asm_slice->start,SQL_INTEGER);
2531  $sth->bind_param(4,$asm_slice->end,SQL_INTEGER);
2532  $sth->bind_param(5,$cmp_slice->start,SQL_INTEGER);
2533  $sth->bind_param(6,$cmp_slice->end,SQL_INTEGER);
2534  $sth->bind_param(7,$ori,SQL_INTEGER);
2535 
2536  $sth->execute();
2537 
2538  #use Data::Dumper qw( Dumper );
2539  #warn Dumper( $self->db->{seq_region_cache} );
2540  #$self->db->{seq_region_cache} = undef;
2541  #$self->_cache_seq_regions();
2542 
2543  my $ama = $self->db->get_AssemblyMapperAdaptor();
2544  $ama->delete_cache();
2545 
2546 
2547  return $asm_slice->name . "<>" . $cmp_slice->name;
2548 
2549 }
2550 
2551 
2552 =head2 prepare
2553 
2554  Arg [1] : String $sql
2555  Example : ( optional )
2556  Description: overrides the default adaptor prepare method.
2557  All slice sql will usually use the dna_db.
2558  Returntype : DBD::sth
2559  Exceptions : none
2560  Caller : internal, convenience method
2561  Status : Stable
2562 
2563 =cut
2564 
2565 sub prepare {
2566  my ( $self, $sql ) = @_;
2567  return $self->db()->dnadb()->dbc->prepare($sql);
2568 }
2569 
2570 sub _build_exception_cache {
2571  my $self = shift;
2572 
2573  # build up a cache of the entire assembly exception table
2574  # it should be small anyway
2575  my $sth =
2576  $self->prepare( 'SELECT ae.seq_region_id, ae.seq_region_start, '
2577  . 'ae.seq_region_end, ae.exc_type, ae.exc_seq_region_id, '
2578  . 'ae.exc_seq_region_start, ae.exc_seq_region_end '
2579  . 'FROM assembly_exception ae, '
2580  . 'seq_region sr, coord_system cs '
2581  . 'WHERE sr.seq_region_id = ae.seq_region_id '
2582  . 'AND sr.coord_system_id = cs.coord_system_id '
2583  . 'AND cs.species_id = ?' );
2584 
2585  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
2586  $sth->execute();
2587 
2588  my %hash;
2589  $self->{'asm_exc_cache'} = \%hash;
2590 
2591  my $row;
2592  while ( $row = $sth->fetchrow_arrayref() ) {
2593  my @result = @$row;
2594  $hash{ $result[0] } ||= [];
2595  push( @{ $hash{ $result[0] } }, \@result );
2596  }
2597  $sth->finish();
2598 } ## end sub _build_exception_cache
2599 
2600 =head2 cache_toplevel_seq_mappings
2601 
2602  Args : none
2603  Example : $slice_adaptor->cache_toplevel_seq_mappings();
2604  Description: caches all the assembly mappings needed for genes
2605  Returntype : None
2606  Exceptions : None
2607  Caller : general
2608  Status : At Risk
2609  : New experimental code
2610 
2611 =cut
2612 
2613 sub cache_toplevel_seq_mappings {
2614  my ($self) = @_;
2615 
2616  # Get the sequence level to map too
2617 
2618  my $sql = (<<SSQL);
2619  SELECT name
2620  FROM coord_system
2621  WHERE attrib like "%sequence_level%"
2622  AND species_id = ?
2623 SSQL
2624 
2625  my $sth = $self->prepare($sql);
2626  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
2627  $sth->execute();
2628 
2629  my $sequence_level = $sth->fetchrow_array();
2630 
2631  $sth->finish();
2632 
2633  my $csa = $self->db->get_CoordSystemAdaptor();
2634  my $ama = $self->db->get_AssemblyMapperAdaptor();
2635 
2636  my $cs1 = $csa->fetch_by_name($sequence_level);
2637 
2638  #get level to map too.
2639 
2640  $sql = (<<LSQL);
2641  SELECT DISTINCT(cs.name)
2642  FROM seq_region sr,
2643  seq_region_attrib sra,
2644  attrib_type at,
2645  coord_system cs
2646  WHERE sra.seq_region_id = sr.seq_region_id
2647  AND sra.attrib_type_id = at.attrib_type_id
2648  AND at.code = "toplevel"
2649  AND cs.coord_system_id = sr.coord_system_id
2650  AND cs.species_id = ?
2651 LSQL
2652 
2653  $sth = $self->prepare($sql);
2654  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
2655  $sth->execute();
2656 
2657  while ( my $csn = $sth->fetchrow_array() ) {
2658  if ( $csn eq $sequence_level ) { next }
2659  my $cs2 = $csa->fetch_by_name($csn);
2660  my $am = $ama->fetch_by_CoordSystems( $cs1, $cs2 );
2661  $am->register_all();
2662  }
2663 
2664 } ## end sub cache_toplevel_seq_mappings
2665 
2666 
2667 sub _build_circular_slice_cache {
2668  my $self = shift;
2669 
2670  # build up a cache of circular sequence region ids
2671  my $sth =
2672  $self->prepare( "SELECT sra.seq_region_id FROM seq_region_attrib sra "
2673  . "INNER JOIN attrib_type at ON sra.attrib_type_id = at.attrib_type_id "
2674  . "INNER JOIN seq_region sr ON sra.seq_region_id = sr.seq_region_id "
2675  . "INNER JOIN coord_system cs ON sr.coord_system_id = cs.coord_system_id "
2676  . "WHERE code = 'circular_seq' and cs.species_id = ?");
2677 
2678  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
2679  $sth->execute();
2680 
2681  my $id;
2682  my %hash;
2683  if ( ($id) = $sth->fetchrow_array() ) {
2684  $self->{'circular_sr_id_cache'} = \%hash;
2685  $self->{'is_circular'} = 1;
2686  $hash{ $id } = $id;
2687  while ( ($id) = $sth->fetchrow_array() ) {
2688  $hash{ $id } = $id;
2689  }
2690  } else {
2691  $self->{'is_circular'} = 0;
2692  }
2693  $sth->finish();
2694 } ## end _build_circular_slice_cache
2695 
2696 =head2 _create_chromosome_alias
2697 
2698  Args : none
2699  Example : $self->create_chromosome_alias();
2700  Description: create chromosome alias in coordinate system with karyotype attributes
2701  Returntype : Bio::EnsEMBL::CoordSystem, or none
2702  Exceptions : None
2703  Caller : general
2704  Status : in testing
2705 
2706 =cut
2707 
2708 sub _create_chromosome_alias {
2709  my $self = shift;
2710  my $csa = $self->db->get_CoordSystemAdaptor();
2711 
2712  # to store reformatted db query results
2713  my $karyotype_seq_regions;
2714 
2715  my $karyotype_rank_string = "karyotype_rank";
2716 
2717  # to store coord_system_id to alias
2718  my $cs_id;
2719  my %cs_id_rank;
2720 
2721  # check whether seq region cache exists, otherwise run database query
2722  if ( $self->{'karyotype_cache'} ) {
2723  $karyotype_seq_regions = $self->{'karyotype_cache'};
2724 
2725  # get candidate coordSystem(s) to alias
2726  foreach my $seq_region_name ( keys %{$karyotype_seq_regions} ) {
2727  foreach my $coord_system_id ( keys %{ $karyotype_seq_regions->{ $seq_region_name } } ) {
2728  my $rank = $karyotype_seq_regions->{ $seq_region_name }->{ $coord_system_id }->{ rank };
2729  $cs_id_rank{ $coord_system_id } = $rank;
2730  }
2731  }
2732  } else {
2733  # cannot find a coordssystem called chromosome
2734  # look through attribs to find seq_regions with karyotype
2735 
2736  my $sth =
2737  $self->prepare( "SELECT sr.seq_region_id, sr.name, sr.coord_system_id, sr.length, a.code, cs.rank "
2738  . "FROM seq_region sr, seq_region_attrib sra, attrib_type a, coord_system cs "
2739  . "WHERE sr.seq_region_id = sra.seq_region_id "
2740  . "AND a.attrib_type_id = sra.attrib_type_id "
2741  . "AND cs.coord_system_id = sr.coord_system_id "
2742  . "AND a.code = ?" );
2743 
2744  $sth->bind_param( 1, $karyotype_rank_string );
2745  $sth->execute();
2746 
2747  # fetch SQL results and identify coord_system_id(s) that
2748  # has(have) karyotype attribs
2749  while ( my $hashref = $sth->fetchrow_hashref() ) {
2750  my $seq_region_id = $hashref->{ seq_region_id };
2751  my $seq_region_name = $hashref->{ name };
2752  my $coord_system_id = $hashref->{ coord_system_id };
2753  my $length = $hashref->{ length };
2754  my $code = $hashref->{ code };
2755  my $rank = $hashref->{ rank };
2756 
2757  # account for possibility of multiple coord_system_ids returned from db query
2758  $karyotype_seq_regions->{ $seq_region_name }->{ $coord_system_id } = {
2759  code => $code,
2760  length => $length,
2761  seq_region_id => $seq_region_id,
2762  rank => $rank
2763  };
2764 
2765  # hash to help decide which cs id to use, if multiple
2766  $cs_id_rank{ $coord_system_id } = $rank;
2767  }
2768  $sth->finish();
2769  }
2770 
2771  my $cs_id_count = keys %cs_id_rank;
2772  # if number of coordSystem ids retrieved is one, simply use this coordSystem id
2773  # otherwise, choose the coordSystem with the best-rank (i.e. lowest number)
2774  if ( $cs_id_count == 1 ) {
2775  $cs_id = (keys %cs_id_rank)[0]; # get only key name in hash
2776  } else {
2777  foreach my $id (sort { $cs_id_rank{$a} <=> $cs_id_rank{$b} } keys %cs_id_rank) {
2778  $cs_id = $id;
2779  last; # exit loop after getting lowest-ranked coordSystem id
2780  }
2781  }
2782 
2783  if ( !$cs_id ) {
2784  throw("No coordinate system to create a chromosome slice from (because there is no suitable coordinate system to create an alias to).\n");
2785  } else {
2786 
2787  # use appropriate 'coord_system_id' to set 'alias_to' variable
2788  # first check that a chromosome object does not already exist
2789  my $cs = $csa->fetch_by_dbID( $cs_id );
2790  if ( !$cs ) {
2791  throw("Unable to retrieve CoordSystem object using dbID: $cs_id");
2792  }
2793  if ( $cs->name eq "chromosome" ) {
2794  throw("A chromosome CoordSystem object already exists. Cannot create chromosome alias.");
2795  }
2796  $cs->alias_to('chromosome');
2797 
2798  # create karyotype cache in retrieved coordsystem
2799  $cs->{'karyotype_cache'} = $karyotype_seq_regions;
2800 
2801  return $cs;
2802  }
2803 } ## end create_chromosome_alias
2804 
2805 
2806 =head2 _fetch_by_seq_region_synonym
2807  Args : $cs, $seq_region_name, $start, $end, $strand, $version, $no_fuzz
2808  Example : $self->fetch_by_seq_region_synonym( $cs, $seq_region_name, $start, $end, $strand, $version, $no_fuzz );
2809  Description: fetches all the seq region synonyms (or uses wildcard) when requested
2810  Returntype : Bio::EnsEMBL::SliceAdaptor, or none
2811  Exceptions : None
2812  Caller : general
2813  Status : (refactored from fetch_by_region)
2814 =cut
2815 
2816 sub _fetch_by_seq_region_synonym {
2817 
2818  my ( $self, $cs, $seq_region_name, $start, $end, $strand, $version, $no_fuzz ) = @_;
2819 
2820  # check input
2821  assert_integer($start, 'start') if defined $start;
2822  assert_integer($end, 'end') if defined $end;
2823 
2824  if ( !defined($start) ) { $start = 1 }
2825  if ( !defined($strand) ) { $strand = 1 }
2826 
2827  if ( !defined($seq_region_name) ) {
2828  throw('seq_region_name argument is required');
2829  }
2830 
2831  my $coord_system_name;
2832  # try synonyms
2833  my $syn_sql = "select s.name, cs.name, cs.version from seq_region s join seq_region_synonym ss using (seq_region_id) join coord_system cs using (coord_system_id) where ss.synonym like ? and cs.species_id =? ";
2834  if (defined $cs) {
2835  $coord_system_name = $cs->name;
2836  $syn_sql .= "AND cs.name = '" . $coord_system_name . "' ";
2837  }
2838  if (defined $version) {
2839  $syn_sql .= "AND cs.version = '" . $version . "' ";
2840  }
2841  $syn_sql .= "ORDER BY cs.rank ASC";
2842  my $syn_sql_sth = $self->prepare($syn_sql);
2843  $syn_sql_sth->bind_param(1, $seq_region_name, SQL_VARCHAR);
2844  $syn_sql_sth->bind_param(2, $self->species_id(), SQL_INTEGER);
2845  $syn_sql_sth->execute();
2846  my ($new_name, $new_coord_system, $new_version);
2847  $syn_sql_sth->bind_columns( \$new_name, \$new_coord_system, \$new_version);
2848 
2849  # we want to see if there was an exact match for the seq_region_synonym.
2850  # The overall logic is:
2851  # Case 1 - A synonym exactly matches the query, and either:
2852  # a - the coord_system matches
2853  # or
2854  # b - the user is looking for toplevel
2855  # In this case, we now have the real seq_region name, so we can re-run
2856  # fetch_by_region using that name, coord_system, and version
2857  # Case 2 - A synonym exactly matches, but none of the coord_systems match
2858  # and the user is not looking for toplevel.
2859  # In this case, we return nothing, and move on to try fuzzy-matching
2860  # if requested.
2861  # Case 3 - No synonyms match, so repeat the query using wildcards
2862  my $matched_synonym = 0;
2863  while ($syn_sql_sth->fetch){
2864  $matched_synonym = 1;
2865  if ((not defined($cs)) || ($cs->name eq $new_coord_system && $cs->version eq $new_version)) {
2866  return $self->fetch_by_region($new_coord_system, $new_name, $start, $end, $strand, $new_version, $no_fuzz);
2867  }
2868  }
2869 
2870  if ($matched_synonym == 0) {
2871  # Try wildcard searching if no exact synonym was found
2872  $syn_sql_sth = $self->prepare($syn_sql);
2873  my $escaped_seq_region_name = $seq_region_name;
2874  my $escape_char = $self->dbc->db_handle->get_info(14);
2875  $escaped_seq_region_name =~ s/([_%])/$escape_char$1/g;
2876  $syn_sql_sth->bind_param(1, "$escaped_seq_region_name%", SQL_VARCHAR);
2877  $syn_sql_sth->bind_param(2, $self->species_id(), SQL_INTEGER);
2878  $syn_sql_sth->execute();
2879  $syn_sql_sth->bind_columns( \$new_name, \$new_coord_system, \$new_version);
2880 
2881  my @matched_syn_wrong_cs;
2882  while ($syn_sql_sth->fetch){
2883  if ((not defined($cs)) || ($cs->name eq $new_coord_system && $cs->version eq $new_version)) {
2884  return $self->fetch_by_region($new_coord_system, $new_name, $start, $end, $strand, $new_version, $no_fuzz);
2885  } else {
2886  push(@matched_syn_wrong_cs, [$new_coord_system, $new_version]);
2887  }
2888  }
2889 
2890  if (scalar(@matched_syn_wrong_cs) > 0) {
2891  my $cs_warning_text = "Searched for a known feature on coordniate system: " .
2892  $cs->dbID .
2893  " but found it on " .
2894  join(", ", map {if ($_->[1]) {
2895  $_->[0] . ":" . $_->[1]
2896  } else {
2897  $_->[0]
2898  }
2899  } @matched_syn_wrong_cs) .
2900  "\n No result returned, consider searching without coordinate system or use toplevel.";
2901  warning($cs_warning_text);
2902  return;
2903  }
2904  }
2905  $syn_sql_sth->finish;
2906  return;
2907 }
2908 
2909 
2910 =head2 _fetch_by_fuzzy_matching
2911  Args : $cs, $seq_region_name, $sql, $constraint, $bind_params
2912  Example : my $fuzzy_matched_name = $self->fetch_by_fuzzy_matching( $cs, $seq_region_name, $sql, $constraint, $bind_params );
2913  Description: fetches all the fuzzy matches for a given seq_region_name when requested
2914  Returntype : string, Bio::EnsEMBL::CoordSystem
2915  Exceptions : None
2916  Caller : general
2917  Status : (refactored from fetch_by_region)
2918 =cut
2919 
2920 sub _fetch_by_fuzzy_matching {
2921 
2922  my ($self, $cs, $seq_region_name, $sql, $constraint, $bind_params) = @_;
2923 
2924  my $csa = $self->db->get_CoordSystemAdaptor();
2925  my $length;
2926 
2927  # Do fuzzy matching, assuming that we are just missing a version
2928  # on the end of the seq_region name.
2929 
2930  my $sth =
2931  $self->prepare( $sql . " WHERE sr.name LIKE ? " . $constraint );
2932 
2933  @$bind_params[0] = [ sprintf( '%s.%%', $seq_region_name ), SQL_VARCHAR ];
2934 
2935  my $pos = 0;
2936  foreach my $param (@$bind_params) {
2937  $sth->bind_param( ++$pos, $param->[0], $param->[1] );
2938  }
2939 
2940  $sth->execute();
2941 
2942  my $prefix_len = length($seq_region_name) + 1;
2943  my $high_ver = undef;
2944  my $high_cs = $cs;
2945 
2946  # Find the fuzzy-matched seq_region with the highest postfix
2947  # (which ought to be a version).
2948 
2949  my ( $tmp_name, $id, $tmp_length, $cs_id );
2950  $sth->bind_columns( \( $tmp_name, $id, $tmp_length, $cs_id ) );
2951 
2952  my $i = 0;
2953 
2954  while ( $sth->fetch ) {
2955  my $tmp_cs =
2956  ( defined($cs) ? $cs : $csa->fetch_by_dbID($cs_id) );
2957 
2958  # cache values for future reference
2959  my $arr = [ $id, $tmp_name, $cs_id, $tmp_length ];
2960  $self->{'sr_name_cache'}->{"$tmp_name:$cs_id"} = $arr;
2961  $self->{'sr_id_cache'}->{"$id"} = $arr;
2962 
2963  my $tmp_ver = substr( $tmp_name, $prefix_len );
2964 
2965  # skip versions which are non-numeric and apparently not
2966  # versions
2967  if ( $tmp_ver !~ /^\d+$/ ) { next }
2968 
2969  # take version with highest num, if two versions match take one
2970  # with highest ranked coord system (lowest num)
2971  if ( !defined($high_ver)
2972  || $tmp_ver > $high_ver
2973  || ( $tmp_ver == $high_ver && $tmp_cs->rank < $high_cs->rank )
2974  )
2975  {
2976  $seq_region_name = $tmp_name;
2977  $length = $tmp_length;
2978  $high_ver = $tmp_ver;
2979  $high_cs = $tmp_cs;
2980  }
2981 
2982  $i++;
2983  } ## end while ( $sth->fetch )
2984  $sth->finish();
2985 
2986  # warn if fuzzy matching found more than one result
2987  if ( $i > 1 ) {
2988  warning(
2989  sprintf(
2990  "Fuzzy matching of seq_region_name "
2991  . "returned more than one result.\n"
2992  . "You might want to check whether the returned seq_region\n"
2993  . "(%s:%s) is the one you intended to fetch.\n",
2994  $high_cs->name(), $seq_region_name ) );
2995  }
2996 
2997  $cs = $high_cs;
2998 
2999  # return if we did not find any appropriate match:
3000  if ( !defined($high_ver) ) { return; }
3001 
3002  return ($seq_region_name, $cs);
3003 }
3004 
3005 1;
transcript
public transcript()
Bio::EnsEMBL::CircularSlice
Definition: CircularSlice.pm:48
Bio::EnsEMBL::Registry::get_adaptor
public Adaptor get_adaptor()
EnsEMBL
Definition: Filter.pm:1
Bio::EnsEMBL::ProjectionSegment
Definition: ProjectionSegment.pm:27
Bio::EnsEMBL::LRGSlice::new
public new()
map
public map()
Bio::EnsEMBL::Utils::Slice
Definition: Slice.pm:21
Bio::EnsEMBL::CircularSlice::new
public Bio::EnsEMBL::CircularSlice new()
Bio::EnsEMBL::CoordSystem::name
public String name()
Bio::EnsEMBL::Feature
Definition: Feature.pm:47
Bio::EnsEMBL::DBSQL::DBAdaptor::dbc
public Bio::EnsEMBL::DBSQL::DBConnection dbc()
Bio::EnsEMBL::Slice::get_genome_component
public Scalar get_genome_component()
Bio::EnsEMBL::Slice::slice
public slice()
cleanup
public cleanup()
Bio::EnsEMBL::CoordSystem
Definition: CoordSystem.pm:40
Bio::EnsEMBL::DBSQL::SliceAdaptor
Definition: SliceAdaptor.pm:78
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::CoordSystem::alias_to
public Bio::EnsEMBL::CoordSystem alias_to()
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
Bio::EnsEMBL::Slice::new
public Bio::EnsEMBL::Slice new()
exon
public exon()
Bio::EnsEMBL::DBSQL::DBConnection::prepare
public DBI prepare()
Bio::EnsEMBL::Slice::seq_region_name
public String seq_region_name()
Bio::EnsEMBL::Slice::seq_region_length
public Int seq_region_length()
Bio::EnsEMBL::LRGSlice
Definition: LRGSlice.pm:37
Bio::EnsEMBL::Slice::get_seq_region_id
public Int get_seq_region_id()
has_karyotype
public has_karyotype()
Bio::EnsEMBL::Slice::new_fast
public Bio::EnsEMBL::Slice new_fast()
Bio::EnsEMBL::DBSQL::BaseAdaptor
Definition: BaseAdaptor.pm:71
Bio::EnsEMBL::Utils::Scalar
Definition: Scalar.pm:66
Bio::EnsEMBL::DBSQL::BaseAdaptor::db
public Bio::EnsEMBL::DBSQL::DBAdaptor db()
get_seq_region_id
public get_seq_region_id()
Bio::EnsEMBL::Slice::is_toplevel
public Int is_toplevel()
Bio::EnsEMBL::Registry::load_registry_from_db
public Int load_registry_from_db()
Bio::EnsEMBL::Slice::coord_system
public Bio::EnsEMBL::CoordSystem coord_system()
Bio::EnsEMBL::ProjectionSegment::to_Slice
public Bio::EnsEMBL::Slice to_Slice()
Bio
Definition: AltAlleleGroup.pm:4
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::Slice::strand
public Int strand()
Bio::EnsEMBL::Mapper
Definition: Coordinate.pm:3
Bio::EnsEMBL::Storable::adaptor
public Bio::EnsEMBL::DBSQL::BaseAdaptor adaptor()