ensembl-hive  2.7.0
AssemblyMapperAdaptor.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
34 
35 =head1 SYNOPSIS
36 
38 
40  -host => 'ensembldb.ensembl.org',
41  -user => 'anonymous'
42  );
43 
44  $asma = Bio::EnsEMBL::Registry->get_adaptor( "human", "core",
45  "assemblymapper" );
46 
47  $csa = Bio::EnsEMBL::Registry->get_adaptor( "human", "core",
48  "coordsystem" );
49 
50  my $chr33_cs = $csa->fetch_by_name( 'chromosome', 'NCBI33' );
51  my $chr34_cs = $csa->fetch_by_name( 'chromosome', 'NCBI34' );
52  my $ctg_cs = $csa->fetch_by_name('contig');
53  my $clone_cs = $csa->fetch_by_name('clone');
54 
55  my $chr_ctg_mapper =
56  $asma->fetch_by_CoordSystems( $chr33_cs, $ctg_cs );
57 
58  my $ncbi33_ncbi34_mapper =
59  $asm_adptr->fetch_by_CoordSystems( $chr33, $chr34 );
60 
61  my $ctg_clone_mapper =
62  $asm_adptr->fetch_by_CoordSystems( $ctg_cs, $clone_cs );
63 
64 
65 =head1 DESCRIPTION
66 
67 Adaptor for handling Assembly mappers. This is a I<Singleton> class.
68 ie: There is only one per database (C<DBAdaptor>).
69 
70 This is used to retrieve mappers between any two coordinate systems
71 whose makeup is described by the assembly table. Currently one step
72 (explicit) and two step (implicit) pairwise mapping is supported. In
73 one-step mapping an explicit relationship between the coordinate systems
74 is defined in the assembly table. In two-step 'chained' mapping no
75 explicit mapping is present but the coordinate systems must share a
76 common mapping to an intermediate coordinate system.
77 
78 =head1 METHODS
79 
80 =cut
81 
82 package Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor;
83 use vars qw(@ISA);
84 use strict;
85 
86 use Bio::EnsEMBL::DBSQL::BaseAdaptor;
90 
91 use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache
92 use Bio::EnsEMBL::Utils::Exception qw(throw);
94 
95 use integer; #do proper arithmetic bitshifts
96 
98 
99 
100 my $CHUNKFACTOR = 20; # 2^20 = approx. 10^6
101 
102 =head2 new
103 
104  Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for
105  the database this assembly mapper is using.
106  Example : my $asma = new Bio::EnsEMBL::AssemblyMapperAdaptor($dbadaptor);
107  Description: Creates a new AssemblyMapperAdaptor object
109  Exceptions : none
111  Status : Stable
112 
113 =cut
114 
115 sub new {
116  my($class, $dbadaptor) = @_;
117 
118  my $self = $class->SUPER::new($dbadaptor);
119 
120  $self->{'_asm_mapper_cache'} = {};
121 
122  # use a shared cache (for this database) that contains info about
123  # seq regions
124  my $seq_region_cache = $self->db->get_SeqRegionCache();
125  $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'};
126  $self->{'sr_id_cache'} = $seq_region_cache->{'id_cache'};
127 
128  return $self;
129 }
130 
131 
132 
133 =head2 cache_seq_ids_with_mult_assemblys
134 
135  Example : $self->adaptor->cache_seq_ids_with_mult_assemblys();
136  Description: Creates a hash of the component seq region ids that
137  map to more than one assembly from the assembly table.
138  Retruntype : none
139  Exceptions : none
140  Caller : AssemblyMapper, ChainedAssemblyMapper
141  Status : At Risk
142 
143 =cut
144 
145 sub cache_seq_ids_with_mult_assemblys{
146  my $self = shift;
147  my %multis;
148 
149  return if (defined($self->{'multi_seq_ids'}));
150 
151  $self->{'multi_seq_ids'} = {};
152 
153  my $sql = qq(
154  SELECT sra.seq_region_id
155  FROM seq_region_attrib sra,
156  attrib_type at,
157  seq_region sr,
158  coord_system cs
159  WHERE sra.attrib_type_id = at.attrib_type_id
160  AND code = "MultAssem"
161  AND sra.seq_region_id = sr.seq_region_id
162  AND sr.coord_system_id = cs.coord_system_id
163  AND cs.species_id = ?);
164 
165  my $sth = $self->prepare($sql);
166 
167  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
168 
169  $sth->execute();
170 
171  my ($seq_region_id);
172 
173  $sth->bind_columns(\$seq_region_id);
174 
175  while($sth->fetch()) {
176  $self->{'multi_seq_ids'}->{$seq_region_id} = 1;
177  }
178  $sth->finish;
179 }
180 
181 
182 =head2 fetch_by_CoordSystems
183 
184  Arg [1] : Bio::EnsEMBL::CoordSystem $cs1
185  One of the coordinate systems to retrieve the mapper
186  between
187  Arg [2] : Bio::EnsEMBL::CoordSystem $cs2
188  The other coordinate system to map between
189  Description: Retrieves an Assembly mapper for two coordinate
190  systems whose relationship is described in the
191  assembly table.
192 
193  The ordering of the coodinate systems is arbitrary.
194  The following two statements are equivalent:
195  $mapper = $asma->fetch_by_CoordSystems($cs1,$cs2);
196  $mapper = $asma->fetch_by_CoordSystems($cs2,$cs1);
197  Returntype : Bio::EnsEMBL::AssemblyMapper
198  Exceptions : wrong argument types
199  Caller : general
200  Status : Stable
201 
202 =cut
203 
204 sub fetch_by_CoordSystems {
205  my $self = shift;
206  my $cs1 = shift;
207  my $cs2 = shift;
208 
209  if(!ref($cs1) || !$cs1->isa('Bio::EnsEMBL::CoordSystem')) {
210  throw("cs1 argument must be a Bio::EnsEMBL::CoordSystem.");
211  }
212  if(!ref($cs2) || !$cs2->isa('Bio::EnsEMBL::CoordSystem')) {
213  throw("cs2 argument must be a Bio::EnsEMBL::CoordSystem.");
214  }
215 
216 # if($cs1->equals($cs2)) {
217 # throw("Cannot create mapper between same coord systems: " .
218 # $cs1->name . " " . $cs1->version);
219 # }
220 
221  if($cs1->is_top_level()) {
222  return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs1, $cs2);
223  }
224  if($cs2->is_top_level()) {
225  return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs2, $cs1);
226  }
227 
228  my $csa = $self->db->get_CoordSystemAdaptor();
229 
230  #retrieve the shortest possible mapping path between these systems
231  my @mapping_path = @{$csa->get_mapping_path($cs1,$cs2)};
232 
233  if(!@mapping_path) {
234 
235  return undef;
236  }
237 
238  my $key = join(':', map({defined($_)?$_->dbID():"-"} @mapping_path));
239 
240  my $asm_mapper = $self->{'_asm_mapper_cache'}->{$key};
241 
242  return $asm_mapper if($asm_mapper);
243 
244  if(@mapping_path == 1) {
245  throw("Incorrect mapping path defined in meta table. " .
246  "0 step mapping encountered between:\n" .
247  $cs1->name() . " " . $cs1->version() . " and " . $cs2->name . " " .
248  $cs2->version());
249  }
250 
251  if(@mapping_path == 2) {
252  #1 step regular mapping
253  $asm_mapper = Bio::EnsEMBL::AssemblyMapper->new($self, @mapping_path);
254 
255 # If you want multiple pieces on two seqRegions to map to each other
256 # you need to make an assembly.mapping entry that is seperated with a #
257 # instead of an |.
258 
259  $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
260  return $asm_mapper;
261  }
262 
263  if(@mapping_path == 3) {
264  #two step chained mapping
265  $asm_mapper = Bio::EnsEMBL::ChainedAssemblyMapper->new($self,@mapping_path);
266  #in multi-step mapping it is possible get requests with the
267  #coordinate system ordering reversed since both mappings directions
268  #cache on both orderings just in case
269  #e.g. chr <-> contig <-> clone and clone <-> contig <-> chr
270 
271  $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
272  $key = join(':', map({defined($_)?$_->dbID():"-"} reverse(@mapping_path)));
273  $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
274  return $asm_mapper;
275  }
276 
277  throw("Only 1 and 2 step coordinate system mapping is currently\n" .
278  "supported. Mapping between " .
279  $cs1->name() . " " . $cs1->version() . " and " .
280  $cs2->name() . " " . $cs2->version() .
281  " requires ". (scalar(@mapping_path)-1) . " steps.");
282 }
283 
284 
285 
286 =head2 register_assembled
287 
288  Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper
289  A valid AssemblyMapper object
290  Arg [2] : integer $asm_seq_region
291  The dbID of the seq_region to be registered
292  Arg [3] : int $asm_start
293  The start of the region to be registered
294  Arg [4] : int $asm_end
295  The end of the region to be registered
296  Description: Declares an assembled region to the AssemblyMapper.
297  This extracts the relevant data from the assembly
298  table and stores it in Mapper internal to the $asm_mapper.
299  It therefore must be called before any mapping is
300  attempted on that region. Otherwise only gaps will
301  be returned. Note that the AssemblyMapper automatically
302  calls this method when the need arises.
303  Returntype : none
304  Exceptions : throw if the seq_region to be registered does not exist
305  or if it associated with multiple assembled pieces (bad data
306  in assembly table)
308  Status : Stable
309 
310 =cut
311 
312 
313 sub register_assembled {
314  my $self = shift;
315  my $asm_mapper = shift;
316  my $asm_seq_region = shift;
317  my $asm_start = shift;
318  my $asm_end = shift;
319 
320  if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) {
321  throw("Bio::EnsEMBL::AssemblyMapper argument expected");
322  }
323 
324  throw("asm_seq_region argument expected") if(!defined($asm_seq_region));
325  throw("asm_start argument expected") if(!defined($asm_start));
326  throw("asm_end argument expected") if(!defined($asm_end));
327 
328  my $asm_cs_id = $asm_mapper->assembled_CoordSystem->dbID();
329  my $cmp_cs_id = $asm_mapper->component_CoordSystem->dbID();
330 
331  #split up the region to be registered into fixed chunks
332  #this allows us to keep track of regions that have already been
333  #registered and also works under the assumption that if a small region
334  #is requested it is likely that other requests will be made in the
335  #vicinity (the minimum size registered the chunksize (2^chunkfactor)
336 
337  my @chunk_regions;
338  #determine span of chunks
339  #bitwise shift right is fast and easy integer division
340 
341  my($start_chunk, $end_chunk);
342 
343  $start_chunk = $asm_start >> $CHUNKFACTOR;
344  $end_chunk = $asm_end >> $CHUNKFACTOR;
345 
346  # inserts have start = end + 1, on boundary condition start_chunk
347  # could be less than end chunk
348  if($asm_start == $asm_end + 1) {
349  ($start_chunk, $end_chunk) = ($end_chunk, $start_chunk);
350  }
351 
352  #find regions of continuous unregistered chunks
353  my $i;
354  my ($begin_chunk_region,$end_chunk_region);
355  for ($i = $start_chunk; $i <= $end_chunk; $i++) {
356  if($asm_mapper->have_registered_assembled($asm_seq_region, $i)) {
357  if(defined($begin_chunk_region)) {
358  #this is the end of an unregistered region.
359  my $region = [($begin_chunk_region << $CHUNKFACTOR),
360  (($end_chunk_region+1) << $CHUNKFACTOR)-1];
361  push @chunk_regions, $region;
362  $begin_chunk_region = $end_chunk_region = undef;
363  }
364  } else {
365  $begin_chunk_region = $i if(!defined($begin_chunk_region));
366  $end_chunk_region = $i+1;
367  $asm_mapper->register_assembled($asm_seq_region,$i);
368  }
369  }
370 
371  #the last part may have been an unregistered region too
372  if(defined($begin_chunk_region)) {
373  my $region = [($begin_chunk_region << $CHUNKFACTOR),
374  (($end_chunk_region+1) << $CHUNKFACTOR) -1];
375  push @chunk_regions, $region;
376  }
377 
378  return if(!@chunk_regions);
379 
380  # keep the Mapper to a reasonable size
381  if( $asm_mapper->size() > $asm_mapper->max_pair_count() ) {
382  $asm_mapper->flush();
383  #we now have to go and register the entire requested region since we
384  #just flushed everything
385 
386  @chunk_regions = ( [ ( $start_chunk << $CHUNKFACTOR)
387  , (($end_chunk+1) << $CHUNKFACTOR)-1 ] );
388 
389  for( my $i = $start_chunk; $i <= $end_chunk; $i++ ) {
390  $asm_mapper->register_assembled( $asm_seq_region, $i );
391  }
392  }
393 
394 # my $asm_seq_region_id =
395 # $self->_seq_region_name_to_id($asm_seq_region,$asm_cs_id);
396 
397  # Retrieve the description of how the assembled region is made from
398  # component regions for each of the continuous blocks of unregistered,
399  # chunked regions
400 
401  my $q = qq{
402  SELECT
403  asm.cmp_start,
404  asm.cmp_end,
405  asm.cmp_seq_region_id,
406  sr.name,
407  sr.length,
408  asm.ori,
409  asm.asm_start,
410  asm.asm_end
411  FROM
412  assembly asm, seq_region sr
413  WHERE
414  asm.asm_seq_region_id = ? AND
415  ? <= asm.asm_end AND
416  ? >= asm.asm_start AND
417  asm.cmp_seq_region_id = sr.seq_region_id AND
418  sr.coord_system_id = ?
419  };
420 
421  my $sth = $self->prepare($q);
422 
423  foreach my $region (@chunk_regions) {
424  my($region_start, $region_end) = @$region;
425  $sth->bind_param(1,$asm_seq_region,SQL_INTEGER);
426  $sth->bind_param(2,$region_start,SQL_INTEGER);
427  $sth->bind_param(3,$region_end,SQL_INTEGER);
428  $sth->bind_param(4,$cmp_cs_id,SQL_INTEGER);
429 
430  $sth->execute();
431 
432  my($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $ori,
433  $asm_start, $asm_end, $cmp_seq_region_length);
434 
435  $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id,
436  \$cmp_seq_region, \$cmp_seq_region_length, \$ori,
437  \$asm_start, \$asm_end);
438 
439  #
440  # Load the unregistered regions of the mapper
441  #
442  while($sth->fetch()) {
443  next if($asm_mapper->have_registered_component($cmp_seq_region_id)
444  and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region_id}));
445  $asm_mapper->register_component($cmp_seq_region_id);
446  $asm_mapper->mapper->add_map_coordinates(
447  $asm_seq_region, $asm_start, $asm_end,
448  $ori,
449  $cmp_seq_region_id, $cmp_start, $cmp_end);
450 
451  my $arr = [ $cmp_seq_region_id, $cmp_seq_region,
452  $cmp_cs_id, $cmp_seq_region_length ];
453 
454  $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr;
455  $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr;
456  }
457  }
458 
459  $sth->finish();
460 }
461 
462 
463 
464 sub _seq_region_name_to_id {
465  my $self = shift;
466  my $sr_name = shift;
467  my $cs_id = shift;
468 
469  if(!defined($sr_name) or
470  !defined($cs_id)){
471  throw('seq_region_name and coord_system_id args are required');
472  }
473 
474  my $arr = $self->{'sr_name_cache'}->{"$sr_name:$cs_id"};
475  if( $arr ) {
476  return $arr->[0];
477  }
478 
479  # Get the seq_region_id via the name. This would be quicker if we just
480  # used internal ids instead but stored but then we lose the ability
481  # the transform accross databases with different internal ids
482 
483  my $sth = $self->prepare("SELECT seq_region_id, length " .
484  "FROM seq_region " .
485  "WHERE name = ? AND coord_system_id = ?");
486 
487  $sth->bind_param(1,$sr_name,SQL_VARCHAR);
488  $sth->bind_param(2,$cs_id,SQL_INTEGER);
489  $sth->execute();
490 
491  my @row = $sth->fetchrow_array();
492  unless ( @row ) {
493  throw("No-existent seq_region [$sr_name] in coord system $cs_id");
494  }
495  my @more = $sth->fetchrow_array();
496  if ( @more ) {
497  throw("Ambiguous seq_region [$sr_name] in coord system $cs_id");
498  }
499 
500  my ($sr_id, $sr_length) = @row;
501  $sth->finish();
502 
503  $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ];
504 
505  $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr;
506  $self->{'sr_id_cache'}->{"$sr_id"} = $arr;
507 
508  return $sr_id;
509 }
510 
511 sub _seq_region_id_to_name {
512  my $self = shift;
513  my $sr_id = shift;
514 
515  ($sr_id) || throw('seq_region_is required');
516 
517  my $arr = $self->{'sr_id_cache'}->{"$sr_id"};
518  if( $arr ) {
519  return $arr->[1];
520  }
521 
522  # Get the seq_region name via the id. This would be quicker if we just
523  # used internal ids instead but stored but then we lose the ability
524  # the transform accross databases with different internal ids
525 
526  my $sth = $self->prepare("SELECT name, length ,coord_system_id " .
527  "FROM seq_region " .
528  "WHERE seq_region_id = ? ");
529 
530  $sth->bind_param(1,$sr_id,SQL_INTEGER);
531  $sth->execute();
532 
533  my @row = $sth->fetchrow_array();
534  if(!$sth->rows() == 1) {
535  throw("non-existant seq_region [$sr_id]");
536  }
537 
538  my ($sr_name, $sr_length, $cs_id) = @row;
539  $sth->finish();
540 
541  $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ];
542 
543  $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr;
544  $self->{'sr_id_cache'}->{"$sr_id"} = $arr;
545 
546  return $sr_name;
547 }
548 
549 
550 =head2 register_component
551 
552  Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper
553  A valid AssemblyMapper object
554  Arg [2] : integer $cmp_seq_region
555  The dbID of the seq_region to be registered
556  Description: Declares a component region to the AssemblyMapper.
557  This extracts the relevant data from the assembly
558  table and stores it in Mapper internal to the $asm_mapper.
559  It therefore must be called before any mapping is
560  attempted on that region. Otherwise only gaps will
561  be returned. Note that the AssemblyMapper automatically
562  calls this method when the need arises.
563  Returntype : none
564  Exceptions : throw if the seq_region to be registered does not exist
565  or if it associated with multiple assembled pieces (bad data
566  in assembly table)
568  Status : Stable
569 
570 =cut
571 
572 sub register_component {
573  my $self = shift;
574  my $asm_mapper = shift;
575  my $cmp_seq_region = shift;
576 
577  if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) {
578  throw("Bio::EnsEMBL::AssemblyMapper argument expected");
579  }
580 
581  if(!defined($cmp_seq_region)) {
582  throw("cmp_seq_region argument expected");
583  }
584 
585  my $cmp_cs_id = $asm_mapper->component_CoordSystem()->dbID();
586  my $asm_cs_id = $asm_mapper->assembled_CoordSystem()->dbID();
587 
588  #do nothing if this region is already registered or special case
589  return if($asm_mapper->have_registered_component($cmp_seq_region)
590  and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region}));
591 
592 # my $cmp_seq_region_id =
593 # $self->_seq_region_name_to_id($cmp_seq_region, $cmp_cs_id);
594 
595  # Determine what part of the assembled region this component region makes up
596 
597  my $q = qq{
598  SELECT
599  asm.asm_start,
600  asm.asm_end,
601  asm.asm_seq_region_id,
602  sr.name,
603  sr.length
604  FROM
605  assembly asm, seq_region sr
606  WHERE
607  asm.cmp_seq_region_id = ? AND
608  asm.asm_seq_region_id = sr.seq_region_id AND
609  sr.coord_system_id = ?
610  };
611 
612  my $sth = $self->prepare($q);
613  $sth->bind_param(1,$cmp_seq_region,SQL_INTEGER);
614  $sth->bind_param(2,$asm_cs_id,SQL_INTEGER);
615  $sth->execute();
616 
617  my @rows = $sth->fetchrow_array();
618 
619  if($sth->rows() == 0) {
620  #this component is not used in the assembled part i.e. gap
621  $asm_mapper->register_component($cmp_seq_region);
622  $sth->finish();
623  return;
624  }
625 
626  #we do not currently support components mapping to multiple assembled
627  # make sure that you've got the correct mapping in the meta-table :
628  # chromosome:EquCab2#contig ( use'#' for multiple mappings )
629  # chromosome:EquCab2|contig ( use '|' delimiter for 1-1 mappings )
630  #
631  my @more = $sth->fetchrow_array();
632  if($sth->rows() != 1) {
633  $sth->finish();
634  throw("Multiple assembled regions for single " .
635  "component region cmp_seq_region_id=[$cmp_seq_region]\n".
636  "Remember that multiple mappings use the \#-operaator".
637  " in the meta-table (i.e. chromosome:EquCab2\#contig\n");
638  }
639 
640  my ($asm_start, $asm_end, $asm_seq_region_id,
641  $asm_seq_region, $asm_seq_region_length) = @rows;
642 
643  my $arr = [ $asm_seq_region_id, $asm_seq_region,
644  $asm_cs_id, $asm_seq_region_length ];
645 
646  $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr;
647  $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr;
648 
649  $sth->finish();
650 
651  # Register the corresponding assembled region. This allows a us to
652  # register things in assembled chunks which allows us to:
653  # (1) Keep track of what assembled regions are registered
654  # (2) Use locality of reference (if they want something in same general
655  # region it will already be registered).
656 
657  $self->register_assembled($asm_mapper,$asm_seq_region_id,$asm_start,$asm_end);
658 }
659 
660 
661 
662 =head2 register_chained
663 
664  Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
665  The chained assembly mapper to register regions on
666  Arg [2] : string $from ('first' or 'last')
667  The direction we are registering from, and the name of the
668  internal mapper.
669  Arg [3] : string $seq_region_name
670  The name of the seqregion we are registering on
671  Arg [4] : listref $ranges
672  A list of ranges to register (in [$start,$end] tuples).
673  Arg [5] : (optional) $to_slice
674  Only register those on this Slice.
675  Description: Registers a set of ranges on a chained assembly mapper.
676  This function is at the heart of the chained mapping process.
677  It retrieves information from the assembly table and
678  dynamically constructs the mappings between two coordinate
679  systems which are 2 mapping steps apart. It does this by using
680  two internal mappers to load up a third mapper which is
681  actually used by the ChainedAssemblyMapper to perform the
682  mapping.
683 
684  This method must be called before any mapping is
685  attempted on regions of interest, otherwise only gaps will
686  be returned. Note that the ChainedAssemblyMapper automatically
687  calls this method when the need arises.
688  Returntype : none
689  Exceptions : throw if the seq_region to be registered does not exist
690  or if it associated with multiple assembled pieces (bad data
691  in assembly table)
692 
693  throw if the mapping between the coordinate systems cannot
694  be performed in two steps, which means there is an internal
695  error in the data in the meta table or in the code that creates
696  the mapping paths.
698  Status : Stable
699 
700 =cut
701 
702 sub register_chained {
703  my $self = shift;
704  my $casm_mapper = shift;
705  my $from = shift;
706  my $seq_region_id = shift;
707  my $ranges = shift;
708  my $to_slice = shift;
709 
710  my $to_seq_region_id;
711  if(defined($to_slice)){
712  if($casm_mapper->first_CoordSystem()->equals($casm_mapper->last_CoordSystem())){
713  return $self->_register_chained_special($casm_mapper, $from, $seq_region_id, $ranges, $to_slice);
714  }
715  $to_seq_region_id = $to_slice->get_seq_region_id();
716  if(!defined($to_seq_region_id)){
717  die "Could not get seq_region_id for to_slice".$to_slice->seq_region_name."\n";
718  }
719  }
720 
721  my ($start_name, $start_mid_mapper, $start_cs, $start_registry,
722  $end_name, $end_mid_mapper, $end_cs, $end_registry);
723 
724  if($from eq 'first') {
725  $start_name = 'first';
726  $start_mid_mapper = $casm_mapper->first_middle_mapper();
727  $start_cs = $casm_mapper->first_CoordSystem();
728  $start_registry = $casm_mapper->first_registry();
729  $end_mid_mapper = $casm_mapper->last_middle_mapper();
730  $end_cs = $casm_mapper->last_CoordSystem();
731  $end_registry = $casm_mapper->last_registry();
732  $end_name = 'last';
733  } elsif($from eq 'last') {
734  $start_name = 'last';
735  $start_mid_mapper = $casm_mapper->last_middle_mapper();
736  $start_cs = $casm_mapper->last_CoordSystem();
737  $start_registry = $casm_mapper->last_registry();
738  $end_mid_mapper = $casm_mapper->first_middle_mapper();
739  $end_cs = $casm_mapper->first_CoordSystem();
740  $end_registry = $casm_mapper->first_registry();
741  $end_name = 'first';
742  } else {
743  throw("Invalid from argument: [$from], must be 'first' or 'last'");
744  }
745 
746  my $combined_mapper = $casm_mapper->first_last_mapper();
747  my $mid_cs = $casm_mapper->middle_CoordSystem();
748  my $mid_name = 'middle';
749  my $csa = $self->db->get_CoordSystemAdaptor();
750 
751  # Check for the simple case where the ChainedMapper is short
752  if( ! defined $mid_cs ) {
753  $start_mid_mapper = $combined_mapper;
754  }
755 
756 
757  ##############
758  # obtain the first half of the mappings and load them into the start mapper
759  #
760 
761  #ascertain which is component and which is actually assembled coord system
762  my @path;
763 
764  # check for the simple case, where the ChainedMapper is short
765  if( defined $mid_cs ) {
766  @path = @{$csa->get_mapping_path($start_cs, $mid_cs)};
767  } else {
768  @path = @{$csa->get_mapping_path( $start_cs, $end_cs )};
769  }
770 
771  if(@path != 2 && defined( $path[1] )) {
772  my $path = join(',', map({$_->name .' '. $_->version} @path));
773  my $len = scalar(@path) - 1;
774  throw("Unexpected mapping path between start and intermediate " .
775  "coord systems (". $start_cs->name . " " . $start_cs->version .
776  " and " . $mid_cs->name . " " . $mid_cs->version . ")." .
777  "\nExpected path length 1, got $len. " .
778  "(path=$path)");
779  }
780 
781  my $sth;
782  my ($asm_cs,$cmp_cs);
783  $asm_cs = $path[0];
784  $cmp_cs = $path[-1];
785 
786  #the SQL varies depending on whether we are coming from assembled or
787  #component coordinate system
788 
789 my $asm2cmp = (<<ASMCMP);
790  SELECT
791  asm.cmp_start,
792  asm.cmp_end,
793  asm.cmp_seq_region_id,
794  sr.name,
795  sr.length,
796  asm.ori,
797  asm.asm_start,
798  asm.asm_end
799  FROM
800  assembly asm, seq_region sr
801  WHERE
802  asm.asm_seq_region_id = ? AND
803  ? <= asm.asm_end AND
804  ? >= asm.asm_start AND
805  asm.cmp_seq_region_id = sr.seq_region_id AND
806  sr.coord_system_id = ?
807 ASMCMP
808 
809 
810 my $cmp2asm = (<<CMPASM);
811  SELECT
812  asm.asm_start,
813  asm.asm_end,
814  asm.asm_seq_region_id,
815  sr.name,
816  sr.length,
817  asm.ori,
818  asm.cmp_start,
819  asm.cmp_end
820  FROM
821  assembly asm, seq_region sr
822  WHERE
823  asm.cmp_seq_region_id = ? AND
824  ? <= asm.cmp_end AND
825  ? >= asm.cmp_start AND
826  asm.asm_seq_region_id = sr.seq_region_id AND
827  sr.coord_system_id = ?
828 CMPASM
829 
830  my $asm2cmp_sth;
831  my $cmp2asm_sth;
832  if(defined($to_slice)){
833  my $to_cs = $to_slice->coord_system;
834  if($asm_cs->equals($to_cs)){
835  $asm2cmp_sth = $self->prepare($asm2cmp);
836  $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id");
837  }
838  elsif($cmp_cs->equals($to_cs)){
839  $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id");
840  $cmp2asm_sth = $self->prepare($cmp2asm);
841  }
842  else{
843  $asm2cmp_sth = $self->prepare($asm2cmp);
844  $cmp2asm_sth = $self->prepare($cmp2asm);
845  }
846  }
847  else{
848  $asm2cmp_sth = $self->prepare($asm2cmp);
849  $cmp2asm_sth = $self->prepare($cmp2asm);
850  }
851 
852 
853 
854  $sth = ($asm_cs->equals($start_cs)) ? $asm2cmp_sth : $cmp2asm_sth;
855 
856  my $mid_cs_id;
857 
858  # check for the simple case where the ChainedMapper is short
859  if( defined $mid_cs ) {
860  $mid_cs_id = $mid_cs->dbID();
861  } else {
862  $mid_cs_id = $end_cs->dbID();
863  }
864 
865  my @mid_ranges;
866  my @start_ranges;
867 
868  #need to perform the query for each unregistered range
869  foreach my $range (@$ranges) {
870  my ($start, $end) = @$range;
871  $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
872  $sth->bind_param(2,$start,SQL_INTEGER);
873  $sth->bind_param(3,$end,SQL_INTEGER);
874  $sth->bind_param(4,$mid_cs_id,SQL_INTEGER);
875  $sth->execute();
876 
877  #load the start <-> mid mapper with the results and record the mid cs
878  #ranges we just added to the mapper
879 
880  my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
881  $ori, $start_start, $start_end);
882 
883  $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
884  \$mid_seq_region, \$mid_length, \$ori, \$start_start,
885  \$start_end);
886 
887  while($sth->fetch()) {
888 
889  if( defined $mid_cs ) {
890  $start_mid_mapper->add_map_coordinates
891  (
892  $seq_region_id,$start_start, $start_end, $ori,
893  $mid_seq_region_id, $mid_start, $mid_end
894  );
895  } else {
896  if( $from eq "first" ) {
897  $combined_mapper->add_map_coordinates
898  (
899  $seq_region_id,$start_start, $start_end, $ori,
900  $mid_seq_region_id, $mid_start, $mid_end
901  );
902  } else {
903  $combined_mapper->add_map_coordinates
904  (
905  $mid_seq_region_id, $mid_start, $mid_end, $ori,
906  $seq_region_id,$start_start, $start_end
907  );
908  }
909  }
910 
911  #update sr_name cache
912  my $arr = [ $mid_seq_region_id, $mid_seq_region,
913  $mid_cs_id, $mid_length ];
914 
915  $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
916  $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
917 
918  push @mid_ranges,[$mid_seq_region_id,$mid_seq_region,
919  $mid_start,$mid_end];
920  push @start_ranges, [ $seq_region_id, $start_start, $start_end ];
921 
922  #the region that we actually register may actually be larger or smaller
923  #than the region that we wanted to register.
924  #register the intersection of the region so we do not end up doing
925  #extra work later
926 
927  if($start_start < $start || $start_end > $end) {
928  $start_registry->check_and_register($seq_region_id,$start_start,
929  $start_end);
930  }
931  }
932  $sth->finish();
933  }
934 
935  # in the one step case, we load the mid ranges in the
936  # last_registry and we are done
937  if( ! defined $mid_cs ) {
938  for my $range ( @mid_ranges ) {
939  $end_registry->check_and_register( $range->[0], $range->[2],
940  $range->[3] );
941  }
942 
943  # and thats it for the simple case ...
944  return;
945  }
946 
947 
948  ###########
949  # now the second half of the mapping
950  # perform another query and load the mid <-> end mapper using the mid cs
951  # ranges
952  #
953 
954  #ascertain which is component and which is actually assembled coord system
955  @path = @{$csa->get_mapping_path($mid_cs, $end_cs)};
956  if(@path == 2 || ( @path == 3 && !defined $path[1])) {
957 
958  $asm_cs = $path[0];
959  $cmp_cs = $path[-1];
960  } else {
961  my $path = join(',', map({$_->name .' '. $_->version} @path));
962  my $len = scalar(@path)-1;
963  throw("Unexpected mapping path between intermediate and last" .
964  "coord systems (". $mid_cs->name . " " . $mid_cs->version .
965  " and " . $end_cs->name . " " . $end_cs->version . ")." .
966  "\nExpected path length 1, got $len. " .
967  "(path=$path)");
968  }
969 
970  if(defined($to_slice)){
971  my $to_cs = $to_slice->coord_system;
972  if($asm_cs->equals($to_cs)){
973  $asm2cmp_sth = $self->prepare($asm2cmp);
974  $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id");
975  }
976  elsif($cmp_cs->equals($to_cs)){
977  $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id");
978  $cmp2asm_sth = $self->prepare($cmp2asm);
979  }
980  else{
981  $asm2cmp_sth = $self->prepare($asm2cmp);
982  $cmp2asm_sth = $self->prepare($cmp2asm);
983  }
984  }
985 
986  $sth = ($asm_cs->equals($mid_cs)) ? $asm2cmp_sth : $cmp2asm_sth;
987 
988  my $end_cs_id = $end_cs->dbID();
989  foreach my $mid_range (@mid_ranges) {
990  my ($mid_seq_region_id, $mid_seq_region,$start, $end) = @$mid_range;
991  $sth->bind_param(1,$mid_seq_region_id,SQL_INTEGER);
992  $sth->bind_param(2,$start,SQL_INTEGER);
993  $sth->bind_param(3,$end,SQL_INTEGER);
994  $sth->bind_param(4,$end_cs_id,SQL_INTEGER);
995  $sth->execute();
996 
997  #load the end <-> mid mapper with the results and record the mid cs
998  #ranges we just added to the mapper
999 
1000  my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, $end_length,
1001  $ori, $mid_start, $mid_end);
1002 
1003  $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id,
1004  \$end_seq_region, \$end_length, \$ori, \$mid_start,
1005  \$mid_end);
1006 
1007  while($sth->fetch()) {
1008  $end_mid_mapper->add_map_coordinates
1009  (
1010  $end_seq_region_id, $end_start, $end_end, $ori,
1011  $mid_seq_region_id, $mid_start, $mid_end
1012  );
1013 
1014  #update sr_name cache
1015  my $arr = [ $end_seq_region_id,$end_seq_region,$end_cs_id,$end_length ];
1016 
1017  $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr;
1018  $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr;
1019 
1020  #register this region on the end coord system
1021  $end_registry->check_and_register($end_seq_region_id, $end_start, $end_end);
1022  }
1023  $sth->finish();
1024  }
1025 
1026  #########
1027  # Now that both halves are loaded
1028  # Do stepwise mapping using both of the loaded mappers to load
1029  # the final start <-> end mapper
1030  #
1031 
1032  _build_combined_mapper(\@start_ranges, $start_mid_mapper, $end_mid_mapper,
1033  $combined_mapper, $start_name);
1034  #all done!
1035  return;
1036 }
1037 
1038 
1039 =head2 _register_chained_special
1040 
1041  Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
1042  The chained assembly mapper to register regions on
1043  Arg [2] : string $from ('first' or 'last')
1044  The direction we are registering from, and the name of the
1045  internal mapper.
1046  Arg [3] : string $seq_region_name
1047  The name of the seqregion we are registering on
1048  Arg [4] : listref $ranges
1049  A list of ranges to register (in [$start,$end] tuples).
1050  Arg [5] : (optional) $to_slice
1051  Only register those on this Slice.
1052  Description: Registers a set of ranges on a chained assembly mapper.
1053  This function is at the heart of the chained mapping process.
1054  It retrieves information from the assembly table and
1055  dynamically constructs the mappings between two coordinate
1056  systems which are 2 mapping steps apart. It does this by using
1057  two internal mappers to load up a third mapper which is
1058  actually used by the ChainedAssemblyMapper to perform the
1059  mapping.
1060 
1061  This method must be called before any mapping is
1062  attempted on regions of interest, otherwise only gaps will
1063  be returned. Note that the ChainedAssemblyMapper automatically
1064  calls this method when the need arises.
1065  Returntype : none
1066  Exceptions : throw if the seq_region to be registered does not exist
1067  or if it associated with multiple assembled pieces (bad data
1068  in assembly table)
1069 
1070  throw if the mapping between the coordinate systems cannot
1071  be performed in two steps, which means there is an internal
1072  error in the data in the meta table or in the code that creates
1073  the mapping paths.
1075  Status : Stable
1076 
1077 =cut
1078 
1079 sub _register_chained_special {
1080  my $self = shift;
1081  my $casm_mapper = shift;
1082  my $from = shift;
1083  my $seq_region_id = shift;
1084  my $ranges = shift;
1085  my $to_slice = shift;
1086  my $found = 0;
1087 
1088  my $sth = $self->prepare("SELECT
1089  asm.cmp_start,
1090  asm.cmp_end,
1091  asm.cmp_seq_region_id,
1092  sr.name,
1093  sr.length,
1094  asm.ori,
1095  asm.asm_start,
1096  asm.asm_end
1097  FROM
1098  assembly asm, seq_region sr
1099  WHERE
1100  asm.asm_seq_region_id = ? AND
1101  ? <= asm.asm_end AND
1102  ? >= asm.asm_start AND
1103  asm.cmp_seq_region_id = sr.seq_region_id AND
1104  sr.coord_system_id = ? AND
1105  asm.cmp_seq_region_id = ?");
1106 
1107 
1108  my ($start_name, $start_mid_mapper, $start_cs, $start_registry,
1109  $end_name, $end_mid_mapper, $end_cs, $end_registry);
1110 
1111  if($from eq 'first') {
1112  $start_name = 'first';
1113  $start_mid_mapper = $casm_mapper->first_middle_mapper();
1114  $start_cs = $casm_mapper->first_CoordSystem();
1115  $start_registry = $casm_mapper->first_registry();
1116  $end_mid_mapper = $casm_mapper->last_middle_mapper();
1117  $end_cs = $casm_mapper->last_CoordSystem();
1118  $end_registry = $casm_mapper->last_registry();
1119  $end_name = 'last';
1120  } elsif($from eq 'last') {
1121  $start_name = 'last';
1122  $start_mid_mapper = $casm_mapper->last_middle_mapper();
1123  $start_cs = $casm_mapper->last_CoordSystem();
1124  $start_registry = $casm_mapper->last_registry();
1125  $end_mid_mapper = $casm_mapper->first_middle_mapper();
1126  $end_cs = $casm_mapper->first_CoordSystem();
1127  $end_registry = $casm_mapper->first_registry();
1128  $end_name = 'first';
1129  } else {
1130  throw("Invalid from argument: [$from], must be 'first' or 'last'");
1131  }
1132 
1133  my $combined_mapper = $casm_mapper->first_last_mapper();
1134  my $mid_cs = $casm_mapper->middle_CoordSystem();
1135  my $mid_name = 'middle';
1136  my $csa = $self->db->get_CoordSystemAdaptor();
1137 
1138  # Check for the simple case where the ChainedMapper is short
1139  if( ! defined $mid_cs ) {
1140  $start_mid_mapper = $combined_mapper;
1141  }
1142 
1143 
1144  my @path;
1145  if( defined $mid_cs ) {
1146  @path = @{$csa->get_mapping_path($start_cs, $mid_cs)};
1147  } else {
1148  @path = @{$csa->get_mapping_path( $start_cs, $end_cs )};
1149  }
1150  if( ! defined $mid_cs ) {
1151  $start_mid_mapper = $combined_mapper;
1152  }
1153 
1154  if(@path != 2 && defined( $path[1] )) {
1155  my $path = join(',', map({$_->name .' '. $_->version} @path));
1156  my $len = scalar(@path) - 1;
1157  throw("Unexpected mapping path between start and intermediate " .
1158  "coord systems (". $start_cs->name . " " . $start_cs->version .
1159  " and " . $mid_cs->name . " " . $mid_cs->version . ")." .
1160  "\nExpected path length 1, got $len. " .
1161  "(path=$path)");
1162  }
1163 
1164  my ($asm_cs,$cmp_cs);
1165  $asm_cs = $path[0];
1166  $cmp_cs = $path[-1];
1167 
1168  $combined_mapper = $casm_mapper->first_last_mapper();
1169  $mid_cs = $casm_mapper->middle_CoordSystem();
1170  $mid_name = 'middle';
1171  $csa = $self->db->get_CoordSystemAdaptor();
1172 
1173  my $mid_cs_id;
1174 
1175  # Check for the simple case where the ChainedMapper is short
1176  if ( !defined $mid_cs ) {
1177  $start_mid_mapper = $combined_mapper;
1178  } else {
1179  $mid_cs_id = $mid_cs->dbID();
1180  }
1181 
1182  my @mid_ranges;
1183  my @start_ranges;
1184 
1185  my $to_cs = $to_slice->coord_system;
1186  foreach my $direction (1, 0){
1187  my $id1;
1188  my $id2;
1189  if($direction){
1190  $id1 = $seq_region_id;
1191  $id2 = $to_slice->get_seq_region_id();
1192  }
1193  else{
1194  $id2 = $seq_region_id;
1195  $id1 = $to_slice->get_seq_region_id();
1196  }
1197 
1198  foreach my $range (@$ranges) {
1199  my ($start, $end) = @$range;
1200  $sth->bind_param(1,$id1,SQL_INTEGER);
1201  $sth->bind_param(2,$start,SQL_INTEGER);
1202  $sth->bind_param(3,$end,SQL_INTEGER);
1203  $sth->bind_param(4,$to_cs->dbID,SQL_INTEGER);
1204  $sth->bind_param(5,$id2,SQL_INTEGER);
1205  $sth->execute();
1206 
1207  my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
1208  $ori, $start_start, $start_end);
1209 
1210  $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
1211  \$mid_seq_region, \$mid_length, \$ori, \$start_start,
1212  \$start_end);
1213 
1214  while($sth->fetch()) {
1215  $found = 1;
1216 
1217  if( defined $mid_cs ) {
1218  $start_mid_mapper->add_map_coordinates
1219  (
1220  $id1,$start_start, $start_end, $ori,
1221  $mid_seq_region_id, $mid_start, $mid_end
1222  );
1223  } else {
1224  if( $from eq "first") {
1225  if($direction){
1226  $combined_mapper->add_map_coordinates
1227  (
1228  $id1,$start_start, $start_end, $ori,
1229  $mid_seq_region_id, $mid_start, $mid_end
1230  );
1231  }
1232  else{
1233  $combined_mapper->add_map_coordinates
1234  (
1235  $mid_seq_region_id, $mid_start, $mid_end, $ori,
1236  $id1,$start_start, $start_end
1237  );
1238  }
1239  } else {
1240  if($direction){
1241  $combined_mapper->add_map_coordinates
1242  (
1243  $mid_seq_region_id, $mid_start, $mid_end, $ori,
1244  $id1,$start_start, $start_end
1245  );
1246  }
1247  else{
1248  $combined_mapper->add_map_coordinates
1249  (
1250  $id1,$start_start, $start_end, $ori,
1251  $mid_seq_region_id, $mid_start, $mid_end
1252  );
1253  }
1254  }
1255  }
1256 
1257  #update sr_name cache
1258  my $arr = [ $mid_seq_region_id, $mid_seq_region,
1259  $mid_cs_id, $mid_length ];
1260 
1261  $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
1262  $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
1263 
1264  push @mid_ranges,[$mid_seq_region_id,$mid_seq_region,
1265  $mid_start,$mid_end];
1266  push @start_ranges, [ $id1, $start_start, $start_end ];
1267 
1268  #the region that we actually register may actually be larger or smaller
1269  #than the region that we wanted to register.
1270  #register the intersection of the region so we do not end up doing
1271  #extra work later
1272 
1273  if($start_start < $start || $start_end > $end) {
1274  $start_registry->check_and_register($id1,$start_start,
1275  $start_end);
1276  }
1277  }
1278  $sth->finish();
1279  }
1280  if($found){
1281  if( ! defined $mid_cs ) {
1282  for my $range ( @mid_ranges ) {
1283  $end_registry->check_and_register( $range->[0], $range->[2],
1284  $range->[3] );
1285  }
1286 
1287  # and thats it for the simple case ...
1288  return;
1289  }
1290  }
1291  }
1292 }
1293 
1294 
1295 =head2 register_all
1296 
1297  Arg [1] : Bio::EnsEMBL::AssemblyMapper $mapper
1298  Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2);
1299 
1300  # make cache large enough to hold all of the mappings
1301  $mapper->max_pair_count(10e6);
1302  $asm_mapper_adaptor->register_all($mapper);
1303 
1304  # perform mappings as normal
1305  $mapper->map($slice->seq_region_name(), $sr_start, $sr_end,
1306  $sr_strand, $cs1);
1307  ...
1308  Description: This function registers the entire set of mappings between
1309  two coordinate systems in an assembly mapper.
1310  This will use a lot of memory but will be much more efficient
1311  when doing a lot of mapping which is spread over the entire
1312  genome.
1313  Returntype : none
1314  Exceptions : none
1315  Caller : specialised prograhsm
1316  Status : Stable
1317 
1318 =cut
1319 
1320 sub register_all {
1321  my $self = shift;
1322  my $mapper = shift;
1323 
1324  my $asm_cs_id = $mapper->assembled_CoordSystem()->dbID();
1325  my $cmp_cs_id = $mapper->component_CoordSystem()->dbID();
1326 
1327  # retrieve every relevant assembled/component pair from the assembly table
1328 
1329  my $q = qq{
1330  SELECT
1331  asm.cmp_start,
1332  asm.cmp_end,
1333  asm.cmp_seq_region_id,
1334  cmp_sr.name,
1335  cmp_sr.length,
1336  asm.ori,
1337  asm.asm_start,
1338  asm.asm_end,
1339  asm.asm_seq_region_id,
1340  asm_sr.name,
1341  asm_sr.length
1342  FROM
1343  assembly asm, seq_region asm_sr, seq_region cmp_sr
1344  WHERE
1345  asm.cmp_seq_region_id = cmp_sr.seq_region_id AND
1346  asm.asm_seq_region_id = asm_sr.seq_region_id AND
1347  cmp_sr.coord_system_id = ? AND
1348  asm_sr.coord_system_id = ?
1349  };
1350 
1351  my $sth = $self->prepare($q);
1352 
1353  $sth->bind_param(1,$cmp_cs_id,SQL_INTEGER);
1354  $sth->bind_param(2,$asm_cs_id,SQL_INTEGER);
1355  $sth->execute();
1356 
1357  # load the mapper with the assembly information
1358 
1359  my ($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $cmp_length,
1360  $ori,
1361  $asm_start, $asm_end, $asm_seq_region_id, $asm_seq_region, $asm_length);
1362 
1363  $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id,
1364  \$cmp_seq_region, \$cmp_length, \$ori,
1365  \$asm_start, \$asm_end, \$asm_seq_region_id,
1366  \$asm_seq_region, \$asm_length);
1367 
1368  my %asm_registered;
1369 
1370  while($sth->fetch()) {
1371  $mapper->register_component($cmp_seq_region_id);
1372  $mapper->mapper->add_map_coordinates(
1373  $asm_seq_region_id, $asm_start, $asm_end,
1374  $ori,
1375  $cmp_seq_region_id, $cmp_start, $cmp_end);
1376 
1377  my $arr = [$cmp_seq_region_id, $cmp_seq_region, $cmp_cs_id, $cmp_length];
1378 
1379  $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr;
1380  $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr;
1381 
1382  # only register each asm seq_region once since it requires some work
1383  if(!$asm_registered{$asm_seq_region_id}) {
1384  $asm_registered{$asm_seq_region_id} = 1;
1385 
1386  # register all chunks from start of seq region to end
1387  my $end_chunk = $asm_length >> $CHUNKFACTOR;
1388  for(my $i = 0; $i <= $end_chunk; $i++) {
1389  $mapper->register_assembled($asm_seq_region_id, $i);
1390  }
1391 
1392  $arr = [$asm_seq_region_id, $asm_seq_region, $asm_cs_id, $asm_length];
1393 
1394  $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr;
1395  $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr;
1396  }
1397  }
1398 
1399  $sth->finish();
1400 
1401  return;
1402 }
1403 
1404 
1405 
1406 =head2 register_all_chained
1407 
1408  Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
1409  Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2);
1410 
1411  # make the cache large enough to hold all of the mappings
1412  $mapper->max_pair_count(10e6);
1413  # load all of the mapping data
1414  $asm_mapper_adaptor->register_all_chained($mapper);
1415 
1416  # perform mappings as normal
1417  $mapper->map($slice->seq_region_name(), $sr_start, $sr_end,
1418  $sr_strand, $cs1);
1419  ...
1420  Description: This function registers the entire set of mappings between
1421  two coordinate systems in a chained mapper. This will use a lot
1422  of memory but will be much more efficient when doing a lot of
1423  mapping which is spread over the entire genome.
1424  Returntype : none
1425  Exceptions : throw if mapper is between coord systems with unexpected
1426  mapping paths
1427  Caller : specialised programs doing a lot of genome-wide mapping
1428  Status : Stable
1429 
1430 =cut
1431 
1432 sub register_all_chained {
1433  my $self = shift;
1434  my $casm_mapper = shift;
1435 
1436  my $first_cs = $casm_mapper->first_CoordSystem();
1437  my $mid_cs = $casm_mapper->middle_CoordSystem();
1438  my $last_cs = $casm_mapper->last_CoordSystem();
1439 
1440  my $start_mid_mapper = $casm_mapper->first_middle_mapper();
1441  my $end_mid_mapper = $casm_mapper->last_middle_mapper();
1442  my $combined_mapper = $casm_mapper->first_last_mapper();
1443 
1444  my @ranges;
1445 
1446  my $sth = $self->prepare(
1447  'SELECT
1448  asm.cmp_start,
1449  asm.cmp_end,
1450  asm.cmp_seq_region_id,
1451  sr_cmp.name,
1452  sr_cmp.length,
1453  asm.ori,
1454  asm.asm_start,
1455  asm.asm_end,
1456  asm.asm_seq_region_id,
1457  sr_asm.name,
1458  sr_asm.length
1459  FROM
1460  assembly asm, seq_region sr_asm, seq_region sr_cmp
1461  WHERE
1462  sr_asm.seq_region_id = asm.asm_seq_region_id AND
1463  sr_cmp.seq_region_id = asm.cmp_seq_region_id AND
1464  sr_asm.coord_system_id = ? AND
1465  sr_cmp.coord_system_id = ?');
1466 
1467  my $csa = $self->db()->get_CoordSystemAdaptor();
1468 
1469  my @path;
1470 
1471  if ( !defined $mid_cs ) {
1472  @path = @{ $csa->get_mapping_path( $first_cs, $last_cs ) };
1473  if ( !defined( $path[1] ) ) {
1474  splice( @path, 1, 1 );
1475  }
1476  } else {
1477  @path = @{ $csa->get_mapping_path( $first_cs, $mid_cs ) };
1478  # fix for when we have something like supercontig#contig#chromosome
1479  if ( !defined( $path[1] ) ) {
1480  splice( @path, 1, 1 );
1481  }
1482  }
1483 
1484  if ( @path != 2 ) {
1485  my $path =
1486  join( ',', map( { $_->name . ' ' . $_->version } @path ) );
1487  my $len = scalar(@path) - 1;
1488  throw( "Unexpected mapping path between start and intermediate "
1489  . "coord systems ("
1490  . $first_cs->name . " "
1491  . $first_cs->version . " and "
1492  . $mid_cs->name . " "
1493  . $mid_cs->version . ")."
1494  . "\nExpected path length 1, got $len. "
1495  . "(path=$path)" );
1496  }
1497 
1498  my ($asm_cs,$cmp_cs) = @path;
1499 
1500  $sth->{mysql_use_result} = 1;
1501  $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER);
1502  $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER);
1503  $sth->execute();
1504 
1505 
1506  my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
1507  $ori, $start_start, $start_end, $start_seq_region_id, $start_seq_region,
1508  $start_length);
1509 
1510  if($asm_cs->equals($first_cs)) {
1511  $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
1512  \$mid_seq_region, \$mid_length, \$ori, \$start_start,
1513  \$start_end, \$start_seq_region_id, \$start_seq_region,
1514  \$start_length);
1515  } else {
1516  $sth->bind_columns(\$start_start, \$start_end, \$start_seq_region_id,
1517  \$start_seq_region, \$start_length, \$ori,
1518  \$mid_start, \$mid_end, \$mid_seq_region_id,
1519  \$mid_seq_region, \$mid_length);
1520 
1521  }
1522 
1523  my ( $mid_cs_id, $start_cs_id, $registry, $mapper );
1524  if ( !defined $mid_cs ) {
1525  $mid_cs_id = $last_cs->dbID();
1526  $start_cs_id = $first_cs->dbID();
1527  $mapper = $combined_mapper;
1528  } else {
1529  $mid_cs_id = $mid_cs->dbID();
1530  $start_cs_id = $first_cs->dbID();
1531  $mapper = $start_mid_mapper;
1532  }
1533 
1534  $registry = $casm_mapper->first_registry();
1535 
1536  while($sth->fetch()) {
1537  $mapper->add_map_coordinates
1538  (
1539  $start_seq_region_id, $start_start, $start_end, $ori,
1540  $mid_seq_region_id, $mid_start, $mid_end
1541  );
1542  push( @ranges, [$start_seq_region_id, $start_start, $start_end ] );
1543 
1544  $registry->check_and_register( $start_seq_region_id, 1, $start_length );
1545  if( ! defined $mid_cs ) {
1546  $casm_mapper->last_registry()->check_and_register
1547  ( $mid_seq_region_id, $mid_start, $mid_end );
1548  }
1549 
1550  my $arr = [ $mid_seq_region_id, $mid_seq_region,
1551  $mid_cs_id, $mid_length ];
1552 
1553  $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
1554  $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
1555 
1556  $arr = [ $start_seq_region_id, $start_seq_region,
1557  $start_cs_id, $start_length ];
1558 
1559  $self->{'sr_name_cache'}->{"$start_seq_region:$start_cs_id"} = $arr;
1560  $self->{'sr_id_cache'}->{"$start_seq_region_id"} = $arr;
1561  }
1562 
1563  if( ! defined $mid_cs ) {
1564  # thats it for the simple case
1565  return;
1566  }
1567 
1568 
1569  @path = @{ $csa->get_mapping_path( $last_cs, $mid_cs ) };
1570  if ( defined($mid_cs) ) {
1571  if ( !defined( $path[1] ) ) {
1572  splice( @path, 1, 1 );
1573  }
1574  }
1575 
1576  if ( @path != 2 ) {
1577  my $path =
1578  join( ',', map( { $_->name . ' ' . $_->version } @path ) );
1579  my $len = scalar(@path) - 1;
1580  throw( "Unexpected mapping path between intermediate and last "
1581  . "coord systems ("
1582  . $last_cs->name . " "
1583  . $last_cs->version . " and "
1584  . $mid_cs->name . " "
1585  . $mid_cs->version . ")."
1586  . "\nExpected path length 1, got $len. "
1587  . "(path=$path)" );
1588  }
1589 
1590  ($asm_cs,$cmp_cs) = @path;
1591 
1592  $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER);
1593  $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER);
1594  $sth->execute();
1595 
1596 
1597  my ($end_start, $end_end, $end_seq_region_id, $end_seq_region,
1598  $end_length);
1599 
1600  if($asm_cs->equals($mid_cs)) {
1601  $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id,
1602  \$end_seq_region, \$end_length, \$ori,
1603  \$mid_start, \$mid_end, \$mid_seq_region_id,
1604  \$mid_seq_region, \$mid_length);
1605  } else {
1606  $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
1607  \$mid_seq_region, \$mid_length, \$ori, \$end_start,
1608  \$end_end, \$end_seq_region_id, \$end_seq_region,
1609  \$end_length);
1610  }
1611 
1612  my $end_cs_id = $last_cs->dbID();
1613  $registry = $casm_mapper->last_registry();
1614 
1615  while($sth->fetch()) {
1616  $end_mid_mapper->add_map_coordinates
1617  (
1618  $end_seq_region_id, $end_start, $end_end, $ori,
1619  $mid_seq_region_id, $mid_start, $mid_end
1620  );
1621 
1622  $registry->check_and_register( $end_seq_region_id, 1, $end_length );
1623 
1624  my $arr = [ $end_seq_region_id, $end_seq_region,
1625  $end_cs_id, $end_length ];
1626  $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr;
1627  $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr;
1628  }
1629 
1630  _build_combined_mapper( \@ranges, $start_mid_mapper, $end_mid_mapper,
1631  $combined_mapper, "first" );
1632 
1633  return;
1634 }
1635 
1636 
1637 
1638 # after both halves of a chained mapper are loaded
1639 # this function maps all ranges in $ranges and loads the
1640 # results into the combined mapper
1641 sub _build_combined_mapper {
1642  my $ranges = shift;
1643  my $start_mid_mapper = shift;
1644  my $end_mid_mapper = shift;
1645  my $combined_mapper = shift;
1646  my $start_name = shift;
1647 
1648  my $mid_name = "middle";
1649 
1650  foreach my $range (@$ranges) {
1651  my ( $seq_region_id, $start, $end) = @$range;
1652 
1653  my $sum = 0;
1654 
1655  my @initial_coords = $start_mid_mapper->map_coordinates($seq_region_id,
1656  $start,$end,1,
1657  $start_name);
1658 
1659  foreach my $icoord (@initial_coords) {
1660  #skip gaps
1661  if($icoord->isa('Bio::EnsEMBL::Mapper::Gap')) {
1662  $sum += $icoord->length();
1663  next;
1664  }
1665 
1666 
1667  #feed the results of the first mapping into the second mapper
1668  my @final_coords =
1669  $end_mid_mapper->map_coordinates($icoord->id, $icoord->start,
1670  $icoord->end,
1671  $icoord->strand, $mid_name);
1672 
1673 
1674  foreach my $fcoord (@final_coords) {
1675  #load up the final mapper
1676 
1677  if($fcoord->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
1678  my $total_start = $start + $sum;
1679  my $total_end = $total_start + $fcoord->length - 1;
1680  my $ori = $fcoord->strand();
1681 
1682  if($start_name eq 'first') { # add coords in consistant order
1683  $combined_mapper->add_map_coordinates(
1684  $seq_region_id, $total_start, $total_end, $ori,
1685  $fcoord->id(), $fcoord->start(), $fcoord->end());
1686  } else {
1687  $combined_mapper->add_map_coordinates(
1688  $fcoord->id(), $fcoord->start(), $fcoord->end(),$ori,
1689  $seq_region_id, $total_start, $total_end);
1690  }
1691 
1692  }
1693  $sum += $fcoord->length();
1694  }
1695  }
1696  }
1697  #all done!
1698 }
1699 
1700 
1701 =head2 seq_regions_to_ids
1702 
1703  Arg [1] : Bio::EnsEMBL::CoordSystem $coord_system
1704  Arg [2] : listref of strings $seq_regions
1705  Example : my @ids = @{$asma->seq_regions_to_ids($coord_sys, \@seq_regs)};
1706  Description: Converts a list of seq_region names to internal identifiers
1707  using the internal cache that has accumulated while registering
1708  regions for AssemblyMappers. If any requested regions are
1709  not found in the cache an attempt is made to retrieve them
1710  from the database.
1711  Returntype : listref of ints
1712  Exceptions : throw if a non-existant seqregion is provided
1713  Caller : general
1714  Status : Stable
1715 
1716 =cut
1717 
1718 sub seq_regions_to_ids {
1719  my $self = shift;
1720  my $coord_system = shift;
1721  my $seq_regions = shift;
1722 
1723  my $cs_id = $coord_system->dbID();
1724 
1725  my @out;
1726 
1727  foreach my $sr (@$seq_regions) {
1728  my $arr = $self->{'sr_name_cache'}->{"$sr:$cs_id"};
1729  if( $arr ) {
1730  push( @out, $arr->[0] );
1731  } else {
1732  push @out, $self->_seq_region_name_to_id($sr,$cs_id);
1733  }
1734  }
1735 
1736  return \@out;
1737 }
1738 
1739 
1740 =head2 seq_ids_to_regions
1741 
1742  Arg [1] : listref of seq_region ids
1743  Example : my @ids = @{$asma->ids_to_seq_regions(\@seq_ids)};
1744  Description: Converts a list of seq_region ids to seq region names
1745  using the internal cache that has accumulated while registering
1746  regions for AssemblyMappers. If any requested regions are
1747  not found in the cache an attempt is made to retrieve them
1748  from the database.
1749  Returntype : listref of strings
1750  Exceptions : throw if a non-existant seq_region_id is provided
1751  Caller : general
1752  Status : Stable
1753 
1754 =cut
1755 
1756 sub seq_ids_to_regions {
1757  my $self = shift;
1758  my $seq_region_ids = shift;
1759 
1760  my @out;
1761 
1762  foreach my $sr (@$seq_region_ids) {
1763  my $arr = $self->{'sr_id_cache'}->{"$sr"};
1764  if( $arr ) {
1765  push( @out, $arr->[1] );
1766  } else {
1767  push @out, $self->_seq_region_id_to_name($sr);
1768  }
1769  }
1770 
1771  return \@out;
1772 }
1773 
1774 =head2 delete_cache
1775 
1776  Description: Delete all the caches for the mappings/seq_regions
1777  Returntype : none
1778  Exceptions : none
1779  Caller : General
1780  Status : At risk
1781 
1782 =cut
1783 
1784 sub delete_cache{
1785  my ($self) = @_;
1786 
1787  %{$self->{'sr_name_cache'}} = ();
1788  %{$self->{'sr_id_cache'}} = ();
1789 
1790  foreach my $key (keys %{$self->{'_asm_mapper_cache'}}){
1791  $self->{'_asm_mapper_cache'}->{$key}->flush();
1792  }
1793  %{$self->{'_asm_mapper_cache'}} = ();
1794  return;
1795 }
1796 
1797 
1798 1;
Bio::EnsEMBL::Registry::get_adaptor
public Adaptor get_adaptor()
Bio::EnsEMBL::Storable::dbID
public Int dbID()
Bio::EnsEMBL::ChainedAssemblyMapper::max_pair_count
public Int max_pair_count()
EnsEMBL
Definition: Filter.pm:1
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
Bio::EnsEMBL::AssemblyMapper::component_CoordSystem
public Bio::EnsEMBL::CoordSystem component_CoordSystem()
Bio::EnsEMBL::AssemblyMapper
Definition: AssemblyMapper.pm:49
map
public map()
AssemblyMapper
Definition: BlastzAligner.pm:7
Bio::EnsEMBL::AssemblyMapper::max_pair_count
public Int max_pair_count()
Bio::EnsEMBL::ChainedAssemblyMapper
Definition: ChainedAssemblyMapper.pm:54
Bio::EnsEMBL::CoordSystem
Definition: CoordSystem.pm:40
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
Bio::EnsEMBL::Utils::SeqRegionCache
Definition: SeqRegionCache.pm:40
Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor
Definition: AssemblyMapperAdaptor.pm:58
Bio::EnsEMBL::DBSQL::DBAdaptor::get_SeqRegionCache
public Bio::EnsEMBL::Utils::SeqRegionCache get_SeqRegionCache()
Bio::EnsEMBL::TopLevelAssemblyMapper
Definition: TopLevelAssemblyMapper.pm:40
Bio::EnsEMBL::DBSQL::BaseAdaptor
Definition: BaseAdaptor.pm:71
Bio::EnsEMBL::AssemblyMapper::new
public Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor new()
Bio::EnsEMBL::Registry::load_registry_from_db
public Int load_registry_from_db()
Bio::EnsEMBL::AssemblyMapper::assembled_CoordSystem
public Bio::EnsEMBL::CoordSystem assembled_CoordSystem()
Bio::EnsEMBL::TopLevelAssemblyMapper::new
public Bio::EnsEMBL::DBSQL::TopLevelAssemblyMapper new()
Bio
Definition: AltAlleleGroup.pm:4
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::ChainedAssemblyMapper::new
public Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor new()