ensembl-hive  2.8.1
AssemblyMapperAdaptor.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
34 
35 =head1 SYNOPSIS
36 
38 
40  -host => 'ensembldb.ensembl.org',
41  -user => 'anonymous'
42  );
43 
44  $asma = Bio::EnsEMBL::Registry->get_adaptor( "human", "core",
45  "assemblymapper" );
46 
47  $csa = Bio::EnsEMBL::Registry->get_adaptor( "human", "core",
48  "coordsystem" );
49 
50  my $chr33_cs = $csa->fetch_by_name( 'chromosome', 'NCBI33' );
51  my $chr34_cs = $csa->fetch_by_name( 'chromosome', 'NCBI34' );
52  my $ctg_cs = $csa->fetch_by_name('contig');
53  my $clone_cs = $csa->fetch_by_name('clone');
54 
55  my $chr_ctg_mapper =
56  $asma->fetch_by_CoordSystems( $chr33_cs, $ctg_cs );
57 
58  my $ncbi33_ncbi34_mapper =
59  $asm_adptr->fetch_by_CoordSystems( $chr33, $chr34 );
60 
61  my $ctg_clone_mapper =
62  $asm_adptr->fetch_by_CoordSystems( $ctg_cs, $clone_cs );
63 
64 
65 =head1 DESCRIPTION
66 
67 Adaptor for handling Assembly mappers. This is a I<Singleton> class.
68 ie: There is only one per database (C<DBAdaptor>).
69 
70 This is used to retrieve mappers between any two coordinate systems
71 whose makeup is described by the assembly table. Currently one step
72 (explicit) and two step (implicit) pairwise mapping is supported. In
73 one-step mapping an explicit relationship between the coordinate systems
74 is defined in the assembly table. In two-step 'chained' mapping no
75 explicit mapping is present but the coordinate systems must share a
76 common mapping to an intermediate coordinate system.
77 
78 =head1 METHODS
79 
80 =cut
81 
82 package Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor;
83 
84 use vars qw(@ISA);
85 use strict;
86 
87 use Bio::EnsEMBL::DBSQL::BaseAdaptor;
91 
92 use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache
93 use Bio::EnsEMBL::Utils::Exception qw(throw);
95 
96 use integer; #do proper arithmetic bitshifts
97 
99 
100 
101 my $CHUNKFACTOR = 20; # 2^20 = approx. 10^6
102 
103 =head2 new
104 
105  Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for
106  the database this assembly mapper is using.
107  Example : my $asma = new Bio::EnsEMBL::AssemblyMapperAdaptor($dbadaptor);
108  Description: Creates a new AssemblyMapperAdaptor object
110  Exceptions : none
112  Status : Stable
113 
114 =cut
115 
116 sub new {
117  my($class, $dbadaptor) = @_;
118 
119  my $self = $class->SUPER::new($dbadaptor);
120 
121  $self->{'_asm_mapper_cache'} = {};
122 
123  # use a shared cache (for this database) that contains info about
124  # seq regions
125  my $seq_region_cache = $self->db->get_SeqRegionCache();
126  $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'};
127  $self->{'sr_id_cache'} = $seq_region_cache->{'id_cache'};
128 
129  return $self;
130 }
131 
132 
133 
134 =head2 cache_seq_ids_with_mult_assemblys
135 
136  Example : $self->adaptor->cache_seq_ids_with_mult_assemblys();
137  Description: Creates a hash of the component seq region ids that
138  map to more than one assembly from the assembly table.
139  Retruntype : none
140  Exceptions : none
141  Caller : AssemblyMapper, ChainedAssemblyMapper
142  Status : At Risk
143 
144 =cut
145 
146 sub cache_seq_ids_with_mult_assemblys{
147  my $self = shift;
148  my %multis;
149 
150  return if (defined($self->{'multi_seq_ids'}));
151 
152  $self->{'multi_seq_ids'} = {};
153 
154  my $sql = qq(
155  SELECT sra.seq_region_id
156  FROM seq_region_attrib sra,
157  attrib_type at,
158  seq_region sr,
159  coord_system cs
160  WHERE sra.attrib_type_id = at.attrib_type_id
161  AND code = "MultAssem"
162  AND sra.seq_region_id = sr.seq_region_id
163  AND sr.coord_system_id = cs.coord_system_id
164  AND cs.species_id = ?);
165 
166  my $sth = $self->prepare($sql);
167 
168  $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
169 
170  $sth->execute();
171 
172  my ($seq_region_id);
173 
174  $sth->bind_columns(\$seq_region_id);
175 
176  while($sth->fetch()) {
177  $self->{'multi_seq_ids'}->{$seq_region_id} = 1;
178  }
179  $sth->finish;
180 }
181 
182 
183 =head2 fetch_by_CoordSystems
184 
185  Arg [1] : Bio::EnsEMBL::CoordSystem $cs1
186  One of the coordinate systems to retrieve the mapper
187  between
188  Arg [2] : Bio::EnsEMBL::CoordSystem $cs2
189  The other coordinate system to map between
190  Description: Retrieves an Assembly mapper for two coordinate
191  systems whose relationship is described in the
192  assembly table.
193 
194  The ordering of the coodinate systems is arbitrary.
195  The following two statements are equivalent:
196  $mapper = $asma->fetch_by_CoordSystems($cs1,$cs2);
197  $mapper = $asma->fetch_by_CoordSystems($cs2,$cs1);
198  Returntype : Bio::EnsEMBL::AssemblyMapper
199  Exceptions : wrong argument types
200  Caller : general
201  Status : Stable
202 
203 =cut
204 
205 sub fetch_by_CoordSystems {
206  my $self = shift;
207  my $cs1 = shift;
208  my $cs2 = shift;
209 
210  if(!ref($cs1) || !$cs1->isa('Bio::EnsEMBL::CoordSystem')) {
211  throw("cs1 argument must be a Bio::EnsEMBL::CoordSystem.");
212  }
213  if(!ref($cs2) || !$cs2->isa('Bio::EnsEMBL::CoordSystem')) {
214  throw("cs2 argument must be a Bio::EnsEMBL::CoordSystem.");
215  }
216 
217 # if($cs1->equals($cs2)) {
218 # throw("Cannot create mapper between same coord systems: " .
219 # $cs1->name . " " . $cs1->version);
220 # }
221 
222  if($cs1->is_top_level()) {
223  return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs1, $cs2);
224  }
225  if($cs2->is_top_level()) {
226  return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs2, $cs1);
227  }
228 
229  my $csa = $self->db->get_CoordSystemAdaptor();
230 
231  #retrieve the shortest possible mapping path between these systems
232  my @mapping_path = @{$csa->get_mapping_path($cs1,$cs2)};
233 
234  if(!@mapping_path) {
235 
236  return undef;
237  }
238 
239  my $key = join(':', map({defined($_)?$_->dbID():"-"} @mapping_path));
240 
241  my $asm_mapper = $self->{'_asm_mapper_cache'}->{$key};
242 
243  return $asm_mapper if($asm_mapper);
244 
245  if(@mapping_path == 1) {
246  throw("Incorrect mapping path defined in meta table. " .
247  "0 step mapping encountered between:\n" .
248  $cs1->name() . " " . $cs1->version() . " and " . $cs2->name . " " .
249  $cs2->version());
250  }
251 
252  if(@mapping_path == 2) {
253  #1 step regular mapping
254  $asm_mapper = Bio::EnsEMBL::AssemblyMapper->new($self, @mapping_path);
255 
256 # If you want multiple pieces on two seqRegions to map to each other
257 # you need to make an assembly.mapping entry that is seperated with a #
258 # instead of an |.
259 
260  $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
261  return $asm_mapper;
262  }
263 
264  if(@mapping_path == 3) {
265  #two step chained mapping
266  $asm_mapper = Bio::EnsEMBL::ChainedAssemblyMapper->new($self,@mapping_path);
267  #in multi-step mapping it is possible get requests with the
268  #coordinate system ordering reversed since both mappings directions
269  #cache on both orderings just in case
270  #e.g. chr <-> contig <-> clone and clone <-> contig <-> chr
271 
272  $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
273  $key = join(':', map({defined($_)?$_->dbID():"-"} reverse(@mapping_path)));
274  $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
275  return $asm_mapper;
276  }
277 
278  throw("Only 1 and 2 step coordinate system mapping is currently\n" .
279  "supported. Mapping between " .
280  $cs1->name() . " " . $cs1->version() . " and " .
281  $cs2->name() . " " . $cs2->version() .
282  " requires ". (scalar(@mapping_path)-1) . " steps.");
283 }
284 
285 
286 
287 =head2 register_assembled
288 
289  Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper
290  A valid AssemblyMapper object
291  Arg [2] : integer $asm_seq_region
292  The dbID of the seq_region to be registered
293  Arg [3] : int $asm_start
294  The start of the region to be registered
295  Arg [4] : int $asm_end
296  The end of the region to be registered
297  Description: Declares an assembled region to the AssemblyMapper.
298  This extracts the relevant data from the assembly
299  table and stores it in Mapper internal to the $asm_mapper.
300  It therefore must be called before any mapping is
301  attempted on that region. Otherwise only gaps will
302  be returned. Note that the AssemblyMapper automatically
303  calls this method when the need arises.
304  Returntype : none
305  Exceptions : throw if the seq_region to be registered does not exist
306  or if it associated with multiple assembled pieces (bad data
307  in assembly table)
309  Status : Stable
310 
311 =cut
312 
313 
314 sub register_assembled {
315  my $self = shift;
316  my $asm_mapper = shift;
317  my $asm_seq_region = shift;
318  my $asm_start = shift;
319  my $asm_end = shift;
320 
321  if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) {
322  throw("Bio::EnsEMBL::AssemblyMapper argument expected");
323  }
324 
325  throw("asm_seq_region argument expected") if(!defined($asm_seq_region));
326  throw("asm_start argument expected") if(!defined($asm_start));
327  throw("asm_end argument expected") if(!defined($asm_end));
328 
329  my $asm_cs_id = $asm_mapper->assembled_CoordSystem->dbID();
330  my $cmp_cs_id = $asm_mapper->component_CoordSystem->dbID();
331 
332  #split up the region to be registered into fixed chunks
333  #this allows us to keep track of regions that have already been
334  #registered and also works under the assumption that if a small region
335  #is requested it is likely that other requests will be made in the
336  #vicinity (the minimum size registered the chunksize (2^chunkfactor)
337 
338  my @chunk_regions;
339  #determine span of chunks
340  #bitwise shift right is fast and easy integer division
341 
342  my($start_chunk, $end_chunk);
343 
344  $start_chunk = $asm_start >> $CHUNKFACTOR;
345  $end_chunk = $asm_end >> $CHUNKFACTOR;
346 
347  # inserts have start = end + 1, on boundary condition start_chunk
348  # could be less than end chunk
349  if($asm_start == $asm_end + 1) {
350  ($start_chunk, $end_chunk) = ($end_chunk, $start_chunk);
351  }
352 
353  #find regions of continuous unregistered chunks
354  my $i;
355  my ($begin_chunk_region,$end_chunk_region);
356  for ($i = $start_chunk; $i <= $end_chunk; $i++) {
357  if($asm_mapper->have_registered_assembled($asm_seq_region, $i)) {
358  if(defined($begin_chunk_region)) {
359  #this is the end of an unregistered region.
360  my $region = [($begin_chunk_region << $CHUNKFACTOR),
361  (($end_chunk_region+1) << $CHUNKFACTOR)-1];
362  push @chunk_regions, $region;
363  $begin_chunk_region = $end_chunk_region = undef;
364  }
365  } else {
366  $begin_chunk_region = $i if(!defined($begin_chunk_region));
367  $end_chunk_region = $i+1;
368  $asm_mapper->register_assembled($asm_seq_region,$i);
369  }
370  }
371 
372  #the last part may have been an unregistered region too
373  if(defined($begin_chunk_region)) {
374  my $region = [($begin_chunk_region << $CHUNKFACTOR),
375  (($end_chunk_region+1) << $CHUNKFACTOR) -1];
376  push @chunk_regions, $region;
377  }
378 
379  return if(!@chunk_regions);
380 
381  # keep the Mapper to a reasonable size
382  if( $asm_mapper->size() > $asm_mapper->max_pair_count() ) {
383  $asm_mapper->flush();
384  #we now have to go and register the entire requested region since we
385  #just flushed everything
386 
387  @chunk_regions = ( [ ( $start_chunk << $CHUNKFACTOR)
388  , (($end_chunk+1) << $CHUNKFACTOR)-1 ] );
389 
390  for( my $i = $start_chunk; $i <= $end_chunk; $i++ ) {
391  $asm_mapper->register_assembled( $asm_seq_region, $i );
392  }
393  }
394 
395 # my $asm_seq_region_id =
396 # $self->_seq_region_name_to_id($asm_seq_region,$asm_cs_id);
397 
398  # Retrieve the description of how the assembled region is made from
399  # component regions for each of the continuous blocks of unregistered,
400  # chunked regions
401 
402  my $q = qq{
403  SELECT
404  asm.cmp_start,
405  asm.cmp_end,
406  asm.cmp_seq_region_id,
407  sr.name,
408  sr.length,
409  asm.ori,
410  asm.asm_start,
411  asm.asm_end
412  FROM
413  assembly asm, seq_region sr
414  WHERE
415  asm.asm_seq_region_id = ? AND
416  ? <= asm.asm_end AND
417  ? >= asm.asm_start AND
418  asm.cmp_seq_region_id = sr.seq_region_id AND
419  sr.coord_system_id = ?
420  };
421 
422  my $sth = $self->prepare($q);
423 
424  foreach my $region (@chunk_regions) {
425  my($region_start, $region_end) = @$region;
426  $sth->bind_param(1,$asm_seq_region,SQL_INTEGER);
427  $sth->bind_param(2,$region_start,SQL_INTEGER);
428  $sth->bind_param(3,$region_end,SQL_INTEGER);
429  $sth->bind_param(4,$cmp_cs_id,SQL_INTEGER);
430 
431  $sth->execute();
432 
433  my($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $ori,
434  $asm_start, $asm_end, $cmp_seq_region_length);
435 
436  $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id,
437  \$cmp_seq_region, \$cmp_seq_region_length, \$ori,
438  \$asm_start, \$asm_end);
439 
440  #
441  # Load the unregistered regions of the mapper
442  #
443  while($sth->fetch()) {
444  next if($asm_mapper->have_registered_component($cmp_seq_region_id)
445  and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region_id}));
446  $asm_mapper->register_component($cmp_seq_region_id);
447  $asm_mapper->mapper->add_map_coordinates(
448  $asm_seq_region, $asm_start, $asm_end,
449  $ori,
450  $cmp_seq_region_id, $cmp_start, $cmp_end);
451 
452  my $arr = [ $cmp_seq_region_id, $cmp_seq_region,
453  $cmp_cs_id, $cmp_seq_region_length ];
454 
455  $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr;
456  $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr;
457  }
458  }
459 
460  $sth->finish();
461 }
462 
463 
464 
465 sub _seq_region_name_to_id {
466  my $self = shift;
467  my $sr_name = shift;
468  my $cs_id = shift;
469 
470  if(!defined($sr_name) or
471  !defined($cs_id)){
472  throw('seq_region_name and coord_system_id args are required');
473  }
474 
475  my $arr = $self->{'sr_name_cache'}->{"$sr_name:$cs_id"};
476  if( $arr ) {
477  return $arr->[0];
478  }
479 
480  # Get the seq_region_id via the name. This would be quicker if we just
481  # used internal ids instead but stored but then we lose the ability
482  # the transform accross databases with different internal ids
483 
484  my $sth = $self->prepare("SELECT seq_region_id, length " .
485  "FROM seq_region " .
486  "WHERE name = ? AND coord_system_id = ?");
487 
488  $sth->bind_param(1,$sr_name,SQL_VARCHAR);
489  $sth->bind_param(2,$cs_id,SQL_INTEGER);
490  $sth->execute();
491 
492  my @row = $sth->fetchrow_array();
493  unless ( @row ) {
494  throw("No-existent seq_region [$sr_name] in coord system $cs_id");
495  }
496  my @more = $sth->fetchrow_array();
497  if ( @more ) {
498  throw("Ambiguous seq_region [$sr_name] in coord system $cs_id");
499  }
500 
501  my ($sr_id, $sr_length) = @row;
502  $sth->finish();
503 
504  $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ];
505 
506  $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr;
507  $self->{'sr_id_cache'}->{"$sr_id"} = $arr;
508 
509  return $sr_id;
510 }
511 
512 sub _seq_region_id_to_name {
513  my $self = shift;
514  my $sr_id = shift;
515 
516  ($sr_id) || throw('seq_region_is required');
517 
518  my $arr = $self->{'sr_id_cache'}->{"$sr_id"};
519  if( $arr ) {
520  return $arr->[1];
521  }
522 
523  # Get the seq_region name via the id. This would be quicker if we just
524  # used internal ids instead but stored but then we lose the ability
525  # the transform accross databases with different internal ids
526 
527  my $sth = $self->prepare("SELECT name, length ,coord_system_id " .
528  "FROM seq_region " .
529  "WHERE seq_region_id = ? ");
530 
531  $sth->bind_param(1,$sr_id,SQL_INTEGER);
532  $sth->execute();
533 
534  my @row = $sth->fetchrow_array();
535  if(!$sth->rows() == 1) {
536  throw("non-existant seq_region [$sr_id]");
537  }
538 
539  my ($sr_name, $sr_length, $cs_id) = @row;
540  $sth->finish();
541 
542  $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ];
543 
544  $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr;
545  $self->{'sr_id_cache'}->{"$sr_id"} = $arr;
546 
547  return $sr_name;
548 }
549 
550 
551 =head2 register_component
552 
553  Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper
554  A valid AssemblyMapper object
555  Arg [2] : integer $cmp_seq_region
556  The dbID of the seq_region to be registered
557  Description: Declares a component region to the AssemblyMapper.
558  This extracts the relevant data from the assembly
559  table and stores it in Mapper internal to the $asm_mapper.
560  It therefore must be called before any mapping is
561  attempted on that region. Otherwise only gaps will
562  be returned. Note that the AssemblyMapper automatically
563  calls this method when the need arises.
564  Returntype : none
565  Exceptions : throw if the seq_region to be registered does not exist
566  or if it associated with multiple assembled pieces (bad data
567  in assembly table)
569  Status : Stable
570 
571 =cut
572 
573 sub register_component {
574  my $self = shift;
575  my $asm_mapper = shift;
576  my $cmp_seq_region = shift;
577 
578  if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) {
579  throw("Bio::EnsEMBL::AssemblyMapper argument expected");
580  }
581 
582  if(!defined($cmp_seq_region)) {
583  throw("cmp_seq_region argument expected");
584  }
585 
586  my $cmp_cs_id = $asm_mapper->component_CoordSystem()->dbID();
587  my $asm_cs_id = $asm_mapper->assembled_CoordSystem()->dbID();
588 
589  #do nothing if this region is already registered or special case
590  return if($asm_mapper->have_registered_component($cmp_seq_region)
591  and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region}));
592 
593 # my $cmp_seq_region_id =
594 # $self->_seq_region_name_to_id($cmp_seq_region, $cmp_cs_id);
595 
596  # Determine what part of the assembled region this component region makes up
597 
598  my $q = qq{
599  SELECT
600  asm.asm_start,
601  asm.asm_end,
602  asm.asm_seq_region_id,
603  sr.name,
604  sr.length
605  FROM
606  assembly asm, seq_region sr
607  WHERE
608  asm.cmp_seq_region_id = ? AND
609  asm.asm_seq_region_id = sr.seq_region_id AND
610  sr.coord_system_id = ?
611  };
612 
613  my $sth = $self->prepare($q);
614  $sth->bind_param(1,$cmp_seq_region,SQL_INTEGER);
615  $sth->bind_param(2,$asm_cs_id,SQL_INTEGER);
616  $sth->execute();
617 
618  my @rows = $sth->fetchrow_array();
619 
620  if($sth->rows() == 0) {
621  #this component is not used in the assembled part i.e. gap
622  $asm_mapper->register_component($cmp_seq_region);
623  $sth->finish();
624  return;
625  }
626 
627  #we do not currently support components mapping to multiple assembled
628  # make sure that you've got the correct mapping in the meta-table :
629  # chromosome:EquCab2#contig ( use'#' for multiple mappings )
630  # chromosome:EquCab2|contig ( use '|' delimiter for 1-1 mappings )
631  #
632  my @more = $sth->fetchrow_array();
633  if($sth->rows() != 1) {
634  $sth->finish();
635  throw("Multiple assembled regions for single " .
636  "component region cmp_seq_region_id=[$cmp_seq_region]\n".
637  "Remember that multiple mappings use the \#-operaator".
638  " in the meta-table (i.e. chromosome:EquCab2\#contig\n");
639  }
640 
641  my ($asm_start, $asm_end, $asm_seq_region_id,
642  $asm_seq_region, $asm_seq_region_length) = @rows;
643 
644  my $arr = [ $asm_seq_region_id, $asm_seq_region,
645  $asm_cs_id, $asm_seq_region_length ];
646 
647  $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr;
648  $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr;
649 
650  $sth->finish();
651 
652  # Register the corresponding assembled region. This allows a us to
653  # register things in assembled chunks which allows us to:
654  # (1) Keep track of what assembled regions are registered
655  # (2) Use locality of reference (if they want something in same general
656  # region it will already be registered).
657 
658  $self->register_assembled($asm_mapper,$asm_seq_region_id,$asm_start,$asm_end);
659 }
660 
661 
662 
663 =head2 register_chained
664 
665  Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
666  The chained assembly mapper to register regions on
667  Arg [2] : string $from ('first' or 'last')
668  The direction we are registering from, and the name of the
669  internal mapper.
670  Arg [3] : string $seq_region_name
671  The name of the seqregion we are registering on
672  Arg [4] : listref $ranges
673  A list of ranges to register (in [$start,$end] tuples).
674  Arg [5] : (optional) $to_slice
675  Only register those on this Slice.
676  Description: Registers a set of ranges on a chained assembly mapper.
677  This function is at the heart of the chained mapping process.
678  It retrieves information from the assembly table and
679  dynamically constructs the mappings between two coordinate
680  systems which are 2 mapping steps apart. It does this by using
681  two internal mappers to load up a third mapper which is
682  actually used by the ChainedAssemblyMapper to perform the
683  mapping.
684 
685  This method must be called before any mapping is
686  attempted on regions of interest, otherwise only gaps will
687  be returned. Note that the ChainedAssemblyMapper automatically
688  calls this method when the need arises.
689  Returntype : none
690  Exceptions : throw if the seq_region to be registered does not exist
691  or if it associated with multiple assembled pieces (bad data
692  in assembly table)
693 
694  throw if the mapping between the coordinate systems cannot
695  be performed in two steps, which means there is an internal
696  error in the data in the meta table or in the code that creates
697  the mapping paths.
699  Status : Stable
700 
701 =cut
702 
703 sub register_chained {
704  my $self = shift;
705  my $casm_mapper = shift;
706  my $from = shift;
707  my $seq_region_id = shift;
708  my $ranges = shift;
709  my $to_slice = shift;
710 
711  my $to_seq_region_id;
712  if(defined($to_slice)){
713  if($casm_mapper->first_CoordSystem()->equals($casm_mapper->last_CoordSystem())){
714  return $self->_register_chained_special($casm_mapper, $from, $seq_region_id, $ranges, $to_slice);
715  }
716  $to_seq_region_id = $to_slice->get_seq_region_id();
717  if(!defined($to_seq_region_id)){
718  die "Could not get seq_region_id for to_slice".$to_slice->seq_region_name."\n";
719  }
720  }
721 
722  my ($start_name, $start_mid_mapper, $start_cs, $start_registry,
723  $end_name, $end_mid_mapper, $end_cs, $end_registry);
724 
725  if($from eq 'first') {
726  $start_name = 'first';
727  $start_mid_mapper = $casm_mapper->first_middle_mapper();
728  $start_cs = $casm_mapper->first_CoordSystem();
729  $start_registry = $casm_mapper->first_registry();
730  $end_mid_mapper = $casm_mapper->last_middle_mapper();
731  $end_cs = $casm_mapper->last_CoordSystem();
732  $end_registry = $casm_mapper->last_registry();
733  $end_name = 'last';
734  } elsif($from eq 'last') {
735  $start_name = 'last';
736  $start_mid_mapper = $casm_mapper->last_middle_mapper();
737  $start_cs = $casm_mapper->last_CoordSystem();
738  $start_registry = $casm_mapper->last_registry();
739  $end_mid_mapper = $casm_mapper->first_middle_mapper();
740  $end_cs = $casm_mapper->first_CoordSystem();
741  $end_registry = $casm_mapper->first_registry();
742  $end_name = 'first';
743  } else {
744  throw("Invalid from argument: [$from], must be 'first' or 'last'");
745  }
746 
747  my $combined_mapper = $casm_mapper->first_last_mapper();
748  my $mid_cs = $casm_mapper->middle_CoordSystem();
749  my $mid_name = 'middle';
750  my $csa = $self->db->get_CoordSystemAdaptor();
751 
752  # Check for the simple case where the ChainedMapper is short
753  if( ! defined $mid_cs ) {
754  $start_mid_mapper = $combined_mapper;
755  }
756 
757 
758  ##############
759  # obtain the first half of the mappings and load them into the start mapper
760  #
761 
762  #ascertain which is component and which is actually assembled coord system
763  my @path;
764 
765  # check for the simple case, where the ChainedMapper is short
766  if( defined $mid_cs ) {
767  @path = @{$csa->get_mapping_path($start_cs, $mid_cs)};
768  } else {
769  @path = @{$csa->get_mapping_path( $start_cs, $end_cs )};
770  }
771 
772  if(@path != 2 && defined( $path[1] )) {
773  my $path = join(',', map({$_->name .' '. $_->version} @path));
774  my $len = scalar(@path) - 1;
775  throw("Unexpected mapping path between start and intermediate " .
776  "coord systems (". $start_cs->name . " " . $start_cs->version .
777  " and " . $mid_cs->name . " " . $mid_cs->version . ")." .
778  "\nExpected path length 1, got $len. " .
779  "(path=$path)");
780  }
781 
782  my $sth;
783  my ($asm_cs,$cmp_cs);
784  $asm_cs = $path[0];
785  $cmp_cs = $path[-1];
786 
787  #the SQL varies depending on whether we are coming from assembled or
788  #component coordinate system
789 
790 my $asm2cmp = (<<ASMCMP);
791  SELECT
792  asm.cmp_start,
793  asm.cmp_end,
794  asm.cmp_seq_region_id,
795  sr.name,
796  sr.length,
797  asm.ori,
798  asm.asm_start,
799  asm.asm_end
800  FROM
801  assembly asm, seq_region sr
802  WHERE
803  asm.asm_seq_region_id = ? AND
804  ? <= asm.asm_end AND
805  ? >= asm.asm_start AND
806  asm.cmp_seq_region_id = sr.seq_region_id AND
807  sr.coord_system_id = ?
808 ASMCMP
809 
810 
811 my $cmp2asm = (<<CMPASM);
812  SELECT
813  asm.asm_start,
814  asm.asm_end,
815  asm.asm_seq_region_id,
816  sr.name,
817  sr.length,
818  asm.ori,
819  asm.cmp_start,
820  asm.cmp_end
821  FROM
822  assembly asm, seq_region sr
823  WHERE
824  asm.cmp_seq_region_id = ? AND
825  ? <= asm.cmp_end AND
826  ? >= asm.cmp_start AND
827  asm.asm_seq_region_id = sr.seq_region_id AND
828  sr.coord_system_id = ?
829 CMPASM
830 
831  my $asm2cmp_sth;
832  my $cmp2asm_sth;
833  if(defined($to_slice)){
834  my $to_cs = $to_slice->coord_system;
835  if($asm_cs->equals($to_cs)){
836  $asm2cmp_sth = $self->prepare($asm2cmp);
837  $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id");
838  }
839  elsif($cmp_cs->equals($to_cs)){
840  $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id");
841  $cmp2asm_sth = $self->prepare($cmp2asm);
842  }
843  else{
844  $asm2cmp_sth = $self->prepare($asm2cmp);
845  $cmp2asm_sth = $self->prepare($cmp2asm);
846  }
847  }
848  else{
849  $asm2cmp_sth = $self->prepare($asm2cmp);
850  $cmp2asm_sth = $self->prepare($cmp2asm);
851  }
852 
853 
854 
855  $sth = ($asm_cs->equals($start_cs)) ? $asm2cmp_sth : $cmp2asm_sth;
856 
857  my $mid_cs_id;
858 
859  # check for the simple case where the ChainedMapper is short
860  if( defined $mid_cs ) {
861  $mid_cs_id = $mid_cs->dbID();
862  } else {
863  $mid_cs_id = $end_cs->dbID();
864  }
865 
866  my @mid_ranges;
867  my @start_ranges;
868 
869  #need to perform the query for each unregistered range
870  foreach my $range (@$ranges) {
871  my ($start, $end) = @$range;
872  $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
873  $sth->bind_param(2,$start,SQL_INTEGER);
874  $sth->bind_param(3,$end,SQL_INTEGER);
875  $sth->bind_param(4,$mid_cs_id,SQL_INTEGER);
876  $sth->execute();
877 
878  #load the start <-> mid mapper with the results and record the mid cs
879  #ranges we just added to the mapper
880 
881  my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
882  $ori, $start_start, $start_end);
883 
884  $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
885  \$mid_seq_region, \$mid_length, \$ori, \$start_start,
886  \$start_end);
887 
888  while($sth->fetch()) {
889 
890  if( defined $mid_cs ) {
891  $start_mid_mapper->add_map_coordinates
892  (
893  $seq_region_id,$start_start, $start_end, $ori,
894  $mid_seq_region_id, $mid_start, $mid_end
895  );
896  } else {
897  if( $from eq "first" ) {
898  $combined_mapper->add_map_coordinates
899  (
900  $seq_region_id,$start_start, $start_end, $ori,
901  $mid_seq_region_id, $mid_start, $mid_end
902  );
903  } else {
904  $combined_mapper->add_map_coordinates
905  (
906  $mid_seq_region_id, $mid_start, $mid_end, $ori,
907  $seq_region_id,$start_start, $start_end
908  );
909  }
910  }
911 
912  #update sr_name cache
913  my $arr = [ $mid_seq_region_id, $mid_seq_region,
914  $mid_cs_id, $mid_length ];
915 
916  $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
917  $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
918 
919  push @mid_ranges,[$mid_seq_region_id,$mid_seq_region,
920  $mid_start,$mid_end];
921  push @start_ranges, [ $seq_region_id, $start_start, $start_end ];
922 
923  #the region that we actually register may actually be larger or smaller
924  #than the region that we wanted to register.
925  #register the intersection of the region so we do not end up doing
926  #extra work later
927 
928  if($start_start < $start || $start_end > $end) {
929  $start_registry->check_and_register($seq_region_id,$start_start,
930  $start_end);
931  }
932  }
933  $sth->finish();
934  }
935 
936  # in the one step case, we load the mid ranges in the
937  # last_registry and we are done
938  if( ! defined $mid_cs ) {
939  for my $range ( @mid_ranges ) {
940  $end_registry->check_and_register( $range->[0], $range->[2],
941  $range->[3] );
942  }
943 
944  # and thats it for the simple case ...
945  return;
946  }
947 
948 
949  ###########
950  # now the second half of the mapping
951  # perform another query and load the mid <-> end mapper using the mid cs
952  # ranges
953  #
954 
955  #ascertain which is component and which is actually assembled coord system
956  @path = @{$csa->get_mapping_path($mid_cs, $end_cs)};
957  if(@path == 2 || ( @path == 3 && !defined $path[1])) {
958 
959  $asm_cs = $path[0];
960  $cmp_cs = $path[-1];
961  } else {
962  my $path = join(',', map({$_->name .' '. $_->version} @path));
963  my $len = scalar(@path)-1;
964  throw("Unexpected mapping path between intermediate and last" .
965  "coord systems (". $mid_cs->name . " " . $mid_cs->version .
966  " and " . $end_cs->name . " " . $end_cs->version . ")." .
967  "\nExpected path length 1, got $len. " .
968  "(path=$path)");
969  }
970 
971  if(defined($to_slice)){
972  my $to_cs = $to_slice->coord_system;
973  if($asm_cs->equals($to_cs)){
974  $asm2cmp_sth = $self->prepare($asm2cmp);
975  $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id");
976  }
977  elsif($cmp_cs->equals($to_cs)){
978  $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id");
979  $cmp2asm_sth = $self->prepare($cmp2asm);
980  }
981  else{
982  $asm2cmp_sth = $self->prepare($asm2cmp);
983  $cmp2asm_sth = $self->prepare($cmp2asm);
984  }
985  }
986 
987  $sth = ($asm_cs->equals($mid_cs)) ? $asm2cmp_sth : $cmp2asm_sth;
988 
989  my $end_cs_id = $end_cs->dbID();
990  foreach my $mid_range (@mid_ranges) {
991  my ($mid_seq_region_id, $mid_seq_region,$start, $end) = @$mid_range;
992  $sth->bind_param(1,$mid_seq_region_id,SQL_INTEGER);
993  $sth->bind_param(2,$start,SQL_INTEGER);
994  $sth->bind_param(3,$end,SQL_INTEGER);
995  $sth->bind_param(4,$end_cs_id,SQL_INTEGER);
996  $sth->execute();
997 
998  #load the end <-> mid mapper with the results and record the mid cs
999  #ranges we just added to the mapper
1000 
1001  my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, $end_length,
1002  $ori, $mid_start, $mid_end);
1003 
1004  $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id,
1005  \$end_seq_region, \$end_length, \$ori, \$mid_start,
1006  \$mid_end);
1007 
1008  while($sth->fetch()) {
1009  $end_mid_mapper->add_map_coordinates
1010  (
1011  $end_seq_region_id, $end_start, $end_end, $ori,
1012  $mid_seq_region_id, $mid_start, $mid_end
1013  );
1014 
1015  #update sr_name cache
1016  my $arr = [ $end_seq_region_id,$end_seq_region,$end_cs_id,$end_length ];
1017 
1018  $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr;
1019  $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr;
1020 
1021  #register this region on the end coord system
1022  $end_registry->check_and_register($end_seq_region_id, $end_start, $end_end);
1023  }
1024  $sth->finish();
1025  }
1026 
1027  #########
1028  # Now that both halves are loaded
1029  # Do stepwise mapping using both of the loaded mappers to load
1030  # the final start <-> end mapper
1031  #
1032 
1033  _build_combined_mapper(\@start_ranges, $start_mid_mapper, $end_mid_mapper,
1034  $combined_mapper, $start_name);
1035  #all done!
1036  return;
1037 }
1038 
1039 
1040 =head2 _register_chained_special
1041 
1042  Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
1043  The chained assembly mapper to register regions on
1044  Arg [2] : string $from ('first' or 'last')
1045  The direction we are registering from, and the name of the
1046  internal mapper.
1047  Arg [3] : string $seq_region_name
1048  The name of the seqregion we are registering on
1049  Arg [4] : listref $ranges
1050  A list of ranges to register (in [$start,$end] tuples).
1051  Arg [5] : (optional) $to_slice
1052  Only register those on this Slice.
1053  Description: Registers a set of ranges on a chained assembly mapper.
1054  This function is at the heart of the chained mapping process.
1055  It retrieves information from the assembly table and
1056  dynamically constructs the mappings between two coordinate
1057  systems which are 2 mapping steps apart. It does this by using
1058  two internal mappers to load up a third mapper which is
1059  actually used by the ChainedAssemblyMapper to perform the
1060  mapping.
1061 
1062  This method must be called before any mapping is
1063  attempted on regions of interest, otherwise only gaps will
1064  be returned. Note that the ChainedAssemblyMapper automatically
1065  calls this method when the need arises.
1066  Returntype : none
1067  Exceptions : throw if the seq_region to be registered does not exist
1068  or if it associated with multiple assembled pieces (bad data
1069  in assembly table)
1070 
1071  throw if the mapping between the coordinate systems cannot
1072  be performed in two steps, which means there is an internal
1073  error in the data in the meta table or in the code that creates
1074  the mapping paths.
1076  Status : Stable
1077 
1078 =cut
1079 
1080 sub _register_chained_special {
1081  my $self = shift;
1082  my $casm_mapper = shift;
1083  my $from = shift;
1084  my $seq_region_id = shift;
1085  my $ranges = shift;
1086  my $to_slice = shift;
1087  my $found = 0;
1088 
1089  my $sth = $self->prepare("SELECT
1090  asm.cmp_start,
1091  asm.cmp_end,
1092  asm.cmp_seq_region_id,
1093  sr.name,
1094  sr.length,
1095  asm.ori,
1096  asm.asm_start,
1097  asm.asm_end
1098  FROM
1099  assembly asm, seq_region sr
1100  WHERE
1101  asm.asm_seq_region_id = ? AND
1102  ? <= asm.asm_end AND
1103  ? >= asm.asm_start AND
1104  asm.cmp_seq_region_id = sr.seq_region_id AND
1105  sr.coord_system_id = ? AND
1106  asm.cmp_seq_region_id = ?");
1107 
1108 
1109  my ($start_name, $start_mid_mapper, $start_cs, $start_registry,
1110  $end_name, $end_mid_mapper, $end_cs, $end_registry);
1111 
1112  if($from eq 'first') {
1113  $start_name = 'first';
1114  $start_mid_mapper = $casm_mapper->first_middle_mapper();
1115  $start_cs = $casm_mapper->first_CoordSystem();
1116  $start_registry = $casm_mapper->first_registry();
1117  $end_mid_mapper = $casm_mapper->last_middle_mapper();
1118  $end_cs = $casm_mapper->last_CoordSystem();
1119  $end_registry = $casm_mapper->last_registry();
1120  $end_name = 'last';
1121  } elsif($from eq 'last') {
1122  $start_name = 'last';
1123  $start_mid_mapper = $casm_mapper->last_middle_mapper();
1124  $start_cs = $casm_mapper->last_CoordSystem();
1125  $start_registry = $casm_mapper->last_registry();
1126  $end_mid_mapper = $casm_mapper->first_middle_mapper();
1127  $end_cs = $casm_mapper->first_CoordSystem();
1128  $end_registry = $casm_mapper->first_registry();
1129  $end_name = 'first';
1130  } else {
1131  throw("Invalid from argument: [$from], must be 'first' or 'last'");
1132  }
1133 
1134  my $combined_mapper = $casm_mapper->first_last_mapper();
1135  my $mid_cs = $casm_mapper->middle_CoordSystem();
1136  my $mid_name = 'middle';
1137  my $csa = $self->db->get_CoordSystemAdaptor();
1138 
1139  # Check for the simple case where the ChainedMapper is short
1140  if( ! defined $mid_cs ) {
1141  $start_mid_mapper = $combined_mapper;
1142  }
1143 
1144 
1145  my @path;
1146  if( defined $mid_cs ) {
1147  @path = @{$csa->get_mapping_path($start_cs, $mid_cs)};
1148  } else {
1149  @path = @{$csa->get_mapping_path( $start_cs, $end_cs )};
1150  }
1151  if( ! defined $mid_cs ) {
1152  $start_mid_mapper = $combined_mapper;
1153  }
1154 
1155  if(@path != 2 && defined( $path[1] )) {
1156  my $path = join(',', map({$_->name .' '. $_->version} @path));
1157  my $len = scalar(@path) - 1;
1158  throw("Unexpected mapping path between start and intermediate " .
1159  "coord systems (". $start_cs->name . " " . $start_cs->version .
1160  " and " . $mid_cs->name . " " . $mid_cs->version . ")." .
1161  "\nExpected path length 1, got $len. " .
1162  "(path=$path)");
1163  }
1164 
1165  my ($asm_cs,$cmp_cs);
1166  $asm_cs = $path[0];
1167  $cmp_cs = $path[-1];
1168 
1169  $combined_mapper = $casm_mapper->first_last_mapper();
1170  $mid_cs = $casm_mapper->middle_CoordSystem();
1171  $mid_name = 'middle';
1172  $csa = $self->db->get_CoordSystemAdaptor();
1173 
1174  my $mid_cs_id;
1175 
1176  # Check for the simple case where the ChainedMapper is short
1177  if ( !defined $mid_cs ) {
1178  $start_mid_mapper = $combined_mapper;
1179  } else {
1180  $mid_cs_id = $mid_cs->dbID();
1181  }
1182 
1183  my @mid_ranges;
1184  my @start_ranges;
1185 
1186  my $to_cs = $to_slice->coord_system;
1187  foreach my $direction (1, 0){
1188  my $id1;
1189  my $id2;
1190  if($direction){
1191  $id1 = $seq_region_id;
1192  $id2 = $to_slice->get_seq_region_id();
1193  }
1194  else{
1195  $id2 = $seq_region_id;
1196  $id1 = $to_slice->get_seq_region_id();
1197  }
1198 
1199  foreach my $range (@$ranges) {
1200  my ($start, $end) = @$range;
1201  $sth->bind_param(1,$id1,SQL_INTEGER);
1202  $sth->bind_param(2,$start,SQL_INTEGER);
1203  $sth->bind_param(3,$end,SQL_INTEGER);
1204  $sth->bind_param(4,$to_cs->dbID,SQL_INTEGER);
1205  $sth->bind_param(5,$id2,SQL_INTEGER);
1206  $sth->execute();
1207 
1208  my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
1209  $ori, $start_start, $start_end);
1210 
1211  $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
1212  \$mid_seq_region, \$mid_length, \$ori, \$start_start,
1213  \$start_end);
1214 
1215  while($sth->fetch()) {
1216  $found = 1;
1217 
1218  if( defined $mid_cs ) {
1219  $start_mid_mapper->add_map_coordinates
1220  (
1221  $id1,$start_start, $start_end, $ori,
1222  $mid_seq_region_id, $mid_start, $mid_end
1223  );
1224  } else {
1225  if( $from eq "first") {
1226  if($direction){
1227  $combined_mapper->add_map_coordinates
1228  (
1229  $id1,$start_start, $start_end, $ori,
1230  $mid_seq_region_id, $mid_start, $mid_end
1231  );
1232  }
1233  else{
1234  $combined_mapper->add_map_coordinates
1235  (
1236  $mid_seq_region_id, $mid_start, $mid_end, $ori,
1237  $id1,$start_start, $start_end
1238  );
1239  }
1240  } else {
1241  if($direction){
1242  $combined_mapper->add_map_coordinates
1243  (
1244  $mid_seq_region_id, $mid_start, $mid_end, $ori,
1245  $id1,$start_start, $start_end
1246  );
1247  }
1248  else{
1249  $combined_mapper->add_map_coordinates
1250  (
1251  $id1,$start_start, $start_end, $ori,
1252  $mid_seq_region_id, $mid_start, $mid_end
1253  );
1254  }
1255  }
1256  }
1257 
1258  #update sr_name cache
1259  my $arr = [ $mid_seq_region_id, $mid_seq_region,
1260  $mid_cs_id, $mid_length ];
1261 
1262  $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
1263  $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
1264 
1265  push @mid_ranges,[$mid_seq_region_id,$mid_seq_region,
1266  $mid_start,$mid_end];
1267  push @start_ranges, [ $id1, $start_start, $start_end ];
1268 
1269  #the region that we actually register may actually be larger or smaller
1270  #than the region that we wanted to register.
1271  #register the intersection of the region so we do not end up doing
1272  #extra work later
1273 
1274  if($start_start < $start || $start_end > $end) {
1275  $start_registry->check_and_register($id1,$start_start,
1276  $start_end);
1277  }
1278  }
1279  $sth->finish();
1280  }
1281  if($found){
1282  if( ! defined $mid_cs ) {
1283  for my $range ( @mid_ranges ) {
1284  $end_registry->check_and_register( $range->[0], $range->[2],
1285  $range->[3] );
1286  }
1287 
1288  # and thats it for the simple case ...
1289  return;
1290  }
1291  }
1292  }
1293 }
1294 
1295 
1296 =head2 register_all
1297 
1298  Arg [1] : Bio::EnsEMBL::AssemblyMapper $mapper
1299  Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2);
1300 
1301  # make cache large enough to hold all of the mappings
1302  $mapper->max_pair_count(10e6);
1303  $asm_mapper_adaptor->register_all($mapper);
1304 
1305  # perform mappings as normal
1306  $mapper->map($slice->seq_region_name(), $sr_start, $sr_end,
1307  $sr_strand, $cs1);
1308  ...
1309  Description: This function registers the entire set of mappings between
1310  two coordinate systems in an assembly mapper.
1311  This will use a lot of memory but will be much more efficient
1312  when doing a lot of mapping which is spread over the entire
1313  genome.
1314  Returntype : none
1315  Exceptions : none
1316  Caller : specialised prograhsm
1317  Status : Stable
1318 
1319 =cut
1320 
1321 sub register_all {
1322  my $self = shift;
1323  my $mapper = shift;
1324 
1325  my $asm_cs_id = $mapper->assembled_CoordSystem()->dbID();
1326  my $cmp_cs_id = $mapper->component_CoordSystem()->dbID();
1327 
1328  # retrieve every relevant assembled/component pair from the assembly table
1329 
1330  my $q = qq{
1331  SELECT
1332  asm.cmp_start,
1333  asm.cmp_end,
1334  asm.cmp_seq_region_id,
1335  cmp_sr.name,
1336  cmp_sr.length,
1337  asm.ori,
1338  asm.asm_start,
1339  asm.asm_end,
1340  asm.asm_seq_region_id,
1341  asm_sr.name,
1342  asm_sr.length
1343  FROM
1344  assembly asm, seq_region asm_sr, seq_region cmp_sr
1345  WHERE
1346  asm.cmp_seq_region_id = cmp_sr.seq_region_id AND
1347  asm.asm_seq_region_id = asm_sr.seq_region_id AND
1348  cmp_sr.coord_system_id = ? AND
1349  asm_sr.coord_system_id = ?
1350  };
1351 
1352  my $sth = $self->prepare($q);
1353 
1354  $sth->bind_param(1,$cmp_cs_id,SQL_INTEGER);
1355  $sth->bind_param(2,$asm_cs_id,SQL_INTEGER);
1356  $sth->execute();
1357 
1358  # load the mapper with the assembly information
1359 
1360  my ($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $cmp_length,
1361  $ori,
1362  $asm_start, $asm_end, $asm_seq_region_id, $asm_seq_region, $asm_length);
1363 
1364  $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id,
1365  \$cmp_seq_region, \$cmp_length, \$ori,
1366  \$asm_start, \$asm_end, \$asm_seq_region_id,
1367  \$asm_seq_region, \$asm_length);
1368 
1369  my %asm_registered;
1370 
1371  while($sth->fetch()) {
1372  $mapper->register_component($cmp_seq_region_id);
1373  $mapper->mapper->add_map_coordinates(
1374  $asm_seq_region_id, $asm_start, $asm_end,
1375  $ori,
1376  $cmp_seq_region_id, $cmp_start, $cmp_end);
1377 
1378  my $arr = [$cmp_seq_region_id, $cmp_seq_region, $cmp_cs_id, $cmp_length];
1379 
1380  $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr;
1381  $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr;
1382 
1383  # only register each asm seq_region once since it requires some work
1384  if(!$asm_registered{$asm_seq_region_id}) {
1385  $asm_registered{$asm_seq_region_id} = 1;
1386 
1387  # register all chunks from start of seq region to end
1388  my $end_chunk = $asm_length >> $CHUNKFACTOR;
1389  for(my $i = 0; $i <= $end_chunk; $i++) {
1390  $mapper->register_assembled($asm_seq_region_id, $i);
1391  }
1392 
1393  $arr = [$asm_seq_region_id, $asm_seq_region, $asm_cs_id, $asm_length];
1394 
1395  $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr;
1396  $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr;
1397  }
1398  }
1399 
1400  $sth->finish();
1401 
1402  return;
1403 }
1404 
1405 
1406 
1407 =head2 register_all_chained
1408 
1409  Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
1410  Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2);
1411 
1412  # make the cache large enough to hold all of the mappings
1413  $mapper->max_pair_count(10e6);
1414  # load all of the mapping data
1415  $asm_mapper_adaptor->register_all_chained($mapper);
1416 
1417  # perform mappings as normal
1418  $mapper->map($slice->seq_region_name(), $sr_start, $sr_end,
1419  $sr_strand, $cs1);
1420  ...
1421  Description: This function registers the entire set of mappings between
1422  two coordinate systems in a chained mapper. This will use a lot
1423  of memory but will be much more efficient when doing a lot of
1424  mapping which is spread over the entire genome.
1425  Returntype : none
1426  Exceptions : throw if mapper is between coord systems with unexpected
1427  mapping paths
1428  Caller : specialised programs doing a lot of genome-wide mapping
1429  Status : Stable
1430 
1431 =cut
1432 
1433 sub register_all_chained {
1434  my $self = shift;
1435  my $casm_mapper = shift;
1436 
1437  my $first_cs = $casm_mapper->first_CoordSystem();
1438  my $mid_cs = $casm_mapper->middle_CoordSystem();
1439  my $last_cs = $casm_mapper->last_CoordSystem();
1440 
1441  my $start_mid_mapper = $casm_mapper->first_middle_mapper();
1442  my $end_mid_mapper = $casm_mapper->last_middle_mapper();
1443  my $combined_mapper = $casm_mapper->first_last_mapper();
1444 
1445  my @ranges;
1446 
1447  my $sth = $self->prepare(
1448  'SELECT
1449  asm.cmp_start,
1450  asm.cmp_end,
1451  asm.cmp_seq_region_id,
1452  sr_cmp.name,
1453  sr_cmp.length,
1454  asm.ori,
1455  asm.asm_start,
1456  asm.asm_end,
1457  asm.asm_seq_region_id,
1458  sr_asm.name,
1459  sr_asm.length
1460  FROM
1461  assembly asm, seq_region sr_asm, seq_region sr_cmp
1462  WHERE
1463  sr_asm.seq_region_id = asm.asm_seq_region_id AND
1464  sr_cmp.seq_region_id = asm.cmp_seq_region_id AND
1465  sr_asm.coord_system_id = ? AND
1466  sr_cmp.coord_system_id = ?');
1467 
1468  my $csa = $self->db()->get_CoordSystemAdaptor();
1469 
1470  my @path;
1471 
1472  if ( !defined $mid_cs ) {
1473  @path = @{ $csa->get_mapping_path( $first_cs, $last_cs ) };
1474  if ( !defined( $path[1] ) ) {
1475  splice( @path, 1, 1 );
1476  }
1477  } else {
1478  @path = @{ $csa->get_mapping_path( $first_cs, $mid_cs ) };
1479  # fix for when we have something like supercontig#contig#chromosome
1480  if ( !defined( $path[1] ) ) {
1481  splice( @path, 1, 1 );
1482  }
1483  }
1484 
1485  if ( @path != 2 ) {
1486  my $path =
1487  join( ',', map( { $_->name . ' ' . $_->version } @path ) );
1488  my $len = scalar(@path) - 1;
1489  throw( "Unexpected mapping path between start and intermediate "
1490  . "coord systems ("
1491  . $first_cs->name . " "
1492  . $first_cs->version . " and "
1493  . $mid_cs->name . " "
1494  . $mid_cs->version . ")."
1495  . "\nExpected path length 1, got $len. "
1496  . "(path=$path)" );
1497  }
1498 
1499  my ($asm_cs,$cmp_cs) = @path;
1500 
1501  $sth->{mysql_use_result} = 1;
1502  $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER);
1503  $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER);
1504  $sth->execute();
1505 
1506 
1507  my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
1508  $ori, $start_start, $start_end, $start_seq_region_id, $start_seq_region,
1509  $start_length);
1510 
1511  if($asm_cs->equals($first_cs)) {
1512  $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
1513  \$mid_seq_region, \$mid_length, \$ori, \$start_start,
1514  \$start_end, \$start_seq_region_id, \$start_seq_region,
1515  \$start_length);
1516  } else {
1517  $sth->bind_columns(\$start_start, \$start_end, \$start_seq_region_id,
1518  \$start_seq_region, \$start_length, \$ori,
1519  \$mid_start, \$mid_end, \$mid_seq_region_id,
1520  \$mid_seq_region, \$mid_length);
1521 
1522  }
1523 
1524  my ( $mid_cs_id, $start_cs_id, $registry, $mapper );
1525  if ( !defined $mid_cs ) {
1526  $mid_cs_id = $last_cs->dbID();
1527  $start_cs_id = $first_cs->dbID();
1528  $mapper = $combined_mapper;
1529  } else {
1530  $mid_cs_id = $mid_cs->dbID();
1531  $start_cs_id = $first_cs->dbID();
1532  $mapper = $start_mid_mapper;
1533  }
1534 
1535  $registry = $casm_mapper->first_registry();
1536 
1537  while($sth->fetch()) {
1538  $mapper->add_map_coordinates
1539  (
1540  $start_seq_region_id, $start_start, $start_end, $ori,
1541  $mid_seq_region_id, $mid_start, $mid_end
1542  );
1543  push( @ranges, [$start_seq_region_id, $start_start, $start_end ] );
1544 
1545  $registry->check_and_register( $start_seq_region_id, 1, $start_length );
1546  if( ! defined $mid_cs ) {
1547  $casm_mapper->last_registry()->check_and_register
1548  ( $mid_seq_region_id, $mid_start, $mid_end );
1549  }
1550 
1551  my $arr = [ $mid_seq_region_id, $mid_seq_region,
1552  $mid_cs_id, $mid_length ];
1553 
1554  $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
1555  $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
1556 
1557  $arr = [ $start_seq_region_id, $start_seq_region,
1558  $start_cs_id, $start_length ];
1559 
1560  $self->{'sr_name_cache'}->{"$start_seq_region:$start_cs_id"} = $arr;
1561  $self->{'sr_id_cache'}->{"$start_seq_region_id"} = $arr;
1562  }
1563 
1564  if( ! defined $mid_cs ) {
1565  # thats it for the simple case
1566  return;
1567  }
1568 
1569 
1570  @path = @{ $csa->get_mapping_path( $last_cs, $mid_cs ) };
1571  if ( defined($mid_cs) ) {
1572  if ( !defined( $path[1] ) ) {
1573  splice( @path, 1, 1 );
1574  }
1575  }
1576 
1577  if ( @path != 2 ) {
1578  my $path =
1579  join( ',', map( { $_->name . ' ' . $_->version } @path ) );
1580  my $len = scalar(@path) - 1;
1581  throw( "Unexpected mapping path between intermediate and last "
1582  . "coord systems ("
1583  . $last_cs->name . " "
1584  . $last_cs->version . " and "
1585  . $mid_cs->name . " "
1586  . $mid_cs->version . ")."
1587  . "\nExpected path length 1, got $len. "
1588  . "(path=$path)" );
1589  }
1590 
1591  ($asm_cs,$cmp_cs) = @path;
1592 
1593  $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER);
1594  $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER);
1595  $sth->execute();
1596 
1597 
1598  my ($end_start, $end_end, $end_seq_region_id, $end_seq_region,
1599  $end_length);
1600 
1601  if($asm_cs->equals($mid_cs)) {
1602  $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id,
1603  \$end_seq_region, \$end_length, \$ori,
1604  \$mid_start, \$mid_end, \$mid_seq_region_id,
1605  \$mid_seq_region, \$mid_length);
1606  } else {
1607  $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
1608  \$mid_seq_region, \$mid_length, \$ori, \$end_start,
1609  \$end_end, \$end_seq_region_id, \$end_seq_region,
1610  \$end_length);
1611  }
1612 
1613  my $end_cs_id = $last_cs->dbID();
1614  $registry = $casm_mapper->last_registry();
1615 
1616  while($sth->fetch()) {
1617  $end_mid_mapper->add_map_coordinates
1618  (
1619  $end_seq_region_id, $end_start, $end_end, $ori,
1620  $mid_seq_region_id, $mid_start, $mid_end
1621  );
1622 
1623  $registry->check_and_register( $end_seq_region_id, 1, $end_length );
1624 
1625  my $arr = [ $end_seq_region_id, $end_seq_region,
1626  $end_cs_id, $end_length ];
1627  $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr;
1628  $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr;
1629  }
1630 
1631  _build_combined_mapper( \@ranges, $start_mid_mapper, $end_mid_mapper,
1632  $combined_mapper, "first" );
1633 
1634  return;
1635 }
1636 
1637 
1638 
1639 # after both halves of a chained mapper are loaded
1640 # this function maps all ranges in $ranges and loads the
1641 # results into the combined mapper
1642 sub _build_combined_mapper {
1643  my $ranges = shift;
1644  my $start_mid_mapper = shift;
1645  my $end_mid_mapper = shift;
1646  my $combined_mapper = shift;
1647  my $start_name = shift;
1648 
1649  my $mid_name = "middle";
1650 
1651  foreach my $range (@$ranges) {
1652  my ( $seq_region_id, $start, $end) = @$range;
1653 
1654  my $sum = 0;
1655 
1656  my @initial_coords = $start_mid_mapper->map_coordinates($seq_region_id,
1657  $start,$end,1,
1658  $start_name);
1659 
1660  foreach my $icoord (@initial_coords) {
1661  #skip gaps
1662  if($icoord->isa('Bio::EnsEMBL::Mapper::Gap')) {
1663  $sum += $icoord->length();
1664  next;
1665  }
1666 
1667 
1668  #feed the results of the first mapping into the second mapper
1669  my @final_coords =
1670  $end_mid_mapper->map_coordinates($icoord->id, $icoord->start,
1671  $icoord->end,
1672  $icoord->strand, $mid_name);
1673 
1674 
1675  foreach my $fcoord (@final_coords) {
1676  #load up the final mapper
1677 
1678  if($fcoord->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
1679  my $total_start = $start + $sum;
1680  my $total_end = $total_start + $fcoord->length - 1;
1681  my $ori = $fcoord->strand();
1682 
1683  if($start_name eq 'first') { # add coords in consistant order
1684  $combined_mapper->add_map_coordinates(
1685  $seq_region_id, $total_start, $total_end, $ori,
1686  $fcoord->id(), $fcoord->start(), $fcoord->end());
1687  } else {
1688  $combined_mapper->add_map_coordinates(
1689  $fcoord->id(), $fcoord->start(), $fcoord->end(),$ori,
1690  $seq_region_id, $total_start, $total_end);
1691  }
1692 
1693  }
1694  $sum += $fcoord->length();
1695  }
1696  }
1697  }
1698  #all done!
1699 }
1700 
1701 
1702 =head2 seq_regions_to_ids
1703 
1704  Arg [1] : Bio::EnsEMBL::CoordSystem $coord_system
1705  Arg [2] : listref of strings $seq_regions
1706  Example : my @ids = @{$asma->seq_regions_to_ids($coord_sys, \@seq_regs)};
1707  Description: Converts a list of seq_region names to internal identifiers
1708  using the internal cache that has accumulated while registering
1709  regions for AssemblyMappers. If any requested regions are
1710  not found in the cache an attempt is made to retrieve them
1711  from the database.
1712  Returntype : listref of ints
1713  Exceptions : throw if a non-existant seqregion is provided
1714  Caller : general
1715  Status : Stable
1716 
1717 =cut
1718 
1719 sub seq_regions_to_ids {
1720  my $self = shift;
1721  my $coord_system = shift;
1722  my $seq_regions = shift;
1723 
1724  my $cs_id = $coord_system->dbID();
1725 
1726  my @out;
1727 
1728  foreach my $sr (@$seq_regions) {
1729  my $arr = $self->{'sr_name_cache'}->{"$sr:$cs_id"};
1730  if( $arr ) {
1731  push( @out, $arr->[0] );
1732  } else {
1733  push @out, $self->_seq_region_name_to_id($sr,$cs_id);
1734  }
1735  }
1736 
1737  return \@out;
1738 }
1739 
1740 
1741 =head2 seq_ids_to_regions
1742 
1743  Arg [1] : listref of seq_region ids
1744  Example : my @ids = @{$asma->ids_to_seq_regions(\@seq_ids)};
1745  Description: Converts a list of seq_region ids to seq region names
1746  using the internal cache that has accumulated while registering
1747  regions for AssemblyMappers. If any requested regions are
1748  not found in the cache an attempt is made to retrieve them
1749  from the database.
1750  Returntype : listref of strings
1751  Exceptions : throw if a non-existant seq_region_id is provided
1752  Caller : general
1753  Status : Stable
1754 
1755 =cut
1756 
1757 sub seq_ids_to_regions {
1758  my $self = shift;
1759  my $seq_region_ids = shift;
1760 
1761  my @out;
1762 
1763  foreach my $sr (@$seq_region_ids) {
1764  my $arr = $self->{'sr_id_cache'}->{"$sr"};
1765  if( $arr ) {
1766  push( @out, $arr->[1] );
1767  } else {
1768  push @out, $self->_seq_region_id_to_name($sr);
1769  }
1770  }
1771 
1772  return \@out;
1773 }
1774 
1775 =head2 delete_cache
1776 
1777  Description: Delete all the caches for the mappings/seq_regions
1778  Returntype : none
1779  Exceptions : none
1780  Caller : General
1781  Status : At risk
1782 
1783 =cut
1784 
1785 sub delete_cache{
1786  my ($self) = @_;
1787 
1788  %{$self->{'sr_name_cache'}} = ();
1789  %{$self->{'sr_id_cache'}} = ();
1790 
1791  foreach my $key (keys %{$self->{'_asm_mapper_cache'}}){
1792  $self->{'_asm_mapper_cache'}->{$key}->flush();
1793  }
1794  %{$self->{'_asm_mapper_cache'}} = ();
1795  return;
1796 }
1797 
1798 
1799 1;
Bio::EnsEMBL::Registry::get_adaptor
public Adaptor get_adaptor()
Bio::EnsEMBL::Storable::dbID
public Int dbID()
Bio::EnsEMBL::ChainedAssemblyMapper::max_pair_count
public Int max_pair_count()
EnsEMBL
Definition: Filter.pm:1
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
Bio::EnsEMBL::AssemblyMapper::component_CoordSystem
public Bio::EnsEMBL::CoordSystem component_CoordSystem()
Bio::EnsEMBL::AssemblyMapper
Definition: AssemblyMapper.pm:49
map
public map()
AssemblyMapper
Definition: BlastzAligner.pm:7
Bio::EnsEMBL::AssemblyMapper::max_pair_count
public Int max_pair_count()
Bio::EnsEMBL::ChainedAssemblyMapper
Definition: ChainedAssemblyMapper.pm:54
Bio::EnsEMBL::CoordSystem
Definition: CoordSystem.pm:40
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
Bio::EnsEMBL::Utils::SeqRegionCache
Definition: SeqRegionCache.pm:40
Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor
Definition: AssemblyMapperAdaptor.pm:58
Bio::EnsEMBL::DBSQL::DBAdaptor::get_SeqRegionCache
public Bio::EnsEMBL::Utils::SeqRegionCache get_SeqRegionCache()
Bio::EnsEMBL::TopLevelAssemblyMapper
Definition: TopLevelAssemblyMapper.pm:40
Bio::EnsEMBL::DBSQL::BaseAdaptor
Definition: BaseAdaptor.pm:71
Bio::EnsEMBL::AssemblyMapper::new
public Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor new()
Bio::EnsEMBL::Registry::load_registry_from_db
public Int load_registry_from_db()
Bio::EnsEMBL::AssemblyMapper::assembled_CoordSystem
public Bio::EnsEMBL::CoordSystem assembled_CoordSystem()
Bio::EnsEMBL::TopLevelAssemblyMapper::new
public Bio::EnsEMBL::DBSQL::TopLevelAssemblyMapper new()
Bio
Definition: AltAlleleGroup.pm:4
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::ChainedAssemblyMapper::new
public Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor new()