ensembl-hive  2.7.0
Mapper.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
34 
35 =head1 SYNOPSIS
36 
37  $map = Bio::EnsEMBL::Mapper->new( 'rawcontig', 'chromosome' );
38 
39  # add a coodinate mapping - supply two pairs or coordinates
40  $map->add_map_coordinates(
41  $contig_id, $contig_start, $contig_end, $contig_ori,
42  $chr_name, chr_start, $chr_end
43  );
44 
45  # map from one coordinate system to another
46  my @coordlist =
47  $mapper->map_coordinates( 627012, 2, 5, -1, "rawcontig" );
48 
49 =head1 DESCRIPTION
50 
51 Generic mapper to provide coordinate transforms between two disjoint
52 coordinate systems. This mapper is intended to be 'context neutral' - in
53 that it does not contain any code relating to any particular coordinate
54 system. This is provided in, for example, Bio::EnsEMBL::AssemblyMapper.
55 
56 Mappings consist of pairs of 'to-' and 'from-' contigs with coordinates on each.
57 Orientation is abbreviated to 'ori',
58 
59 The contig pair hash is divided into mappings per seq_region, the code
60 below makes assumptions about how to filter these results, thus the comparisons
61 for some properties are absent in the code but implicit by data structure.
62 
63 The assembly mapping hash '_pair_last' orders itself by the target seq region and looks like this:
64 
65  1 => ARRAY(0x1024c79c0)
66  0 Bio::EnsEMBL::Mapper::Pair=HASH(0x1024d6198)
67  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025edf98)
68  'end' => 4
69  'id' => 4
70  'start' => 1
71  'ori' => 1
72  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025edf68)
73  'end' => 4
74  'id' => 1
75  'start' => 1
76  1 Bio::EnsEMBL::Mapper::Pair=HASH(0x1026c20f0)
77  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee3a0)
78  'end' => 12
79  'id' => 4
80  'start' => 9
81  'ori' => 1
82  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee370)
83  'end' => 4
84  'id' => 1
85  'start' => 1
86  2 => ARRAY(0x1025ee460)
87  0 Bio::EnsEMBL::Mapper::Pair=HASH(0x1025ee400)
88  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee2c8)
89  'end' => 8
90  'id' => 4
91  'start' => 5
92  'ori' => 1
93  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee2b0)
94  'end' => 4
95  'id' => 2
96  'start' => 1
97  1 Bio::EnsEMBL::Mapper::Pair=HASH(0x1025ee658)
98  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025eea48)
99  'end' => 16
100  'id' => 4
101  'start' => 13
102  'ori' => 1
103  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025eea18)
104  'end' => 4
105  'id' => 2
106  'start' => 1
107 
108 The other mapping hash available is the reverse sense, putting the 'from' seq_region as the
109 sorting key. Here is an excerpt.
110 
111 0 HASH(0x102690bb8)
112  4 => ARRAY(0x1025ee028)
113  0 Bio::EnsEMBL::Mapper::Pair=HASH(0x1024d6198)
114  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025edf98)
115  'end' => 4
116  'id' => 4
117  'start' => 1
118  'ori' => 1
119  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025edf68)
120  'end' => 4
121  'id' => 1
122  'start' => 1
123  1 Bio::EnsEMBL::Mapper::Pair=HASH(0x1025ee400)
124  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee2c8)
125  'end' => 8
126  'id' => 4
127  'start' => 5
128  'ori' => 1
129  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee2b0)
130  'end' => 4
131  'id' => 2
132  'start' => 1
133 
134 
135 =head1 METHODS
136 
137 =cut
138 
139 package Bio::EnsEMBL::Mapper;
140 use strict;
141 use integer;
142 
143 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
150 
151 use Bio::EnsEMBL::Utils::Exception qw(throw);
154 
155 =head2 new
156 
157  Arg [1] : string $from
158  The name of the 'from' coordinate system
159  Arg [2] : string $to
160  The name of the 'to' coordinate system
161  Arg [3] : (optional) Bio::EnsEMBL::CoordSystem $from_cs
162  The 'from' coordinate system
163  Arg [4] : (optional) Bio::EnsEMBL::CoordSystem $to_cs
164  Example : my $mapper = Bio::EnsEMBL::Mapper->new('FROM', 'TO');
165  Description: Constructor. Creates a new Bio::EnsEMBL::Mapper object.
166  Returntype : Bio::EnsEMBL::Mapper
167  Exceptions : none
168  Caller : general
169 
170 =cut
171 
172 sub new {
173  my ( $proto, $from, $to, $from_cs, $to_cs ) = @_;
174 
175  if ( !defined($to) || !defined($from) ) {
176  throw("Must supply 'to' and 'from' tags");
177  }
178 
179  my $class = ref($proto) || $proto;
180 
181  my $self = bless( { "_pair_$from" => {},
182  "_pair_$to" => {},
183  "_tree_$from" => {},
184  "_tree_$to" => {},
185  'pair_count' => 0,
186  'to' => $to,
187  'from' => $from,
188  'to_cs' => $to_cs,
189  'from_cs' => $from_cs
190  },
191  $class );
192 
193  # do sql to get any componente with muliple assemblys.
194 
195  return $self;
196 }
197 
198 =head2 flush
199 
200  Args : none
201  Example : none
202  Description: removes all cached information out of this mapper
203  Returntype : none
204  Exceptions : none
206 
207 =cut
208 
209 sub flush {
210  my $self = shift;
211  my $from = $self->from();
212  my $to = $self->to();
213 
214  $self->{"_pair_$from"} = {};
215  $self->{"_pair_$to"} = {};
216  $self->{"_tree_$from"} = {};
217  $self->{"_tree_$to"} = {};
218 
219  $self->{'pair_count'} = 0;
220 }
221 
222 
223 
224 =head2 map_coordinates
225 
226  Arg 1 string $id
227  id of 'source' sequence
228  Arg 2 int $start
229  start coordinate of 'source' sequence
230  Arg 3 int $end
231  end coordinate of 'source' sequence
232  Arg 4 int $strand
233  raw contig orientation (+/- 1)
234  Arg 5 string $type
235  nature of transform - gives the type of
236  coordinates to be transformed *from*
237  Arg 6 boolean (0 or 1) $include_original_region
238  option to include original input coordinate region mappings in the result
239  Arg 7 int $cdna_coding_start
240  cdna coding start
241  Function generic map method
242  Returntype if $include_original_region == 0
243  array of mappped Bio::EnsEMBL::Mapper::Coordinate
244  and/or Bio::EnsEMBL::Mapper::Gap
245  if $include_original_region == 1
246  hash of mapped and original Bio::EnsEMBL::Mapper::Coordinate
247  and/or Bio::EnsEMBL::Mapper::Gap
248  Exceptions none
249  Caller Bio::EnsEMBL::Mapper
250 
251 =cut
252 
253 sub map_coordinates {
254  my ( $self, $id, $start, $end, $strand, $type, $include_original_region, $cdna_coding_start ) = @_;
255  unless ( defined($id)
256  && defined($start)
257  && defined($end)
258  && defined($strand)
259  && defined($type) )
260  {
261  throw("Expecting at least 5 arguments");
262  }
263 
264  $cdna_coding_start = defined $cdna_coding_start ? $cdna_coding_start : 1;
265 
266  # special case for handling inserts:
267  if ( $start == $end + 1 ) {
268  return $self->map_insert( $id, $start, $end, $strand, $type, undef, $include_original_region);
269  }
270 
271  if ( !$self->{'_is_sorted'} ) { $self->_sort() }
272 
273  my $hash = $self->{"_pair_$type"};
274 
275  my ( $from, $to, $cs );
276 
277  if ( $type eq $self->{'to'} ) {
278  $from = 'to';
279  $to = 'from';
280  $cs = $self->{'from_cs'};
281  } else {
282  $from = 'from';
283  $to = 'to';
284  $cs = $self->{'to_cs'};
285  }
286 
287  unless ( defined $hash ) {
288  throw("Type $type is neither to or from coordinate systems");
289  }
290 
291  my @result;
292  my @paired_result;
293 
294  if ( !defined $hash->{ uc($id) } ) {
295  # one big gap!
296  my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
297  if ($include_original_region) {
298  push @paired_result, { 'original' => $gap, 'mapped' => $gap };
299  return @paired_result;
300  } else {
301  push @result, $gap;
302  return @result;
303  }
304  }
305 
306  my $last_used_pair;
307 
308  # if we don't already have an interval tree built for this id,
309  # build one now
310  if ( !defined( $self->{"_tree_$type"}->{ uc($id) } )) {
311  $self->{"_tree_$type"}->{ uc($id) } = _build_immutable_tree($from,
312  $hash->{ uc($id) });
313  }
314  # query the interval tree (either cached or created new) for overlapping intervals
315  my $overlap = $self->{"_tree_$type"}->{ uc($id) }->query($start, $end);
316 
317  my $rank = 0;
318  my $orig_start = $start;
319  my $last_target_coord = undef;
320  foreach my $i (@{$overlap}) {
321  my $pair = $i->data;
322  my $self_coord = $pair->{$from};
323  my $target_coord = $pair->{$to};
324 
325  #
326  # But not the case for haplotypes!! need to test for this case???
327  # so removing this till a better solution is found
328  #
329  #
330  # if($self_coord->{'start'} < $start){
331  # $start = $orig_start;
332  # $rank++;
333  # }
334 
335  if ( defined($last_target_coord) && $target_coord->{'id'} ne $last_target_coord ) {
336  if ( $self_coord->{'start'} < $start ) {
337  # i.e. the same bit is being mapped to another assembled bit
338  $start = $orig_start;
339  }
340  } else {
341  $last_target_coord = $target_coord->{'id'};
342  }
343 
344  if ( $start < $self_coord->{'start'} ) {
345  # gap detected
346  my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $self_coord->{'start'} - 1, $rank );
347  push( @result, $gap );
348  push(@paired_result, { 'original' => $gap, 'mapped' => $gap });
349 
350  $start = $gap->{'end'} + 1;
351  }
352  my ( $target_start, $target_end);
353  my ( $ori_start, $ori_end);
354 
355  my $res;
356  if ( exists $pair->{'indel'} ) {
357  # When next pair is an IndelPair and not a Coordinate, create the
358  # new mapping Coordinate, the IndelCoordinate.
359  $target_start = $target_coord->{'start'};
360  $target_end = $target_coord->{'end'};
361 
362  #original coordinates
363  $ori_start = $self_coord->{'start'};
364  $ori_end = $self_coord->{'end'};
365 
366  #create a Gap object
367  my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start,
368  ( $self_coord->{'end'} < $end ? $self_coord->{'end'} : $end ) );
369  #create the Coordinate object
370  my $coord =
371  Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
372  $target_start, $target_end, $pair->{'ori'}*$strand, $cs );
373  #and finally, the IndelCoordinate object with
374  $res = Bio::EnsEMBL::Mapper::IndelCoordinate->new( $gap, $coord );
375 
376  } else {
377  # start is somewhere inside the region
378  if ( $pair->{'ori'} == 1 ) {
379  $target_start = $target_coord->{'start'} + ( $start - $self_coord->{'start'} );
380  } else {
381  $target_end = $target_coord->{'end'} - ( $start - $self_coord->{'start'} );
382  }
383 
384  # Either we are enveloping this map or not. If yes, then end
385  # point (self perspective) is determined solely by target. If
386  # not we need to adjust.
387 
388  if ( $end > $self_coord->{'end'} ) {
389  # enveloped
390  if ( $pair->{'ori'} == 1 ) {
391  $target_end = $target_coord->{'end'};
392  } else {
393  $target_start = $target_coord->{'start'};
394  }
395  } else {
396  # need to adjust end
397  if ( $pair->{'ori'} == 1 ) {
398  $target_end =
399  $target_coord->{'start'} +
400  ( $end - $self_coord->{'start'} );
401  } else {
402 
403  $target_start =
404  $target_coord->{'end'} - ( $end - $self_coord->{'start'} );
405  }
406 
407  }
408 
409  $res = Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
410  $target_start, $target_end, $pair->{'ori'}*$strand,$cs, $rank );
411 
412  #save the ori coordinate info
413  $ori_start = ($start - $cdna_coding_start) + 1;
414  $ori_end = $ori_start + ($target_end - $target_start);
415 
416 
417  } ## end else [ if ( exists $pair->{'indel'...})]
418 
419  push( @result, $res );
420  my $res_ori = Bio::EnsEMBL::Mapper::Coordinate->new( $self_coord->{'id'},
421  $ori_start, $ori_end, $pair->{'ori'}*$strand,$cs, $rank);
422  push(@paired_result, { 'original' => $res_ori, 'mapped' => $res });
423 
424 
425  $last_used_pair = $pair;
426  $start = $self_coord->{'end'} + 1;
427 
428  } ## end for ( my $i = $start_idx...)
429 
430  if ( !defined $last_used_pair ) {
431  my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
432  push( @result, $gap );
433  push(@paired_result, { 'original' => $gap, 'mapped' => $gap });
434 
435 
436  } elsif ( $last_used_pair->{$from}->{'end'} < $end ) {
437  # gap at the end
438  my $gap = Bio::EnsEMBL::Mapper::Gap->new(
439  $last_used_pair->{$from}->{'end'} + 1,
440  $end, $rank );
441  push( @result, $gap );
442 
443  push(@paired_result, { 'original' => $gap, 'mapped' => $gap });
444  }
445 
446  if ( $strand == -1 ) {
447  @result = reverse(@result);
448  @paired_result = reverse(@paired_result);
449  }
450 
451  if ($include_original_region){
452  return @paired_result;
453  }else{
454  return @result;
455  }
456 
457 } ## end sub map_coordinates
458 
459 
460 
461 =head2 map_insert
462 
463  Arg [1] : string $id
464  Arg [2] : int $start - start coord. Since this is an insert should always
465  be one greater than end.
466  Arg [3] : int $end - end coord. Since this is an insert should always
467  be one less than start.
468  Arg [4] : int $strand (0, 1, -1)
469  Arg [5] : string $type - the coordinate system name the coords are from.
470  Arg [6] : boolean $fastmap - if specified, this is being called from
471  the fastmap call. The mapping done is not any faster for
472  inserts, but the return value is different.
473  Example :
474  Description: This is in internal function which handles the special mapping
475  case for inserts (start = end +1). This function will be called
476  automatically by the map function so there is no reason to
477  call it directly.
478  Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and/or Gap objects
479  Exceptions : none
480  Caller : map_coordinates()
481 
482 =cut
483 
484 sub map_insert {
485  my ($self, $id, $start, $end, $strand, $type, $fastmap, $include_original_region) = @_;
486 
487  # swap start/end and map the resultant 2bp coordinate
488  ($start, $end) =($end,$start);
489 
490  my @coords = $self->map_coordinates($id, $start, $end, $strand, $type, $include_original_region);
491 
492  if(@coords == 1) {
493  my $c = $coords[0];
494  if ($include_original_region) {
495  my $m = $c->{'mapped'};
496  my $orig = $c->{'original'};
497  ($m->{'start'}, $m->{'end'}) = ($m->{'end'}, $m->{'start'});
498  ($orig->{'start'}, $orig->{'end'}) = ($orig->{'end'}, $orig->{'start'});
499  } else {
500  ($c->{'start'}, $c->{'end'}) = ($c->{'end'}, $c->{'start'});
501  }
502  } else {
503  throw("Unexpected: Got ",scalar(@coords)," expected 2.") if(@coords != 2);
504 
505  # adjust coordinates, remove gaps
506  my ($c1, $c2);
507 
508  if($strand == -1) {
509  ($c2,$c1) = @coords;
510  } else {
511  ($c1, $c2) = @coords;
512  }
513  @coords = ();
514 
515  my ($m1, $m2);
516  if ($include_original_region) {
517  ($m1, $m2) = ($c1->{'mapped'}, $c2->{'mapped'});
518  } else {
519  ($m1, $m2) = ($c1, $c2);
520  }
521  if(ref($m1) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
522  # insert is after first coord
523  if($m1->{'strand'} * $strand == -1) {
524  $m1->{'end'}--;
525  } else {
526  $m1->{'start'}++;
527  }
528  if ($include_original_region) {
529  $c1->{'mapped'} = $m1;
530  @coords = ($c1);
531  } else {
532  @coords = ($m1);
533  }
534  }
535  if(ref($m2) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
536  # insert is before second coord
537  if($m2->{'strand'} * $strand == -1) {
538  $m2->{'start'}++;
539  } else {
540  $m2->{'end'}--;
541  }
542  if($strand == -1) {
543  if ($include_original_region) {
544  $c2->{'mapped'} = $m2;
545  unshift @coords, $c2;
546  } else {
547  unshift @coords, $m2;
548  }
549  } else {
550  if ($include_original_region) {
551  $c2->{'mapped'} = $m2;
552  push @coords, $c2;
553  } else {
554  push @coords, $m2;
555  }
556  }
557  }
558 
559  # as with @coords == 1 above, if we are in
560  # include_original_region mode, then we have
561  # to flip back the original region as well
562  if ($include_original_region) {
563  foreach my $coord (@coords) {
564  my $orig = $coord->{'original'};
565  ($orig->{'start'}, $orig->{'end'}) =
566  ($orig->{'end'}, $orig->{'start'});
567  }
568  }
569  }
570 
571  if($fastmap) {
572  return undef if(@coords != 1);
573  my $c = $include_original_region ? $coords[0]->{'mapped'} : $coords[0];
574  return ($c->{'id'}, $c->{'start'}, $c->{'end'},
575  $c->{'strand'}, $c->{'coord_system'});
576  }
577 
578  return @coords;
579 }
580 
581 
582 
583 
584 
585 
586 =head2 fastmap
587 
588  Arg 1 string $id
589  id of 'source' sequence
590  Arg 2 int $start
591  start coordinate of 'source' sequence
592  Arg 3 int $end
593  end coordinate of 'source' sequence
594  Arg 4 int $strand
595  raw contig orientation (+/- 1)
596  Arg 5 int $type
597  nature of transform - gives the type of
598  coordinates to be transformed *from*
599  Function inferior map method. Will only do ungapped unsplit mapping.
600  Will return id, start, end strand in a list.
601  Returntype list of results
602  Exceptions none
603  Caller Bio::EnsEMBL::AssemblyMapper
604 
605 =cut
606 
607 sub fastmap {
608  my ($self, $id, $start, $end, $strand, $type) = @_;
609 
610  my ($from, $to, $cs);
611 
612  if($end+1 == $start) {
613  return $self->map_insert($id, $start, $end, $strand, $type, 1);
614  }
615 
616  if( ! $self->{'_is_sorted'} ) { $self->_sort() }
617 
618  if($type eq $self->{'to'}) {
619  $from = 'to';
620  $to = 'from';
621  $cs = $self->{'from_cs'};
622  } else {
623  $from = 'from';
624  $to = 'to';
625  $cs = $self->{'to_cs'};
626  }
627 
628  my $hash = $self->{"_pair_$type"} or
629  throw("Type $type is neither to or from coordinate systems");
630 
631 
632  my $pairs = $hash->{uc($id)};
633 
634  foreach my $pair (@$pairs) {
635  my $self_coord = $pair->{$from};
636  my $target_coord = $pair->{$to};
637 
638  # only super easy mapping is done
639  if( $start < $self_coord->{'start'} ||
640  $end > $self_coord->{'end'} ) {
641  next;
642  }
643 
644  if( $pair->{'ori'} == 1 ) {
645  return ( $target_coord->{'id'},
646  $target_coord->{'start'}+$start-$self_coord->{'start'},
647  $target_coord->{'start'}+$end-$self_coord->{'start'},
648  $strand, $cs );
649  } else {
650  return ( $target_coord->{'id'},
651  $target_coord->{'end'} - ($end - $self_coord->{'start'}),
652  $target_coord->{'end'} - ($start - $self_coord->{'start'}),
653  -$strand, $cs );
654  }
655  }
656 
657  return ();
658 }
659 
660 
661 
662 =head2 add_map_coordinates
663 
664  Arg 1 int $id
665  id of 'source' sequence
666  Arg 2 int $start
667  start coordinate of 'source' sequence
668  Arg 3 int $end
669  end coordinate of 'source' sequence
670  Arg 4 int $strand
671  relative orientation of source and target (+/- 1)
672  Arg 5 int $id
673  id of 'target' sequence
674  Arg 6 int $start
675  start coordinate of 'target' sequence
676  Arg 7 int $end
677  end coordinate of 'target' sequence
678  Function Stores details of mapping between
679  'source' and 'target' regions.
680  Returntype none
681  Exceptions none
682  Caller Bio::EnsEMBL::Mapper
683 
684 =cut
685 
686 sub add_map_coordinates {
687  my ( $self, $contig_id, $contig_start, $contig_end, $contig_ori,
688  $chr_name, $chr_start, $chr_end )
689  = @_;
690 
691  unless ( defined($contig_id)
692  && defined($contig_start)
693  && defined($contig_end)
694  && defined($contig_ori)
695  && defined($chr_name)
696  && defined($chr_start)
697  && defined($chr_end) )
698  {
699  throw("7 arguments expected");
700  }
701 
702  if ( ( $chr_end > $chr_start ) and ( $contig_end - $contig_start ) != ( $chr_end - $chr_start ) ) {
703  throw("Cannot deal with mis-lengthed mappings so far");
704  }
705 
706  my $from = Bio::EnsEMBL::Mapper::Unit->new( $contig_id, $contig_start,
707  $contig_end );
708  my $to =
709  Bio::EnsEMBL::Mapper::Unit->new( $chr_name, $chr_start, $chr_end );
710 
711  my $pair = Bio::EnsEMBL::Mapper::Pair->new( $from, $to, $contig_ori );
712 
713  # place into hash on both ids
714  my $map_to = $self->{'to'};
715  my $map_from = $self->{'from'};
716 
717  push( @{ $self->{"_pair_$map_to"}->{ uc($chr_name) } }, $pair );
718  push( @{ $self->{"_pair_$map_from"}->{ uc($contig_id) } }, $pair );
719 
720  # any interval trees for this set of intervals are now invalid
721  # so remove them
722  $self->{"_tree_$map_to"}->{ uc($contig_id) } = undef;
723  $self->{"_tree_$map_from"}->{ uc($contig_id) } = undef;
724 
725  $self->{'pair_count'}++;
726  $self->{'_is_sorted'} = 0;
727 } ## end sub add_map_coordinates
728 
729 
730 =head2 add_indel_coordinates
731 
732  Arg 1 int $id
733  id of 'source' sequence
734  Arg 2 int $start
735  start coordinate of 'source' sequence
736  Arg 3 int $end
737  end coordinate of 'source' sequence
738  Arg 4 int $strand
739  relative orientation of source and target (+/- 1)
740  Arg 5 int $id
741  id of 'targe' sequence
742  Arg 6 int $start
743  start coordinate of 'targe' sequence
744  Arg 7 int $end
745  end coordinate of 'targe' sequence
746  Function stores details of mapping between two regions:
747  'source' and 'target'. Returns 1 if the pair was added, 0 if it
748  was already in. Used when adding an indel
749  Returntype int 0,1
750  Exceptions none
751  Caller Bio::EnsEMBL::Mapper
752 
753 =cut
754 
755 sub add_indel_coordinates{
756  my ($self, $contig_id, $contig_start, $contig_end,
757  $contig_ori, $chr_name, $chr_start, $chr_end) = @_;
758 
759  unless(defined($contig_id) && defined($contig_start) && defined($contig_end)
760  && defined($contig_ori) && defined($chr_name) && defined($chr_start)
761  && defined($chr_end)) {
762  throw("7 arguments expected");
763  }
764 
765  #we need to create the IndelPair object to add to both lists, to and from
766  my $from =
767  Bio::EnsEMBL::Mapper::Unit->new($contig_id, $contig_start, $contig_end);
768  my $to =
769  Bio::EnsEMBL::Mapper::Unit->new($chr_name, $chr_start, $chr_end);
770 
771  my $pair = Bio::EnsEMBL::Mapper::IndelPair->new($from, $to, $contig_ori);
772 
773  # place into hash on both ids
774  my $map_to = $self->{'to'};
775  my $map_from = $self->{'from'};
776 
777  push( @{$self->{"_pair_$map_to"}->{uc($chr_name)}}, $pair );
778  push( @{$self->{"_pair_$map_from"}->{uc($contig_id)}}, $pair );
779 
780  # any interval trees for this set of intervals are now invalid
781  # so remove them
782  $self->{"_tree_$map_to"}->{ uc($chr_name) } = undef;
783  $self->{"_tree_$map_from"}->{ uc($contig_id) } = undef;
784 
785  $self->{'pair_count'}++;
786 
787  $self->{'_is_sorted'} = 0;
788  return 1;
789 }
790 
791 
792 =head2 map_indel
793 
794  Arg [1] : string $id
795  Arg [2] : int $start - start coord. Since this is an indel should always
796  be one greater than end.
797  Arg [3] : int $end - end coord. Since this is an indel should always
798  be one less than start.
799  Arg [4] : int $strand (0, 1, -1)
800  Arg [5] : string $type - the coordinate system name the coords are from.
801  Example : @coords = $mapper->map_indel();
802  Description: This is in internal function which handles the special mapping
803  case for indels (start = end +1). It will be used to map from
804  a coordinate system with a gap to another that contains an
805  insertion. It will be mainly used by the Variation API.
806  Returntype : Bio::EnsEMBL::Mapper::Unit objects
807  Exceptions : none
808  Caller : general
809 
810 =cut
811 
812 sub map_indel {
813  my ( $self, $id, $start, $end, $strand, $type ) = @_;
814 
815  # swap start/end and map the resultant 2bp coordinate
816  ( $start, $end ) = ( $end, $start );
817 
818  if ( !$self->{'_is_sorted'} ) { $self->_sort() }
819 
820  my $hash = $self->{"_pair_$type"};
821 
822  my ( $from, $to, $cs );
823 
824  if ( $type eq $self->{'to'} ) {
825  $from = 'to';
826  $to = 'from';
827  $cs = $self->{'from_cs'};
828  } else {
829  $from = 'from';
830  $to = 'to';
831  $cs = $self->{'to_cs'};
832  }
833 
834  unless ( defined $hash ) {
835  throw("Type $type is neither to or from coordinate systems");
836  }
837  my @indel_coordinates;
838 
839  # if we don't already have an interval tree built for this id,
840  # build one now
841  if ( !defined $self->{"_tree_$type"}->{ uc($id) } ) {
842  $self->{"_tree_$type"}->{ uc($id) } = _build_immutable_tree($from,
843  $hash->{ uc($id) });
844  }
845  # query the interval tree (either cached or created new) for overlapping intervals
846  my $overlap = $self->{"_tree_$type"}->{ uc($id) }->query($start, $end);
847 
848  foreach my $i (@{$overlap}) {
849  my $pair = $i->data;
850  my $self_coord = $pair->{$from};
851  my $target_coord = $pair->{$to};
852 
853  if ( exists $pair->{'indel'} ) {
854  #need to return unit coordinate
855  my $to =
856  Bio::EnsEMBL::Mapper::Unit->new( $target_coord->{'id'},
857  $target_coord->{'start'},
858  $target_coord->{'end'}, );
859  push @indel_coordinates, $to;
860  last;
861  }
862  }
863 
864  return @indel_coordinates;
865 } ## end sub map_indel
866 
867 
868 =head2 add_Mapper
869 
870  Arg 1 Bio::EnsEMBL::Mapper $mapper2
871  Example $mapper->add_Mapper($mapper2)
872  Function add all the map coordinates from $mapper to this mapper.
873  This object will contain mapping pairs from both the old
874  object and $mapper2.
875  Returntype int 0,1
876  Exceptions throw if 'to' and 'from' from both Bio::EnsEMBL::Mappers
877  are incompatible
878  Caller $mapper->methodname()
879 
880 =cut
881 
882 sub add_Mapper{
883  my ($self, $mapper) = @_;
884 
885  my $mapper_to = $mapper->{'to'};
886  my $mapper_from = $mapper->{'from'};
887  if ($mapper_to ne $self->{'to'} or $mapper_from ne $self->{'from'}) {
888  throw("Trying to add an incompatible Mapper");
889  }
890 
891  my $count_a = 0;
892  foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_to"}}) {
893  push(@{$self->{"_pair_$mapper_to"}->{$seq_name}},
894  @{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
895  $self->{"_tree_$mapper_to"}->{ uc($seq_name) } = undef;
896  $count_a += scalar(@{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
897  }
898  my $count_b = 0;
899  foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_from"}}) {
900  push(@{$self->{"_pair_$mapper_from"}->{$seq_name}},
901  @{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
902  $self->{"_tree_$mapper_from"}->{ uc($seq_name) } = undef;
903  $count_b += scalar(@{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
904  }
905 
906  if ($count_a == $count_b) {
907  $self->{'pair_count'} += $count_a;
908  } else {
909  throw("Trying to add a funny Mapper");
910  }
911 
912  $self->{'_is_sorted'} = 0;
913  return 1;
914 }
915 
916 
917 
918 =head2 list_pairs
919 
920  Arg 1 int $id
921  id of 'source' sequence
922  Arg 2 int $start
923  start coordinate of 'source' sequence
924  Arg 3 int $end
925  end coordinate of 'source' sequence
926  Arg 4 string $type
927  nature of transform - gives the type of
928  coordinates to be transformed *from*
929  Function list all pairs of mappings in a region
930  Returntype list of Bio::EnsEMBL::Mapper::Pair
931  Exceptions none
932  Caller Bio::EnsEMBL::Mapper
933 
934 =cut
935 
936 sub list_pairs {
937  my ( $self, $id, $start, $end, $type ) = @_;
938 
939  if ( !$self->{'_is_sorted'} ) { $self->_sort() }
940 
941  if ( !defined $type ) {
942  throw("Expected 4 arguments");
943  }
944 
945  if ( $start > $end ) {
946  throw( "Start is greater than end "
947  . "for id $id, start $start, end $end\n" );
948  }
949 
950  my $hash = $self->{"_pair_$type"};
951 
952  my ( $from, $to );
953 
954  if ( $type eq $self->{'to'} ) {
955  $from = 'to';
956  $to = 'from';
957  } else {
958  $from = 'from';
959  $to = 'to';
960  }
961 
962  unless ( defined $hash ) {
963  throw("Type $type is neither to or from coordinate systems");
964  }
965 
966  my @list;
967 
968  unless ( exists $hash->{ uc($id) } ) {
969  return ();
970  }
971 
972  @list = @{ $hash->{ uc($id) } };
973 
974  my @output;
975  if ( $start == -1 && $end == -1 ) {
976  return @list;
977  } else {
978 
979  foreach my $p (@list) {
980 
981  if ( $p->{$from}->{'end'} < $start ) {
982  next;
983  }
984  if ( $p->{$from}->{'start'} > $end ) {
985  last;
986  }
987  push( @output, $p );
988  }
989  return @output;
990  }
991 } ## end sub list_pairs
992 
993 
994 =head2 to
995 
997  id of 'source' sequence
998  Function accessor method form the 'source'
999  and 'target' in a Mapper::Pair
1000  Returntype Bio::EnsEMBL::Mapper::Unit
1001  Exceptions none
1002  Caller Bio::EnsEMBL::Mapper
1003 
1004 =cut
1005 
1006 sub to {
1007  my ( $self, $value ) = @_;
1008 
1009  if ( defined($value) ) {
1010  $self->{'to'} = $value;
1011  }
1012 
1013  return $self->{'to'};
1014 }
1015 
1016 =head2 from
1017 
1018  Arg 1 Bio::EnsEMBL::Mapper::Unit $id
1019  id of 'source' sequence
1020  Function accessor method form the 'source'
1021  and 'target' in a Mapper::Pair
1022  Returntype Bio::EnsEMBL::Mapper::Unit
1023  Exceptions none
1024  Caller Bio::EnsEMBL::Mapper
1025 
1026 =cut
1027 sub from {
1028  my ( $self, $value ) = @_;
1029 
1030  if ( defined($value) ) {
1031  $self->{'from'} = $value;
1032  }
1033 
1034  return $self->{'from'};
1035 }
1036 
1037 
1038 # _dump
1039 #
1040 # Arg 1 *FileHandle $fh
1041 # Function convenience dump function
1042 # possibly useful for debugging
1043 # Returntype none
1044 # Exceptions none
1045 # Caller internal
1046 #
1047 
1048 sub _dump{
1049  my ($self,$fh) = @_;
1050 
1051  if( !defined $fh ) {
1052  $fh = \*STDERR;
1053  }
1054 
1055  my $from = $self->{'from'};
1056  my $to = $self->{'to'};
1057 
1058  print $fh "dumping from-hash _pair_$from\n";
1059  foreach my $id ( keys %{$self->{"_pair_$from"}} ) {
1060  print $fh "{_pair_$from}->{" . uc($id) . "}:\n";
1061  foreach my $pair ( @{$self->{"_pair_$from"}->{uc($id)}} ) {
1062  print $fh " ",$pair->from->start," ",$pair->from->end,":",$pair->to->start," ",$pair->to->end," ",$pair->to->id,"\n";
1063  }
1064  if (defined($self->{"_tree_$from"}->{uc($id)})) {
1065  print $fh "{_tree_$from}->{" . uc($id) . "} instantiated\n";
1066  } else {
1067  print $fh "{_tree_$from}->{" . uc($id) . "} empty\n";
1068  }
1069  }
1070  print $fh "dumping to-hash _pair_$to\n";
1071  foreach my $id ( keys %{$self->{"_pair_$to"}} ) {
1072  print $fh "{_pair_$to}->{" . uc($id) . "}:\n";
1073  foreach my $pair ( @{$self->{"_pair_$to"}->{uc($id)}} ) {
1074  print $fh " ",$pair->to->start," ",$pair->to->end,":",$pair->from->start," ",$pair->from->end," ",$pair->from->id,"\n";
1075  }
1076  if (defined($self->{"_tree_$to"}->{uc($id)})) {
1077  print $fh "{_tree_$to}->{" . uc($id) . "} instantiated\n";
1078  } else {
1079  print $fh "{_tree_$to}->{" . uc($id) . "} empty\n";
1080  }
1081  }
1082 }
1083 
1084 
1085 # _sort
1086 #
1087 # Function sort function so that all
1088 # mappings are sorted by
1089 # chromosome start
1090 # Returntype none
1091 # Exceptions none
1092 # Caller internal
1093 #
1094 
1095 sub _sort {
1096  my ($self) = @_;
1097 
1098  my $to = $self->{'to'};
1099  my $from = $self->{'from'};
1100 
1101  foreach my $id ( keys %{ $self->{"_pair_$from"} } ) {
1102  @{ $self->{"_pair_$from"}->{$id} } =
1103  sort { $a->{'from'}->{'start'} <=> $b->{'from'}->{'start'} }
1104  @{ $self->{"_pair_$from"}->{$id} };
1105  }
1106 
1107  foreach my $id ( keys %{ $self->{"_pair_$to"} } ) {
1108  @{ $self->{"_pair_$to"}->{$id} } =
1109  sort { $a->{'to'}->{'start'} <=> $b->{'to'}->{'start'} }
1110  @{ $self->{"_pair_$to"}->{$id} };
1111  }
1112 
1113  $self->_merge_pairs();
1114  $self->_is_sorted(1);
1115 }
1116 
1117 # this function merges pairs that are adjacent into one
1118 sub _merge_pairs {
1119  my $self = shift;
1120 
1121  my ( $lr, $lr_from, $del_pair, $next_pair, $current_pair );
1122 
1123  my $map_to = $self->{'to'};
1124  my $map_from = $self->{'from'};
1125  $self->{'pair_count'} = 0;
1126 
1127  for my $key ( keys %{$self->{"_pair_$map_to"}} ) {
1128  # merging pairs means all interval trees for
1129  # this set of intervals are now invalid, so delete them
1130  $self->{"_tree_$map_to"}->{ uc($key) } = undef;
1131  $lr = $self->{"_pair_$map_to"}->{$key};
1132 
1133  my $i = 0;
1134  my $next = 1;
1135  my $length = $#{$lr};
1136  while( $next <= $length ) {
1137  $current_pair = $lr->[$i];
1138  $next_pair = $lr->[$next];
1139  $del_pair = undef;
1140 
1141  if(exists $current_pair->{'indel'} || exists $next_pair->{'indel'}){
1142  #necessary to modify the merge function to not merge indels
1143  $next++;
1144  $i++;
1145  }
1146  else{
1147  # duplicate filter
1148  if( $current_pair->{'to'}->{'start'} == $next_pair->{'to'}->{'start'}
1149  && $current_pair->{'from'}->{'id'} == $next_pair->{'from'}->{'id'}
1150  && $current_pair->{'from'}->{'start'} == $next_pair->{'from'}->{'start'}) {
1151  # Modified in e75 to support GRCh38. Contigs used repeatedly in one seq region were
1152  # being pre-emptively deleted as copies. Extra from-start condition above ensures copy
1153  # deletion is restricted to same-location copies. Even more stringent checks can be made
1154  # at cost of speed.
1155  $del_pair = $next_pair;
1156  } elsif ( ( $current_pair->{'from'}->{'id'} eq $next_pair->{'from'}->{'id'} ) &&
1157  ( $next_pair->{'ori'} == $current_pair->{'ori'} ) &&
1158  ( $next_pair->{'to'}->{'start'} -1 == $current_pair->{'to'}->{'end'} )) {
1159 
1160  if( $current_pair->{'ori'} == 1 ) {
1161  # check forward strand merge
1162  if( $next_pair->{'from'}->{'start'} - 1 == $current_pair->{'from'}->{'end'} ) {
1163  # normal merge with previous element
1164  $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
1165  $current_pair->{'from'}->{'end'} = $next_pair->{'from'}->{'end'};
1166  $del_pair = $next_pair;
1167  }
1168  } else {
1169  # check backward strand merge
1170  if( $next_pair->{'from'}->{'end'} + 1 == $current_pair->{'from'}->{'start'} ) {
1171  # yes its a merge
1172  $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
1173  $current_pair->{'from'}->{'start'} = $next_pair->{'from'}->{'start'};
1174  $del_pair = $next_pair;
1175  }
1176  }
1177  }
1178 
1179  if( defined $del_pair ) {
1180  splice( @$lr, $next, 1 );
1181  $lr_from = $self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})};
1182  for( my $j=0; $j <= $#{$lr_from}; $j++ ) {
1183  if( $lr_from->[$j] == $del_pair ) {
1184  splice( @$lr_from, $j, 1 );
1185  last;
1186  }
1187  }
1188  $length--;
1189  if( $length < $next ) { last; }
1190  }
1191  else {
1192  $next++;
1193  $i++;
1194  }
1195  }
1196 
1197  }
1198  $self->{'pair_count'} += scalar( @$lr );
1199  }
1200 }
1201 
1202 
1203 # _is_sorted
1204 #
1205 # Arg 1 int $sorted
1206 # Function toggle for whether the (internal)
1207 # map data are sorted
1208 # Returntype int
1209 # Exceptions none
1210 # Caller internal
1211 #
1212 
1213 sub _is_sorted {
1214  my ($self, $value) = @_;
1215  $self->{'_is_sorted'} = $value if (defined($value));
1216  return $self->{'_is_sorted'};
1217 }
1218 
1219 # _build_immutable_tree
1220 #
1221 # Arg 1 string $pair_side - the from or to half of each pair to be
1222 # the source of intervals
1223 # Arg 2 listref $pair_list - a list of Bio::EnsEMBL::Mapper::Pair
1224 # Function builds a Bio::EnsEMBL::Utils::Tree::Interval::Immutable with
1225 # intervals corresponding to the chosen side (from or to) of
1226 # each Pair in $pair_list and a pointer to each Pair
1227 # Returntype Bio::EnsEMBL::Utils::Tree::Interval::Immutable
1228 # Exceptions none
1229 # Caller internal
1230 
1231 sub _build_immutable_tree {
1232  my ($pair_side, $pair_list) = @_;
1233  # create set of intervals for the tree
1234  my $from_intervals;
1235 
1236  foreach my $i (@{$pair_list}) {
1237  my $start = $i->{$pair_side}{start};
1238  my $end = $i->{$pair_side}{end};
1239 
1240  if ($end < $start) {
1241  ($end, $start) = ($start, $end);
1242  }
1243 
1244  push @{$from_intervals}, Bio::EnsEMBL::Utils::Interval->new($start, $end, $i);
1245  }
1246 
1247  # Create the interval tree defined on the above set of intervals
1248  #
1249  # As of release/99, we have more experience with the immutable
1250  # interval tree, so we will stick with this one. A future
1251  # refactoring effort may wish to replace this with a dynamically
1252  # maintained mutable interval tree, rather than simply throwing
1253  # trees away and rebuilding when the underlying interval set changes
1254  return Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($from_intervals);
1255 }
1256 
1257 1;
Bio::EnsEMBL::Utils::Interval
Definition: Interval.pm:41
Bio::EnsEMBL::Mapper::Unit
Definition: Unit.pm:15
Bio::EnsEMBL::Mapper::IndelCoordinate
Definition: IndelCoordinate.pm:16
EnsEMBL
Definition: Filter.pm:1
Bio::EnsEMBL::AssemblyMapper
Definition: AssemblyMapper.pm:49
Bio::EnsEMBL::Mapper::Coordinate
Definition: Coordinate.pm:14
map
public map()
Bio::EnsEMBL::Mapper::Gap::new
public Bio::EnsEMBL::Mapper::Gap new()
AssemblyMapper
Definition: BlastzAligner.pm:7
Bio::EnsEMBL::Mapper::IndelPair
Definition: IndelPair.pm:15
Bio::EnsEMBL::ChainedAssemblyMapper
Definition: ChainedAssemblyMapper.pm:54
Bio::EnsEMBL::CoordSystem
Definition: CoordSystem.pm:40
Bio::EnsEMBL::Mapper::Pair
Definition: Pair.pm:15
about
public about()
Bio::EnsEMBL::Mapper::Pair::new
public new()
Bio::EnsEMBL::Mapper::Coordinate::new
public new()
Bio::EnsEMBL::Mapper::Unit::new
public new()
Bio::EnsEMBL::Mapper::Gap
Definition: Gap.pm:14
Bio::EnsEMBL::Utils::Interval::new
public Bio::EnsEMBL::Utils::Interval new()
Bio::EnsEMBL::Mapper::IndelPair::new
public new()
Bio::EnsEMBL::Utils::Tree::Interval::Immutable
Definition: Node.pm:8
Bio
Definition: AltAlleleGroup.pm:4
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::Mapper::IndelCoordinate::new
public Bio::EnsEMBL::Mapper::IndelCoordinate new()
Bio::EnsEMBL::Mapper
Definition: Coordinate.pm:3