ensembl-hive  2.8.1
Mapper.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
34 
35 =head1 SYNOPSIS
36 
37  $map = Bio::EnsEMBL::Mapper->new( 'rawcontig', 'chromosome' );
38 
39  # add a coodinate mapping - supply two pairs or coordinates
40  $map->add_map_coordinates(
41  $contig_id, $contig_start, $contig_end, $contig_ori,
42  $chr_name, chr_start, $chr_end
43  );
44 
45  # map from one coordinate system to another
46  my @coordlist =
47  $mapper->map_coordinates( 627012, 2, 5, -1, "rawcontig" );
48 
49 =head1 DESCRIPTION
50 
51 Generic mapper to provide coordinate transforms between two disjoint
52 coordinate systems. This mapper is intended to be 'context neutral' - in
53 that it does not contain any code relating to any particular coordinate
54 system. This is provided in, for example, Bio::EnsEMBL::AssemblyMapper.
55 
56 Mappings consist of pairs of 'to-' and 'from-' contigs with coordinates on each.
57 Orientation is abbreviated to 'ori',
58 
59 The contig pair hash is divided into mappings per seq_region, the code
60 below makes assumptions about how to filter these results, thus the comparisons
61 for some properties are absent in the code but implicit by data structure.
62 
63 The assembly mapping hash '_pair_last' orders itself by the target seq region and looks like this:
64 
65  1 => ARRAY(0x1024c79c0)
66  0 Bio::EnsEMBL::Mapper::Pair=HASH(0x1024d6198)
67  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025edf98)
68  'end' => 4
69  'id' => 4
70  'start' => 1
71  'ori' => 1
72  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025edf68)
73  'end' => 4
74  'id' => 1
75  'start' => 1
76  1 Bio::EnsEMBL::Mapper::Pair=HASH(0x1026c20f0)
77  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee3a0)
78  'end' => 12
79  'id' => 4
80  'start' => 9
81  'ori' => 1
82  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee370)
83  'end' => 4
84  'id' => 1
85  'start' => 1
86  2 => ARRAY(0x1025ee460)
87  0 Bio::EnsEMBL::Mapper::Pair=HASH(0x1025ee400)
88  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee2c8)
89  'end' => 8
90  'id' => 4
91  'start' => 5
92  'ori' => 1
93  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee2b0)
94  'end' => 4
95  'id' => 2
96  'start' => 1
97  1 Bio::EnsEMBL::Mapper::Pair=HASH(0x1025ee658)
98  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025eea48)
99  'end' => 16
100  'id' => 4
101  'start' => 13
102  'ori' => 1
103  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025eea18)
104  'end' => 4
105  'id' => 2
106  'start' => 1
107 
108 The other mapping hash available is the reverse sense, putting the 'from' seq_region as the
109 sorting key. Here is an excerpt.
110 
111 0 HASH(0x102690bb8)
112  4 => ARRAY(0x1025ee028)
113  0 Bio::EnsEMBL::Mapper::Pair=HASH(0x1024d6198)
114  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025edf98)
115  'end' => 4
116  'id' => 4
117  'start' => 1
118  'ori' => 1
119  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025edf68)
120  'end' => 4
121  'id' => 1
122  'start' => 1
123  1 Bio::EnsEMBL::Mapper::Pair=HASH(0x1025ee400)
124  'from' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee2c8)
125  'end' => 8
126  'id' => 4
127  'start' => 5
128  'ori' => 1
129  'to' => Bio::EnsEMBL::Mapper::Unit=HASH(0x1025ee2b0)
130  'end' => 4
131  'id' => 2
132  'start' => 1
133 
134 
135 =head1 METHODS
136 
137 =cut
138 
139 package Bio::EnsEMBL::Mapper;
140 
141 use strict;
142 use integer;
143 
144 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
151 
152 use Bio::EnsEMBL::Utils::Exception qw(throw);
155 
156 =head2 new
157 
158  Arg [1] : string $from
159  The name of the 'from' coordinate system
160  Arg [2] : string $to
161  The name of the 'to' coordinate system
162  Arg [3] : (optional) Bio::EnsEMBL::CoordSystem $from_cs
163  The 'from' coordinate system
164  Arg [4] : (optional) Bio::EnsEMBL::CoordSystem $to_cs
165  Example : my $mapper = Bio::EnsEMBL::Mapper->new('FROM', 'TO');
166  Description: Constructor. Creates a new Bio::EnsEMBL::Mapper object.
167  Returntype : Bio::EnsEMBL::Mapper
168  Exceptions : none
169  Caller : general
170 
171 =cut
172 
173 sub new {
174  my ( $proto, $from, $to, $from_cs, $to_cs ) = @_;
175 
176  if ( !defined($to) || !defined($from) ) {
177  throw("Must supply 'to' and 'from' tags");
178  }
179 
180  my $class = ref($proto) || $proto;
181 
182  my $self = bless( { "_pair_$from" => {},
183  "_pair_$to" => {},
184  "_tree_$from" => {},
185  "_tree_$to" => {},
186  'pair_count' => 0,
187  'to' => $to,
188  'from' => $from,
189  'to_cs' => $to_cs,
190  'from_cs' => $from_cs
191  },
192  $class );
193 
194  # do sql to get any componente with muliple assemblys.
195 
196  return $self;
197 }
198 
199 =head2 flush
200 
201  Args : none
202  Example : none
203  Description: removes all cached information out of this mapper
204  Returntype : none
205  Exceptions : none
207 
208 =cut
209 
210 sub flush {
211  my $self = shift;
212  my $from = $self->from();
213  my $to = $self->to();
214 
215  $self->{"_pair_$from"} = {};
216  $self->{"_pair_$to"} = {};
217  $self->{"_tree_$from"} = {};
218  $self->{"_tree_$to"} = {};
219 
220  $self->{'pair_count'} = 0;
221 }
222 
223 
224 
225 =head2 map_coordinates
226 
227  Arg 1 string $id
228  id of 'source' sequence
229  Arg 2 int $start
230  start coordinate of 'source' sequence
231  Arg 3 int $end
232  end coordinate of 'source' sequence
233  Arg 4 int $strand
234  raw contig orientation (+/- 1)
235  Arg 5 string $type
236  nature of transform - gives the type of
237  coordinates to be transformed *from*
238  Arg 6 boolean (0 or 1) $include_original_region
239  option to include original input coordinate region mappings in the result
240  Arg 7 int $cdna_coding_start
241  cdna coding start
242  Function generic map method
243  Returntype if $include_original_region == 0
244  array of mappped Bio::EnsEMBL::Mapper::Coordinate
245  and/or Bio::EnsEMBL::Mapper::Gap
246  if $include_original_region == 1
247  hash of mapped and original Bio::EnsEMBL::Mapper::Coordinate
248  and/or Bio::EnsEMBL::Mapper::Gap
249  Exceptions none
250  Caller Bio::EnsEMBL::Mapper
251 
252 =cut
253 
254 sub map_coordinates {
255  my ( $self, $id, $start, $end, $strand, $type, $include_original_region, $cdna_coding_start ) = @_;
256  unless ( defined($id)
257  && defined($start)
258  && defined($end)
259  && defined($strand)
260  && defined($type) )
261  {
262  throw("Expecting at least 5 arguments");
263  }
264 
265  $cdna_coding_start = defined $cdna_coding_start ? $cdna_coding_start : 1;
266 
267  # special case for handling inserts:
268  if ( $start == $end + 1 ) {
269  return $self->map_insert( $id, $start, $end, $strand, $type, undef, $include_original_region);
270  }
271 
272  if ( !$self->{'_is_sorted'} ) { $self->_sort() }
273 
274  my $hash = $self->{"_pair_$type"};
275 
276  my ( $from, $to, $cs );
277 
278  if ( $type eq $self->{'to'} ) {
279  $from = 'to';
280  $to = 'from';
281  $cs = $self->{'from_cs'};
282  } else {
283  $from = 'from';
284  $to = 'to';
285  $cs = $self->{'to_cs'};
286  }
287 
288  unless ( defined $hash ) {
289  throw("Type $type is neither to or from coordinate systems");
290  }
291 
292  my @result;
293  my @paired_result;
294 
295  if ( !defined $hash->{ uc($id) } ) {
296  # one big gap!
297  my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
298  if ($include_original_region) {
299  push @paired_result, { 'original' => $gap, 'mapped' => $gap };
300  return @paired_result;
301  } else {
302  push @result, $gap;
303  return @result;
304  }
305  }
306 
307  my $last_used_pair;
308 
309  # if we don't already have an interval tree built for this id,
310  # build one now
311  if ( !defined( $self->{"_tree_$type"}->{ uc($id) } )) {
312  $self->{"_tree_$type"}->{ uc($id) } = _build_immutable_tree($from,
313  $hash->{ uc($id) });
314  }
315  # query the interval tree (either cached or created new) for overlapping intervals
316  my $overlap = $self->{"_tree_$type"}->{ uc($id) }->query($start, $end);
317 
318  my $rank = 0;
319  my $orig_start = $start;
320  my $last_target_coord = undef;
321  foreach my $i (@{$overlap}) {
322  my $pair = $i->data;
323  my $self_coord = $pair->{$from};
324  my $target_coord = $pair->{$to};
325 
326  #
327  # But not the case for haplotypes!! need to test for this case???
328  # so removing this till a better solution is found
329  #
330  #
331  # if($self_coord->{'start'} < $start){
332  # $start = $orig_start;
333  # $rank++;
334  # }
335 
336  if ( defined($last_target_coord) && $target_coord->{'id'} ne $last_target_coord ) {
337  if ( $self_coord->{'start'} < $start ) {
338  # i.e. the same bit is being mapped to another assembled bit
339  $start = $orig_start;
340  }
341  } else {
342  $last_target_coord = $target_coord->{'id'};
343  }
344 
345  if ( $start < $self_coord->{'start'} ) {
346  # gap detected
347  my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $self_coord->{'start'} - 1, $rank );
348  push( @result, $gap );
349  push(@paired_result, { 'original' => $gap, 'mapped' => $gap });
350 
351  $start = $gap->{'end'} + 1;
352  }
353  my ( $target_start, $target_end);
354  my ( $ori_start, $ori_end);
355 
356  my $res;
357  if ( exists $pair->{'indel'} ) {
358  # When next pair is an IndelPair and not a Coordinate, create the
359  # new mapping Coordinate, the IndelCoordinate.
360  $target_start = $target_coord->{'start'};
361  $target_end = $target_coord->{'end'};
362 
363  #original coordinates
364  $ori_start = $self_coord->{'start'};
365  $ori_end = $self_coord->{'end'};
366 
367  #create a Gap object
368  my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start,
369  ( $self_coord->{'end'} < $end ? $self_coord->{'end'} : $end ) );
370  #create the Coordinate object
371  my $coord =
372  Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
373  $target_start, $target_end, $pair->{'ori'}*$strand, $cs );
374  #and finally, the IndelCoordinate object with
375  $res = Bio::EnsEMBL::Mapper::IndelCoordinate->new( $gap, $coord );
376 
377  } else {
378  # start is somewhere inside the region
379  if ( $pair->{'ori'} == 1 ) {
380  $target_start = $target_coord->{'start'} + ( $start - $self_coord->{'start'} );
381  } else {
382  $target_end = $target_coord->{'end'} - ( $start - $self_coord->{'start'} );
383  }
384 
385  # Either we are enveloping this map or not. If yes, then end
386  # point (self perspective) is determined solely by target. If
387  # not we need to adjust.
388 
389  if ( $end > $self_coord->{'end'} ) {
390  # enveloped
391  if ( $pair->{'ori'} == 1 ) {
392  $target_end = $target_coord->{'end'};
393  } else {
394  $target_start = $target_coord->{'start'};
395  }
396  } else {
397  # need to adjust end
398  if ( $pair->{'ori'} == 1 ) {
399  $target_end =
400  $target_coord->{'start'} +
401  ( $end - $self_coord->{'start'} );
402  } else {
403 
404  $target_start =
405  $target_coord->{'end'} - ( $end - $self_coord->{'start'} );
406  }
407 
408  }
409 
410  $res = Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
411  $target_start, $target_end, $pair->{'ori'}*$strand,$cs, $rank );
412 
413  #save the ori coordinate info
414  $ori_start = ($start - $cdna_coding_start) + 1;
415  $ori_end = $ori_start + ($target_end - $target_start);
416 
417 
418  } ## end else [ if ( exists $pair->{'indel'...})]
419 
420  push( @result, $res );
421  my $res_ori = Bio::EnsEMBL::Mapper::Coordinate->new( $self_coord->{'id'},
422  $ori_start, $ori_end, $pair->{'ori'}*$strand,$cs, $rank);
423  push(@paired_result, { 'original' => $res_ori, 'mapped' => $res });
424 
425 
426  $last_used_pair = $pair;
427  $start = $self_coord->{'end'} + 1;
428 
429  } ## end for ( my $i = $start_idx...)
430 
431  if ( !defined $last_used_pair ) {
432  my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
433  push( @result, $gap );
434  push(@paired_result, { 'original' => $gap, 'mapped' => $gap });
435 
436 
437  } elsif ( $last_used_pair->{$from}->{'end'} < $end ) {
438  # gap at the end
439  my $gap = Bio::EnsEMBL::Mapper::Gap->new(
440  $last_used_pair->{$from}->{'end'} + 1,
441  $end, $rank );
442  push( @result, $gap );
443 
444  push(@paired_result, { 'original' => $gap, 'mapped' => $gap });
445  }
446 
447  if ( $strand == -1 ) {
448  @result = reverse(@result);
449  @paired_result = reverse(@paired_result);
450  }
451 
452  if ($include_original_region){
453  return @paired_result;
454  }else{
455  return @result;
456  }
457 
458 } ## end sub map_coordinates
459 
460 
461 
462 =head2 map_insert
463 
464  Arg [1] : string $id
465  Arg [2] : int $start - start coord. Since this is an insert should always
466  be one greater than end.
467  Arg [3] : int $end - end coord. Since this is an insert should always
468  be one less than start.
469  Arg [4] : int $strand (0, 1, -1)
470  Arg [5] : string $type - the coordinate system name the coords are from.
471  Arg [6] : boolean $fastmap - if specified, this is being called from
472  the fastmap call. The mapping done is not any faster for
473  inserts, but the return value is different.
474  Example :
475  Description: This is in internal function which handles the special mapping
476  case for inserts (start = end +1). This function will be called
477  automatically by the map function so there is no reason to
478  call it directly.
479  Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and/or Gap objects
480  Exceptions : none
481  Caller : map_coordinates()
482 
483 =cut
484 
485 sub map_insert {
486  my ($self, $id, $start, $end, $strand, $type, $fastmap, $include_original_region) = @_;
487 
488  # swap start/end and map the resultant 2bp coordinate
489  ($start, $end) =($end,$start);
490 
491  my @coords = $self->map_coordinates($id, $start, $end, $strand, $type, $include_original_region);
492 
493  if(@coords == 1) {
494  my $c = $coords[0];
495  if ($include_original_region) {
496  my $m = $c->{'mapped'};
497  my $orig = $c->{'original'};
498  ($m->{'start'}, $m->{'end'}) = ($m->{'end'}, $m->{'start'});
499  ($orig->{'start'}, $orig->{'end'}) = ($orig->{'end'}, $orig->{'start'});
500  } else {
501  ($c->{'start'}, $c->{'end'}) = ($c->{'end'}, $c->{'start'});
502  }
503  } else {
504  throw("Unexpected: Got ",scalar(@coords)," expected 2.") if(@coords != 2);
505 
506  # adjust coordinates, remove gaps
507  my ($c1, $c2);
508 
509  if($strand == -1) {
510  ($c2,$c1) = @coords;
511  } else {
512  ($c1, $c2) = @coords;
513  }
514  @coords = ();
515 
516  my ($m1, $m2);
517  if ($include_original_region) {
518  ($m1, $m2) = ($c1->{'mapped'}, $c2->{'mapped'});
519  } else {
520  ($m1, $m2) = ($c1, $c2);
521  }
522  if(ref($m1) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
523  # insert is after first coord
524  if($m1->{'strand'} * $strand == -1) {
525  $m1->{'end'}--;
526  } else {
527  $m1->{'start'}++;
528  }
529  if ($include_original_region) {
530  $c1->{'mapped'} = $m1;
531  @coords = ($c1);
532  } else {
533  @coords = ($m1);
534  }
535  }
536  if(ref($m2) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
537  # insert is before second coord
538  if($m2->{'strand'} * $strand == -1) {
539  $m2->{'start'}++;
540  } else {
541  $m2->{'end'}--;
542  }
543  if($strand == -1) {
544  if ($include_original_region) {
545  $c2->{'mapped'} = $m2;
546  unshift @coords, $c2;
547  } else {
548  unshift @coords, $m2;
549  }
550  } else {
551  if ($include_original_region) {
552  $c2->{'mapped'} = $m2;
553  push @coords, $c2;
554  } else {
555  push @coords, $m2;
556  }
557  }
558  }
559 
560  # as with @coords == 1 above, if we are in
561  # include_original_region mode, then we have
562  # to flip back the original region as well
563  if ($include_original_region) {
564  foreach my $coord (@coords) {
565  my $orig = $coord->{'original'};
566  ($orig->{'start'}, $orig->{'end'}) =
567  ($orig->{'end'}, $orig->{'start'});
568  }
569  }
570  }
571 
572  if($fastmap) {
573  return undef if(@coords != 1);
574  my $c = $include_original_region ? $coords[0]->{'mapped'} : $coords[0];
575  return ($c->{'id'}, $c->{'start'}, $c->{'end'},
576  $c->{'strand'}, $c->{'coord_system'});
577  }
578 
579  return @coords;
580 }
581 
582 
583 
584 
585 
586 
587 =head2 fastmap
588 
589  Arg 1 string $id
590  id of 'source' sequence
591  Arg 2 int $start
592  start coordinate of 'source' sequence
593  Arg 3 int $end
594  end coordinate of 'source' sequence
595  Arg 4 int $strand
596  raw contig orientation (+/- 1)
597  Arg 5 int $type
598  nature of transform - gives the type of
599  coordinates to be transformed *from*
600  Function inferior map method. Will only do ungapped unsplit mapping.
601  Will return id, start, end strand in a list.
602  Returntype list of results
603  Exceptions none
604  Caller Bio::EnsEMBL::AssemblyMapper
605 
606 =cut
607 
608 sub fastmap {
609  my ($self, $id, $start, $end, $strand, $type) = @_;
610 
611  my ($from, $to, $cs);
612 
613  if($end+1 == $start) {
614  return $self->map_insert($id, $start, $end, $strand, $type, 1);
615  }
616 
617  if( ! $self->{'_is_sorted'} ) { $self->_sort() }
618 
619  if($type eq $self->{'to'}) {
620  $from = 'to';
621  $to = 'from';
622  $cs = $self->{'from_cs'};
623  } else {
624  $from = 'from';
625  $to = 'to';
626  $cs = $self->{'to_cs'};
627  }
628 
629  my $hash = $self->{"_pair_$type"} or
630  throw("Type $type is neither to or from coordinate systems");
631 
632 
633  my $pairs = $hash->{uc($id)};
634 
635  foreach my $pair (@$pairs) {
636  my $self_coord = $pair->{$from};
637  my $target_coord = $pair->{$to};
638 
639  # only super easy mapping is done
640  if( $start < $self_coord->{'start'} ||
641  $end > $self_coord->{'end'} ) {
642  next;
643  }
644 
645  if( $pair->{'ori'} == 1 ) {
646  return ( $target_coord->{'id'},
647  $target_coord->{'start'}+$start-$self_coord->{'start'},
648  $target_coord->{'start'}+$end-$self_coord->{'start'},
649  $strand, $cs );
650  } else {
651  return ( $target_coord->{'id'},
652  $target_coord->{'end'} - ($end - $self_coord->{'start'}),
653  $target_coord->{'end'} - ($start - $self_coord->{'start'}),
654  -$strand, $cs );
655  }
656  }
657 
658  return ();
659 }
660 
661 
662 
663 =head2 add_map_coordinates
664 
665  Arg 1 int $id
666  id of 'source' sequence
667  Arg 2 int $start
668  start coordinate of 'source' sequence
669  Arg 3 int $end
670  end coordinate of 'source' sequence
671  Arg 4 int $strand
672  relative orientation of source and target (+/- 1)
673  Arg 5 int $id
674  id of 'target' sequence
675  Arg 6 int $start
676  start coordinate of 'target' sequence
677  Arg 7 int $end
678  end coordinate of 'target' sequence
679  Function Stores details of mapping between
680  'source' and 'target' regions.
681  Returntype none
682  Exceptions none
683  Caller Bio::EnsEMBL::Mapper
684 
685 =cut
686 
687 sub add_map_coordinates {
688  my ( $self, $contig_id, $contig_start, $contig_end, $contig_ori,
689  $chr_name, $chr_start, $chr_end )
690  = @_;
691 
692  unless ( defined($contig_id)
693  && defined($contig_start)
694  && defined($contig_end)
695  && defined($contig_ori)
696  && defined($chr_name)
697  && defined($chr_start)
698  && defined($chr_end) )
699  {
700  throw("7 arguments expected");
701  }
702 
703  if ( ( $chr_end > $chr_start ) and ( $contig_end - $contig_start ) != ( $chr_end - $chr_start ) ) {
704  throw("Cannot deal with mis-lengthed mappings so far");
705  }
706 
707  my $from = Bio::EnsEMBL::Mapper::Unit->new( $contig_id, $contig_start,
708  $contig_end );
709  my $to =
710  Bio::EnsEMBL::Mapper::Unit->new( $chr_name, $chr_start, $chr_end );
711 
712  my $pair = Bio::EnsEMBL::Mapper::Pair->new( $from, $to, $contig_ori );
713 
714  # place into hash on both ids
715  my $map_to = $self->{'to'};
716  my $map_from = $self->{'from'};
717 
718  push( @{ $self->{"_pair_$map_to"}->{ uc($chr_name) } }, $pair );
719  push( @{ $self->{"_pair_$map_from"}->{ uc($contig_id) } }, $pair );
720 
721  # any interval trees for this set of intervals are now invalid
722  # so remove them
723  $self->{"_tree_$map_to"}->{ uc($contig_id) } = undef;
724  $self->{"_tree_$map_from"}->{ uc($contig_id) } = undef;
725 
726  $self->{'pair_count'}++;
727  $self->{'_is_sorted'} = 0;
728 } ## end sub add_map_coordinates
729 
730 
731 =head2 add_indel_coordinates
732 
733  Arg 1 int $id
734  id of 'source' sequence
735  Arg 2 int $start
736  start coordinate of 'source' sequence
737  Arg 3 int $end
738  end coordinate of 'source' sequence
739  Arg 4 int $strand
740  relative orientation of source and target (+/- 1)
741  Arg 5 int $id
742  id of 'targe' sequence
743  Arg 6 int $start
744  start coordinate of 'targe' sequence
745  Arg 7 int $end
746  end coordinate of 'targe' sequence
747  Function stores details of mapping between two regions:
748  'source' and 'target'. Returns 1 if the pair was added, 0 if it
749  was already in. Used when adding an indel
750  Returntype int 0,1
751  Exceptions none
752  Caller Bio::EnsEMBL::Mapper
753 
754 =cut
755 
756 sub add_indel_coordinates{
757  my ($self, $contig_id, $contig_start, $contig_end,
758  $contig_ori, $chr_name, $chr_start, $chr_end) = @_;
759 
760  unless(defined($contig_id) && defined($contig_start) && defined($contig_end)
761  && defined($contig_ori) && defined($chr_name) && defined($chr_start)
762  && defined($chr_end)) {
763  throw("7 arguments expected");
764  }
765 
766  #we need to create the IndelPair object to add to both lists, to and from
767  my $from =
768  Bio::EnsEMBL::Mapper::Unit->new($contig_id, $contig_start, $contig_end);
769  my $to =
770  Bio::EnsEMBL::Mapper::Unit->new($chr_name, $chr_start, $chr_end);
771 
772  my $pair = Bio::EnsEMBL::Mapper::IndelPair->new($from, $to, $contig_ori);
773 
774  # place into hash on both ids
775  my $map_to = $self->{'to'};
776  my $map_from = $self->{'from'};
777 
778  push( @{$self->{"_pair_$map_to"}->{uc($chr_name)}}, $pair );
779  push( @{$self->{"_pair_$map_from"}->{uc($contig_id)}}, $pair );
780 
781  # any interval trees for this set of intervals are now invalid
782  # so remove them
783  $self->{"_tree_$map_to"}->{ uc($chr_name) } = undef;
784  $self->{"_tree_$map_from"}->{ uc($contig_id) } = undef;
785 
786  $self->{'pair_count'}++;
787 
788  $self->{'_is_sorted'} = 0;
789  return 1;
790 }
791 
792 
793 =head2 map_indel
794 
795  Arg [1] : string $id
796  Arg [2] : int $start - start coord. Since this is an indel should always
797  be one greater than end.
798  Arg [3] : int $end - end coord. Since this is an indel should always
799  be one less than start.
800  Arg [4] : int $strand (0, 1, -1)
801  Arg [5] : string $type - the coordinate system name the coords are from.
802  Example : @coords = $mapper->map_indel();
803  Description: This is in internal function which handles the special mapping
804  case for indels (start = end +1). It will be used to map from
805  a coordinate system with a gap to another that contains an
806  insertion. It will be mainly used by the Variation API.
807  Returntype : Bio::EnsEMBL::Mapper::Unit objects
808  Exceptions : none
809  Caller : general
810 
811 =cut
812 
813 sub map_indel {
814  my ( $self, $id, $start, $end, $strand, $type ) = @_;
815 
816  # swap start/end and map the resultant 2bp coordinate
817  ( $start, $end ) = ( $end, $start );
818 
819  if ( !$self->{'_is_sorted'} ) { $self->_sort() }
820 
821  my $hash = $self->{"_pair_$type"};
822 
823  my ( $from, $to, $cs );
824 
825  if ( $type eq $self->{'to'} ) {
826  $from = 'to';
827  $to = 'from';
828  $cs = $self->{'from_cs'};
829  } else {
830  $from = 'from';
831  $to = 'to';
832  $cs = $self->{'to_cs'};
833  }
834 
835  unless ( defined $hash ) {
836  throw("Type $type is neither to or from coordinate systems");
837  }
838  my @indel_coordinates;
839 
840  # if we don't already have an interval tree built for this id,
841  # build one now
842  if ( !defined $self->{"_tree_$type"}->{ uc($id) } ) {
843  $self->{"_tree_$type"}->{ uc($id) } = _build_immutable_tree($from,
844  $hash->{ uc($id) });
845  }
846  # query the interval tree (either cached or created new) for overlapping intervals
847  my $overlap = $self->{"_tree_$type"}->{ uc($id) }->query($start, $end);
848 
849  foreach my $i (@{$overlap}) {
850  my $pair = $i->data;
851  my $self_coord = $pair->{$from};
852  my $target_coord = $pair->{$to};
853 
854  if ( exists $pair->{'indel'} ) {
855  #need to return unit coordinate
856  my $to =
857  Bio::EnsEMBL::Mapper::Unit->new( $target_coord->{'id'},
858  $target_coord->{'start'},
859  $target_coord->{'end'}, );
860  push @indel_coordinates, $to;
861  last;
862  }
863  }
864 
865  return @indel_coordinates;
866 } ## end sub map_indel
867 
868 
869 =head2 add_Mapper
870 
871  Arg 1 Bio::EnsEMBL::Mapper $mapper2
872  Example $mapper->add_Mapper($mapper2)
873  Function add all the map coordinates from $mapper to this mapper.
874  This object will contain mapping pairs from both the old
875  object and $mapper2.
876  Returntype int 0,1
877  Exceptions throw if 'to' and 'from' from both Bio::EnsEMBL::Mappers
878  are incompatible
879  Caller $mapper->methodname()
880 
881 =cut
882 
883 sub add_Mapper{
884  my ($self, $mapper) = @_;
885 
886  my $mapper_to = $mapper->{'to'};
887  my $mapper_from = $mapper->{'from'};
888  if ($mapper_to ne $self->{'to'} or $mapper_from ne $self->{'from'}) {
889  throw("Trying to add an incompatible Mapper");
890  }
891 
892  my $count_a = 0;
893  foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_to"}}) {
894  push(@{$self->{"_pair_$mapper_to"}->{$seq_name}},
895  @{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
896  $self->{"_tree_$mapper_to"}->{ uc($seq_name) } = undef;
897  $count_a += scalar(@{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
898  }
899  my $count_b = 0;
900  foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_from"}}) {
901  push(@{$self->{"_pair_$mapper_from"}->{$seq_name}},
902  @{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
903  $self->{"_tree_$mapper_from"}->{ uc($seq_name) } = undef;
904  $count_b += scalar(@{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
905  }
906 
907  if ($count_a == $count_b) {
908  $self->{'pair_count'} += $count_a;
909  } else {
910  throw("Trying to add a funny Mapper");
911  }
912 
913  $self->{'_is_sorted'} = 0;
914  return 1;
915 }
916 
917 
918 
919 =head2 list_pairs
920 
921  Arg 1 int $id
922  id of 'source' sequence
923  Arg 2 int $start
924  start coordinate of 'source' sequence
925  Arg 3 int $end
926  end coordinate of 'source' sequence
927  Arg 4 string $type
928  nature of transform - gives the type of
929  coordinates to be transformed *from*
930  Function list all pairs of mappings in a region
931  Returntype list of Bio::EnsEMBL::Mapper::Pair
932  Exceptions none
933  Caller Bio::EnsEMBL::Mapper
934 
935 =cut
936 
937 sub list_pairs {
938  my ( $self, $id, $start, $end, $type ) = @_;
939 
940  if ( !$self->{'_is_sorted'} ) { $self->_sort() }
941 
942  if ( !defined $type ) {
943  throw("Expected 4 arguments");
944  }
945 
946  if ( $start > $end ) {
947  throw( "Start is greater than end "
948  . "for id $id, start $start, end $end\n" );
949  }
950 
951  my $hash = $self->{"_pair_$type"};
952 
953  my ( $from, $to );
954 
955  if ( $type eq $self->{'to'} ) {
956  $from = 'to';
957  $to = 'from';
958  } else {
959  $from = 'from';
960  $to = 'to';
961  }
962 
963  unless ( defined $hash ) {
964  throw("Type $type is neither to or from coordinate systems");
965  }
966 
967  my @list;
968 
969  unless ( exists $hash->{ uc($id) } ) {
970  return ();
971  }
972 
973  @list = @{ $hash->{ uc($id) } };
974 
975  my @output;
976  if ( $start == -1 && $end == -1 ) {
977  return @list;
978  } else {
979 
980  foreach my $p (@list) {
981 
982  if ( $p->{$from}->{'end'} < $start ) {
983  next;
984  }
985  if ( $p->{$from}->{'start'} > $end ) {
986  last;
987  }
988  push( @output, $p );
989  }
990  return @output;
991  }
992 } ## end sub list_pairs
993 
994 
995 =head2 to
996 
998  id of 'source' sequence
999  Function accessor method form the 'source'
1000  and 'target' in a Mapper::Pair
1001  Returntype Bio::EnsEMBL::Mapper::Unit
1002  Exceptions none
1003  Caller Bio::EnsEMBL::Mapper
1004 
1005 =cut
1006 
1007 sub to {
1008  my ( $self, $value ) = @_;
1009 
1010  if ( defined($value) ) {
1011  $self->{'to'} = $value;
1012  }
1013 
1014  return $self->{'to'};
1015 }
1016 
1017 =head2 from
1018 
1019  Arg 1 Bio::EnsEMBL::Mapper::Unit $id
1020  id of 'source' sequence
1021  Function accessor method form the 'source'
1022  and 'target' in a Mapper::Pair
1023  Returntype Bio::EnsEMBL::Mapper::Unit
1024  Exceptions none
1025  Caller Bio::EnsEMBL::Mapper
1026 
1027 =cut
1028 sub from {
1029  my ( $self, $value ) = @_;
1030 
1031  if ( defined($value) ) {
1032  $self->{'from'} = $value;
1033  }
1034 
1035  return $self->{'from'};
1036 }
1037 
1038 
1039 # _dump
1040 #
1041 # Arg 1 *FileHandle $fh
1042 # Function convenience dump function
1043 # possibly useful for debugging
1044 # Returntype none
1045 # Exceptions none
1046 # Caller internal
1047 #
1048 
1049 sub _dump{
1050  my ($self,$fh) = @_;
1051 
1052  if( !defined $fh ) {
1053  $fh = \*STDERR;
1054  }
1055 
1056  my $from = $self->{'from'};
1057  my $to = $self->{'to'};
1058 
1059  print $fh "dumping from-hash _pair_$from\n";
1060  foreach my $id ( keys %{$self->{"_pair_$from"}} ) {
1061  print $fh "{_pair_$from}->{" . uc($id) . "}:\n";
1062  foreach my $pair ( @{$self->{"_pair_$from"}->{uc($id)}} ) {
1063  print $fh " ",$pair->from->start," ",$pair->from->end,":",$pair->to->start," ",$pair->to->end," ",$pair->to->id,"\n";
1064  }
1065  if (defined($self->{"_tree_$from"}->{uc($id)})) {
1066  print $fh "{_tree_$from}->{" . uc($id) . "} instantiated\n";
1067  } else {
1068  print $fh "{_tree_$from}->{" . uc($id) . "} empty\n";
1069  }
1070  }
1071  print $fh "dumping to-hash _pair_$to\n";
1072  foreach my $id ( keys %{$self->{"_pair_$to"}} ) {
1073  print $fh "{_pair_$to}->{" . uc($id) . "}:\n";
1074  foreach my $pair ( @{$self->{"_pair_$to"}->{uc($id)}} ) {
1075  print $fh " ",$pair->to->start," ",$pair->to->end,":",$pair->from->start," ",$pair->from->end," ",$pair->from->id,"\n";
1076  }
1077  if (defined($self->{"_tree_$to"}->{uc($id)})) {
1078  print $fh "{_tree_$to}->{" . uc($id) . "} instantiated\n";
1079  } else {
1080  print $fh "{_tree_$to}->{" . uc($id) . "} empty\n";
1081  }
1082  }
1083 }
1084 
1085 
1086 # _sort
1087 #
1088 # Function sort function so that all
1089 # mappings are sorted by
1090 # chromosome start
1091 # Returntype none
1092 # Exceptions none
1093 # Caller internal
1094 #
1095 
1096 sub _sort {
1097  my ($self) = @_;
1098 
1099  my $to = $self->{'to'};
1100  my $from = $self->{'from'};
1101 
1102  foreach my $id ( keys %{ $self->{"_pair_$from"} } ) {
1103  @{ $self->{"_pair_$from"}->{$id} } =
1104  sort { $a->{'from'}->{'start'} <=> $b->{'from'}->{'start'} }
1105  @{ $self->{"_pair_$from"}->{$id} };
1106  }
1107 
1108  foreach my $id ( keys %{ $self->{"_pair_$to"} } ) {
1109  @{ $self->{"_pair_$to"}->{$id} } =
1110  sort { $a->{'to'}->{'start'} <=> $b->{'to'}->{'start'} }
1111  @{ $self->{"_pair_$to"}->{$id} };
1112  }
1113 
1114  $self->_merge_pairs();
1115  $self->_is_sorted(1);
1116 }
1117 
1118 # this function merges pairs that are adjacent into one
1119 sub _merge_pairs {
1120  my $self = shift;
1121 
1122  my ( $lr, $lr_from, $del_pair, $next_pair, $current_pair );
1123 
1124  my $map_to = $self->{'to'};
1125  my $map_from = $self->{'from'};
1126  $self->{'pair_count'} = 0;
1127 
1128  for my $key ( keys %{$self->{"_pair_$map_to"}} ) {
1129  # merging pairs means all interval trees for
1130  # this set of intervals are now invalid, so delete them
1131  $self->{"_tree_$map_to"}->{ uc($key) } = undef;
1132  $lr = $self->{"_pair_$map_to"}->{$key};
1133 
1134  my $i = 0;
1135  my $next = 1;
1136  my $length = $#{$lr};
1137  while( $next <= $length ) {
1138  $current_pair = $lr->[$i];
1139  $next_pair = $lr->[$next];
1140  $del_pair = undef;
1141 
1142  if(exists $current_pair->{'indel'} || exists $next_pair->{'indel'}){
1143  #necessary to modify the merge function to not merge indels
1144  $next++;
1145  $i++;
1146  }
1147  else{
1148  # duplicate filter
1149  if( $current_pair->{'to'}->{'start'} == $next_pair->{'to'}->{'start'}
1150  && $current_pair->{'from'}->{'id'} == $next_pair->{'from'}->{'id'}
1151  && $current_pair->{'from'}->{'start'} == $next_pair->{'from'}->{'start'}) {
1152  # Modified in e75 to support GRCh38. Contigs used repeatedly in one seq region were
1153  # being pre-emptively deleted as copies. Extra from-start condition above ensures copy
1154  # deletion is restricted to same-location copies. Even more stringent checks can be made
1155  # at cost of speed.
1156  $del_pair = $next_pair;
1157  } elsif ( ( $current_pair->{'from'}->{'id'} eq $next_pair->{'from'}->{'id'} ) &&
1158  ( $next_pair->{'ori'} == $current_pair->{'ori'} ) &&
1159  ( $next_pair->{'to'}->{'start'} -1 == $current_pair->{'to'}->{'end'} )) {
1160 
1161  if( $current_pair->{'ori'} == 1 ) {
1162  # check forward strand merge
1163  if( $next_pair->{'from'}->{'start'} - 1 == $current_pair->{'from'}->{'end'} ) {
1164  # normal merge with previous element
1165  $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
1166  $current_pair->{'from'}->{'end'} = $next_pair->{'from'}->{'end'};
1167  $del_pair = $next_pair;
1168  }
1169  } else {
1170  # check backward strand merge
1171  if( $next_pair->{'from'}->{'end'} + 1 == $current_pair->{'from'}->{'start'} ) {
1172  # yes its a merge
1173  $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
1174  $current_pair->{'from'}->{'start'} = $next_pair->{'from'}->{'start'};
1175  $del_pair = $next_pair;
1176  }
1177  }
1178  }
1179 
1180  if( defined $del_pair ) {
1181  splice( @$lr, $next, 1 );
1182  $lr_from = $self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})};
1183  for( my $j=0; $j <= $#{$lr_from}; $j++ ) {
1184  if( $lr_from->[$j] == $del_pair ) {
1185  splice( @$lr_from, $j, 1 );
1186  last;
1187  }
1188  }
1189  $length--;
1190  if( $length < $next ) { last; }
1191  }
1192  else {
1193  $next++;
1194  $i++;
1195  }
1196  }
1197 
1198  }
1199  $self->{'pair_count'} += scalar( @$lr );
1200  }
1201 }
1202 
1203 
1204 # _is_sorted
1205 #
1206 # Arg 1 int $sorted
1207 # Function toggle for whether the (internal)
1208 # map data are sorted
1209 # Returntype int
1210 # Exceptions none
1211 # Caller internal
1212 #
1213 
1214 sub _is_sorted {
1215  my ($self, $value) = @_;
1216  $self->{'_is_sorted'} = $value if (defined($value));
1217  return $self->{'_is_sorted'};
1218 }
1219 
1220 # _build_immutable_tree
1221 #
1222 # Arg 1 string $pair_side - the from or to half of each pair to be
1223 # the source of intervals
1224 # Arg 2 listref $pair_list - a list of Bio::EnsEMBL::Mapper::Pair
1225 # Function builds a Bio::EnsEMBL::Utils::Tree::Interval::Immutable with
1226 # intervals corresponding to the chosen side (from or to) of
1227 # each Pair in $pair_list and a pointer to each Pair
1228 # Returntype Bio::EnsEMBL::Utils::Tree::Interval::Immutable
1229 # Exceptions none
1230 # Caller internal
1231 
1232 sub _build_immutable_tree {
1233  my ($pair_side, $pair_list) = @_;
1234  # create set of intervals for the tree
1235  my $from_intervals;
1236 
1237  foreach my $i (@{$pair_list}) {
1238  my $start = $i->{$pair_side}{start};
1239  my $end = $i->{$pair_side}{end};
1240 
1241  if ($end < $start) {
1242  ($end, $start) = ($start, $end);
1243  }
1244 
1245  push @{$from_intervals}, Bio::EnsEMBL::Utils::Interval->new($start, $end, $i);
1246  }
1247 
1248  # Create the interval tree defined on the above set of intervals
1249  #
1250  # As of release/99, we have more experience with the immutable
1251  # interval tree, so we will stick with this one. A future
1252  # refactoring effort may wish to replace this with a dynamically
1253  # maintained mutable interval tree, rather than simply throwing
1254  # trees away and rebuilding when the underlying interval set changes
1255  return Bio::EnsEMBL::Utils::Tree::Interval::Immutable->new($from_intervals);
1256 }
1257 
1258 1;
Bio::EnsEMBL::Utils::Interval
Definition: Interval.pm:41
Bio::EnsEMBL::Mapper::Unit
Definition: Unit.pm:15
Bio::EnsEMBL::Mapper::IndelCoordinate
Definition: IndelCoordinate.pm:16
EnsEMBL
Definition: Filter.pm:1
Bio::EnsEMBL::AssemblyMapper
Definition: AssemblyMapper.pm:49
Bio::EnsEMBL::Mapper::Coordinate
Definition: Coordinate.pm:14
map
public map()
Bio::EnsEMBL::Mapper::Gap::new
public Bio::EnsEMBL::Mapper::Gap new()
AssemblyMapper
Definition: BlastzAligner.pm:7
Bio::EnsEMBL::Mapper::IndelPair
Definition: IndelPair.pm:15
Bio::EnsEMBL::ChainedAssemblyMapper
Definition: ChainedAssemblyMapper.pm:54
Bio::EnsEMBL::CoordSystem
Definition: CoordSystem.pm:40
Bio::EnsEMBL::Mapper::Pair
Definition: Pair.pm:15
about
public about()
Bio::EnsEMBL::Mapper::Pair::new
public new()
Bio::EnsEMBL::Mapper::Coordinate::new
public new()
Bio::EnsEMBL::Mapper::Unit::new
public new()
Bio::EnsEMBL::Mapper::Gap
Definition: Gap.pm:14
Bio::EnsEMBL::Utils::Interval::new
public Bio::EnsEMBL::Utils::Interval new()
Bio::EnsEMBL::Mapper::IndelPair::new
public new()
Bio::EnsEMBL::Utils::Tree::Interval::Immutable
Definition: Node.pm:8
Bio
Definition: AltAlleleGroup.pm:4
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::Mapper::IndelCoordinate::new
public Bio::EnsEMBL::Mapper::IndelCoordinate new()
Bio::EnsEMBL::Mapper
Definition: Coordinate.pm:3