ensembl-hive  2.8.1
TranscriptMapper.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 TranscriptMapper - A utility class used to perform coordinate conversions
34 between a number of coordinate systems relating to transcripts
35 
36 =head1 SYNOPSIS
37 
38  my $trmapper = Bio::EnsEMBL::TranscriptMapper->new($transcript);
39 
40  @coords = $trmapper->cdna2genomic( 123, 554 );
41 
42  @coords = $trmapper->genomic2cdna( 141, 500, -1 );
43 
44  @coords = $trmapper->genomic2cds( 141, 500, -1 );
45 
46  @coords = $trmapper->pep2genomic( 10, 60 );
47 
48  @coords = $trmapper->genomic2pep( 123, 400, 1 );
49 
50 =head1 DESCRIPTION
51 
52 This is a utility class which can be used to perform coordinate conversions
53 between a number of coordinate systems relating to transcripts.
54 
55 Any transcript object given to TranscriptMapper should have a proper Slice
56 object attached. The Slice provides vital information for the mapping process.
57 After TranscriptMapper has been instantiated, changes to the supplied
58 Transcript will not affect the Mapper's results.
59 
60 =head1 METHODS
61 
62 =cut
63 
64 package Bio::EnsEMBL::TranscriptMapper;
65 
66 use strict;
67 use warnings;
68 
69 use Bio::EnsEMBL::Utils::Exception qw(throw);
70 
74 
75 
76 
77 =head2 new
78 
79  Arg [1] : Bio::EnsEMBL::Transcript $transcript
80  The transcript for which a TranscriptMapper should be created.
81  Example : $trans_mapper = Bio::EnsEMBL::TranscriptMapper->new($transcript)
82  Description: Creates a TranscriptMapper object which can be used to perform
83  various coordinate transformations relating to transcripts.
84  Note that the TranscriptMapper uses the transcript state at the
85  time of creation to perform the conversions, and that a new
86  TranscriptMapper must be created if the Transcript is altered.
87  'Genomic' coordinates are coordinates which are relative to the
88  slice that the Transcript is on.
90  Exceptions : throws if a transcript is not an argument
91  Caller : Transcript::get_TranscriptMapper
92  Status : Stable
93 
94 =cut
95 
96 sub new {
97  my $caller = shift;
98  my $transcript = shift;
99 
100  my $class = ref($caller) || $caller;
101 
102  if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
103  throw("Transcript argument is required.");
104  }
105 
106 
107  my $exons = $transcript->get_all_Exons();
108  my $start_phase;
109  if(@$exons) {
110  $start_phase = $exons->[0]->phase;
111  } else {
112  $start_phase = -1;
113  }
114 
115  # Create a cdna <-> genomic mapper and load it with exon coords
116  my $mapper = _load_mapper($transcript);
117 
118  my $self = bless({'exon_coord_mapper' => $mapper,
119  'start_phase' => $start_phase,
120  'cdna_coding_start' => $transcript->cdna_coding_start(),
121  'cdna_coding_end' => $transcript->cdna_coding_end()},
122  $class);
123 }
124 
125 
126 =head2 _load_mapper
127 
128  Arg [1] : Bio::EnsEMBL::Transcript $transcript
129  The transcript for which a mapper should be created.
130  Example : my $mapper = _load_mapper($transcript);
131  Description: loads the mapper
132  Returntype : Bio::EnsEMBL::Mapper
133  Exceptions : none
134  Caller : Internal
135  Status : Stable
136 
137 =cut
138 
139 sub _load_mapper {
140  my $transcript = shift;
141 
142  my $mapper = Bio::EnsEMBL::Mapper->new( 'cdna', 'genomic');
143 
144  my $edits_on = $transcript->edits_enabled();
145  my @edits;
146 
147  if($edits_on) {
148  @edits = @{$transcript->get_all_SeqEdits()};
149  @edits = sort {$a->start() <=> $b->start()} @edits;
150  }
151 
152  my $edit_shift = 0;
153 
154  my $cdna_start = undef;
155 
156  my $cdna_end = 0;
157 
158 
159  foreach my $ex (@{$transcript->get_all_Exons}) {
160  my $gen_start = $ex->seq_region_start();
161  my $gen_end = $ex->seq_region_end();
162 
163  $cdna_start = $cdna_end + 1;
164  $cdna_end = $cdna_start + $ex->length() - 1;
165 
166  my $strand = $ex->strand();
167 
168  # add deletions and insertions into pairs when SeqEdits turned on
169  # ignore mismatches (i.e. treat as matches)
170  if($edits_on) {
171  while(@edits && $edits[0]->start() + $edit_shift <= $cdna_end) {
172 
173  my $edit = shift(@edits);
174  my $len_diff = $edit->length_diff();
175 
176  if($len_diff) {
177  # break pair into two parts, finish first pair just before edit
178 
179  my $prev_cdna_end = $edit->start() + $edit_shift - 1;
180  my $prev_cdna_start = $cdna_start;
181  my $prev_len = $prev_cdna_end - $prev_cdna_start + 1;
182 
183  my $prev_gen_end;
184  my $prev_gen_start;
185  if($strand == 1) {
186  $prev_gen_start = $gen_start;
187  $prev_gen_end = $gen_start + $prev_len - 1;
188  } else {
189  $prev_gen_start = $gen_end - $prev_len + 1;
190  $prev_gen_end = $gen_end;
191  }
192 
193  if($prev_len > 0) { # only create map pair if not boundary case
194  $mapper->add_map_coordinates
195  ('cdna', $prev_cdna_start, $prev_cdna_end, $strand,
196  'genome', $prev_gen_start,$prev_gen_end);
197  }
198 
199  $cdna_start = $prev_cdna_end + 1;
200 
201  if($strand == 1) {
202  $gen_start = $prev_gen_end + 1;
203  } else {
204  $gen_end = $prev_gen_start - 1;
205  }
206 
207  $cdna_end += $len_diff;
208 
209  if($len_diff > 0) {
210  $cdna_start += $len_diff;
211  }
212  else {
213  if($strand == 1) {
214  $gen_start -= $len_diff;
215  } else {
216  $gen_end += $len_diff;
217  }
218  }
219 
220  $edit_shift += $len_diff;
221  }
222  }
223  }
224 
225  my $pair_len = $cdna_end - $cdna_start + 1;
226 
227  if($pair_len > 0) {
228  $mapper->add_map_coordinates('cdna', $cdna_start, $cdna_end, $strand,
229  'genome', $gen_start, $gen_end);
230  }
231  }
232 
233  return $mapper;
234 }
235 
236 
237 =head2 cdna2genomic
238 
239  Arg [1] : $start
240  The start position in cdna coordinates
241  Arg [2] : $end
242  The end position in cdna coordinates
243  Example : @cdna_coords = $transcript_mapper->cdna2genomic($start, $end);
244  Description: Converts cdna coordinates to genomic coordinates. The
245  return value is a list of coordinates and gaps, where the coordinates
246  are relative to the sequence region that the transcript used to construct
247  this mapper was on.
248  Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
250  Exceptions : throws if no start or end
251  Caller : general
252  Status : Stable
253 
254 =cut
255 
256 
257 sub cdna2genomic {
258  my ($self, $start, $end, $include_original_region, $cdna_coding_start) = @_;
259 
260  if( !defined $end ) {
261  throw("Must call with start/end");
262  }
263 
264  $cdna_coding_start = defined $cdna_coding_start ? $cdna_coding_start : 1;
265  my $mapper = $self->{'exon_coord_mapper'};
266 
267  return $mapper->map_coordinates('cdna', $start, $end, 1, "cdna", $include_original_region, $cdna_coding_start);
268 
269 }
270 
271 
272 =head2 genomic2cdna
273 
274  Arg [1] : $start
275  The start position in genomic coordinates
276  Arg [2] : $end
277  The end position in genomic coordinates
278  Arg [3] : $strand
279  The strand of the genomic coordinates (default value 1)
280  Example : @coords = $trans_mapper->genomic2cdna($start, $end, $strnd);
281  Description: Converts genomic coordinates to cdna coordinates. The
282  return value is a list of coordinates and gaps. Gaps
283  represent intronic or upstream/downstream regions which do
284  not comprise this transcripts cdna. Coordinate objects
285  represent genomic regions which map to exons (utrs included).
286  Note: A poorly formed Transcript will cause this method to
287  malfunction.
288  Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
290  Exceptions : throws if start, end or strand not defined
291  Caller : general
292  Status : Stable
293 
294 =cut
295 
296 sub genomic2cdna {
297  my ($self, $start, $end, $strand) = @_;
298 
299  unless(defined $start && defined $end && defined $strand) {
300  throw("start, end and strand arguments are required\n");
301  }
302 
303  my $mapper = $self->{'exon_coord_mapper'};
304 
305  return $mapper->map_coordinates("genome", $start, $end, $strand,"genomic");
306 
307 }
308 
309 
310 =head2 cds2genomic
311 
312  Arg [1] : int $start
313  start position in cds coords
314  Arg [2] : int $end
315  end position in cds coords
316  Arg [3] boolean (0 or 1) $include_original_region
317  option to include original input coordinate region mappings in the result
318  Example : @genomic_coords = $transcript_mapper->cds2genomic(69, 306);
319  Description: Converts cds coordinates into genomic coordinates. Coordinates returned
320  are relative to sequence region that the transcript used to construct
321  this mapper was on.
322  Returntype : list of Bio::EnsEMBL::Mapper::Gap and
324  Exceptions : throws if no end
325  Caller : general
326  Status : at risk
327 
328 =cut
329 
330 sub cds2genomic {
331  my ( $self, $start, $end, $include_original_region ) = @_;
332 
333  if ( !( defined($start) && defined($end) ) ) {
334  throw("Must call with start and end");
335  }
336 
337  # Move start end into translate cDNA coordinates now.
338  $start = $start +( $self->{'cdna_coding_start'} - 1 ) ;
339  $end = $end + ( $self->{'cdna_coding_start'} - 1 );
340 
341  #Check if the start exceeds the cdna_coding_end, if yes, return
342  if($start > $self->{'cdna_coding_end'}){
343  return undef;
344  }
345 
346  #Check if the end exceeds the cdna_coding_end, if yes, truncate it otherwise we will be including gaps
347  if($end > $self->{'cdna_coding_end'}){
348  $end = $self->{'cdna_coding_end'};
349  }
350 
351  return $self->cdna2genomic( $start, $end, $include_original_region, $self->{'cdna_coding_start'} );
352 }
353 
354 =head2 pep2genomic
355 
356  Arg [1] : int $start
357  start position in peptide coords
358  Arg [2] : int $end
359  end position in peptide coords
360  Example : @genomic_coords = $transcript_mapper->pep2genomic(23, 102);
361  Description: Converts peptide coordinates into genomic coordinates. The
362  coordinates returned are relative to the sequence region that the
363  transcript used to construct this TranscriptMapper was on.
364  Returntype : list of Bio::EnsEMBL::Mapper::Gap and
366  Exceptions : throws if no end
367  Caller : general
368  Status : Stable
369 
370 =cut
371 
372 sub pep2genomic {
373  my ( $self, $start, $end ) = @_;
374 
375  if ( !( defined($start) && defined($end) ) ) {
376  throw("Must call with start and end");
377  }
378 
379  # Take possible N-padding at beginning of CDS into account.
380  my $start_phase = $self->{'start_phase'};
381  my $shift = ( $start_phase > 0 ) ? $start_phase : 0;
382 
383  # Move start end into translate cDNA coordinates now.
384  $start = 3*$start - 2 + ( $self->{'cdna_coding_start'} - 1 ) - $shift;
385  $end = 3*$end + ( $self->{'cdna_coding_start'} - 1 ) - $shift;
386 
387  return $self->cdna2genomic( $start, $end );
388 }
389 
390 
391 =head2 genomic2cds
392 
393  Arg [1] : int $start
394  The genomic start position
395  Arg [2] : int $end
396  The genomic end position
397  Arg [3] : int $strand
398  The genomic strand
399  Example : @cds_coords = $trans_mapper->genomic2cds($start, $end, $strand);
400  Description: Converts genomic coordinates into CDS coordinates of the
401  transcript that was used to create this transcript mapper.
402  Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
404  Exceptions : throw if start, end or strand not defined
405  Caller : general
406  Status : Stable
407 
408 =cut
409 
410 sub genomic2cds {
411  my ($self, $start, $end, $strand) = @_;
412 
413  if(!defined($start) || !defined($end) || !defined($strand)) {
414  throw("start, end and strand arguments are required");
415  }
416 
417  if($start > $end + 1) {
418  throw("start arg must be less than or equal to end arg + 1");
419  }
420 
421  my $cdna_cstart = $self->{'cdna_coding_start'};
422  my $cdna_cend = $self->{'cdna_coding_end'};
423 
424  #this is a pseudogene if there is no coding region
425  if(!defined($cdna_cstart)) {
426  #return a gap of the entire requested region, there is no CDS
427  return Bio::EnsEMBL::Mapper::Gap->new($start,$end);
428  }
429 
430  my @coords = $self->genomic2cdna($start, $end, $strand);
431 
432  my @out;
433 
434  foreach my $coord (@coords) {
435  if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) {
436  push @out, $coord;
437  } else {
438  my $start = $coord->start;
439  my $end = $coord->end;
440 
441  if($coord->strand == -1 || $end < $cdna_cstart || $start > $cdna_cend) {
442  #is all gap - does not map to peptide
443  push @out, Bio::EnsEMBL::Mapper::Gap->new($start,$end);
444  } else {
445  #we know area is at least partially overlapping CDS
446 
447  my $cds_start = $start - $cdna_cstart + 1;
448  my $cds_end = $end - $cdna_cstart + 1;
449 
450  if($start < $cdna_cstart) {
451  #start of coordinates are in the 5prime UTR
452  push @out, Bio::EnsEMBL::Mapper::Gap->new($start, $cdna_cstart-1);
453 
454  #start is now relative to start of CDS
455  $cds_start = 1;
456  }
457 
458  my $end_gap = undef;
459  if($end > $cdna_cend) {
460  #end of coordinates are in the 3prime UTR
461  $end_gap = Bio::EnsEMBL::Mapper::Gap->new($cdna_cend + 1, $end);
462  #adjust end to relative to CDS start
463  $cds_end = $cdna_cend - $cdna_cstart + 1;
464  }
465 
466  #start and end are now entirely in CDS and relative to CDS start
467  $coord->start($cds_start);
468  $coord->end($cds_end);
469 
470  push @out, $coord;
471 
472  if($end_gap) {
473  #push out the region which was in the 3prime utr
474  push @out, $end_gap;
475  }
476  }
477  }
478  }
479 
480  return @out;
481 
482 }
483 
484 
485 =head2 genomic2pep
486 
487  Arg [1] : $start
488  The start position in genomic coordinates
489  Arg [2] : $end
490  The end position in genomic coordinates
491  Arg [3] : $strand
492  The strand of the genomic coordinates
493  Example : @pep_coords = $transcript->genomic2pep($start, $end, $strand);
494  Description: Converts genomic coordinates to peptide coordinates. The
495  return value is a list of coordinates and gaps.
496  Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
498  Exceptions : throw if start, end or strand not defined
499  Caller : general
500  Status : Stable
501 
502 =cut
503 
504 sub genomic2pep {
505  my ($self, $start, $end, $strand) = @_;
506 
507  unless(defined $start && defined $end && defined $strand) {
508  throw("start, end and strand arguments are required");
509  }
510 
511  my @coords = $self->genomic2cds($start, $end, $strand);
512 
513  my @out;
514 
515  my $start_phase = $self->{'start_phase'};
516 
517  #take into account possible N padding at beginning of CDS
518  my $shift = ($start_phase > 0) ? $start_phase : 0;
519 
520  foreach my $coord (@coords) {
521  if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) {
522  push @out, $coord;
523  } else {
524 
525  #start and end are now entirely in CDS and relative to CDS start
526 
527  #convert to peptide coordinates
528  my $pep_start = int(($coord->start + $shift + 2) / 3);
529  my $pep_end = int(($coord->end + $shift + 2) / 3);
530  $coord->start($pep_start);
531  $coord->end($pep_end);
532 
533  push @out, $coord;
534  }
535  }
536 
537  return @out;
538 }
539 
540 
541 1;
transcript
public transcript()
Bio::EnsEMBL::Mapper::Coordinate
Definition: Coordinate.pm:14
Bio::EnsEMBL::TranscriptMapper::cdna2genomic
public List cdna2genomic()
map
public map()
Bio::EnsEMBL::Mapper::Gap::new
public Bio::EnsEMBL::Mapper::Gap new()
Bio::EnsEMBL::Mapper
Definition: Mapper.pm:117
Bio::EnsEMBL::TranscriptMapper::new
public Bio::EnsEMBL::TranscriptMapper new()
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::Mapper::Gap::start
public Int start()
Bio::EnsEMBL::TranscriptMapper
Definition: TranscriptMapper.pm:34
Bio::EnsEMBL::Transcript
Definition: Transcript.pm:44
Bio::EnsEMBL::Mapper::Gap
Definition: Gap.pm:14
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::Mapper
Definition: Coordinate.pm:3