ensembl-hive  2.7.0
Upstream.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::Upstream - Object that defines an upstream region
34 
35 =head1 SYNOPSIS
36 
38 
39  my $upstream = Bio::EnsEMBL::Upstream->new(
40  -transcript => $transcript,
41  -length => 2000 # bp
42  );
43 
44  # Retrieve coordinates of upstream region
45  my $upstream_region_start = $upstream->upstart;
46  my $upstream_region_end = $upstream->upend;
47 
48  # Retrieve coordinates in 'downstream' first intron
49  my $intron_region_start = $upstream->downstart;
50  my $intron_region_end = $upstream->downend;
51 
52  # Coordinates are returned in the same scheme as the input transcript.
53  # However, the coordinates of an upstream region can be transformed to
54  # any other scheme using a slice
55 
56  $upstream->transform($slice);
57 
58  # Coordinates can be retrieved in scheme in the same manner as the
59  # above.
60 
61 =head1 DESCRIPTION
62 
63 An object that determines the upstream region of a transcript. Such a
64 region is non-coding and ensures that other genes or transcripts are
65 not present. Ultimately, these objects can be used to looking for
66 promoter elements. To this end, it is also possible to derive a region
67 downstream of the first exon, within the first intron and where promoter
68 elements sometimes are found.
69 
70 =head1 METHODS
71 
72 =cut
73 
74 package Bio::EnsEMBL::Upstream;
75 
76 use strict;
77 use vars qw(@ISA);
80 
81 use Bio::EnsEMBL::Utils::Exception qw(throw);
82 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
83 
84 @ISA = qw(Bio::EnsEMBL::Feature);
85 
86 
87 =head2 new
88 
89  Arg [transcript] : (optional) Bio::EnsEMBL::Transcript
90  Arg [length] : (optional) int $length
91  Example : $upstream = Bio::EnsEMBL::Upstream->new(-transcript => $transcript,
92  -length => 2000);
93  Description: Creates a new upstream object
94  Returntype : Bio::EnsEMBL::Upstream
95  Exceptions : none
96  Caller : Bio::EnsEMBL::Transcript, general
97  Status : Stable
98 
99 =cut
100 
101 sub new {
102  my ($class, @args) = @_;
103  my $self = {};
104 
105  bless $self, $class;
106 
107  my ($transcript,
108  $length) = rearrange([qw(TRANSCRIPT
109  LENGTH
110  )],@args);
111 
112  $self->transcript($transcript) if defined $transcript;
113  $self->length($length) if $length;
114 
115  return $self
116 }
117 
118 =head2 transcript
119 
120  Arg : (optional) Bio::EnsEMBL::Transcript
121  Example : $self->transcript($transcript);
122  Description: Getter/setter for transcript object
123  Returntype : Bio::EnsEMBL::Transcript
124  Exceptions : Throws if argument is not undefined
126  Caller : $self->new, $self->_derive_coords,
127  $self->_first_coding_Exon
128  Status : Stable
129 
130 =cut
131 
132 
133 sub transcript {
134  my $self = shift;
135 
136  if (@_){
137  $self->{_transcript} = shift;
138 
139  if (defined $self->{_transcript}) {
140  throw("Transcript is not a Bio::EnsEMBL::Transcript")
141  if (! $self->{_transcript}->isa("Bio::EnsEMBL::Transcript"));
142  $self->_flush_cache;
143  }
144  }
145 
146  return $self->{_transcript}
147 }
148 
149 =head2 length
150 
151  Arg : (optional) int $length
152  Example : $self->length(2000); # bp
153  Description: Getter/setter for upstream region length.
154  Returntype : int
155  Exceptions : Throws if length is requested before it has been set.
156  Caller : $self->new, $self->_derive_coords
157  Status : Stable
158 
159 =cut
160 
161 sub length {
162  my $self = shift;
163 
164  if (@_){
165  $self->{_length} = shift;
166  $self->_flush_cache;
167  }
168 
169  throw("Region length has not been set.")
170  unless $self->{_length};
171 
172  return $self->{_length}
173 }
174 
175 =head2 _flush_cache
176 
177  Arg : none
178  Example : $self->_flush_cache;
179  Description: Empties cached coordinates (called when
180  coordinate scheme or region length has changed).
181  Returntype : none
182  Exceptions : none
183  Caller : $self->length, $self->transform
184  Status : Stable
185 
186 =cut
187 
188 sub _flush_cache {
189  my $self = shift;
190 
191  $self->upstart(undef);
192  $self->upend(undef);
193  $self->downstart(undef);
194  $self->downend(undef);
195 }
196 
197 =head2 upstart
198 
199  Arg : none
200  Example : $self->upstart;
201  Description: Returns the start coordinate of the region
202  upstream of the transcript. This coordinate
203  is always the furthest from the translation
204  initiation codon, whereas upend always abutts
205  the translation initiation codon.
206  Returntype : int
207  Exceptions : none
208  Caller : general
209  Status : Stable
210 
211 =cut
212 
213 sub upstart {
214  my $self = shift;
215 
216  if (@_) {
217  $self->{_upstart} = shift @_;
218  return
219  }
220 
221  if (! defined $self->{_upstart}) {
222  $self->_derive_coords('up');
223  }
224 
225  return $self->{_upstart}
226 }
227 
228 =head2 upend
229 
230  Arg : none
231  Example : $self->upend;
232  Description: Returns the end coordinate of the region
233  upstream of the transcript. This coordinate
234  always always abutts the translation
235  initiation codon, whereas upstart always
236  returns the coorindate furthest from the
237  translation initiation codon.
238  Returntype : int
239  Exceptions : none
240  Caller : general
241  Status : Stable
242 
243 =cut
244 
245 sub upend {
246  my $self = shift;
247 
248  if (@_) {
249  $self->{_upend} = shift @_;
250  return
251  }
252 
253  if (! defined $self->{_upend}) {
254  $self->_derive_coords('up');
255  }
256 
257  return $self->{_upend}
258 }
259 
260 =head2 downstart
261 
262  Arg : none
263  Example : $self->downstart;
264  Description: Returns the start coordinate of the region
265  in the first intron of the transcript. This
266  coordinate is always closest to the first
267  exon (irregardless of strand).
268  Returntype : int
269  Exceptions : none
270  Caller : general
271  Status : Stable
272 
273 =cut
274 
275 sub downstart {
276  my $self = shift;
277 
278  if (@_) {
279  $self->{_downstart} = shift @_;
280  return
281  }
282 
283  if (! defined $self->{_downstart}) {
284  $self->_derive_coords('down');
285  }
286 
287  return $self->{_downstart}
288 }
289 
290 =head2 downend
291 
292  Arg : none
293  Example : $self->downend;
294  Description: Returns the end coordinate of the region
295  in the first intron of the transcript. This
296  coordinate is always furthest from the first
297  exon (irregardless of strand).
298  Returntype : int
299  Exceptions : none
300  Caller : general
301  Status : Stable
302 
303 =cut
304 
305 sub downend {
306  my $self = shift;
307 
308  if (@_) {
309  $self->{_downend} = shift @_;
310  return
311  }
312 
313  if (! defined $self->{_downend}) {
314  $self->_derive_coords('down');
315  }
316 
317  return $self->{_downend}
318 }
319 
320 =head2 transform
321 
322  Arg :
323  Example :
324  Description: Not yet implemented
325  Returntype :
326  Exceptions :
327  Caller :
328  Status : At Risk
329 
330 =cut
331 
332 
333 # Over-riding inherited class. As yet unimplemented.
334 
335 sub transform {
336  my $self = shift;
337 
338  throw("No transform method implemented for " . $self);
339 }
340 
341 =head2 derive_upstream_coords
342 
343  Arg : none
344  Example : my ($upstart, $upend)
345  = $self->derive_upstream_coords;
346  Description: Derives upstream coordinates (for
347  compatability with older scripts).
348  Returntype : arrayref
349  Exceptions : none
350  Caller : general
351  Status : Stable
352 
353 =cut
354 
355 sub derive_upstream_coords {
356  my $self = shift;
357 
358  return [$self->upstart, $self->upend]
359 }
360 
361 =head2 derive_downstream_coords
362 
363  Arg : none
364  Example : my ($downstart, $downend)
365  = $self->derive_downstream_coords;
366  Description: Derives downstream coordinates (for
367  compatability with older scripts).
368  Returntype : arrayref
369  Exceptions : none
370  Caller : general
371  Status : Stable
372 
373 =cut
374 
375 sub derive_downstream_coords {
376  my $self = shift;
377 
378  return [$self->downstart, $self->downend]
379 }
380 
381 =head2 _derive_coords
382 
383  Arg : string $direction (either 'up' or 'down').
384  Example : $self->_derive_coords('up');
385  Description: Determines the coordinates of either upstream
386  or downstream region.
387  Returntype : none
388  Exceptions : Throws if argument is not either 'up' or 'down'
389  Caller : $self->upstart, $self->upend, $self->downstart,
390  $self->downend
391  Status : Stable
392 
393 =cut
394 
395 sub _derive_coords {
396  my ($self, $direction) = @_;
397 
398  # Check direction
399  throw("Must specify either \'up\' of \'down\'-stream direction to derive coords.")
400  unless (($direction eq 'up')||($direction eq 'down'));
401 
402  # Put things in easily accessible places.
403  my $core_db_slice_adaptor = $self->transcript->slice->adaptor;
404  my $region_length = $self->length;
405 
406  # Whatever coord system the gene is currently is, transform to the toplevel.
407  my $transcript = $self->transcript->transform('toplevel');
408 
409  # Use our transformed transcript to determine the upstream region coords.
410  # End should always be just before the coding start (like ATG), including 3' UTR.
411  # Start is the outer limit of the region upstream (furthest from ATG).
412 
413  my $region_start;
414  my $region_end;
415 
416  if ($transcript->strand == 1){
417  if ($direction eq 'up'){
418  $region_end = $transcript->coding_region_start - 1;
419  $region_start = $region_end - $region_length;
420  } elsif ($direction eq 'down'){
421  $region_end = $self->_first_coding_Exon->end + 1;
422  $region_start = $region_end + $region_length;
423  }
424  } elsif ($transcript->strand == -1) {
425  if ($direction eq 'up'){
426  $region_end = $transcript->coding_region_end + 1;
427  $region_start = $region_end + $region_length;
428 
429  } elsif ($direction eq 'down'){
430  $region_end = $self->_first_coding_Exon->start - 1;
431  $region_start = $region_end - $region_length;
432  }
433  }
434 
435  # Trim the upstream/downstream region to remove extraneous coding sequences
436  # from other genes and/or transcripts.
437 
438  my ($slice_low_coord, $slice_high_coord) = sort {$a <=> $b} ($region_start, $region_end);
439 
440  my $region_slice
441  = $core_db_slice_adaptor->fetch_by_region($transcript->slice->coord_system->name,
442  $transcript->slice->seq_region_name,
443  $slice_low_coord,
444  $slice_high_coord);
445 
446  if ($transcript->strand == 1) {
447  if ($direction eq 'up') {
448  $region_start += $self->_bases_to_trim('left_end', $region_slice);
449  } elsif ($direction eq 'down') {
450  $region_start -= $self->_bases_to_trim('right_end', $region_slice);
451  }
452  } elsif ($transcript->strand == -1) {
453  if ($direction eq 'up') {
454  $region_start -= $self->_bases_to_trim('right_end', $region_slice);
455  } elsif ($direction eq 'down') {
456  $region_start += $self->_bases_to_trim('left_end', $region_slice);
457  }
458  }
459 
460  # Always return start < end
461 
462  ($region_start, $region_end) = sort {$a <=> $b} ($region_start, $region_end);
463 
464  if ($direction eq 'up') {
465  $self->upstart($region_start);
466  $self->upend($region_end);
467  } elsif ($direction eq 'down') {
468  $self->downstart($region_start);
469  $self->downend($region_end);
470  }
471 }
472 
473 =head2 _bases_to_trim
474 
475  Arg : string $end_to_trim (either 'right_end' or
476  'left_end').
477  Arg : Bio::EnsEMBL::Slice
478  Example : $self->_derive_coords('right_end', $slice);
479  Description: Finds exons from other genes/transcripts that
480  invade our upstream/downstream slice and
481  returns the number of bases that should be
482  truncated from the appropriate end of the
483  upstream/downstream region.
484  Returntype : in
485  Exceptions : Throws if argument is not either 'right_end'
486  or 'left_end'
487  Caller : $self->_derive_coords
488  Status : Stable
489 
490 =cut
491 
492 # Method to look for coding regions that invade the upstream region. For
493 # now, this method returns the number of bases to trim. I doesn't yet
494 # do anything special if an exon is completely swallowed (truncates at
495 # the end of the overlapping exon and discards any non-coding sequence
496 # further upstream) or overlaps the 'wrong' end of the region (cases where
497 # two alternate exons share one end of sequence - does this happen?).
498 
499 # The input argument 'end' defines the end of the slice that should be
500 # truncated.
501 
502 sub _bases_to_trim {
503  my ($self, $end_to_trim, $slice) = @_;
504 
505  throw "Slice end argument must be either left_end or right_end"
506  unless ($end_to_trim eq 'right_end' || $end_to_trim eq 'left_end');
507 
508  my @overlap_coords;
509  my $slice_length = $slice->length;
510  my $right_trim = 0;
511  my $left_trim = 0;
512 
513  foreach my $exon (@{$slice->get_all_Exons}){
514  next if $exon->stable_id eq $self->_first_coding_Exon->stable_id;
515 
516  my $start = $exon->start;
517  my $end = $exon->end;
518 
519  # Choose from four possible exon arrangements
520 
521  # -----|********************|----- Slice
522  # --|=========================|--- Exon arrangement 1
523  # ----------|======|-------------- Exon arrangement 2
524  # --|=======|--------------------- Exon arrangement 3
525  # -------------------|=========|-- Exon arrangement 4
526 
527 
528  if ($start <= 0 && $end >= $slice_length) { # exon arrangement 1
529  $right_trim = $slice_length - 1;
530  $left_trim = $slice_length - 1;
531  last;
532 
533  } elsif ($start >= 0 && $end <= $slice_length) { # exon arrangement 2
534  my $this_right_trim = ($slice_length - $start) + 1;
535 
536  $right_trim = $this_right_trim
537  if $this_right_trim > $right_trim;
538 
539  $left_trim = $end
540  if $end > $left_trim;
541 
542  } elsif ($start <= 0 && $end < $slice_length) { # exon arrangement 3
543  $right_trim = $slice_length; # a bit draconian
544  $left_trim = $end
545  if $end > $left_trim;
546 
547  } elsif ($start > 0 && $end >= $slice_length) { # exon arrangement 4
548  my $this_right_trim = ($slice_length - $start) + 1;
549 
550  $right_trim = $this_right_trim
551  if $this_right_trim > $right_trim;
552 
553  $left_trim = $slice_length; # also a bit draconian
554  }
555 
556  }
557 
558  return $right_trim if $end_to_trim eq 'right_end';
559  return $left_trim if $end_to_trim eq 'left_end';
560 }
561 
562 =head2 _first_coding_Exon
563 
564  Arg : none
565  Example : $self->_first_coding_Exon;
566  Description: Finds the first exon of our transcript that
567  contains coding bases.
568  Returntype : Bio::EnsEMBL::Exon
569  Exceptions : none
570  Caller : $self->_derive_coords, $self->_bases_to_trim
571  Status : Stable
572 
573 =cut
574 
575 sub _first_coding_Exon {
576  my $self = shift;
577 
578  unless ($self->{_first_coding_exon}){
579 
580  my $exons = $self->transcript->get_all_translateable_Exons;
581 
582  $self->{_first_coding_exon} = $exons->[0]
583  if $self->transcript->strand == 1;
584  $self->{_first_coding_exon} = $exons->[-1]
585  if $self->transcript->strand == -1;
586  }
587 
588  return $self->{_first_coding_exon}
589 }
590 
591 
592 return 1;
transcript
public transcript()
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
Bio::EnsEMBL::Feature
Definition: Feature.pm:47
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
exon
public exon()
Bio::EnsEMBL::Exon
Definition: Exon.pm:42
Bio::EnsEMBL::DBSQL::SimpleFeatureAdaptor
Definition: SimpleFeatureAdaptor.pm:31
Bio::EnsEMBL::Transcript
Definition: Transcript.pm:44
Bio::EnsEMBL::Upstream
Definition: Upstream.pm:45
Bio::EnsEMBL::Transcript::new
public Bio::EnsEMBL::Transcript new()
Bio::EnsEMBL::Upstream::new
public Bio::EnsEMBL::Upstream new()
Bio::EnsEMBL::Utils::Argument
Definition: Argument.pm:34
Bio::EnsEMBL::Upstream::upstart
public Int upstart()
Bio::EnsEMBL::Exon::strand
public Int strand()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68