ensembl-hive  2.7.0
PolyA.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
34 
35 =head1 SYNOPSIS
36 
37  my $seq; # a Bio::Seq object
38  my $polyA = Bio::EnsEMBL::Utils::PolyA->new();
39 
40  # returns a new Bio::Seq object with the trimmed sequence
41  my $trimmed_seq = $polyA->clip($seq);
42 
43  # cat put Ns in the place of the polyA/polyT tail
44  my $masked_seq = $polyA->mask($seq);
45 
46  # can put in lower case the polyA/polyT using any flag:
47  my $softmasked_seq = $poly->mask( $seq, 'soft' );
48 
49 =head1 DESCRIPTION
50 
51  It reads a Bio::Seq object, it first finds out whether it has a
52  polyA or a polyT and then performs one operation in the seq string:
53  clipping, masking or softmasking. It then returns a new Bio::Seq
54  object with the new sequence.
55 
56 =head1 METHODS
57 
58 =cut
59 
60 package Bio::EnsEMBL::Utils::PolyA;
61 
62 use Bio::Seq;
63 use vars qw(@ISA);
64 
65 use strict;
66 
67 
68 =head2 new
69 
70 =cut
71 
72 sub new{
73  my ($class) = @_;
74  if (ref($class)){
75  $class = ref($class);
76  }
77  my $self = {};
78  bless($self,$class);
79 
80  return $self;
81 }
82 
83 
84 ############################################################
85 
86 sub clip{
87  my ($self, $bioseq) = @_;
88 
89  # print STDERR "past a $bioseq\n";
90  my $seq = $bioseq->seq;
91  $self->_clip(1);
92  $self->_mask(0);
93  $self->_softmask(0);
94  my $new_seq = $self->_find_polyA($seq);
95  my $cdna = Bio::Seq->new();
96  if (length($new_seq) > 0){
97  $cdna->seq($new_seq);
98  }
99  else {
100  print "While clipping the the polyA tail, sequence ".$bioseq->display_id." totally disappeared.\n";
101  print "Returning undef\n";
102  return undef;
103  }
104  $cdna->display_id( $bioseq->display_id );
105  $cdna->desc( $bioseq->desc );
106 
107  return $cdna;
108 }
109 
110 ############################################################
111 
112 sub mask{
113  my ($self, $bioseq, $flag ) = @_;
114 
115  my $seq = $bioseq->seq;
116  $self->_clip(0);
117  if ( $flag ){
118  $self->_mask(0);
119  $self->_softmask(1);
120  }
121  else{
122  $self->_mask(1);
123  $self->_softmask(0);
124  }
125  my $new_seq = $self->_find_polyA($seq);
126  my $cdna = new Bio::Seq;
127  $cdna->seq($new_seq);
128  $cdna->display_id( $bioseq->display_id );
129  $cdna->desc( $bioseq->desc );
130 
131  return $cdna;
132 }
133 
134 ############################################################
135 
136 
137 
138 
139 ############################################################
140 
141 sub _find_polyA{
142  my ($self, $seq) = @_;
143  my $new_seq;
144  my $length = length($seq);
145 
146  # is it a polyA or polyT?
147  my $check_polyT = substr( $seq, 0, 6 );
148 
149  my $check_polyA = substr( $seq, -6 );
150 
151  my $t_count = $check_polyT =~ tr/Tt//;
152  my $a_count = $check_polyA =~ tr/Aa//;
153 
154  #### polyA ####
155  if ( $a_count >= 5 && $a_count > $t_count ){
156 
157  # we calculate the number of bases we want to chop
158  my $length_to_mask = 0;
159 
160  # we start with 3 bases
161  my ($piece, $count ) = (3,0);
162 
163  # count also the number of Ns, consider the Ns as potential As
164  my $n_count = 0;
165 
166  # take 3 by 3 bases from the end
167  while( $length_to_mask < $length ){
168  my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece);
169  $count = $chunk =~ tr/Aa//;
170  $n_count = $chunk =~ tr/Nn//;
171  if ( ($count + $n_count) >= 2*( $piece )/3 ){
172  $length_to_mask += 3;
173  }
174  else{
175  last;
176  }
177  }
178 
179  if ( $length_to_mask > 0 ){
180  # do not mask the last base if it is not an A:
181  my $last_base = substr( $seq, ( $length - $length_to_mask ), 1);
182  my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1);
183  if ( !( $last_base eq 'A' || $last_base eq 'a') ){
184  $length_to_mask--;
185  }
186  elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){
187  $length_to_mask++;
188  }
189  my $clipped_seq = substr( $seq, 0, $length - $length_to_mask );
190  my $mask;
191  if ( $self->_clip ){
192  $mask = "";
193  }
194  elsif( $self->_mask ){
195  $mask = "N" x ($length_to_mask);
196  }
197  elsif ( $self->_softmask ){
198  $mask = lc substr( $seq, ( $length - $length_to_mask ) );
199  }
200  $new_seq = $clipped_seq . $mask;
201  }
202  else{
203  $new_seq = $seq;
204  }
205  }
206  #### polyT ####
207  elsif( $t_count >=5 && $t_count > $a_count ){
208 
209  # calculate the number of bases to chop
210  my $length_to_mask = -3;
211 
212  # we start with 3 bases:
213  my ($piece, $count) = (3,3);
214 
215  # count also the number of Ns, consider the Ns as potential As
216  my $n_count = 0;
217 
218  # take 3 by 3 bases from the beginning
219  while ( $length_to_mask < $length ){
220  my $chunk = substr( $seq, $length_to_mask + 3, $piece );
221  #print STDERR "length to mask: $length_to_mask\n";
222  #print "chunk: $chunk\n";
223  $count = $chunk =~ tr/Tt//;
224  $n_count = $chunk =~ tr/Nn//;
225  if ( ($count+$n_count) >= 2*( $piece )/3 ){
226  $length_to_mask +=3;
227  }
228  else{
229  last;
230 
231  }
232  }
233  if ( $length_to_mask >= 0 ){
234  # do not chop the last base if it is not a A:
235  #print STDERR "clipping sequence $seq\n";
236  my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 );
237  my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 );
238  if ( !( $last_base eq 'T' || $last_base eq 't' ) ){
239  $length_to_mask--;
240  }
241  elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){
242  $length_to_mask++;
243  }
244  my $clipped_seq = substr( $seq, $length_to_mask + 3);
245  my $mask;
246  if ( $self->_clip ){
247  $mask = "";
248  }
249  elsif( $self->_mask ){
250  $mask = "N" x ($length_to_mask+3);
251  }
252  elsif ($self->_softmask){
253  $mask = lc substr( $seq, 0, ($length_to_mask + 3) );
254  }
255  $new_seq = $mask.$clipped_seq;
256  }
257  else{
258  $new_seq = $seq;
259  }
260  }
261  else{
262  # we cannot be sure of what it is
263  # do not clip
264  $new_seq = $seq;
265  }
266 
267  return $new_seq;
268 }
269 
270 ############################################################
271 
272 sub _mask{
273  my ($self,$mask) = @_;
274  if (defined($mask)){
275  $self->{_mask} = $mask;
276  }
277  return $self->{_mask};
278 }
279 
280 ############################################################
281 
282 sub _clip{
283  my ($self,$clip) = @_;
284  if (defined($clip)){
285  $self->{_clip} = $clip;
286  }
287  return $self->{_clip};
288 }
289 
290 ############################################################
291 
292 sub _softmask{
293  my ($self,$softmask) = @_;
294  if (defined($softmask)){
295  $self->{_softmask} = $softmask;
296  }
297  return $self->{_softmask};
298 }
299 
300 ############################################################
301 
302 
303 sub has_polyA_track{
304  my ($self, $seq) = @_;
305  my $new_seq;
306  my $length = length($seq);
307 
308  # is it a polyA or polyT?
309  my $check_polyT = substr( $seq, 0, 10 );
310 
311  my $check_polyA = substr( $seq, -10 );
312 
313  print STDERR "polyA: $check_polyA\n";
314 
315  my $t_count = $check_polyT =~ tr/Tt//;
316  my $a_count = $check_polyA =~ tr/Aa//;
317 
318  ## testing with this short cut
319  if ( $a_count >=7 || $t_count >=7 ){
320  return 1;
321  }
322  else{
323  return 0;
324  }
325 }
326 
327 
328 ################
329 1;
330 
Bio::EnsEMBL::Utils::PolyA
Definition: PolyA.pm:29
Bio::EnsEMBL::Utils::PolyA::new
public new()