3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
37 my $seq; # a Bio::Seq
object
40 # returns a new Bio::Seq object with the trimmed sequence
41 my $trimmed_seq = $polyA->clip($seq);
43 # cat put Ns in the place of the polyA/polyT tail
44 my $masked_seq = $polyA->mask($seq);
46 # can put in lower case the polyA/polyT using any flag:
47 my $softmasked_seq = $poly->mask( $seq,
'soft' );
51 It reads a Bio::Seq object, it first finds out whether it has a
52 polyA or a polyT and then performs one operation in the seq string:
53 clipping, masking or softmasking. It then returns a
new Bio::Seq
54 object with the
new sequence.
60 package Bio::EnsEMBL::Utils::PolyA;
84 ############################################################
87 my ($self, $bioseq) = @_;
89 # print STDERR "past a $bioseq\n";
90 my $seq = $bioseq->seq;
94 my $new_seq = $self->_find_polyA($seq);
95 my $cdna = Bio::Seq->new();
96 if (length($new_seq) > 0){
100 print
"While clipping the the polyA tail, sequence ".$bioseq->display_id.
" totally disappeared.\n";
101 print
"Returning undef\n";
104 $cdna->display_id( $bioseq->display_id );
105 $cdna->desc( $bioseq->desc );
110 ############################################################
113 my ($self, $bioseq, $flag ) = @_;
115 my $seq = $bioseq->seq;
125 my $new_seq = $self->_find_polyA($seq);
126 my $cdna =
new Bio::Seq;
127 $cdna->seq($new_seq);
128 $cdna->display_id( $bioseq->display_id );
129 $cdna->desc( $bioseq->desc );
134 ############################################################
139 ############################################################
142 my ($self, $seq) = @_;
144 my $length = length($seq);
146 # is it a polyA or polyT?
147 my $check_polyT = substr( $seq, 0, 6 );
149 my $check_polyA = substr( $seq, -6 );
151 my $t_count = $check_polyT =~ tr/Tt
152 my $a_count = $check_polyA =~ tr/Aa
155 if ( $a_count >= 5 && $a_count > $t_count ){
157 # we calculate the number of bases we want to chop
158 my $length_to_mask = 0;
160 # we start with 3 bases
161 my ($piece, $count ) = (3,0);
163 # count also the number of Ns, consider the Ns as potential As
166 # take 3 by 3 bases from the end
167 while( $length_to_mask < $length ){
168 my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece);
169 $count = $chunk =~ tr/Aa
170 $n_count = $chunk =~ tr/Nn
171 if ( ($count + $n_count) >= 2*( $piece )/3 ){
172 $length_to_mask += 3;
179 if ( $length_to_mask > 0 ){
180 # do not mask the last base if it is not an A:
181 my $last_base = substr( $seq, ( $length - $length_to_mask ), 1);
182 my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1);
183 if ( !( $last_base eq
'A' || $last_base eq
'a') ){
186 elsif( $previous_to_last eq
'A' || $previous_to_last eq
'a' ){
189 my $clipped_seq = substr( $seq, 0, $length - $length_to_mask );
194 elsif( $self->_mask ){
195 $mask =
"N" x ($length_to_mask);
197 elsif ( $self->_softmask ){
198 $mask = lc substr( $seq, ( $length - $length_to_mask ) );
200 $new_seq = $clipped_seq . $mask;
207 elsif( $t_count >=5 && $t_count > $a_count ){
209 # calculate the number of bases to chop
210 my $length_to_mask = -3;
212 # we start with 3 bases:
213 my ($piece, $count) = (3,3);
215 # count also the number of Ns, consider the Ns as potential As
218 # take 3 by 3 bases from the beginning
219 while ( $length_to_mask < $length ){
220 my $chunk = substr( $seq, $length_to_mask + 3, $piece );
221 #print STDERR "length to mask: $length_to_mask\n";
222 #print "chunk: $chunk\n";
223 $count = $chunk =~ tr/Tt
224 $n_count = $chunk =~ tr/Nn
225 if ( ($count+$n_count) >= 2*( $piece )/3 ){
233 if ( $length_to_mask >= 0 ){
234 # do not chop the last base if it is not a A:
235 #print STDERR "clipping sequence $seq\n";
236 my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 );
237 my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 );
238 if ( !( $last_base eq
'T' || $last_base eq
't' ) ){
241 elsif( $previous_to_last eq
'T' || $previous_to_last eq
't' ){
244 my $clipped_seq = substr( $seq, $length_to_mask + 3);
249 elsif( $self->_mask ){
250 $mask =
"N" x ($length_to_mask+3);
252 elsif ($self->_softmask){
253 $mask = lc substr( $seq, 0, ($length_to_mask + 3) );
255 $new_seq = $mask.$clipped_seq;
262 # we cannot be sure of what it is
270 ############################################################
273 my ($self,$mask) = @_;
275 $self->{_mask} = $mask;
277 return $self->{_mask};
280 ############################################################
283 my ($self,$clip) = @_;
285 $self->{_clip} = $clip;
287 return $self->{_clip};
290 ############################################################
293 my ($self,$softmask) = @_;
294 if (defined($softmask)){
295 $self->{_softmask} = $softmask;
297 return $self->{_softmask};
300 ############################################################
304 my ($self, $seq) = @_;
306 my $length = length($seq);
308 # is it a polyA or polyT?
309 my $check_polyT = substr( $seq, 0, 10 );
311 my $check_polyA = substr( $seq, -10 );
313 print STDERR
"polyA: $check_polyA\n";
315 my $t_count = $check_polyT =~ tr/Tt
316 my $a_count = $check_polyA =~ tr/Aa
318 ## testing with this short cut
319 if ( $a_count >=7 || $t_count >=7 ){