my ($self, $seq) = @_;
my $new_seq;
my $length = length($seq);
# is it a polyA or polyT?
my $check_polyT = substr( $seq, 0, 6 );
my $check_polyA = substr( $seq, -6 );
my $t_count = $check_polyT =~ tr/Tt
my $a_count = $check_polyA =~ tr/Aa
#### polyA ####
if ( $a_count >= 5 && $a_count > $t_count ){
# we calculate the number of bases we want to chop
my $length_to_mask = 0;
# we start with 3 bases
my ($piece, $count ) = (3,0);
# count also the number of Ns, consider the Ns as potential As
my $n_count = 0;
# take 3 by 3 bases from the end
while( $length_to_mask < $length ){
my $chunk = substr( $seq, ($length - ($length_to_mask + 3)), $piece);
$count = $chunk =~ tr/Aa
$n_count = $chunk =~ tr/Nn
if ( ($count + $n_count) >= 2*( $piece )/3 ){
$length_to_mask += 3;
}
else{
last;
}
}
if ( $length_to_mask > 0 ){
# do not mask the last base if it is not an A:
my $last_base = substr( $seq, ( $length - $length_to_mask ), 1);
my $previous_to_last = substr( $seq, ( $length - $length_to_mask - 1), 1);
if ( !( $last_base eq 'A' || $last_base eq 'a') ){
$length_to_mask--;
}
elsif( $previous_to_last eq 'A' || $previous_to_last eq 'a' ){
$length_to_mask++;
}
my $clipped_seq = substr( $seq, 0, $length - $length_to_mask );
my $mask;
if ( $self->_clip ){
$mask = "";
}
elsif( $self->_mask ){
$mask = "N" x ($length_to_mask);
}
elsif ( $self->_softmask ){
$mask = lc substr( $seq, ( $length - $length_to_mask ) );
}
$new_seq = $clipped_seq . $mask;
}
else{
$new_seq = $seq;
}
}
#### polyT ####
elsif( $t_count >=5 && $t_count > $a_count ){
# calculate the number of bases to chop
my $length_to_mask = -3;
# we start with 3 bases:
my ($piece, $count) = (3,3);
# count also the number of Ns, consider the Ns as potential As
my $n_count = 0;
# take 3 by 3 bases from the beginning
while ( $length_to_mask < $length ){
my $chunk = substr( $seq, $length_to_mask + 3, $piece );
#print STDERR "length to mask: $length_to_mask\n";
#print "chunk: $chunk\n";
$count = $chunk =~ tr/Tt
$n_count = $chunk =~ tr/Nn
if ( ($count+$n_count) >= 2*( $piece )/3 ){
$length_to_mask +=3;
}
else{
last;
}
}
if ( $length_to_mask >= 0 ){
# do not chop the last base if it is not a A:
#print STDERR "clipping sequence $seq\n";
my $last_base = substr( $seq, ( $length_to_mask + 3 - 1 ), 1 );
my $previous_to_last = substr( $seq, ( $length_to_mask + 3 ), 1 );
if ( !( $last_base eq 'T' || $last_base eq 't' ) ){
$length_to_mask--;
}
elsif( $previous_to_last eq 'T' || $previous_to_last eq 't' ){
$length_to_mask++;
}
my $clipped_seq = substr( $seq, $length_to_mask + 3);
my $mask;
if ( $self->_clip ){
$mask = "";
}
elsif( $self->_mask ){
$mask = "N" x ($length_to_mask+3);
}
elsif ($self->_softmask){
$mask = lc substr( $seq, 0, ($length_to_mask + 3) );
}
$new_seq = $mask.$clipped_seq;
}
else{
$new_seq = $seq;
}
}
else{
# we cannot be sure of what it is
# do not clip
$new_seq = $seq;
}
return $new_seq;
}