ensembl-hive  2.7.0
RepeatMaskedSlice.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::RepeatMaskedSlice - Arbitary Slice of a genome
34 
35 =head1 SYNOPSIS
36 
37  $sa = $db->get_SliceAdaptor();
38 
39  $slice =
40  $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 );
41 
42  $repeat_masked_slice = $slice->get_repeatmasked_seq();
43 
44  # get repeat masked sequence:
45  my $dna = $repeat_masked_slice->seq();
46  $dna = $repeat_masked_slice->subseq( 1, 1000 );
47 
48 =head1 DESCRIPTION
49 
50 This is a specialised Bio::EnsEMBL::Slice class that is used to retrieve
51 repeat masked genomic sequence rather than normal genomic sequence.
52 
53 =head1 METHODS
54 
55 =cut
56 
57 package Bio::EnsEMBL::RepeatMaskedSlice;
58 
59 use strict;
60 use warnings;
61 
63 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
64 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
66 
67 use vars qw(@ISA);
68 
69 @ISA = ('Bio::EnsEMBL::Slice');
70 
71 # The BLOCK_PWR is the lob_bin of the chunksize where you want your repeat features
72 # to be retrieved. This will create repeat feature retrieval calls that are likely
73 # to be on the same slice and hopefully create cache hits and less database traffic
74 my $BLOCK_PWR = 18;
75 
76 
77 
78 =head2 new
79 
80  Arg [-REPEAT_MASK] : The logic name of the repeats to be used for masking.
81  If not provided, all repeats in the database are used.
82  Arg [...] : Named superclass arguments. See B<Bio::EnsEMBL::Slice>.
83  Example : my $slice = Bio::EnsEMBL::RepeatMaskedSlice->new
84  (-START => $start,
85  -END => $end,
86  -STRAND => $strand,
87  -SEQ_REGION_NAME => $seq_region,
88  -SEQ_REGION_LENGTH => $seq_region_length,
89  -COORD_SYSTEM => $cs,
90  -ADAPTOR => $adaptor,
91  -REPEAT_MASK => ['repeat_masker'],
92  -SOFT_MASK => 1,
93  -NOT_DEFAULT_MASKING_CASES => {"repeat_class_SINE/MIR" => 1,
94  "repeat_name_AluSp" => 0});
95  Description: Creates a Slice which behaves exactly as a normal slice but
96  that returns repeat masked sequence from the seq method.
98  Exceptions : none
99  Caller : RawComputes (PredictionTranscript creation code).
100  Status : Stable
101 
102 =cut
103 
104 sub new {
105  my $caller = shift;
106  my $class = ref($caller) || $caller;
107 
108  my ($logic_names, $soft_mask, $not_default_masking_cases) = rearrange(['REPEAT_MASK',
109  'SOFT_MASK',
110  'NOT_DEFAULT_MASKING_CASES'], @_);
111 
112  my $self = $class->SUPER::new(@_);
113 
114 
115  $logic_names ||= [''];
116  if(ref($logic_names) ne 'ARRAY') {
117  throw("Reference to list of logic names argument expected.");
118  }
119 
120  $self->{'repeat_mask_logic_names'} = $logic_names;
121  $self->{'soft_mask'} = $soft_mask;
122  $self->{'not_default_masking_cases'} = $not_default_masking_cases;
123  $self->{'not_default_masking_cases'} ||= {};
124 
125  return $self;
126 }
127 
128 
129 =head2 repeat_mask_logic_names
130 
131  Arg [1] : reference to list of strings $logic_names (optional)
132  Example : $rm_slice->repeat_mask_logic_name(['repeat_masker']);
133  Description: Getter/Setter for the logic_names of the repeats that are used
134  to mask this slices sequence.
135  Returntype : reference to list of strings
136  Exceptions : none
137  Caller : seq() method
138  Status : Stable
139 
140 =cut
141 
142 sub repeat_mask_logic_names {
143  my $self = shift;
144 
145  if(@_) {
146  my $array = shift;
147  if(ref($array) ne 'ARRAY') {
148  throw('Reference to list of logic names argument expected.');
149  }
150  }
151 
152  return $self->{'repeat_mask_logic_names'};
153 }
154 
155 
156 =head2 soft_mask
157 
158  Arg [1] : boolean $soft_mask (optional)
159  Example : $rm_slice->soft_mask(0);
160  Description: Getter/Setter which is used to turn on/off softmasking of the
161  sequence returned by seq.
162  Returntype : boolean
163  Exceptions : none
164  Caller : seq() method
165  Status : Stable
166 
167 =cut
168 
169 sub soft_mask {
170  my $self = shift;
171  $self->{'soft_mask'} = shift if(@_);
172  return $self->{'soft_mask'} || 0;
173 }
174 
175 =head2 not_default_masking_cases
176 
177  Arg [1] : hash reference $not_default_masking_cases (optional, default is {})
178  The values are 0 or 1 for hard and soft masking respectively
179  The keys of the hash should be of 2 forms
180  "repeat_class_" . $repeat_consensus->repeat_class,
181  e.g. "repeat_class_SINE/MIR"
182  "repeat_name_" . $repeat_consensus->name
183  e.g. "repeat_name_MIR"
184  depending on which base you want to apply the not default masking either
185  the repeat_class or repeat_name. Both can be specified in the same hash
186  at the same time, but in that case, repeat_name setting has priority over
187  repeat_class. For example, you may have hard masking as default, and
188  you may want soft masking of all repeat_class SINE/MIR,
189  but repeat_name AluSp (which are also from repeat_class SINE/MIR)
190  Example : $rm_slice->not_default_masking_cases({"repeat_class_SINE/MIR" => 1,
191  "repeat_name_AluSp" => 0});
192  Description: Getter/Setter which is used to escape some repeat class or name from the default
193  masking in place.
194  Returntype : hash reference
195  Exceptions : none
196  Caller : seq() and subseq() methods
197  Status : Stable
198 
199 =cut
200 
201 sub not_default_masking_cases {
202  my $self = shift;
203  $self->{'not_default_masking_cases'} = shift if (@_);
204  return $self->{'not_default_masking_cases'};
205 }
206 
207 =head2 seq
208 
209  Arg [1] : none
210  Example : print $rmslice->seq(), "\n";
211  Description: Retrieves the entire repeat masked sequence for this slice.
212  See also the B<Bio::EnsEMBL::Slice> implementation of this
213  method.
214  Returntype : string
215  Exceptions : none
216  Caller : general
217  Status : Stable
218 
219 =cut
220 
221 sub seq {
222  my $self = shift;
223 
224  #
225  # get all the features
226  #
227  my $repeats = $self->_get_repeat_features($self);
228  my $soft_mask = $self->soft_mask();
229  my $not_default_masking_cases = $self->not_default_masking_cases();
230 
231  #
232  # get the dna
233  #
234  my $dna = $self->SUPER::seq(@_);
235 
236  #
237  # mask the dna
238  #
239  $self->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases);
240  return $dna;
241 }
242 
243 =head2 subseq
244 
245  Arg [1] : none
246  Example : print $rmslice->subseq(1, 1000);
247  Description: Retrieves a repeat masked sequence from a specified subregion
248  of this slice. See also the B<Bio::EnsEMBL::Slice>
249  implementation of this method.
250  Returntype : string
251  Exceptions : none
252  Caller : general
253  Status : Stable
254 
255 =cut
256 
257 sub subseq {
258  my $self = shift;
259  my $start = shift;
260  my $end = shift;
261  my $strand = shift;
262 
263  my $subsequence_slice = $self->sub_Slice($start, $end, $strand);
264 
265  # If frequent subseqs happen on repeatMasked sequence this results in
266  # a lot of feature retrieval from the database. To avoid this, features
267  # are only retrieved from subslices with fixed space boundaries.
268  # The access happens in block to make cache hits more likely
269  # ONLY DO IF WE ARE CACHING
270 
271  my $subslice;
272  if(! $self->adaptor()->db()->no_cache()) {
273 
274  my $seq_region_slice = $self->seq_region_Slice();
275  # The blocksize can be defined on the top of this module.
276  my $block_min = ($subsequence_slice->start()-1) >> $BLOCK_PWR;
277  my $block_max = ($subsequence_slice->end()-1) >> $BLOCK_PWR;
278 
279  my $sub_start = ($block_min << $BLOCK_PWR)+1;
280  my $sub_end = ($block_max+1)<<$BLOCK_PWR;
281  if ($sub_end > $seq_region_slice->length) {
282  $sub_end = $seq_region_slice->length ;
283  }
284  $subslice = $seq_region_slice->sub_Slice($sub_start, $sub_end);
285  }
286  else {
287  $subslice = $subsequence_slice;
288  }
289 
290  my $repeats = $self->_get_repeat_features($subslice);
291  my $soft_mask = $self->soft_mask();
292  my $not_default_masking_cases = $self->not_default_masking_cases();
293  my $dna = $subsequence_slice->SUPER::seq();
294  $subsequence_slice->_mask_features(\$dna,$repeats,$soft_mask,$not_default_masking_cases);
295  return $dna;
296 }
297 
298 =head2 _get_repeat_features
299 
300  Args [1] : Bio::EnsEMBL::Slice to fetch features for
301  Description : Gets repeat features for the given slice
302  Returntype : ArrayRef[Bio::EnsEMBL::RepeatFeature] array of repeats
303 
304 =cut
305 
306 
307 
308 sub _get_repeat_features {
309  my ($self, $slice) = @_;
310  my $logic_names = $self->repeat_mask_logic_names();
311  my @repeats;
312  foreach my $l (@$logic_names) {
313  push @repeats, @{$slice->get_all_RepeatFeatures($l)};
314  }
315  return \@repeats;
316 }
317 
318 1;
Bio::EnsEMBL::Slice::get_all_RepeatFeatures
public Listref get_all_RepeatFeatures()
Bio::EnsEMBL::Utils::Sequence
Definition: Sequence.pm:22
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::RepeatMaskedSlice::new
public Bio::EnsEMBL::RepeatMaskedSlice new()
Bio::EnsEMBL::RepeatMaskedSlice::seq
public String seq()
Bio::EnsEMBL::RepeatFeature
Definition: RepeatFeature.pm:45
Bio::EnsEMBL::RepeatMaskedSlice
Definition: RepeatMaskedSlice.pm:28
Bio::EnsEMBL::PredictionTranscript
Definition: PredictionTranscript.pm:39
Bio::EnsEMBL::Slice::get_repeatmasked_seq
public Bio::EnsEMBL::RepeatMaskedSlice get_repeatmasked_seq()
Bio::EnsEMBL::Utils::Argument
Definition: Argument.pm:34
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68