ensembl-hive  2.7.0
SequenceAdaptor.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =head1 CONTACT
19 
20  Please email comments or questions to the public Ensembl
21  developers list at <dev@ensembl.org>.
22 
23  Questions may also be sent to the Ensembl help desk at
24  <helpdesk@ensembl.org>.
25 
26 =cut
27 
28 =head1 NAME
29 
30 Bio::EnsEMBL::DBSQL::SequenceAdaptor - produce sequence strings from locations
31 
32 =head1 SYNOPSIS
33 
34  my $sa = $registry->get_adaptor( 'Human', 'Core', 'Sequence' );
35 
36  my $dna =
37  ${ $sa->fetch_by_Slice_start_end_strand( $slice, 1, 1000, -1 ) };
38 
39 =head1 DESCRIPTION
40 
41 An adaptor for the retrieval of DNA sequence from the EnsEMBL database
42 
43 =head1 METHODS
44 
45 =cut
46 
47 package Bio::EnsEMBL::DBSQL::SequenceAdaptor;
48 
49 use strict;
50 use warnings;
51 
52 use Bio::EnsEMBL::Utils::Exception qw(throw);
53 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
54 use Bio::EnsEMBL::Utils::Scalar qw( assert_ref );
55 use DBI qw/:sql_types/;
57 our @EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
58 
59 =head2 new
60 
61  Arg [1] : none
62  Example : my $sa = $db_adaptor->get_SequenceAdaptor();
63  Description: Constructor. Calls superclass constructor and initialises
64  internal cache structure.
66  Exceptions : none
67  Caller : DBAdaptor::get_SequenceAdaptor
68  Status : Stable
69 
70 =cut
71 
72 sub new {
73  my ($caller, $db, $chunk_power, $cache_size) = @_;
74  my $class = ref($caller) || $caller;
75  my $self = $class->SUPER::new($db);
76  $self->_init_seq_instance($chunk_power, $cache_size);
77  $self->_populate_seq_region_edits();
78  return $self;
79 }
80 
81 =head2 fetch_by_Slice_start_end_strand
82 
83  Arg [1] : Bio::EnsEMBL::Slice slice
84  The slice from which you want the sequence
85  Arg [2] : (optional) int startBasePair
86  The start base pair relative to the start of the slice. Negative
87  values or values greater than the length of the slice are fine.
88  default = 1
89  Arg [3] : (optional) int endBasePair
90  The end base pair relative to the start of the slice. Negative
91  values or values greater than the length of the slice are fine,
92  but the end must be greater than or equal to the start
93  count from 1
94  default = the length of the slice
95  Arg [4] : (optional) int strand
96  1, -1
97  default = 1
98  Example : $dna = $seq_adptr->fetch_by_Slice_start_end_strand($slice, 1,
99  1000, -1);
100  Description: Retrieves from db the sequence for this slice
101  uses AssemblyMapper to find the assembly
102  Returntype : string
103  Exceptions : endBasePair should be less or equal to length of slice
104  Caller : Bio::EnsEMBL::Slice::seq(), Slice::subseq()
105  Status : Stable
106 
107 =cut
108 
109 sub fetch_by_Slice_start_end_strand {
110  my ( $self, $slice, $start, $end, $strand ) = @_;
111 
112  if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
113  throw("Slice argument is required.");
114  }
115 
116  $start = 1 if(!defined($start));
117 
118 
119  if ( ( !defined($end) || $start > $end || $start < 0 || $end < 0 || $slice->start> $slice->end ) && $slice->is_circular ) {
120 
121  if ( !defined($end) || ($start > $end ) ) {
122  return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $start, $end, $strand );
123  }
124 
125  if ( defined($end) && ($end < 0) ) {
126  $end += $slice->seq_region_length;
127  }
128 
129  if ($start < 0) {
130  $start += $slice->seq_region_length;
131  }
132 
133  if($slice->start> $slice->end) {
134  return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $slice->start, $slice->end, $strand );
135  }
136  }
137 
138  if ( ( !defined($end) ) && (not $slice->is_circular) ) {
139  $end = $slice->end() - $slice->start() + 1;
140  }
141 
142  if ( $start > $end ) {
143  throw("Start must be less than or equal to end.");
144  }
145 
146  $strand ||= 1;
147  #get a new slice that spans the exact region to retrieve dna from
148  my $right_expand = $end - $slice->length(); #negative is fine
149  my $left_expand = 1 - $start; #negative is fine
150 
151  if($right_expand || $left_expand) {
152  $slice = $slice->expand($left_expand, $right_expand);
153  }
154 
155  #retrieve normalized 'non-symlinked' slices
156  #this allows us to support haplotypes and PARs
157  my $slice_adaptor = $slice->adaptor();
158  my @symproj=@{$slice_adaptor->fetch_normalized_slice_projection($slice)};
159 
160  if(@symproj == 0) {
161  throw('Could not retrieve normalized Slices. Database contains ' .
162  'incorrect assembly_exception information.');
163  }
164 
165  #call this method again with any slices that were 'symlinked' to by this
166  #slice
167  if(@symproj != 1 || $symproj[0]->[2] != $slice) {
168  my $seq;
169  foreach my $segment (@symproj) {
170  my $symlink_slice = $segment->[2];
171  #get sequence from each symlinked area
172  $seq .= ${$self->fetch_by_Slice_start_end_strand($symlink_slice,
173  1,undef,1)};
174  }
175  if($strand == -1) {
176  reverse_comp(\$seq);
177  }
178  return \$seq;
179  }
180 
181  # we need to project this slice onto the sequence coordinate system
182  # even if the slice is in the same coord system, we want to trim out
183  # flanking gaps (if the slice is past the edges of the seqregion)
184  my $csa = $self->db->get_CoordSystemAdaptor();
185  my $seqlevel = $csa->fetch_sequence_level();
186 
187  my @projection=@{$slice->project($seqlevel->name(), $seqlevel->version())};
188 
189  my $seq = '';
190  my $total = 0;
191  my $tmp_seq;
192 
193  #fetch sequence from each of the sequence regions projected onto
194  foreach my $segment (@projection) {
195  my ($start, $end, $seq_slice) = @$segment;
196 
197  #check for gaps between segments and pad them with Ns
198  my $gap = $start - $total - 1;
199  if($gap) {
200  $seq .= 'N' x $gap;
201  }
202 
203  my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);
204 
205  $tmp_seq = ${$self->_fetch_seq($seq_region_id,
206  $seq_slice->start, $seq_slice->length())};
207 
208  if(!defined $tmp_seq) {
209  throw('No sequence found for seq_region '.$seq_region_id.':'.$seq_slice->start);
210  }
211 
212  #reverse compliment on negatively oriented slices
213  if($seq_slice->strand == -1) {
214  reverse_comp(\$tmp_seq);
215  }
216 
217  $seq .= $tmp_seq;
218 
219  $total = $end;
220  }
221 
222  #check for any remaining gaps at the end
223  my $gap = $slice->length - $total;
224  if($gap) {
225  $seq .= 'N' x $gap;
226  }
227 
228  #if the sequence is too short it is because we came in with a seqlevel
229  #slice that was partially off of the seq_region. Pad the end with Ns
230  #to make long enough
231  if(length($seq) != $slice->length()) {
232  $seq .= 'N' x ($slice->length() - length($seq));
233  }
234 
235  if(defined($self->{_rna_edits_cache}) and defined($self->{_rna_edits_cache}->{$slice->get_seq_region_id})){
236  $self->_rna_edit($slice,\$seq);
237  }
238 
239  #if they asked for the negative slice strand revcomp the whole thing
240  reverse_comp(\$seq) if($strand == -1);
241 
242  return \$seq;
243 }
244 
245 =head2 can_access_Slice
246 
247  Description : Returns 1 since we can access any Slice's data
248 
249 =cut
250 
251 sub can_access_Slice {
252  return 1;
253 }
254 
255 sub _fetch_by_Slice_start_end_strand_circular {
256  my ( $self, $slice, $start, $end, $strand ) = @_;
257 
258  assert_ref( $slice, 'Bio::EnsEMBL::Slice' );
259 
260  $strand ||= 1;
261  if ( !defined($start) ) {
262  $start ||= 1;
263  }
264 
265  if ( !defined($end) ) {
266  $end = $slice->end() - $slice->start() + 1;
267  }
268 
269  if ( $start > $end && $slice->is_circular() ) {
270  my ($seq, $seq1, $seq2);
271 
272  my $midpoint = $slice->seq_region_length - $slice->start + 1;
273  $seq1 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, 1, $midpoint, 1 )};
274  $seq2 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, $midpoint + 1, $slice->length(), 1 )};
275 
276  $seq = $slice->strand > 0 ? "$seq1$seq2" : "$seq2$seq1";
277 
278  reverse_comp( \$seq ) if ( $strand == -1 );
279 
280  return \$seq;
281  }
282 
283  # Get a new slice that spans the exact region to retrieve dna from
284  my $right_expand = $end - $slice->length(); #negative is fine
285  my $left_expand = 1 - $start; #negative is fine
286 
287  if ( $right_expand || $left_expand ) {
288  $slice =
289  $slice->strand > 0
290  ? $slice->expand( $left_expand, $right_expand )
291  : $slice->expand( $right_expand, $left_expand );
292  }
293 
294  # Retrieve normalized 'non-symlinked' slices. This allows us to
295  # support haplotypes and PARs.
296  my $slice_adaptor = $slice->adaptor();
297  my @symproj =
298  @{ $slice_adaptor->fetch_normalized_slice_projection($slice) };
299 
300  if ( @symproj == 0 ) {
301  throw( 'Could not retrieve normalized Slices. Database contains '
302  . 'incorrect assembly_exception information.' );
303  }
304 
305  # Call this method again with any slices that were 'symlinked' to by
306  # this slice.
307  if ( @symproj != 1 || $symproj[0]->[2] != $slice ) {
308  my $seq;
309  foreach my $segment (@symproj) {
310  my $symlink_slice = $segment->[2];
311 
312  # Get sequence from each symlinked area.
313  $seq .= ${
314  $self->fetch_by_Slice_start_end_strand( $symlink_slice, 1,
315  undef, 1 ) };
316  }
317  if ( $strand == -1 ) {
318  reverse_comp( \$seq );
319  }
320 
321  return \$seq;
322  }
323 
324  # We need to project this slice onto the sequence coordinate system
325  # even if the slice is in the same coord system, we want to trim out
326  # flanking gaps (if the slice is past the edges of the seqregion).
327  my $csa = $self->db->get_CoordSystemAdaptor();
328  my $seqlevel = $csa->fetch_sequence_level();
329 
330  my @projection =
331  @{ $slice->project( $seqlevel->name(), $seqlevel->version() ) };
332 
333  my $seq = '';
334  my $total = 0;
335  my $tmp_seq;
336 
337  # Fetch sequence from each of the sequence regions projected onto.
338  foreach my $segment (@projection) {
339  my ( $start, $end, $seq_slice ) = @{$segment};
340 
341  # Check for gaps between segments and pad them with Ns
342  my $gap = $start - $total - 1;
343  if ($gap) {
344  $seq .= 'N' x $gap;
345  }
346 
347  my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);
348 
349  $tmp_seq = ${
350  $self->_fetch_seq( $seq_region_id, $seq_slice->start(),
351  $seq_slice->length() ) };
352 
353  # Reverse compliment on negatively oriented slices.
354  if ( $seq_slice->strand == -1 ) {
355  reverse_comp( \$tmp_seq );
356  }
357 
358  $seq .= $tmp_seq;
359 
360  $total = $end;
361  }
362 
363  # Check for any remaining gaps at the end.
364  my $gap = $slice->length() - $total;
365 
366  if ($gap) {
367  $seq .= 'N' x $gap;
368  }
369 
370  # If the sequence is too short it is because we came in with a
371  # seqlevel slice that was partially off of the seq_region. Pad the
372  # end with Ns to make long enough
373  if ( length($seq) != $slice->length() ) {
374  $seq .= 'N' x ( $slice->length() - length($seq) );
375  }
376 
377  if ( defined( $self->{_rna_edits_cache} )
378  && defined(
379  $self->{_rna_edits_cache}->{ $slice->get_seq_region_id } ) )
380  {
381  $self->_rna_edit( $slice, \$seq );
382  }
383 
384  return \$seq;
385 } ## end sub _fetch_by_Slice_start_end_strand_circular
386 
387 
388 
389 =head2 _rna_edit
390 
391  Description : Performs within sequence region editting when
392  the underlying sequence is incorrect. Used by LRGs.
393 
394 =cut
395 
396 sub _rna_edit {
397  my $self = shift;
398  my $slice = shift;
399  my $seq = shift; #reference to string
400 
401  my $s_start = $slice->start; # substr start at 0 , but seq starts at 1 (so no -1 here)
402  my $s_end = $s_start+length($$seq) - 1; # But we do need -1 here to keep the coords closed
403 
404  foreach my $edit (@{$self->{_rna_edits_cache}->{$slice->get_seq_region_id}}){
405  # seq_region_attrib for an exit looks like
406  # <start> <end> <edit>
407  # 2568 2569 GT
408  my ($start, $end, $txt) = split (/\s+/, $edit);
409 
410  # check that RNA edit is not outside the requested region : happens quite often with LRG regions
411  next if ($end < $s_start);
412  next if ($s_end < $start);
413  # Length of the edit ($txt)
414  my $edit_length = length($txt);
415 
416  # If the edit isn't fully encompassed by the slice, we need to extract the
417  # edit's sub-sequence that we're patching on to the sequence
418  if($start < $s_start || $end > $s_end) {
419  my $edit_offset;
420  # Find the offset for the start of the edit, eg.
421  # seq slice: TC
422  # LRG edit : CG
423  # Offset should be 1 to extract starting at the G
424  # Formula: offset = max(s_start - start, 0)
425  $edit_offset = ($s_start - $start) < 0 ? 0 : $s_start - $start;
426 
427  # Find the length of the overlapping piece of the edit.
428  # This needs to take in to account the offset
429  # seq slice: TC
430  # LRG edit : GA
431  # Where we're want the length to be 1 from an existing offset of 1 to
432  # extract just G
433  # Forumla: length = length(edit) - offset - max(end - s_end, 0)
434  $edit_length = length($txt) - $edit_offset - ($end - $s_end < 0 ? 0 : $end - $s_end);
435 
436  # Extract the overlapping edit
437  $txt = substr($txt, $edit_offset, $edit_length);
438  }
439 
440  # Apply the patch, we don't want negative offsets as that's totally wrong.
441  # Do a max(start - s_start, 0) to prevent this
442  substr($$seq,($start-$s_start < 0 ? 0 : $start-$s_start),$edit_length, $txt);
443  }
444  return;
445 }
446 
447 =head2 _fetch_raw_seq
448 
449  Description : Communicates with the database to fetch back sequence
450 
451 =cut
452 
453 sub _fetch_raw_seq {
454  my ($self, $id, $start, $length) = @_;
455  my $sql = <<'SQL';
456 SELECT UPPER(SUBSTR(d.sequence, ?, ?))
457 FROM dna d
458 WHERE d.seq_region_id =?
459 SQL
460  my $seq = $self->dbc()->sql_helper()->execute_single_result(
461  -SQL => $sql,
462  -PARAMS => [[$start, SQL_INTEGER], [$length, SQL_INTEGER], [$id, SQL_INTEGER]],
463  -NO_ERROR => 1
464  );
465  return \$seq;
466 }
467 
468 =head2 store
469 
470  Arg [1] : int $seq_region_id the id of the sequence region this dna
471  will be associated with.
472  Arg [2] : string $sequence the dna sequence to be stored
473  in the database. Note that the sequence passed in will be
474  converted to uppercase.
475  Example : $seq_adaptor->store(11, 'ACTGGGTACCAAACAAACACAACA');
476  Description: stores a dna sequence in the databases dna table and returns the
477  database identifier for the new record.
478  Returntype : none
479  Exceptions : throw if the database insert fails
480  Caller : sequence loading scripts
481  Status : Stable
482 
483 =cut
484 
485 sub store {
486  my ($self, $seq_region_id, $sequence) = @_;
487 
488  if(!$seq_region_id) {
489  throw('seq_region_id is required');
490  }
491 
492  $sequence = uc($sequence);
493 
494  my $statement =
495  $self->prepare("INSERT INTO dna(seq_region_id, sequence) VALUES(?,?)");
496 
497  $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
498  $statement->bind_param(2,$sequence,SQL_LONGVARCHAR);
499  $statement->execute();
500 
501  $statement->finish();
502 
503  return;
504 }
505 
506 =head2 remove
507 
508  Arg [1] : int $seq_region_id the id of the sequence region this dna
509  is associated with.
510  Example : $seq_adaptor->remove(11);
511  Description: removes a dna sequence for a given seq_region_id
512  Returntype : none
513  Exceptions : throw if the database delete fails
514  Caller : Internal
515  Status : Stable
516 
517 =cut
518 
519 sub remove {
520  my ($self, $seq_region_id) = @_;
521 
522  if(!$seq_region_id) {
523  throw('seq_region_id is required');
524  }
525 
526  my $statement =
527  $self->prepare("DELETE FROM dna WHERE seq_region_id = ?");
528 
529  $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
530  $statement->execute();
531 
532  $statement->finish();
533 
534  return;
535 }
536 
537 =head2 _populate_seq_region_edits
538 
539  Description: Query the database for any _rna_edit attributes attached to a seq region
540 
541 =cut
542 
543 sub _populate_seq_region_edits {
544  my ($self) = @_;
545  my $sql;
546  my @params = ('_rna_edit');
547  if($self->db()->is_multispecies()) {
548  $sql = <<'SQL';
549 select sra.seq_region_id, sra.value
550 from seq_region_attrib sra
551 join attrib_type using (attrib_type_id)
552 join seq_region s using (seq_region_id)
553 join coord_system cs using (coord_system_id)
554 where code =?
555 and species_id =?
556 SQL
557  push(@params, $self->db()->species_id());
558  }
559  else {
560  $sql = <<'SQL';
561 select sra.seq_region_id, sra.value
562 from seq_region_attrib sra join attrib_type using (attrib_type_id)
563 where code = ?
564 SQL
565  }
566 
567  my $mapper = sub {
568  my ($row, $array) = @_;
569  my ($seq_region_id, $value) = @{$row};
570  my ($start, $end, $substring) = split (/\s+/, $value);
571  my $edit_length = ($end - $start) + 1;
572  my $substring_length = length($substring);
573  if($edit_length != $substring_length) {
574  throw "seq_region_id $seq_region_id has an attrib of type '_rna_edit' (value '$value'). Edit length ${edit_length} is not the same as the replacement's length ${substring_length}. Please fix. We only support substitutions via this mechanism";
575  }
576  if(defined $array) {
577  push(@{$array}, $value);
578  return;
579  }
580  return [$value];
581  };
582 
583  my $edits = $self->dbc()->sql_helper->execute_into_hash(-SQL => $sql, -PARAMS => ['_rna_edit'], -CALLBACK => $mapper);
584  $self->{_rna_edits_cache} = $edits if %{$edits};
585  return;
586 }
587 
588 1;
EnsEMBL
Definition: Filter.pm:1
AssemblyMapper
Definition: BlastzAligner.pm:7
Bio::EnsEMBL::Utils::Sequence
Definition: Sequence.pm:22
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::DBSQL::SequenceAdaptor
Definition: SequenceAdaptor.pm:22
Bio::EnsEMBL::Slice::seq
public String seq()
Bio::EnsEMBL::DBSQL::BaseSequenceAdaptor
Definition: BaseSequenceAdaptor.pm:35
Bio::EnsEMBL::DBSQL::BaseAdaptor
Definition: BaseAdaptor.pm:71
Bio::EnsEMBL::Utils::Scalar
Definition: Scalar.pm:66
Bio::EnsEMBL::DBSQL::BaseSequenceAdaptor::_init_seq_instance
protected _init_seq_instance()
Bio::EnsEMBL::DBSQL::SequenceAdaptor::fetch_by_Slice_start_end_strand
public String fetch_by_Slice_start_end_strand()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68