3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 Please email comments or questions to the
public Ensembl
21 developers list at <dev@ensembl.org>.
23 Questions may also be sent to the Ensembl help desk at
24 <helpdesk@ensembl.org>.
34 my $sa = $registry->get_adaptor(
'Human',
'Core',
'Sequence' );
41 An adaptor
for the retrieval of DNA sequence from the
EnsEMBL database
47 package Bio::EnsEMBL::DBSQL::SequenceAdaptor;
55 use DBI qw/:sql_types/;
57 our @EXPORT = (@{$DBI::EXPORT_TAGS{
'sql_types'}});
62 Example : my $sa = $db_adaptor->get_SequenceAdaptor();
63 Description: Constructor. Calls superclass constructor and initialises
64 internal cache structure.
67 Caller : DBAdaptor::get_SequenceAdaptor
73 my ($caller, $db, $chunk_power, $cache_size) = @_;
74 my $class = ref($caller) || $caller;
75 my $self = $class->SUPER::new($db);
77 $self->_populate_seq_region_edits();
81 =head2 fetch_by_Slice_start_end_strand
84 The slice from which you want the sequence
85 Arg [2] : (optional)
int startBasePair
86 The start base pair relative to the start of the slice. Negative
87 values or values greater than the length of the slice are fine.
89 Arg [3] : (optional)
int endBasePair
90 The end base pair relative to the start of the slice. Negative
91 values or values greater than the length of the slice are fine,
92 but the end must be greater than or equal to the start
94 default = the length of the slice
95 Arg [4] : (optional)
int strand
98 Example : $dna = $seq_adptr->fetch_by_Slice_start_end_strand($slice, 1,
100 Description: Retrieves from db the sequence
for this slice
103 Exceptions : endBasePair should be less or equal to length of slice
109 sub fetch_by_Slice_start_end_strand {
110 my ( $self, $slice, $start, $end, $strand ) = @_;
112 if(!ref($slice) || !($slice->isa(
"Bio::EnsEMBL::Slice") or $slice->isa(
'Bio::EnsEMBL::LRGSlice')) ) {
113 throw(
"Slice argument is required.");
116 $start = 1
if(!defined($start));
119 if ( ( !defined($end) || $start > $end || $start < 0 || $end < 0 || $slice->start> $slice->end ) && $slice->is_circular ) {
121 if ( !defined($end) || ($start > $end ) ) {
122 return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $start, $end, $strand );
125 if ( defined($end) && ($end < 0) ) {
126 $end += $slice->seq_region_length;
130 $start += $slice->seq_region_length;
133 if($slice->start> $slice->end) {
134 return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $slice->start, $slice->end, $strand );
138 if ( ( !defined($end) ) && (not $slice->is_circular) ) {
139 $end = $slice->end() - $slice->start() + 1;
142 if ( $start > $end ) {
143 throw(
"Start must be less than or equal to end.");
147 #get a new slice that spans the exact region to retrieve dna from
148 my $right_expand = $end - $slice->length(); #negative is fine
149 my $left_expand = 1 - $start; #negative is fine
151 if($right_expand || $left_expand) {
152 $slice = $slice->expand($left_expand, $right_expand);
155 #retrieve normalized 'non-symlinked' slices
156 #this allows us to support haplotypes and PARs
157 my $slice_adaptor = $slice->adaptor();
158 my @symproj=@{$slice_adaptor->fetch_normalized_slice_projection($slice)};
161 throw(
'Could not retrieve normalized Slices. Database contains ' .
162 'incorrect assembly_exception information.');
165 #call this method again with any slices that were 'symlinked' to by this
167 if(@symproj != 1 || $symproj[0]->[2] != $slice) {
169 foreach my $segment (@symproj) {
170 my $symlink_slice = $segment->[2];
171 #get sequence from each symlinked area
172 $seq .= ${$self->fetch_by_Slice_start_end_strand($symlink_slice,
181 # we need to project this slice onto the sequence coordinate system
182 # even if the slice is in the same coord system, we want to trim out
183 # flanking gaps (if the slice is past the edges of the seqregion)
184 my $csa = $self->db->get_CoordSystemAdaptor();
185 my $seqlevel = $csa->fetch_sequence_level();
187 my @projection=@{$slice->project($seqlevel->name(), $seqlevel->version())};
193 #fetch sequence from each of the sequence regions projected onto
194 foreach my $segment (@projection) {
195 my ($start, $end, $seq_slice) = @$segment;
197 #check for gaps between segments and pad them with Ns
198 my $gap = $start - $total - 1;
203 my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);
205 $tmp_seq = ${$self->_fetch_seq($seq_region_id,
206 $seq_slice->start, $seq_slice->length())};
208 if(!defined $tmp_seq) {
209 throw(
'No sequence found for seq_region '.$seq_region_id.
':'.$seq_slice->start);
212 #reverse compliment on negatively oriented slices
213 if($seq_slice->strand == -1) {
214 reverse_comp(\$tmp_seq);
222 #check for any remaining gaps at the end
223 my $gap = $slice->length - $total;
228 #if the sequence is too short it is because we came in with a seqlevel
229 #slice that was partially off of the seq_region. Pad the end with Ns
231 if(length($seq) != $slice->length()) {
232 $seq .=
'N' x ($slice->length() - length($seq));
235 if(defined($self->{_rna_edits_cache}) and defined($self->{_rna_edits_cache}->{$slice->get_seq_region_id})){
236 $self->_rna_edit($slice,\$seq);
239 #if they asked for the negative slice strand revcomp the whole thing
240 reverse_comp(\$seq)
if($strand == -1);
245 =head2 can_access_Slice
247 Description : Returns 1 since we can access any Slice
's data
251 sub can_access_Slice {
255 sub _fetch_by_Slice_start_end_strand_circular {
256 my ( $self, $slice, $start, $end, $strand ) = @_;
261 if ( !defined($start) ) {
265 if ( !defined($end) ) {
266 $end = $slice->end() - $slice->start() + 1;
269 if ( $start > $end && $slice->is_circular() ) {
270 my ($seq, $seq1, $seq2);
272 my $midpoint = $slice->seq_region_length - $slice->start + 1;
273 $seq1 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, 1, $midpoint, 1 )};
274 $seq2 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, $midpoint + 1, $slice->length(), 1 )};
276 $seq = $slice->strand > 0 ? "$seq1$seq2" : "$seq2$seq1";
278 reverse_comp( \$seq ) if ( $strand == -1 );
283 # Get a new slice that spans the exact region to retrieve dna from
284 my $right_expand = $end - $slice->length(); #negative is fine
285 my $left_expand = 1 - $start; #negative is fine
287 if ( $right_expand || $left_expand ) {
290 ? $slice->expand( $left_expand, $right_expand )
291 : $slice->expand( $right_expand, $left_expand );
294 # Retrieve normalized 'non-symlinked
' slices. This allows us to
295 # support haplotypes and PARs.
296 my $slice_adaptor = $slice->adaptor();
298 @{ $slice_adaptor->fetch_normalized_slice_projection($slice) };
300 if ( @symproj == 0 ) {
301 throw( 'Could not retrieve normalized Slices. Database contains
'
302 . 'incorrect assembly_exception information.
' );
305 # Call this method again with any slices that were 'symlinked
' to by
307 if ( @symproj != 1 || $symproj[0]->[2] != $slice ) {
309 foreach my $segment (@symproj) {
310 my $symlink_slice = $segment->[2];
312 # Get sequence from each symlinked area.
314 $self->fetch_by_Slice_start_end_strand( $symlink_slice, 1,
317 if ( $strand == -1 ) {
318 reverse_comp( \$seq );
324 # We need to project this slice onto the sequence coordinate system
325 # even if the slice is in the same coord system, we want to trim out
326 # flanking gaps (if the slice is past the edges of the seqregion).
327 my $csa = $self->db->get_CoordSystemAdaptor();
328 my $seqlevel = $csa->fetch_sequence_level();
331 @{ $slice->project( $seqlevel->name(), $seqlevel->version() ) };
337 # Fetch sequence from each of the sequence regions projected onto.
338 foreach my $segment (@projection) {
339 my ( $start, $end, $seq_slice ) = @{$segment};
341 # Check for gaps between segments and pad them with Ns
342 my $gap = $start - $total - 1;
347 my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);
350 $self->_fetch_seq( $seq_region_id, $seq_slice->start(),
351 $seq_slice->length() ) };
353 # Reverse compliment on negatively oriented slices.
354 if ( $seq_slice->strand == -1 ) {
355 reverse_comp( \$tmp_seq );
363 # Check for any remaining gaps at the end.
364 my $gap = $slice->length() - $total;
370 # If the sequence is too short it is because we came in with a
371 # seqlevel slice that was partially off of the seq_region. Pad the
372 # end with Ns to make long enough
373 if ( length($seq) != $slice->length() ) {
374 $seq .= 'N
' x ( $slice->length() - length($seq) );
377 if ( defined( $self->{_rna_edits_cache} )
379 $self->{_rna_edits_cache}->{ $slice->get_seq_region_id } ) )
381 $self->_rna_edit( $slice, \$seq );
385 } ## end sub _fetch_by_Slice_start_end_strand_circular
391 Description : Performs within sequence region editting when
392 the underlying sequence is incorrect. Used by LRGs.
399 my $seq = shift; #reference to string
401 my $s_start = $slice->start; # substr start at 0 , but seq starts at 1 (so no -1 here)
402 my $s_end = $s_start+length($$seq) - 1; # But we do need -1 here to keep the coords closed
404 foreach my $edit (@{$self->{_rna_edits_cache}->{$slice->get_seq_region_id}}){
405 # seq_region_attrib for an exit looks like
406 # <start> <end> <edit>
408 my ($start, $end, $txt) = split (/\s+/, $edit);
410 # check that RNA edit is not outside the requested region : happens quite often with LRG regions
411 next if ($end < $s_start);
412 next if ($s_end < $start);
413 # Length of the edit ($txt)
414 my $edit_length = length($txt);
416 # If the edit isn't fully encompassed by the slice, we need to extract the
417 # edit's sub-sequence that we're patching on to the sequence
418 if($start < $s_start || $end > $s_end) {
420 # Find the offset for the start of the edit, eg.
423 # Offset should be 1 to extract starting at the G
424 # Formula: offset = max(s_start - start, 0)
425 $edit_offset = ($s_start - $start) < 0 ? 0 : $s_start - $start;
427 # Find the length of the overlapping piece of the edit.
428 # This needs to take in to account the offset
431 # Where we're want the length to be 1 from an existing offset of 1 to
433 # Forumla: length = length(edit) - offset - max(end - s_end, 0)
434 $edit_length = length($txt) - $edit_offset - ($end - $s_end < 0 ? 0 : $end - $s_end);
436 # Extract the overlapping edit
437 $txt = substr($txt, $edit_offset, $edit_length);
440 # Apply the patch, we don't want negative offsets as that's totally wrong.
441 # Do a max(start - s_start, 0) to prevent this
442 substr($$seq,($start-$s_start < 0 ? 0 : $start-$s_start),$edit_length, $txt);
447 =head2 _fetch_raw_seq
449 Description : Communicates with the database to fetch back sequence
454 my ($self, $id, $start, $length) = @_;
456 SELECT UPPER(SUBSTR(d.sequence, ?, ?))
458 WHERE d.seq_region_id =?
460 my $seq = $self->dbc()->sql_helper()->execute_single_result(
462 -PARAMS => [[$start, SQL_INTEGER], [$length, SQL_INTEGER], [$id, SQL_INTEGER]],
470 Arg [1] :
int $seq_region_id the
id of the sequence region this dna
471 will be associated with.
472 Arg [2] :
string $sequence the dna sequence to be stored
473 in the database. Note that the sequence passed in will be
474 converted to uppercase.
475 Example : $seq_adaptor->store(11, 'ACTGGGTACCAAACAAACACAACA');
476 Description: stores a dna sequence in the databases dna table and returns the
477 database identifier for the new record.
479 Exceptions : throw if the database insert fails
480 Caller : sequence loading scripts
486 my ($self, $seq_region_id, $sequence) = @_;
488 if(!$seq_region_id) {
489 throw(
'seq_region_id is required');
492 $sequence = uc($sequence);
495 $self->prepare(
"INSERT INTO dna(seq_region_id, sequence) VALUES(?,?)");
497 $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
498 $statement->bind_param(2,$sequence,SQL_LONGVARCHAR);
499 $statement->execute();
501 $statement->finish();
508 Arg [1] :
int $seq_region_id the
id of the sequence region
this dna
510 Example : $seq_adaptor->remove(11);
511 Description: removes a dna sequence
for a given seq_region_id
513 Exceptions :
throw if the database
delete fails
520 my ($self, $seq_region_id) = @_;
522 if(!$seq_region_id) {
523 throw(
'seq_region_id is required');
527 $self->prepare(
"DELETE FROM dna WHERE seq_region_id = ?");
529 $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
530 $statement->execute();
532 $statement->finish();
537 =head2 _populate_seq_region_edits
539 Description: Query the database
for any _rna_edit attributes attached to a seq region
543 sub _populate_seq_region_edits {
546 my @params = (
'_rna_edit');
547 if($self->db()->is_multispecies()) {
549 select sra.seq_region_id, sra.value
550 from seq_region_attrib sra
551 join attrib_type
using (attrib_type_id)
552 join seq_region s
using (seq_region_id)
553 join coord_system cs
using (coord_system_id)
557 push(@params, $self->db()->species_id());
561 select sra.seq_region_id, sra.value
562 from seq_region_attrib sra join attrib_type
using (attrib_type_id)
568 my ($row, $array) = @_;
569 my ($seq_region_id, $value) = @{$row};
570 my ($start, $end, $substring) = split (/\s+/, $value);
571 my $edit_length = ($end - $start) + 1;
572 my $substring_length = length($substring);
573 if($edit_length != $substring_length) {
574 throw "seq_region_id $seq_region_id has an attrib of type '_rna_edit' (value '$value'). Edit length ${edit_length} is not the same as the replacement's length ${substring_length}. Please fix. We only support substitutions via this mechanism";
577 push(@{$array}, $value);
583 my $edits = $self->dbc()->sql_helper->execute_into_hash(-SQL => $sql, -PARAMS => [
'_rna_edit'], -CALLBACK => $mapper);
584 $self->{_rna_edits_cache} = $edits
if %{$edits};