3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
31 Juguang Xiao <juguang@tll.org.sg>
42 Sequence alignment hits were previously stored within the core database
43 as ungapped alignments. This imposed 2 major constraints on alignments:
45 a) alignments
for a single hit record would require multiple rows in the
47 b) it was not possible to accurately retrieve the exact original
50 Therefore, in the
new branch sequence alignments are now stored as
51 ungapped alignments in the cigar line format (where CIGAR stands
for
52 Concise Idiosyncratic Gapped Alignment Report).
54 In the cigar line format alignments are stored as follows:
60 An example of an alignment
for a hypthetical protein match is shown
64 Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
66 Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
68 This would be stored in the protein_align_feature table as the following
75 package Bio::EnsEMBL::Utils::CigarString;
82 Name : split_hsp (
this name is derived from the original sub in BlastWorn)
83 Usage : my $hsp; # a ready Bio::Search::HSP::GenericHSP
object.
85 my $cigar_string = $factory->
split_hsp($hsp);
87 Function: generate cigar
string.
89 Returns : a text
string.
94 my ($self, $hsp) = @_;
96 $self->throw(
"a defined object needed") unless($hsp && defined($hsp));
97 unless(ref($hsp) && $hsp->isa(
'Bio::Search::HSP::GenericHSP')){
98 $self->throw(
"a HSP object needed");
101 my ($qtype, $htype) = $self->_findTypes($hsp);
102 my ($qstrand, $hstrand) = $self->_findStrands($hsp);
103 my ($qinc, $hinc) = $self->_findIncrements($qstrand,$hstrand,$qtype,$htype);
110 $qstart = $hsp->query->start;
111 }elsif($qstart == -1){
112 $qstart = $hsp->query->end;
114 $self->warn(
"[$qstart], invalid strand value on query");
115 $qstart = $hsp->query->start;
116 # Is this a SearchIO's bug???
121 $hstart = $hsp->subject->start;
122 }elsif($hstrand != -1){
123 $hstart = $hsp->subject->end;
125 $self->throw(
"[$hstart], invalid strand value on subject");
133 my @align_coordinates = ();
134 while($count <= $#qchars){
135 if($qchars[$count] ne
'-' && $hchars[$count] ne
'-') {
139 }
else{ # gapped region
140 push(@align_coordinates, [$qstart, $hstart])
if($found == 1);
143 $qstart += $qinc
if($qchars[$count] ne
'-');
146 $hstart += $hinc
if($hchars[$count] ne
'-');
156 push(@align_coordinates, [$qstart, $hstart]);
159 my $cigar_string =
"";
160 my $last = $#align_coordinates;
162 for(my $i=0; $i<$last; $i++){
163 my $q_this_start = $align_coordinates[$i]->[0];
164 my $q_next_start = $align_coordinates[$i+1]->[0];
165 my $q_length = ($q_next_start-$q_this_start-1)*$qinc;
166 $q_length = abs($q_length);
167 my $h_this_start = $align_coordinates[$i]->[1];
168 my $h_next_start = $align_coordinates[$i+1]->[1];
169 my $h_length = ($h_next_start-$h_this_start-1)*$hinc;
170 $h_length = abs($h_length);
172 my $diff = $q_length - $h_length;
173 if($diff > 0){ # Insertion
174 $cigar_string .= $diff unless($diff == 1);
175 $cigar_string .=
'I';
176 }elsif($diff < 0){ # Deletion
177 $cigar_string .= -$diff unless($diff == -1);
178 $cigar_string .=
'D';
179 }
else{ # e.g $diff == 0, Match
180 $cigar_string .= $q_length unless($q_length == 1);
181 $cigar_string .=
'M';
187 return $cigar_string;
192 my ($self,$hsp) = @_;
194 my $qstrand = $hsp->query->strand;
195 unless($qstrand == 1 || $qstrand == -1){
196 $self->warn(
"query's strand value is neither 1 or -1");
200 my $hstrand = $hsp->subject->strand;
201 unless($hstrand == 1 || $hstrand == -1){
202 $self->warn(
"subject's strand value is neither 1 or -1");
206 return ( $qstrand, $hstrand);
210 my ($self,$hsp) = @_;
214 my $len1 = $hsp->query->length();
215 my $len2 = $hsp->subject->length();
217 if ($len1/$len2 > 2) {
220 } elsif ($len2/$len1 > 2) {
228 return ($type1,$type2);
231 sub _findIncrements {
232 my ($self,$qstrand,$hstrand,$qtype,$htype) = @_;
234 my $qinc = 1 * $qstrand;
235 my $hinc = 1 * $hstrand;
237 if ($qtype eq
'dna' && $htype eq
'pep') {
240 if ($qtype eq
'pep' && $htype eq
'dna') {
244 return ($qinc,$hinc);
247 # This is a core logic of cigar string. The finite state machine theory is
248 # apply. See the below table, x-axis represents the input, with 3 options:
249 # (+/+) -- Both current query and subject bases are non-gap. Match
250 # (-/+) -- The current query base is gap, but subject not. Deletion
251 # (+/-) -- The current subject base is gap, but query not. Insertion
252 # While the y-axis means the current state with letter 'M', 'D', 'I'
254 # The content of this table is the action taken in response of the input and
256 # R remain the state, counter increment.
257 # G;X generate the cigar line based on the current state and counter;
258 # clear the counter to zero and change to the state X
260 # || +/+ | -/+ | +/- |
261 # -------+----------------------+
262 # M || R | G;D | G;I |
263 # ------------------------------+
264 # D || G;M | R | G;I |
265 # ------------------------------+
266 # I || G;M | G;D | R |
269 =head2 generate_cigar_string
271 Name : generate_cigar_string
272 Usage: $cigar_string = $self->generate_cigar_string(\@qchars, \@hchars);
273 Function: generate the cigar
string for a piece of alignment.
274 Args: 2 array references. The lengths of 2 arrays are the same
275 Return: a cigar
string
279 # Developer's Note: The method is originally abstracted from the concept of
280 # cigar string. It only asks the essential information of 2 sequence characters
281 # of the alignment, while the BlastWorn::split_HSP asks more unused information
282 # for cigar string, which is useful to form align_coordinates. - Juguang
284 my ($count, $state); # strictly only used in the following 2 subs
286 sub generate_cigar_string {
288 # my ($self, $qstart, $hstart, $qinc, $hinc, $qchars_ref, $hchars_ref) = @_;
290 my ($self, $qchars_ref, $hchars_ref) = @_;
292 my @qchars = @{$qchars_ref};
293 my @hchars = @{$hchars_ref};
295 unless(scalar(@qchars) == scalar(@hchars)){
296 $self->throw(
"two sequences are not equal in lengths");
300 $state =
'M'; # the current state of gaps, (M, D, I)
302 my $cigar_string =
'';
303 for(my $i=0; $i <= $#qchars; $i++){
304 my $qchar = $qchars[$i];
305 my $hchar = $hchars[$i];
306 if($qchar ne
'-' && $hchar ne
'-'){ # Match
307 $cigar_string .= $self->_sub_cigar_string(
'M');
308 }elsif($qchar eq
'-'){ # Deletion
309 $cigar_string .= $self->_sub_cigar_string(
'D');
310 }elsif($hchar eq
'-'){ # Insertion
311 $cigar_string .= $self->_sub_cigar_string(
'I');
313 $self->throw(
"Impossible state that 2 gaps on each seq aligned");
316 $cigar_string .= $self->_sub_cigar_string(
'X'); # not forget the tail.
317 return $cigar_string;
320 sub _sub_cigar_string {
321 my ($self, $new_state) = @_;
322 my $sub_cigar_string =
'';
323 if($state eq $new_state){
324 $count++; # Remain the state and increase the counter
326 $sub_cigar_string .= $count unless $count == 1;
327 $sub_cigar_string .= $state;
331 return $sub_cigar_string;
334 =head2 generate_cigar_string_by_hsp
336 Name : generate_cigar_string_by_hsp
337 Usage : my $hsp; # a ready GenericHSP
object
338 my $cigar_string = $self->generate_cigar_string_by_hsp($hsp);
339 Function: generate a cigar
string by given HSP
object.
340 Args : a GenericHSP
object
341 Returns: a text
string of cigar
string
345 sub generate_cigar_string_by_hsp {
346 my ($self, $hsp) = @_;
348 unless(ref($hsp) && $hsp->isa(
'Bio::Search::HSP::GenericHSP')){
349 $self->throw(
"A GenericHSP object needed");
355 return $self->generate_cigar_string(\@qchars, \@hchars);