ensembl-hive  2.8.1
CigarString.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =head1 AUTHOR
30 
31 Juguang Xiao <juguang@tll.org.sg>
32 
33 =cut
34 
35 =head1 NAME
36 
37 Bio::EnsEMBL::Utils::CigarString, a utilites module to generate cigar
38 strings
39 
40 =head1 DESCRIPTION
41 
42 Sequence alignment hits were previously stored within the core database
43 as ungapped alignments. This imposed 2 major constraints on alignments:
44 
45 a) alignments for a single hit record would require multiple rows in the
46  database, and
47 b) it was not possible to accurately retrieve the exact original
48  alignment.
49 
50 Therefore, in the new branch sequence alignments are now stored as
51 ungapped alignments in the cigar line format (where CIGAR stands for
52 Concise Idiosyncratic Gapped Alignment Report).
53 
54 In the cigar line format alignments are stored as follows:
55 
56  M: Match
57  D: Deletion
58  I: Insertion
59 
60 An example of an alignment for a hypthetical protein match is shown
61 below:
62 
63 
64  Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
65  PG P G GP R PLGP
66  Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
67 
68 This would be stored in the protein_align_feature table as the following
69 cigar line:
70 
71  7M4D12M2I2MD7M
72 
73 =cut
74 
75 package Bio::EnsEMBL::Utils::CigarString;
76 
77 use strict;
78 use vars qw(@ISA);
79 
80 =head2 split_hsp
81 
82  Name : split_hsp (this name is derived from the original sub in BlastWorn)
83  Usage : my $hsp; # a ready Bio::Search::HSP::GenericHSP object.
84 my $factory = new Bio::EnsEMBL::Utils::CigarString;
85 my $cigar_string = $factory->split_hsp($hsp);
86 
87  Function: generate cigar string.
88  Argument: a HSP object.
89  Returns : a text string.
90 
91 =cut
92 
93 sub split_hsp {
94  my ($self, $hsp) = @_;
95 
96  $self->throw("a defined object needed") unless($hsp && defined($hsp));
97  unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
98  $self->throw("a HSP object needed");
99  }
100 
101  my ($qtype, $htype) = $self->_findTypes($hsp);
102  my ($qstrand, $hstrand) = $self->_findStrands($hsp);
103  my ($qinc, $hinc) = $self->_findIncrements($qstrand,$hstrand,$qtype,$htype);
104 
105  my @gaps = ();
106  my @qchars = split(//, $hsp->query_string);
107  my @hchars = split(//, $hsp->hit_string);
108  my $qstart;
109  if($qstrand == 1){
110  $qstart = $hsp->query->start;
111  }elsif($qstart == -1){
112  $qstart = $hsp->query->end;
113  }else{
114  $self->warn("[$qstart], invalid strand value on query");
115  $qstart = $hsp->query->start;
116  # Is this a SearchIO's bug???
117  }
118 
119  my $hstart;
120  if($hstrand == 1){
121  $hstart = $hsp->subject->start;
122  }elsif($hstrand != -1){
123  $hstart = $hsp->subject->end;
124  }else{
125  $self->throw("[$hstart], invalid strand value on subject");
126  }
127 
128  my $qend = $qstart;
129  my $hend = $hstart;
130  my $count = 0;
131  my $found = 0;
132 
133  my @align_coordinates = ();
134  while($count <= $#qchars){
135  if($qchars[$count] ne '-' && $hchars[$count] ne '-') {
136  $qend += $qinc;
137  $hend += $hinc;
138  $found = 1;
139  }else{ # gapped region
140  push(@align_coordinates, [$qstart, $hstart]) if($found == 1);
141 
142  $qstart = $qend;
143  $qstart += $qinc if($qchars[$count] ne '-');
144 
145  $hstart = $hend;
146  $hstart += $hinc if($hchars[$count] ne '-');
147 
148  $qend = $qstart;
149  $hend = $hstart;
150  $found = 0;
151  }
152  $count++;
153  }
154 
155  if($found){
156  push(@align_coordinates, [$qstart, $hstart]);
157  }
158 
159  my $cigar_string = "";
160  my $last = $#align_coordinates;
161  if($last >= 0){
162  for(my $i=0; $i<$last; $i++){
163  my $q_this_start = $align_coordinates[$i]->[0];
164  my $q_next_start = $align_coordinates[$i+1]->[0];
165  my $q_length = ($q_next_start-$q_this_start-1)*$qinc;
166  $q_length = abs($q_length);
167  my $h_this_start = $align_coordinates[$i]->[1];
168  my $h_next_start = $align_coordinates[$i+1]->[1];
169  my $h_length = ($h_next_start-$h_this_start-1)*$hinc;
170  $h_length = abs($h_length);
171 
172  my $diff = $q_length - $h_length;
173  if($diff > 0){ # Insertion
174  $cigar_string .= $diff unless($diff == 1);
175  $cigar_string .= 'I';
176  }elsif($diff < 0){ # Deletion
177  $cigar_string .= -$diff unless($diff == -1);
178  $cigar_string .= 'D';
179  }else{ # e.g $diff == 0, Match
180  $cigar_string .= $q_length unless($q_length == 1);
181  $cigar_string .= 'M';
182  }
183 
184  } # for
185  } # if
186 
187  return $cigar_string;
188 }
189 
190 
191 sub _findStrands {
192  my ($self,$hsp) = @_;
193 
194  my $qstrand = $hsp->query->strand;
195  unless($qstrand == 1 || $qstrand == -1){
196  $self->warn("query's strand value is neither 1 or -1");
197  $qstrand = 1;
198  }
199 
200  my $hstrand = $hsp->subject->strand;
201  unless($hstrand == 1 || $hstrand == -1){
202  $self->warn("subject's strand value is neither 1 or -1");
203  $hstrand = 1;
204  }
205 
206  return ( $qstrand, $hstrand);
207 }
208 
209 sub _findTypes {
210  my ($self,$hsp) = @_;
211 
212  my $type1;
213  my $type2;
214  my $len1 = $hsp->query->length();
215  my $len2 = $hsp->subject->length();
216 
217  if ($len1/$len2 > 2) {
218  $type1 = 'dna';
219  $type2 = 'pep';
220  } elsif ($len2/$len1 > 2) {
221  $type1 = 'pep';
222  $type2 = 'dna';
223  } else {
224  $type1 = 'dna';
225  $type2 = 'dna';
226  }
227 
228  return ($type1,$type2);
229 }
230 
231 sub _findIncrements {
232  my ($self,$qstrand,$hstrand,$qtype,$htype) = @_;
233 
234  my $qinc = 1 * $qstrand;
235  my $hinc = 1 * $hstrand;
236 
237  if ($qtype eq 'dna' && $htype eq 'pep') {
238  $qinc = 3 * $qinc;
239  }
240  if ($qtype eq 'pep' && $htype eq 'dna') {
241  $hinc = 3 * $hinc;
242  }
243 
244  return ($qinc,$hinc);
245 }
246 
247 # This is a core logic of cigar string. The finite state machine theory is
248 # apply. See the below table, x-axis represents the input, with 3 options:
249 # (+/+) -- Both current query and subject bases are non-gap. Match
250 # (-/+) -- The current query base is gap, but subject not. Deletion
251 # (+/-) -- The current subject base is gap, but query not. Insertion
252 # While the y-axis means the current state with letter 'M', 'D', 'I'
253 #
254 # The content of this table is the action taken in response of the input and
255 # the current state.
256 # R remain the state, counter increment.
257 # G;X generate the cigar line based on the current state and counter;
258 # clear the counter to zero and change to the state X
259 #
260 # || +/+ | -/+ | +/- |
261 # -------+----------------------+
262 # M || R | G;D | G;I |
263 # ------------------------------+
264 # D || G;M | R | G;I |
265 # ------------------------------+
266 # I || G;M | G;D | R |
267 #
268 
269 =head2 generate_cigar_string
270 
271  Name : generate_cigar_string
272  Usage: $cigar_string = $self->generate_cigar_string(\@qchars, \@hchars);
273  Function: generate the cigar string for a piece of alignment.
274  Args: 2 array references. The lengths of 2 arrays are the same
275  Return: a cigar string
276 
277 =cut
278 
279 # Developer's Note: The method is originally abstracted from the concept of
280 # cigar string. It only asks the essential information of 2 sequence characters
281 # of the alignment, while the BlastWorn::split_HSP asks more unused information
282 # for cigar string, which is useful to form align_coordinates. - Juguang
283 
284 my ($count, $state); # strictly only used in the following 2 subs
285 
286 sub generate_cigar_string {
287 
288 # my ($self, $qstart, $hstart, $qinc, $hinc, $qchars_ref, $hchars_ref) = @_;
289 
290  my ($self, $qchars_ref, $hchars_ref) = @_;
291 
292  my @qchars = @{$qchars_ref};
293  my @hchars = @{$hchars_ref};
294 
295  unless(scalar(@qchars) == scalar(@hchars)){
296  $self->throw("two sequences are not equal in lengths");
297  }
298 
299  $count = 0;
300  $state = 'M'; # the current state of gaps, (M, D, I)
301 
302  my $cigar_string = '';
303  for(my $i=0; $i <= $#qchars; $i++){
304  my $qchar = $qchars[$i];
305  my $hchar = $hchars[$i];
306  if($qchar ne '-' && $hchar ne '-'){ # Match
307  $cigar_string .= $self->_sub_cigar_string('M');
308  }elsif($qchar eq '-'){ # Deletion
309  $cigar_string .= $self->_sub_cigar_string('D');
310  }elsif($hchar eq '-'){ # Insertion
311  $cigar_string .= $self->_sub_cigar_string('I');
312  }else{
313  $self->throw("Impossible state that 2 gaps on each seq aligned");
314  }
315  }
316  $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
317  return $cigar_string;
318 }
319 
320 sub _sub_cigar_string {
321  my ($self, $new_state) = @_;
322  my $sub_cigar_string = '';
323  if($state eq $new_state){
324  $count++; # Remain the state and increase the counter
325  }else{
326  $sub_cigar_string .= $count unless $count == 1;
327  $sub_cigar_string .= $state;
328  $count = 1;
329  $state = $new_state;
330  }
331  return $sub_cigar_string;
332 }
333 
334 =head2 generate_cigar_string_by_hsp
335 
336  Name : generate_cigar_string_by_hsp
337  Usage : my $hsp; # a ready GenericHSP object
338 my $cigar_string = $self->generate_cigar_string_by_hsp($hsp);
339  Function: generate a cigar string by given HSP object.
340  Args : a GenericHSP object
341  Returns: a text string of cigar string
342 
343 =cut
344 
345 sub generate_cigar_string_by_hsp {
346  my ($self, $hsp) = @_;
347 
348  unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
349  $self->throw("A GenericHSP object needed");
350  }
351 
352  my @qchars = split(//, $hsp->query_string);
353  my @hchars = split(//, $hsp->hit_string);
354 
355  return $self->generate_cigar_string(\@qchars, \@hchars);
356 }
357 
358 1;
Bio::EnsEMBL::Utils::CigarString
Definition: CigarString.pm:38
Bio::EnsEMBL::Utils::CigarString::split_hsp
public A split_hsp()
Bio::EnsEMBL::Utils::Sequence
Definition: Sequence.pm:22
Bio::EnsEMBL::Utils::Argument
Definition: Argument.pm:34