3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
34 implementation
for alignment features
43 -hseqname =>
'SP:RF1231',
46 -analysis => $analysis,
47 -cigar_string =>
'10M3D5M2I',
48 -align_type =>
'ensembl'
53 Alternatively
if you have an array of ungapped features:
60 There is a method to (re)create ungapped features from the cigar_string:
62 my @ungapped_features = $feat->ungapped_features();
66 Bio::EnsEMBL::BaseAlignFeature inherits from:
67 Bio::EnsEMBL::FeaturePair, which in turn inherits from:
68 Bio::EnsEMBL::Feature,
69 thus methods from both parent classes are available.
72 The cigar_string is a condensed representation of the matches and gaps
73 which make up the gapped alignment (where CIGAR stands for
74 Concise Idiosyncratic Gapped Alignment Report).
76 CIGAR format is: n <matches> [ x <deletes or inserts> m <matches> ]*
77 where M = match, D = delete, I = insert; n, m are match lengths;
78 x is delete or insert length.
80 Spaces are omitted, thus: "23M4I12M2D1M"
81 as are counts for any lengths of 1, thus 1M becomes M: "23M4I12M2DM"
84 To make things clearer this is how a blast HSP would be parsed:
91 Score = 76 (26.8 bits), Expect = 1.4, P = 0.74
92 Identities = 20/71 (28%), Positives = 29/71 (40%), Frame = -1
94 Query: 479 GLQAPPPTPQGCRLIPPPPLGLQAPLPTLRAVGSSHHHP*GRQGSSLSSFRSSLASKASA 300
95 G APPP PQG R P P G + P L + + ++ R +A +
96 Sbjct: 7 GALAPPPAPQG-RWAFPRPTG-KRPATPLHGTARQDRQVRRSEAAKVTGCRGRVAPHVAP 64
98 Query: 299 SSPHNPSPLPS 267
100 Sbjct: 65 PLTHTPTPTPT 75
102 The alignment goes from 479 down to 267 in the query sequence on the reverse
103 strand, and from 7 to 75 in the subject sequence.
105 The alignment is made up of the following ungapped pieces:
107 query_seq start 447 , sbjct_seq hstart 7 , match length 33 , strand -1
108 query_seq start 417 , sbjct_seq hstart 18 , match length 27 , strand -1
109 query_seq start 267 , sbjct_seq hstart 27 , match length 147 , strand -1
111 When assembled into a DnaPepAlignFeature where:
112 (seqname, start, end, strand) refer to the query sequence,
113 (hseqname, hstart, hend, hstrand) refer to the subject sequence,
114 these ungapped pieces are represented by the cigar string:
116 with start 267, end 479, strand -1, and hstart 7, hend 75, hstrand 1.
120 AlignFeature cigar strings have the opposite 'sense
'
121 ('D
' and 'I
' swapped) compared with Exonerate cigar strings.
123 Exonerate modules in Bio::EnsEMBL::Analysis use this convention:
125 The longer genomic sequence specified by:
127 AlignFeature: (sequence, start, end, strand)
129 A shorter sequence (such as EST or protein) specified by:
131 AlignFeature: (hsequence, hstart, hend, hstrand)
133 The resulting AlignFeature cigar strings have 'D
' and 'I
'
134 swapped compared with the Exonerate output, i.e.:
136 exonerate: M 123 D 1 M 11 I 1 M 39
137 AlignFeature: 123MI11MD39M
144 package Bio::EnsEMBL::BaseAlignFeature;
146 use Bio::EnsEMBL::FeaturePair;
147 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
148 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
153 @ISA = qw(Bio::EnsEMBL::FeaturePair);
158 Arg [..] : List of named arguments. (-cigar_string , -features, -align_type) defined
159 in this constructor, others defined in FeaturePair and
160 SeqFeature superclasses. Either cigar_string or a list
161 of ungapped features should be provided - not both.
162 Example : $baf = new BaseAlignFeatureSubclass(-cigar_string => '3M3I12M
', -align_type => 'ensembl
');
163 Description: Creates a new BaseAlignFeature using either a cigar string or
164 a list of ungapped features. BaseAlignFeature is an abstract
165 baseclass and should not actually be instantiated - rather its
166 subclasses should be.
167 Returntype : Bio::EnsEMBL::BaseAlignFeature
168 Exceptions : thrown if both feature and cigar string args are provided
169 thrown if neither feature nor cigar string args are provided
170 warn if cigar string is provided without cigar type
180 my $class = ref($caller) || $caller;
182 my $self = $class->SUPER::new(@_);
184 my ($cigar_string,$align_type,$features) = rearrange([qw(CIGAR_STRING ALIGN_TYPE FEATURES)], @_);
186 if (defined($align_type)) {
187 $self->{'align_type
'} = $align_type;
189 warning("No align_type provided, using ensembl as default");
190 $self->{'align_type
'} = 'ensembl
';
193 if (defined($cigar_string) && defined($features)) {
194 throw("CIGAR_STRING or FEATURES argument is required - not both.");
195 } elsif (defined($features)) {
196 $self->_parse_features($features);
198 } elsif (defined($cigar_string)) {
199 $self->{'cigar_string
'} = $cigar_string;
201 throw("CIGAR_STRING or FEATURES argument is required");
210 Arg [1] : string $cigar_string
211 Example : $feature->cigar_string( "12MI3M" );
212 Description: get/set for attribute cigar_string.
213 cigar_string describes the alignment:
214 "xM" stands for x matches (or mismatches),
215 "xI" for x inserts into the query sequence,
216 "xD" for x deletions from the query sequence
217 where the query sequence is specified by (seqname, start, ...)
218 and the subject sequence by (hseqname, hstart, ...).
219 An "x" that is 1 can be omitted.
220 See the SYNOPSIS for an example.
230 $self->{'cigar_string
'} = shift if(@_);
231 return $self->{'cigar_string
'};
237 Arg [1] : type $align_type
238 Example : $feature->align_type( "ensembl" );
239 Description: get/set for attribute align_type.
240 align_type specifies which cigar string
241 is used to describe the alignment:
242 The default is 'ensembl
'
252 $self->{'align_type
'} = shift if(@_);
253 return $self->{'align_type
'};
257 =head2 alignment_length
260 Description: return the alignment length (including indels) based on the alignment_type ('ensembl
', 'mdtag
')
268 sub alignment_length {
271 # ensembl: Internal CIGAR format
272 if ($self->{'align_type
'} eq 'ensembl
') {
273 return $self->_ensembl_cigar_alignment_length();
274 # mdtag: MD Z String for mismatching positions. Regex : [0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)* (Refer: SAM/BAM specification)
275 } elsif ($self->{'align_type
'} eq 'mdtag
') {
276 return $self->_mdtag_alignment_length();
278 throw("No alignment_length method available for " . $self->{'align_type
'});
284 =head2 _ensembl_cigar_alignment_length
287 Description: return the alignment length (including indels) based on the cigar_string
295 sub _ensembl_cigar_alignment_length {
298 if (! defined $self->{'_alignment_length
'} && defined $self->cigar_string) {
300 my @pieces = ( $self->cigar_string =~ /(\d*[MDI])/g );
302 print STDERR "Error parsing cigar_string\n";
304 my $alignment_length = 0;
305 foreach my $piece (@pieces) {
306 my ($length) = ( $piece =~ /^(\d*)/ );
307 if (! defined $length || $length eq "") {
310 $alignment_length += $length;
312 $self->{'_alignment_length
'} = $alignment_length;
314 return $self->{'_alignment_length
'};
317 =head2 ungapped_features
320 Example : @ungapped_features = $align_feature->get_feature
321 Description: converts the internal cigar_string into an array of
322 ungapped feature pairs
323 Returntype : list of Bio::EnsEMBL::FeaturePair
324 Exceptions : cigar_string not set internally
330 sub ungapped_features {
333 if (!defined($self->{'cigar_string
'})) {
334 throw("No cigar_string defined. Can't
return ungapped features
");
337 return @{$self->_parse_cigar()};
340 =head2 strands_reversed
342 Arg [1] : int $strands_reversed
343 Description: get/set for attribute strands_reversed
344 0 means that strand and hstrand are the original strands obtained
345 from the alignment program used
346 1 means that strand and hstrand have been flipped as compared to
347 the original result provided by the alignment program used.
348 You may want to use the reverse_complement method to restore the
357 sub strands_reversed {
358 my ($self, $arg) = @_;
360 if ( defined $arg ) {
361 $self->{'strands_reversed'} = $arg ;
364 $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'});
366 return $self->{'strands_reversed'};
370 =head2 reverse_complement
373 Description: reverse complement the FeaturePair based on the cigar type
374 modifing strand, hstrand and cigar_string in consequence
383 sub reverse_complement {
386 if ($self->{'align_type'} eq 'ensembl') {
387 return $self->_ensembl_reverse_complement();
389 throw("no reverse_complement method implemented
for " . $self->{'align_type'});
393 =head2 _ensembl_reverse_complement
396 Description: reverse complement the FeaturePair for ensembl cigar string,
397 modifing strand, hstrand and cigar_string in consequence
405 sub _ensembl_reverse_complement {
408 # reverse strand in both sequences
409 $self->strand($self->strand * -1);
410 $self->hstrand($self->hstrand * -1);
412 # reverse cigar_string as consequence
413 my $cigar_string = $self->cigar_string;
414 $cigar_string =~ s/(D|I|M)/$1 /g;
415 my @cigar_pieces = split / /,$cigar_string;
417 while (my $piece = pop @cigar_pieces) {
418 $cigar_string .= $piece;
421 $self->{'strands_reversed'} = 0 unless (defined $self->{'strands_reversed'});
423 if ($self->strands_reversed) {
424 $self->strands_reversed(0)
426 $self->strands_reversed(1);
429 $self->cigar_string($cigar_string);
436 Arg 1 : String $coordinate_system_name
437 Arg [2] : String $coordinate_system_version
438 Example : $feature = $feature->transform('contig');
439 $feature = $feature->transform('chromosome', 'NCBI33');
440 Description: Moves this AlignFeature to the given coordinate system.
441 If the feature cannot be transformed to the destination
442 coordinate system undef is returned instead.
443 Returntype : Bio::EnsEMBL::BaseAlignFeature;
444 Exceptions : wrong parameters
453 my $new_feature = $self->SUPER::transform(@_);
454 if ( !defined($new_feature)
455 || $new_feature->length() != $self->length() )
457 my @segments = @{ $self->project(@_) };
464 foreach my $f ( $self->ungapped_features() ) {
465 $f = $f->transform(@_);
467 push( @ungapped, $f );
469 warning( "Failed to transform alignment feature;
"
470 . "ungapped component could not be transformed
" );
475 eval { $new_feature = $self->new( -features => \@ungapped ); };
481 } ## end if ( !defined($new_feature...))
487 =head2 _parse_ensembl_cigar
490 Description: PRIVATE (internal) method - creates ungapped features from
491 internally stored cigar line in ensembl format
492 Returntype : list of Bio::EnsEMBL::FeaturePair
494 Caller : ungapped_features
499 sub _parse_ensembl_cigar {
502 my $query_unit = $self->_query_unit();
503 my $hit_unit = $self->_hit_unit();
505 my $string = $self->{'cigar_string'};
507 throw("No cigar
string defined in
object") if(!defined($string));
509 my @pieces = ( $string =~ /(\d*[MDI])/g );
512 my $strand1 = $self->{'strand'} || 1;
513 my $strand2 = $self->{'hstrand'}|| 1;
515 my ( $start1, $start2 );
517 if( $strand1 == 1 ) {
518 $start1 = $self->{'start'};
520 $start1 = $self->{'end'};
523 if( $strand2 == 1 ) {
524 $start2 = $self->{'hstart'};
526 $start2 = $self->{'hend'};
530 # Construct ungapped blocks as FeaturePairs objects for each MATCH
532 foreach my $piece (@pieces) {
534 my ($length) = ( $piece =~ /^(\d*)/ );
535 if( $length eq "" ) { $length = 1 }
538 # explicit if statements to avoid rounding problems
539 # and make sure we have sane coordinate systems
540 if( $query_unit == 1 && $hit_unit == 3 ) {
541 $mapped_length = $length*3;
542 } elsif( $query_unit == 3 && $hit_unit == 1 ) {
543 $mapped_length = $length / 3;
544 } elsif ( $query_unit == 1 && $hit_unit == 1 ) {
545 $mapped_length = $length;
547 throw("Internal error $query_unit $hit_unit, currently only
" .
551 if( int($mapped_length) != $mapped_length and
552 ($piece =~ /M$/ or $piece =~ /D$/)) {
553 throw("Internal error with mismapped length of hit, query
" .
554 "$query_unit, hit $hit_unit, length $length
");
557 if( $piece =~ /M$/ ) {
561 my ( $qstart, $qend);
562 if( $strand1 == 1 ) {
564 $qend = $start1 + $length - 1;
568 $qstart = $start1 - $length + 1;
569 $start1 = $qstart - 1;
573 if( $strand2 == 1 ) {
575 $hend = $start2 + $mapped_length - 1;
579 $hstart = $start2 - $mapped_length + 1;
580 $start2 = $hstart - 1;
584 push @features, Bio::EnsEMBL::FeaturePair->new
585 (-SLICE => $self->{'slice'},
586 -SEQNAME => $self->{'seqname'},
590 -HSLICE => $self->{'hslice'},
591 -HSEQNAME => $self->{'hseqname'},
594 -HSTRAND => $strand2,
595 -SCORE => $self->{'score'},
596 -PERCENT_ID => $self->{'percent_id'},
597 -ANALYSIS => $self->{'analysis'},
598 -P_VALUE => $self->{'p_value'},
599 -EXTERNAL_DB_ID => $self->{'external_db_id'},
600 -HCOVERAGE => $self->{'hcoverage'},
601 -GROUP_ID => $self->{'group_id'},
602 -LEVEL_ID => $self->{'level_id'});
606 } elsif( $piece =~ /I$/ ) {
610 if( $strand1 == 1 ) {
615 } elsif( $piece =~ /D$/ ) {
619 if( $strand2 == 1 ) {
620 $start2 += $mapped_length;
622 $start2 -= $mapped_length;
625 throw( "Illegal cigar line $string!
" );
636 if ($self->{'align_type'} eq 'ensembl') {
637 return $self->_parse_ensembl_cigar();
640 throw("No parsing method implemented
for " . $self->{'align_type'});
645 =head2 _parse_features
647 Arg [1] : listref Bio::EnsEMBL::FeaturePair $ungapped_features
648 Description: creates internal cigar_string and start,end hstart,hend
650 Returntype : none, fills in values of self
651 Exceptions : argument list undergoes many sanity checks - throws under many
658 sub _parse_features {
659 my ($self, $features) = @_;
660 if ($self->{'align_type'} eq 'ensembl') {
661 $self->_parse_ensembl_features($features);
663 throw("No _parse_features method implemented
for " . $self->{'align_type'});
667 my $message_only_once = 1;
669 sub _parse_ensembl_features {
670 my ($self,$features ) = @_;
672 if (ref($features) ne "ARRAY
") {
673 throw("features must be an array reference not a [
".ref($features)."]
");
674 } elsif (scalar(@$features) == 0) {
675 throw("features array must not be empty
");
678 my $query_unit = $self->_query_unit();
679 my $hit_unit = $self->_hit_unit();
681 my $strand = $features->[0]->strand;
683 throw ('FeaturePair needs to have strand == 1 or strand == -1') if(!$strand);
688 # Sort the features on their start position
689 # Ascending order on positive strand, descending on negative strand
692 @f = sort {$a->start <=> $b->start} @$features;
694 @f = sort { $b->start <=> $a->start} @$features;
697 my $hstrand = $f[0]->hstrand;
698 my $slice = $f[0]->slice();
699 my $hslice = $f[0]->hslice();
700 my $name = $slice ? $slice->name() : undef;
701 my $hname = $f[0]->hseqname;
702 my $score = $f[0]->score;
703 my $percent = $f[0]->percent_id;
704 my $analysis = $f[0]->analysis;
705 my $pvalue = $f[0]->p_value();
706 my $external_db_id = $f[0]->external_db_id;
707 my $hcoverage = $f[0]->hcoverage;
708 my $group_id = $f[0]->group_id;
709 my $level_id = $f[0]->level_id;
711 my $seqname = $f[0]->seqname;
712 # implicit strand 1 for peptide sequences
715 my $ori = $strand * $hstrand;
717 throw("No features in the array to parse
") if(scalar(@f) == 0);
719 my $prev1; # where last feature q part ended
720 my $prev2; # where last feature s part ended
724 # Use strandedness info of query and hit to make sure both sets of
725 # start and end coordinates are oriented the right way around.
732 $f1start = $f[0]->start;
733 $f1end = $f[-1]->end;
735 $f1start = $f[-1]->start;
740 $f2start = $f[0]->hstart;
741 $f2end = $f[-1]->hend;
743 $f2start = $f[-1]->hstart;
744 $f2end = $f[0]->hend;
748 # Loop through each portion of alignment and construct cigar string
759 if( defined($f->hstrand()) && $f->hstrand() != $hstrand ) {
760 throw("Inconsistent hstrands in feature array
");
762 if( defined($f->strand()) && ($f->strand != $strand)) {
763 throw("Inconsistent strands in feature array
");
765 if ( defined($name) && $name ne $f->slice->name()) {
766 throw("Inconsistent names in feature array [$name -
".
767 $f->slice->name()."]
");
769 if ( defined($hname) && $hname ne $f->hseqname) {
770 throw("Inconsistent hit names in feature array [$hname -
".
773 if ( defined($score) && $score ne $f->score) {
774 throw("Inconsisent scores in feature array [$score -
" .
777 if (defined($f->percent_id) && $percent ne $f->percent_id) {
778 throw("Inconsistent pids in feature array [$percent -
" .
779 $f->percent_id . "]
");
781 if(defined($pvalue) && $pvalue != $f->p_value()) {
782 throw("Inconsistant p_values in feature arraw [$pvalue
" .
783 $f->p_value() . "]
");
785 if($seqname && $seqname ne $f->seqname){
786 throw("Inconsistent seqname in feature array [$seqname -
".
789 my $start1 = $f->start; #source sequence alignment start
790 my $start2 = $f->hstart(); #hit sequence alignment start
793 # More sanity checking
795 if (defined($prev1)) {
796 if ( $strand == 1 ) {
797 if ($f->start < $prev1) {
798 throw("Inconsistent coords in feature array (forward strand).\n
" .
799 "Start [
".$f->start()."] in current feature should be greater
" .
800 "than previous feature end [$prev1].
");
803 if ($f->end > $prev1) {
804 throw("Inconsistent coords in feature array (reverse strand).\n
" .
805 "End [
".$f->end() ."] should be less than previous feature
" .
811 my $length = ($f->end - $f->start + 1); #length of source seq alignment
812 my $hlength = ($f->hend - $f->hstart + 1); #length of hit seq alignment
814 # using multiplication to avoid rounding errors, hence the
815 # switch from query to hit for the ratios
818 # Yet more sanity checking
820 if($query_unit > $hit_unit){
821 # I am going to make the assumption here that this situation will
822 # only occur with DnaPepAlignFeatures, this may not be true
823 my $query_p_length = sprintf "%.0f
", ($length/$query_unit);
824 my $hit_p_length = sprintf "%.0f
", ($hlength * $hit_unit);
825 if( $query_p_length != $hit_p_length) {
826 throw( "Feature lengths not comparable Lengths:
" .$length .
827 " " . $hlength . " Ratios:
" . $query_unit . " " .
831 my $query_d_length = sprintf "%.0f
", ($length*$hit_unit);
832 my $hit_d_length = sprintf "%.0f
", ($hlength * $query_unit);
833 if( $length * $hit_unit != $hlength * $query_unit ) {
834 throw( "Feature lengths not comparable Lengths:
" . $length .
835 " " . $hlength . " Ratios:
" . $query_unit . " " .
840 my $hlengthfactor = ($query_unit/$hit_unit);
843 # Check to see if there is an I type (insertion) gap:
844 # If there is a space between the end of the last source sequence
845 # alignment and the start of this one, then this is an insertion
848 my $insertion_flag = 0;
850 if( ( defined $prev1 ) && ( $f->start > $prev1 + 1 )) {
852 #there is an insertion
854 my $gap = $f->start - $prev1 - 1;
856 $gap = ""; # no need for a number if gap length is 1
858 $string .= "$gap
"."I
";
862 #shift our position in the source seq alignment
866 if(( defined $prev1 ) && ($f->end + 1 < $prev1 )) {
868 #there is an insertion
870 my $gap = $prev1 - $f->end() - 1;
872 $gap = ""; # no need for a number if gap length is 1
874 $string .= "$gap
"."I
";
877 #shift our position in the source seq alignment
878 $prev1 = $f->start();
882 # Check to see if there is a D type (deletion) gap
883 # There is a deletion gap if there is a space between the end of the
884 # last portion of the hit sequence alignment and this one
886 if( $hstrand == 1 ) {
887 if(( defined $prev2 ) && ( $f->hstart() > $prev2 + 1 )) {
890 my $gap = $f->hstart - $prev2 - 1;
891 my $gap2 = int( $gap * $hlengthfactor + 0.5 );
894 $gap2 = ""; # no need for a number if gap length is 1
896 $string .= "$gap2
"."D
";
898 #sanity check, Should not be an insertion and deletion
899 if($insertion_flag) {
900 if ($message_only_once) {
901 warning("Should not be an deletion and insertion on the
" .
902 "same alignment region. cigar_line=$string\n
");
903 $message_only_once = 0;
907 #shift our position in the hit seq alignment
911 if( ( defined $prev2 ) && ( $f->hend() + 1 < $prev2 )) {
914 my $gap = $prev2 - $f->hend - 1;
915 my $gap2 = int( $gap * $hlengthfactor + 0.5 );
918 $gap2 = ""; # no need for a number if gap length is 1
920 $string .= "$gap2
"."D
";
922 #sanity check, Should not be an insertion and deletion
923 if($insertion_flag) {
924 if ($message_only_once) {
925 warning("Should not be an deletion and insertion on the
" .
926 "same alignment region. prev2 = $prev2; f->hend() =
" .
927 $f->hend() . "; cigar_line = $string;\n
");
928 $message_only_once = 0;
932 #shift our position in the hit seq alignment
933 $prev2 = $f->hstart();
936 my $matchlength = $f->end() - $f->start() + 1;
937 if( $matchlength == 1 ) {
940 $string .= $matchlength."M
";
943 $self->{'start'} = $f1start;
944 $self->{'end'} = $f1end;
945 $self->{'seqname'} = $seqname;
946 $self->{'strand'} = $strand;
947 $self->{'score'} = $score;
948 $self->{'percent_id'} = $percent;
949 $self->{'analysis'} = $analysis;
950 $self->{'slice'} = $slice;
951 $self->{'hslice'} = $hslice;
952 $self->{'hstart'} = $f2start;
953 $self->{'hend'} = $f2end;
954 $self->{'hstrand'} = $hstrand;
955 $self->{'hseqname'} = $hname;
956 $self->{'cigar_string'} = $string;
957 $self->{'p_value'} = $pvalue;
958 $self->{'external_db_id'} = $external_db_id;
959 $self->{'hcoverage'} = $hcoverage;
960 $self->{'group_id'} = $group_id;
961 $self->{'level_id'} = $level_id;
972 Description: abstract method, overwrite with something that returns
983 throw( "Abstract method call!
" );
991 Description: abstract method, overwrite with something that returns
1002 throw( "Abstract method call!
" );
1005 =head2 _mdtag_alignment_length
1008 Description: return the alignment length (including indels) based on the mdtag (mdz) string
1016 sub _mdtag_alignment_length {
1019 if (! defined $self->{'_alignment_length'} && defined $self->cigar_string) {
1020 my $mdz_string = $self->cigar_string;
1021 my $chunks = $self->_get_mdz_chunks($mdz_string);
1022 my $alignment_length = 0;
1023 $alignment_length = $self->_get_mdz_alignment_length($chunks);
1024 $self->{'_alignment_length'} = $alignment_length;
1026 return $self->{'_alignment_length'};
1029 =head2 _get_mdz_chunks
1031 Arg [1] : mdtag string - MD Z String for mismatching positions. Regex : [0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)* (Refer: SAM/BAM specification)
1032 Description: parses the mdtag string and group it according the type
1033 eg: MD:Z:35^VIVALE31^GRPLIQPRRKKAYQLEHTFQGLLGKRSLFTE10 returns ['35', '^', 'VIVALE', '31', '^', 'GRPLIQPRRKKAYQLEHTFQGLLGKRSLFTE', '10']
1034 Returntype : array of strings
1040 sub _get_mdz_chunks {
1043 my $mdz_string = shift;
1044 my $alignment_length = 0;
1046 if ( $mdz_string =~ /^MD:Z:(.+)/ ){
1049 #if start to end is a number then all match return as it is
1050 if($mdtag =~/^\d+$/){
1053 my @char_arr = split('',$mdtag);
1054 # Set up for the first loop, this also will handle
1055 # if the array is size one, as we'll have a character
1056 # in the current_chunk to push on the pile at the end.
1057 my $prev_type = $self->_get_mdz_chunk_type( $char_arr[0]);
1058 my $current_chunk = $char_arr[0];
1059 for(my $i = 1; $i <= $#char_arr; $i++) {
1060 my $cur_type = $self->_get_mdz_chunk_type( $char_arr[$i] );
1061 if($cur_type ne $prev_type) {
1062 # We've found a new character class, push the
1063 # current chunk on to the list
1064 push @chunks, $current_chunk;
1065 # Restart the current pile
1066 $current_chunk = $char_arr[$i];
1068 # Same character class, put it on the current pile
1069 $current_chunk .= $char_arr[$i];
1071 # Shift current type in to the prior slot
1072 $prev_type = $cur_type;
1075 # Post loop cleanup of the last piece
1076 # This also handles an array of size one where
1077 # we never entered the loop, that one element
1078 # now gets pushed on the pile.
1079 push @chunks, $current_chunk;
1085 =head2 _get_mdz_alignment_length
1087 Arg [1] : array of strings
1088 Description: calculate the alignment length from the given chunks
1089 Returntype : array of strings
1096 sub _get_mdz_alignment_length{
1101 for(my $i=0; $i< scalar(@$chunks); $i++){
1102 my $chunk = $chunks->[$i];
1103 my $type = $self->_get_mdz_chunk_type($chunk);
1106 $length = $length + $chunk;
1107 }elsif($type eq 'alpha'){
1108 $length = $length + length($chunk);
1109 }elsif($type eq 'del'){
1118 =head2 _get_mdz_chunk_type
1121 Description: get the chunk type
1129 sub _get_mdz_chunk_type{
1136 } elsif($char =~ /\d+/){
1138 } elsif($char =~ /\w+/){
1145 =head2 _mdz_alignment_string
1147 Arg [1] : input sequence
1148 Arg [2] : MD Z String for mismatching positions. Regex : [0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)* (Refer: SAM/BAM specification)
1149 eg: MD:Z:96^RHKTDSFVGLMGKRALNS0V14
1150 Example : $pf->alignment_strings
1151 Description: Allows to rebuild the alignment string of both the seq and hseq sequence
1152 Returntype : array reference containing 2 strings
1153 the first corresponds to seq
1154 the second corresponds to hseq
1161 # Try to get the sequence from translation object
1162 # mdz_string from current object
1163 sub _mdz_alignment_string {
1164 my ($self, $input_seq, $mdz_string) = @_;
1166 my $chunks = $self->_get_mdz_chunks($mdz_string);
1168 my $target_seq = "";
1171 #Handle first chunk, which is always a num
1172 my $first_chunk = $chunks->[0];
1173 my $first_chunk_type = $self->_get_mdz_chunk_type($first_chunk);
1176 if($first_chunk_type eq 'num'){
1177 if($first_chunk > 0){
1178 $target_seq = substr($input_seq, $offset, $first_chunk);
1180 #query and target are same at this point
1181 $query_seq = $target_seq;
1182 $offset = length($target_seq);
1185 die "First chunk should always be a num...something wrong\n
";
1188 for(my $i=1; $i < scalar(@$chunks); $i++){
1190 my $chunk = $chunks->[$i];
1191 my $chunk_type = $self->_get_mdz_chunk_type($chunk);
1193 if($chunk_type eq 'num'){
1194 #if 0, what follows next is a INSERT
1196 my $insert_chunk = $chunks->[++$i];
1197 my $insert_chunk_type = $self->_get_mdz_chunk_type($insert_chunk);
1199 #insert_chunk_type is always alpha
1200 die if $insert_chunk_type eq "num
";
1201 if($insert_chunk_type eq 'del'){
1205 $target_seq = $target_seq . substr($input_seq, $offset, length($insert_chunk));
1206 $query_seq = $query_seq . $insert_chunk;
1208 $offset = $offset + length($insert_chunk);
1211 #if non-0, then it is a match
1212 my $match_chunk = $chunks->[$i];
1213 my $match_chunk_type = $self->_get_mdz_chunk_type($match_chunk);
1215 #$match_chunk_type is always num
1216 die if $match_chunk_type ne "num
";
1218 my $match_target_string = substr($input_seq, $offset, $match_chunk);
1220 $target_seq = $target_seq . $match_target_string;
1221 $query_seq = $query_seq . $match_target_string;
1223 $offset = $offset + $match_chunk;
1226 }elsif($chunk_type eq 'alpha'){
1227 my $insert_chunk = $chunk;
1228 my $insert_chunk_type = $self->_get_mdz_chunk_type($insert_chunk);
1229 die if $insert_chunk_type ne "alpha
";
1231 $target_seq = $target_seq . substr($input_seq, $offset, length($insert_chunk));
1232 $query_seq = $query_seq . $insert_chunk;
1234 $offset = $offset + length($insert_chunk);
1236 }elsif($chunk_type eq 'del'){
1237 #get next chunk which contains the deleted sequence
1238 my $del_chunk = $chunks->[++$i];
1239 my $del_chunk_type = $self->_get_mdz_chunk_type($del_chunk);
1241 #del_chunk_type is always alpha
1242 die if $del_chunk_type ne 'alpha';
1244 $target_seq = $target_seq . "-
" x length($del_chunk);
1245 $query_seq = $query_seq . $del_chunk;
1247 #no change in offset
1255 return [$target_seq, $query_seq];