my ($self,$features ) = @_;
if (ref($features) ne "ARRAY") {
throw("features must be an array reference not a [".ref($features)."]");
} elsif (scalar(@$features) == 0) {
throw("features array must not be empty");
}
my $query_unit = $self->_query_unit();
my $hit_unit = $self->_hit_unit();
my $strand = $features->[0]->strand;
throw ('FeaturePair needs to have strand == 1 or strand == -1') if(!$strand);
my @f;
#
# Sort the features on their start position
# Ascending order on positive strand, descending on negative strand
#
if( $strand == 1 ) {
@f = sort {$a->start <=> $b->start} @$features;
} else {
@f = sort { $b->start <=> $a->start} @$features;
}
my $hstrand = $f[0]->hstrand;
my $slice = $f[0]->slice();
my $hslice = $f[0]->hslice();
my $name = $slice ? $slice->name() : undef;
my $hname = $f[0]->hseqname;
my $score = $f[0]->score;
my $percent = $f[0]->percent_id;
my $analysis = $f[0]->analysis;
my $pvalue = $f[0]->p_value();
my $external_db_id = $f[0]->external_db_id;
my $hcoverage = $f[0]->hcoverage;
my $group_id = $f[0]->group_id;
my $level_id = $f[0]->level_id;
my $seqname = $f[0]->seqname;
# implicit strand 1 for peptide sequences
$strand ||= 1;
$hstrand ||= 1;
my $ori = $strand * $hstrand;
throw("No features in the array to parse") if(scalar(@f) == 0);
my $prev1; # where last feature q part ended
my $prev2; # where last feature s part ended
my $string;
# Use strandedness info of query and hit to make sure both sets of
# start and end coordinates are oriented the right way around.
my $f1start;
my $f1end;
my $f2end;
my $f2start;
if ($strand == 1) {
$f1start = $f[0]->start;
$f1end = $f[-1]->end;
} else {
$f1start = $f[-1]->start;
$f1end = $f[0]->end;
}
if ($hstrand == 1) {
$f2start = $f[0]->hstart;
$f2end = $f[-1]->hend;
} else {
$f2start = $f[-1]->hstart;
$f2end = $f[0]->hend;
}
#
# Loop through each portion of alignment and construct cigar string
#
foreach my $f (@f) {
#
# Sanity checks
#
if (!$f->isa("Bio::EnsEMBL::FeaturePair")) {
throw("Array element [$f] is not a Bio::EnsEMBL::FeaturePair");
}
if( defined($f->hstrand()) && $f->hstrand() != $hstrand ) {
throw("Inconsistent hstrands in feature array");
}
if( defined($f->strand()) && ($f->strand != $strand)) {
throw("Inconsistent strands in feature array");
}
if ( defined($name) && $name ne $f->slice->name()) {
throw("Inconsistent names in feature array [$name - ".
$f->slice->name()."]");
}
if ( defined($hname) && $hname ne $f->hseqname) {
throw("Inconsistent hit names in feature array [$hname - ".
$f->hseqname . "]");
}
if ( defined($score) && $score ne $f->score) {
throw("Inconsisent scores in feature array [$score - " .
$f->score . "]");
}
if (defined($f->percent_id) && $percent ne $f->percent_id) {
throw("Inconsistent pids in feature array [$percent - " .
$f->percent_id . "]");
}
if(defined($pvalue) && $pvalue != $f->p_value()) {
throw("Inconsistant p_values in feature arraw [$pvalue " .
$f->p_value() . "]");
}
if($seqname && $seqname ne $f->seqname){
throw("Inconsistent seqname in feature array [$seqname - ".
$f->seqname . "]");
}
my $start1 = $f->start; #source sequence alignment
start
my $start2 = $f->hstart(); #hit sequence alignment
start
#
# More sanity checking
#
if (defined($prev1)) {
if ( $strand == 1 ) {
if ($f->start < $prev1) {
throw("Inconsistent coords in feature array (forward strand).\n" .
"Start [".$f->start()."] in current feature should be greater " .
"than previous feature end [$prev1].");
}
} else {
if ($f->end > $prev1) {
throw("Inconsistent coords in feature array (reverse strand).\n" .
"End [".$f->end() ."] should be less than previous feature " .
"start [$prev1].");
}
}
}
my $length = ($f->end - $f->start + 1); #
length of source
seq alignment
my $hlength = ($f->hend - $f->hstart + 1); #
length of hit
seq alignment
# using multiplication to avoid rounding errors, hence the
# switch from query to hit for the ratios
#
# Yet more sanity checking
#
if($query_unit > $hit_unit){
# I am going to make the assumption here that this situation will
# only occur with DnaPepAlignFeatures, this may not be true
my $query_p_length = sprintf "%.0f", ($length/$query_unit);
my $hit_p_length = sprintf "%.0f", ($hlength * $hit_unit);
if( $query_p_length != $hit_p_length) {
throw( "Feature lengths not comparable Lengths:" .$length .
" " . $hlength . " Ratios:" . $query_unit . " " .
$hit_unit );
}
} else{
my $query_d_length = sprintf "%.0f", ($length*$hit_unit);
my $hit_d_length = sprintf "%.0f", ($hlength * $query_unit);
if( $length * $hit_unit != $hlength * $query_unit ) {
throw( "Feature lengths not comparable Lengths:" . $length .
" " . $hlength . " Ratios:" . $query_unit . " " .
$hit_unit );
}
}
my $hlengthfactor = ($query_unit/$hit_unit);
#
# Check to see if there is an I type (insertion) gap:
# If there is a space between the end of the last source sequence
# alignment and the start of this one, then this is an insertion
#
my $insertion_flag = 0;
if( $strand == 1 ) {
if( ( defined $prev1 ) && ( $f->start > $prev1 + 1 )) {
#there is an insertion
$insertion_flag = 1;
my $gap = $f->start - $prev1 - 1;
if( $gap == 1 ) {
$gap =
""; # no need
for a number
if gap
length is 1
}
$string .= "$gap"."I";
}
#shift our position in the source seq alignment
$prev1 = $f->end();
} else {
if(( defined $prev1 ) && ($f->end + 1 < $prev1 )) {
#there is an insertion
$insertion_flag = 1;
my $gap = $prev1 - $f->end() - 1;
if( $gap == 1 ) {
$gap =
""; # no need
for a number
if gap
length is 1
}
$string .= "$gap"."I";
}
#shift our position in the source seq alignment
$prev1 = $f->start();
}
#
# Check to see if there is a D type (deletion) gap
# There is a deletion gap if there is a space between the end of the
# last portion of the hit sequence alignment and this one
#
if( $hstrand == 1 ) {
if(( defined $prev2 ) && ( $f->hstart() > $prev2 + 1 )) {
#there is a deletion
my $gap = $f->hstart - $prev2 - 1;
my $gap2 = int( $gap * $hlengthfactor + 0.5 );
if( $gap2 == 1 ) {
$gap2 =
""; # no need
for a number
if gap
length is 1
}
$string .= "$gap2"."D";
#sanity check, Should not be an insertion and deletion
if($insertion_flag) {
if ($message_only_once) {
warning("Should not be an deletion and insertion on the " .
"same alignment region. cigar_line=$string\n");
$message_only_once = 0;
}
}
}
#shift our position in the hit seq alignment
$prev2 = $f->hend();
} else {
if( ( defined $prev2 ) && ( $f->hend() + 1 < $prev2 )) {
#there is a deletion
my $gap = $prev2 - $f->hend - 1;
my $gap2 = int( $gap * $hlengthfactor + 0.5 );
if( $gap2 == 1 ) {
$gap2 =
""; # no need
for a number
if gap
length is 1
}
$string .= "$gap2"."D";
#sanity check, Should not be an insertion and deletion
if($insertion_flag) {
if ($message_only_once) {
warning("Should not be an deletion and insertion on the " .
"same alignment region. prev2 = $prev2; f->hend() = " .
$f->hend() . "; cigar_line = $string;\n");
$message_only_once = 0;
}
}
}
#shift our position in the hit seq alignment
$prev2 = $f->hstart();
}
my $matchlength = $f->end() - $f->start() + 1;
if( $matchlength == 1 ) {
$matchlength = "";
}
$string .= $matchlength."M";
}
$self->{'start'} = $f1start;
$self->{'end'} = $f1end;
$self->{'seqname'} = $seqname;
$self->{'strand'} = $strand;
$self->{'score'} = $score;
$self->{'percent_id'} = $percent;
$self->{'analysis'} = $analysis;
$self->{'slice'} = $slice;
$self->{'hslice'} = $hslice;
$self->{'hstart'} = $f2start;
$self->{'hend'} = $f2end;
$self->{'hstrand'} = $hstrand;
$self->{'hseqname'} = $hname;
$self->{'cigar_string'} = $string;
$self->{'p_value'} = $pvalue;
$self->{'external_db_id'} = $external_db_id;
$self->{'hcoverage'} = $hcoverage;
$self->{'group_id'} = $group_id;
$self->{'level_id'} = $level_id;
}