ensembl-hive  2.8.1
BlastzAligner.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package AssemblyMapper::BlastzAligner;
21 
22 =head1 NAME
23 
24 BlastzAligner.pm - module to do a whole genome alignment between two closely
25 related assemblies and create assembly entries from it.
26 
27 =head1 SYNOPSIS
28 
29 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
30 my $aligner = BlastzAligner->new(-SUPPORT => $support);
31 
32 # create a tempdir for storing input and output
33 $aligner->create_tempdir;
34 
35 # write sequences to fasta and nib files
36 $aligner->write_sequence(
37  $alt_slice,
38  $support->param('altassembly'),
39  "alt_sequence.1"
40 );
41 $aligner->write_sequence(
42  $ref_slice,
43  $support->param('assembly'),
44  "ref_sequence.1"
45 );
46 
47 # run blastz
48 $aligner->run_blastz("alt_sequence.1", "ref_sequence.1");
49 
50 =head1 DESCRIPTION
51 
52 This modules contains helper functions to generate a whole genome alignment
53 between two closely related assemblies using blastz from scratch. Alignments
54 are then stored in an Ensembl assembly table.
55 
56 The module is part of a series of scripts to create a mapping between two
57 assemblies. See "Related scripts" below for an overview of the whole process.
58 
59 =head1 RELATED SCRIPTS
60 
61 The whole process of creating a whole genome alignment is done by these
62 scripts:
63 
64  ensembl/misc-scripts/assembly/load_alternative_assembly.pl
65  ensembl/misc-scripts/assembly/align_by_clone_identity.pl
66  ensembl/misc-scripts/assembly/align_nonident_regions.pl
67 
68 See documention in the respective script for more information.
69 
70 
71 =head1 AUTHOR
72 
73 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
74 
75 =head1 CONTACT
76 
77 Please post comments/questions to the Ensembl development list
78 <http://lists.ensembl.org/mailman/listinfo/dev>
79 
80 =cut
81 
82 use strict;
83 use warnings;
84 no warnings 'uninitialized';
85 
86 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
87 use File::Path;
88 
89 use constant FMT1 => "%-30s%10.0f (%3.2f%%)\n";
90 use constant FMT2 => "%-30s%10.0f\n";
91 use constant FMT3 => "%-8s%-12s%-5s%-10s%-10s%-10s%-10s\n";
92 use constant FMT4 => "%-8s%-12s%-5s%8.0f %8.0f %8.0f %8.0f\n";
93 
94 =head2 new
95 
96  Arg[-SUPPORT] : a Bio::EnsEMBL::Utils::ConversionSupport object
97  Example : my $support = new Bio::EnsEMBL::Utils::ConversionSupport(
98  $SERVERROOT);
99  my $aligner = BlastzAligner->new(-SUPPORT => $support);
100  Description : object constructor method
101  Return type : a BlastzAligner object
102  Exceptions : none
103  Caller : general
104 
105 =cut
106 
107 sub new {
108  my ($class, @args) = @_;
109 
110  my $self = {};
111  bless $self, $class;
112 
113  my ($support) = rearrange([qw(SUPPORT)], @args);
114  $self->support($support);
115 
116  # set bindir
117  $self->bindir($self->support->param('bindir') || '/usr/local/ensembl/bin');
118 
119  return $self;
120 }
121 
122 =head2 create_tempdir
123 
124  Arg[1] : String $tmpdir - temporary directory name
125  Example : $aligner->create_tempdir;
126  Description : Creates a temporary directory in /tmp with a semi-random name
127  (username.timestamp.randomnumber). Alternatively, you can pass
128  it the name of a directory to use.
129  Return type : String - name of the tempdir created
130  Exceptions : Thrown if tempdir can't be created
131  Caller : general
132 
133 =cut
134 
135 sub create_tempdir {
136  my $self = shift;
137  my $tempdir = shift;
138 
139  if ($tempdir) {
140  unless (-d $tempdir) {
141  $self->support->log_error("Can't find tempdir $tempdir: $!");
142  }
143 
144  } else {
145  # create tmpdir to store input and output
146  my $user = `whoami`;
147  chomp $user;
148  $tempdir = "/tmp/$user.".time.".".int(rand(100000));
149  $self->support->log("Creating tmpdir $tempdir...\n");
150  system("mkdir $tempdir") == 0 or
151  $self->support->log_error("Can't create tmp dir $tempdir: $!\n");
152  $self->support->log("Done.\n");
153  }
154 
155  $self->tempdir($tempdir);
156  return $tempdir;
157 }
158 
159 =head2 remove_tempdir
160 
161  Example : $aligner->remove_tempdir;
162  Description : Removes a temporary directory created by $self->create_tempdir.
163  Return type : none
164  Exceptions : none
165  Caller : general
166 
167 =cut
168 
169 sub remove_tempdir {
170  my $self = shift;
171  rmtree($self->tempdir) or $self->support->log_warning("Could not delete ".$self->tempdir.": $!\n");
172 }
173 
174 =head2 write_sequence
175 
176  Arg[1] : Bio::EnsEMBL::Slice $slice - slice for which to write sequence
177  Arg[2] : String $assembly - assembly name
178  Arg[3] : String $basename1 - basename of single sequence file
179  Arg[4] : (optional) String $basename2 - basename of multiple sequence
180  file
181  Example : $aligner->write_sequence($slice, 'NCBI35', 'e_seq.1');
182  Description : Writes a slice's sequence to a fasta file and converts it to nib
183  format. Optionally appends the sequence to another
184  multi-sequence fasta file.
185  Return type : none
186  Exceptions : thrown if faToNib or file appending fails
187  Caller : general
188 
189 =cut
190 
191 sub write_sequence {
192  my ($self, $slice, $assembly, $basename1, $basename2) = @_;
193 
194  my $tmpdir = $self->tempdir;
195 
196  unless (-e "$tmpdir/$basename1.fa") {
197  my $fh = $self->support->filehandle('>', "$tmpdir/$basename1.fa");
198  my $cs_name = $slice->coord_system_name;
199  print $fh join(':', ">$basename1 dna:chromfrag $cs_name",
200  $assembly,
201  $slice->start,
202  $slice->end,
203  $slice->strand
204  ), "\n";
205  print $fh $slice->get_repeatmasked_seq(undef, 1)->seq, "\n";
206  close($fh);
207  }
208 
209  # convert fasta to nib (needed for lavToAxt)
210  unless (-e "$tmpdir/$basename1.nib") {
211  system($self->bindir."/faToNib $tmpdir/$basename1.fa $tmpdir/$basename1.nib") == 0
212  or $self->support->log_error("Can't run faToNib: $!\n");
213  }
214 
215  if ($basename2) {
216  system("cat $tmpdir/$basename1.fa >> $tmpdir/$basename2.fa") == 0 or
217  $self->support->log_error("Can't concat fasta files: $!\n");
218  }
219 }
220 
221 =head2 run_blastz
222 
223  Arg[1] : String $A_basename - basename of alternative fasta file
224  Arg[2] : String $R_basename - basename of reference fasta file
225  Example : $aligner->run_blastz('alt_seq.1', 'ref_seq.1');
226  Description : Runs blastz between an alternative and multiple reference
227  sequences.
228  Return type : none
229  Exceptions : thrown if blastz fails
230  Caller : general
231 
232 =cut
233 
234 sub run_blastz {
235  my ($self, $A_basename, $R_basename) = @_;
236 
237  my $tmpdir = $self->tempdir;
238  my $bindir = $self->bindir;
239  my $id = $self->id;
240 
241  my $blastz_cmd = qq($bindir/blastz $tmpdir/$A_basename.fa $tmpdir/$R_basename.fa Q=blastz_matrix.txt T=0 L=10000 H=2200 Y=3400 > $tmpdir/blastz.$id.lav);
242 
243  unless (-e "$tmpdir/blastz.$id.lav") {
244  system($blastz_cmd) == 0 or
245  $self->support->log_error("Can't run blastz: $!\n");
246  }
247 }
248 
249 =head2 lav_to_axt
250 
251  Example : $aligner->lav_to_axt;
252  Description : Converts blastz output from lav to axt format. Target and query
253  sequences must be present in nib format in the temporary
254  directory for this to work.
255  Return type : none
256  Exceptions : thrown if lavToAxt fails
257  Caller : general
258 
259 =cut
260 
261 sub lav_to_axt {
262  my $self = shift;
263 
264  my $tmpdir = $self->tempdir;
265  my $id = $self->id;
266 
267  unless (-e "$tmpdir/blastz.$id.axt") {
268  system($self->bindir."/lavToAxt $tmpdir/blastz.$id.lav $tmpdir $tmpdir $tmpdir/blastz.$id.axt") == 0 or $self->support->log_error("Can't run lavToAxt: $!\n");
269  }
270 }
271 
272 =head2 find_best_alignment
273 
274  Example : $aligner->find_best_alignment;
275  Description : Finds the best set of non-overlapping alignments by running
276  axtBest.
277  Return type : none
278  Exceptions : thrown if axtBest fails
279  Caller : general
280 
281 =cut
282 
283 sub find_best_alignment {
284  my $self = shift;
285 
286  my $tmpdir = $self->tempdir;
287  my $id = $self->id;
288 
289  unless (-e "$tmpdir/blastz.$id.best.axt") {
290  system($self->bindir."/axtBest $tmpdir/blastz.$id.axt all $tmpdir/blastz.$id.best.axt") == 0 or $self->support->log_error("Can't run axtBest: $!\n");
291  }
292 }
293 
294 =head2 parse_blastz_output
295 
296  Example : $aligner->parse_blastz_output;
297  Description : Reads a blastz alignment result from an axt file and creates
298  a datastructure containing ungapped alignments from it. Note
299  that mismatches are allowed, but separate stats will be
300  collected for them.
301  Return type : none
302  Exceptions : none
303  Caller : general
304 
305 =cut
306 
307 sub parse_blastz_output {
308  my $self = shift;
309 
310  # read file
311  my $tmpdir = $self->tempdir;
312  my $id = $self->id;
313  my $fh = $self->support->filehandle('<', "$tmpdir/blastz.$id.best.axt");
314 
315  # initialize stats
316  $self->init_stats(qw(match mismatch gap alignments bp));
317 
318  my $i = 1;
319  my ($header, $A_seq, $R_seq);
320 
321  while (my $line = <$fh>) {
322  # there are blocks of 4 lines, where line 1 is the header, line 2 is
323  # A_seq, line3 is R_seq
324  $header = $line unless (($i-1) % 4);
325  $A_seq = $line unless (($i-2) % 4);
326  chomp $A_seq;
327  my @A_arr = split(//, $A_seq);
328  $R_seq = $line unless (($i-3) % 4);
329  chomp $R_seq;
330  my @R_arr = split(//, $R_seq);
331 
332  # compare sequences letter by letter
333  if ($i % 4 == 0) {
334  my $match_flag = 0;
335  $self->init_stats(qw(A_gap R_gap));
336  my %coords;
337  @coords{'R_id', 'A_start', 'A_end', 'R_start', 'R_end', 'strand'} =
338  (split(/ /, $header))[4, 2, 3, 5, 6, 7];
339  $coords{'R_id'} =~ s/ref_seq\.(.*)/$1/;
340  ($coords{'strand'} eq '+') ? ($coords{'strand'} = 1) :
341  ($coords{'strand'} = -1);
342  for (my $j = 0; $j < scalar(@A_arr); $j++) {
343  # gap
344  if ($A_arr[$j] eq '-' or $R_arr[$j] eq '-') {
345  $self->stats_incr('gap', 1);
346  $self->stats_incr('A_gap', 1) if ($A_arr[$j] eq '-');
347  $self->stats_incr('R_gap', 1) if ($R_arr[$j] eq '-');
348  $match_flag = 0;
349  } else {
350  $self->found_match($match_flag, $j, \%coords);
351  $match_flag = 1;
352 
353  # match
354  if ($A_arr[$j] eq $R_arr[$j]) {
355  $self->stats_incr('match', 1);
356  # mismatch
357  } else {
358  $self->stats_incr('mismatch', 1);
359  }
360  }
361  }
362  $self->stats_incr('bp', scalar(@A_arr));
363  $self->stats_incr('alignments', 1);
364  }
365 
366  $i++;
367  }
368 }
369 
370 =head2 cleanup_tmpfiles
371 
372  Arg[1-N] : (optional) list @files - additional tmp files to delete
373  Example : $self->cleanup_tmpfiles('e_seq.fa', 'v_seq.fa');
374  Description : deletes temporary files
375  Return type : none
376  Exceptions : Warning if file cannot be deleted
377  Caller : general
378  Status : stable
379 
380 =cut
381 
382 sub cleanup_tmpfiles {
383  my $self = shift;
384  my @files = @_;
385 
386  my $tmpdir = $self->tempdir;
387  my $id = $self->id;
388 
389  push @files,
390  "blastz.$id.lav",
391  "blastz.$id.axt",
392  "blastz.$id.best.axt";
393 
394  foreach my $file (@files) {
395  unlink("$tmpdir/$file") or $self->support->log_warning("Couldn't delete file $file: $!");
396  }
397 }
398 
399 =head2 found_match
400 
401  Arg[1] : Boolean $match_flag - flag indicating if last bp was a match
402  Arg[2] : Int $j - current bp position in the alignment
403  Arg[3] : Hashref $coords - alignment coordinates and strand from blastz
404  output
405  Description : Populates a datastructure describing blocks of alignment.
406  Return type : none
407  Exceptions : none
408  Caller : internal
409 
410 =cut
411 
412 sub found_match {
413  my ($self, $match_flag, $j, $coords) = @_;
414 
415  my $id = $self->id;
416  my $align = $self->get_stats('alignments');
417  my $R_chr = $self->seq_region_name;
418 
419  # last position was a match
420  if ($match_flag) {
421 
422  # adjust align block end
423  if ($self->{'_match'}->{$R_chr}->{$id}->[$align]) {
424  my $c = scalar(@{ $self->{'_match'}->{$R_chr}->{$id}->[$align] }) - 1;
425  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[1] =
426  $coords->{'A_start'} + $j - $self->get_stats('A_gap');
427  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[3] =
428  $coords->{'R_start'} + $j - $self->get_stats('R_gap');
429  }
430 
431  # last position was a non-match
432  } else {
433 
434  # start a new align block
435  push @{ $self->{'_match'}->{$R_chr}->{$id}->[$align] }, [
436  $coords->{'A_start'} + $j - $self->get_stats('A_gap'),
437  $coords->{'A_start'} + $j - $self->get_stats('A_gap'),
438  $coords->{'R_start'} + $j - $self->get_stats('R_gap'),
439  $coords->{'R_start'} + $j - $self->get_stats('R_gap'),
440  $coords->{'strand'},
441  $coords->{'R_id'},
442  ];
443  }
444 }
445 
446 
447 =head2 adjust_coords
448 
449  Arg[1] : Int $A_start - start of alternatvie block in chromosomal coords
450  Arg[2] : Int $A_end - end of alternative block in chromosomal coords
451  Arg[3] : Arrayref $R_coords - list of start/end pairs of reference
452  blocks in chromosomal coords
453  Example : my $R_coords = [ [1, 1000], [3000, 5000] ];
454  $aligner->adjust_coords(1, 30000000, $R_coords);
455  Description : Adjusts coordinates of blastz alignments to chromosomal coords.
456  Return type : none
457  Exceptions : none
458  Caller : general
459 
460 =cut
461 
462 sub adjust_coords {
463  my ($self, $A_start, $A_end, $R_coords) = @_;
464  my $R_chr = $self->seq_region_name;
465  my $id = $self->id;
466 
467  for (my $align = 0; $align < scalar(@{ $self->{'_match'}->{$R_chr}->{$id} || [] }); $align++) {
468  for (my $c = 0; $c < scalar(@{ $self->{'_match'}->{$R_chr}->{$id}->[$align] || []}); $c++) {
469  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[0] += $A_start - 1;
470  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[1] += $A_start - 1;
471 
472  # forward strand match
473  if ($self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[4] == 1) {
474  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[2] += $R_coords->{$self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[5]}->[0] - 1;
475  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[3] += $R_coords->{$self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[5]}->[0] - 1;
476 
477  # reverse strand match
478  } else {
479  my $tmp_start = $R_coords->{$self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[5]}->[1] - $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[3] + 1;
480 
481  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[3] = $R_coords->{$self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[5]}->[1] - $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[2] + 1;
482 
483  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[2] = $tmp_start;
484  }
485 
486  # sanity check: aligned region pairs must have same length
487  my $A_len = $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[1] - $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[0];
488  my $R_len = $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[3] - $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[2];
489  $self->support->log_warning("Length mismatch: $A_len <> $R_len in block $id, alignment $align, stretch $c\n", 2) unless ($A_len == $R_len);
490  }
491  }
492 }
493 
494 =head2 filter_overlaps
495 
496  Description : DEPRECATED. This filtering algorithm didn't work well. Please
497  run the separate script fix_overlaps.pl.
498 
499 =cut
500 
501 sub filter_overlaps {
502  my $self = shift;
503  my $id = $self->id;
504 
505  foreach my $R_chr (sort keys %{ $self->{'_match'} }) {
506  my $coord_check = [];
507  # rearrange the datastructure so that we can find overlaps
508  foreach my $id (keys %{ $self->{'_match'}->{$R_chr} }) {
509  for (my $align = 0; $align < scalar(@{ $self->{'_match'}->{$R_chr}->{$id} }); $align++) {
510  for (my $c = 0; $c < scalar(@{ $self->{'_match'}->{$R_chr}->{$id}->[$align] || []}); $c++) {
511  push @{ $coord_check }, [
512  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[0],
513  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[1],
514  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[2],
515  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[3],
516  $id,
517  $align,
518  $c,
519  ];
520  }
521  }
522  }
523 
524  my @A_sort = sort { $a->[0] <=> $b->[0] } @{ $coord_check };
525  my @R_sort = sort { $a->[2] <=> $b->[2] } @{ $coord_check };
526 
527  # sanity check: alternative alignments must not overlap (axtBest should
528  # guarantee that)
529  my $last;
530  foreach my $c (@A_sort) {
531  $self->support->log_warning("Overlapping alternative alignment at ".join(':', $R_chr, $c->[0], $c->[1])." (last_end ".$last->[1].")\n", 1) if ($last and $c->[0] <= $last->[1]);
532  $last = $c;
533  }
534 
535  # now filter reference overlaps
536  my @seen;
537  $last = undef;
538  foreach my $c (@R_sort) {
539  if ($last and $c->[2] <= $last->[3]) {
540  $self->support->log_verbose("Overlapping reference alignment at ".join(':', $R_chr, $c->[2], $c->[3])." (last_end ".$last->[3].")\n", 1);
541 
542  # if last alignment was longer, delete this one
543  if ($last->[3]-$last->[2] > $c->[3]-$c->[2]) {
544  undef $self->{'_match'}->{$R_chr}->{$c->[4]}->[$c->[5]]->[$c->[6]];
545 
546  # if last alignment was shorter, trace back and delete all
547  # overlapping shorter alignments
548  } else {
549  foreach my $s (@seen) {
550  # earlier alignment still overlapping
551  if ($c->[2] <= $s->[3]) {
552  # earlier alignment shorter -> delete it
553  if ($s->[3]-$s->[2] < $c->[3]-$c->[2]) {
554  undef $self->{'_match'}->{$R_chr}->{$s->[4]}->[$s->[5]]->[$s->[6]];
555 
556  # this alignment shorter -> delete it
557  } else {
558  undef $self->{'_match'}->{$R_chr}->{$c->[4]}->[$c->[5]]->[$c->[6]];
559  $last = $s;
560  last;
561  }
562  } else {
563  $last = $s;
564  last;
565  }
566  }
567 
568  $last = $c;
569  }
570  }
571  unshift @seen, $c;
572  $last = $c unless ($last);
573  }
574  }
575 }
576 
577 =head2 write_assembly
578 
579  Arg[1] : Bio::*::DBSQL::DBAdaptor $R_dba - reference database adaptor
580  Example : my $R_dba = $support->get_database('ensembl');
581  $aligner->write_assembly($R_dba);
582  Description : Writes assembly entries for blastz alignments to the database.
583  Return type : none
584  Exceptions : none
585  Caller : general
586 
587 =cut
588 
589 sub write_assembly {
590  my ($self, $R_dba) = @_;
591 
592  my $R_dbh = $R_dba->dbc->db_handle;
593  my $R_sa = $R_dba->get_SliceAdaptor;
594 
595  my $sth = $R_dbh->prepare(qq(
596  INSERT IGNORE INTO assembly (asm_seq_region_id, cmp_seq_region_id,
597  asm_start,asm_end, cmp_start, cmp_end, ori)
598  VALUES (?, ?, ?, ?, ?, ?, ?)
599  ));
600 
601  $self->support->log("Adding assembly entries for alignments...\n");
602 
603  my $i;
604  foreach my $R_chr (sort _by_chr_num keys %{ $self->{'_match'} }) {
605  # get seq_region_id for alternative and reference chromosome
606  my $A_chr = $R_chr;
607  my $R_slice = $R_sa->fetch_by_region('toplevel', $R_chr, undef, undef, undef, $self->support->param('assembly'));
608  my $R_sid = $R_sa->get_seq_region_id($R_slice);
609  # we need to fetch the alternative slice from the reference db
610  # explicitely by coord_system, since toplevel attribute is not set
611  # there
612  my $cs_name = $R_slice->coord_system_name;
613  my $A_sid = $R_sa->get_seq_region_id($R_sa->fetch_by_region($cs_name, $A_chr, undef, undef, undef, $self->support->param('altassembly')));
614 
615  foreach my $id (sort { $a <=> $b } keys %{ $self->{'_match'}->{$R_chr} }) {
616  for (my $align = 0; $align < scalar(@{ $self->{'_match'}->{$R_chr}->{$id} }); $align++) {
617  for (my $c = 0; $c < scalar(@{ $self->{'_match'}->{$R_chr}->{$id}->[$align] }); $c++) {
618  if ($self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]) {
619  $sth->execute(
620  $R_sid,
621  $A_sid,
622  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[2],
623  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[3],
624  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[0],
625  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[1],
626  $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[4],
627  );
628  $i++;
629  }
630  }
631  }
632  }
633  }
634  $self->support->log("Done inserting $i entries.\n");
635 }
636 
637 =head2 stats_incr
638 
639  Arg[1] : String $code - stats code
640  Arg[2] : Int $incr - number by which stats should be incremented
641  Example : $aligner->stats_incr('total_alignments', 1);
642  Description : Increments stats.
643  Return type : Int - stat value after increment
644  Exceptions : none
645  Caller : general
646 
647 =cut
648 
649 sub stats_incr {
650  my ($self, $code, $incr) = @_;
651  $self->{'_stats'}->{$code} += $incr;
652  return $self->{'_stats'}->{$code};
653 }
654 
655 =head2 init_stats
656 
657  Arg[1] : Array @codes - stats codes to initialise
658  Example : $aligner->init_stats('match', 'mismatch', 'alignments');
659  Description : Initialises stats (i.e. sets to 0).
660  Return type : none
661  Exceptions : none
662  Caller : general
663 
664 =cut
665 
666 sub init_stats {
667  my ($self, @codes) = @_;
668  foreach my $code (@codes) {
669  $self->{'_stats'}->{$code} = 0;
670  }
671 }
672 
673 =head2 get_stats
674 
675  Arg[1] : String $code - stats code
676  Example : my $num_mismatches = $aligner->get_stats('mismatch');
677  Description : Stats getter.
678  Return type : Int - stats value
679  Exceptions : none
680  Caller : general
681 
682 =cut
683 
684 sub get_stats {
685  my ($self, $code) = @_;
686  return $self->{'_stats'}->{$code};
687 }
688 
689 =head2 log_block_stats
690 
691  Arg[1] : Int $indent - indentation level
692  Example : $aligner->log_block_stats(3);
693  Description : Logs stats for an alignment block.
694  Return type : none
695  Exceptions : none
696  Caller : general
697 
698 =cut
699 
700 sub log_block_stats {
701  my ($self, $indent) = @_;
702 
703  $self->support->log("Blastz alignment stats:\n", $indent);
704  $self->support->log(sprintf(FMT2, "Alignments:", $self->get_stats('alignments')), $indent+1);
705  if ($self->get_stats('alignments')) {
706  $self->support->log(sprintf(FMT1,
707  "Matches:",
708  $self->get_stats('match'),
709  $self->get_stats('match')/$self->get_stats('bp')*100),
710  $indent+1);
711  $self->support->log(sprintf(FMT1,
712  "Mismatches:",
713  $self->get_stats('mismatch'),
714  $self->get_stats('mismatch')/$self->get_stats('bp')*100),
715  $indent+1);
716  $self->support->log(sprintf(FMT1,
717  "Gaps:",
718  $self->get_stats('gap'),
719  $self->get_stats('gap')/$self->get_stats('bp')*100),
720  $indent+1);
721  }
722  map { $self->stats_incr($_.'_total', $self->get_stats($_)) }
723  qw(match mismatch gap bp);
724 }
725 
726 =head2 log_overall_stats
727 
728  Example : $aligner->log_overall_stats;
729  Description : Logs overall alignment stats.
730  Return type : none
731  Exceptions : none
732  Caller : general
733 
734 =cut
735 
736 sub log_overall_stats {
737  my $self = shift;
738 
739  # blastz
740  $self->support->log("\nOverall blastz alignment stats:\n");
741 
742  unless ($self->get_stats('alignments')) {
743  $self->support->log("No alignments found.\n", 1);
744  return;
745  }
746 
747  $self->support->log(sprintf(FMT1,
748  "Matches:",
749  $self->get_stats('match_total'),
750  $self->get_stats('match_total')/$self->get_stats('bp_total')*100),
751  1);
752  $self->support->log(sprintf(FMT1,
753  "Mismatches:",
754  $self->get_stats('mismatch_total'),
755  $self->get_stats('mismatch_total')/$self->get_stats('bp_total')*100),
756  1);
757  $self->support->log(sprintf(FMT1,
758  "Gaps:",
759  $self->get_stats('gap_total'),
760  $self->get_stats('gap_total')/$self->get_stats('bp_total')*100),
761  1);
762 
763  # alignments to be written to assembly table
764  $self->support->log_verbose("\nAlignments that will be written to assembly table:\n");
765  $self->support->log_verbose(sprintf(FMT3,
766  qw(CHR BLOCK ALIGNMENT ALT_START ALT_END REF_START REF_END)),
767  1);
768  $self->support->log_verbose(('-'x63)."\n", 1);
769  foreach my $R_chr (sort _by_chr_num keys %{ $self->{'_match'} }) {
770  foreach my $id (sort { $a <=> $b } keys %{ $self->{'_match'}->{$R_chr} }) {
771  for (my $align = 0; $align < scalar(@{ $self->{'_match'}->{$R_chr}->{$id} }); $align++) {
772  for (my $c = 0; $c < scalar(@{ $self->{'_match'}->{$R_chr}->{$id}->[$align] }); $c++) {
773  if ($self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]) {
774  $self->support->log_verbose(sprintf(FMT4,
775  $R_chr,
776  $id,
777  $align+1,
778  @{ $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c] }),
779  1);
780  my $l = $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[1] - $self->{'_match'}->{$R_chr}->{$id}->[$align]->[$c]->[0];
781  $self->stats_incr('alignments_total', 1);
782  $self->stats_incr('short1_10_total', 1) if ($l < 11);
783  $self->stats_incr('short11_100_total', 1) if ($l > 10 and $l < 101);
784  }
785  }
786  }
787  }
788  }
789  $self->support->log("\nAssembly entry stats:\n");
790  $self->support->log(sprintf(FMT2,
791  "Total alignment blocks:",
792  $self->get_stats('alignments_total')),
793  1);
794  $self->support->log(sprintf(FMT2,
795  "Alignments 1-10 bp:",
796  $self->get_stats('short1_10_total')),
797  1);
798  $self->support->log(sprintf(FMT2,
799  "Alignments 11-100 bp:",
800  $self->get_stats('short11_100_total')),
801  1);
802 }
803 
804 =head2 _by_chr_num
805 
806  Example : my @sorted = sort _by_chr_num qw(X, 6-COX, 14, 7);
807  Description : Subroutine to use in sort for sorting chromosomes. Sorts
808  numerically, then alphabetically
809  Return type : values to be used by sort
810  Exceptions : none
811  Caller : internal
812 
813 =cut
814 
815 sub _by_chr_num {
816  my @awords = split /-/, $a;
817  my @bwords = split /-/, $b;
818 
819  my $anum = $awords[0];
820  my $bnum = $bwords[0];
821 
822  if ($anum !~ /^[0-9]*$/) {
823  if ($bnum !~ /^[0-9]*$/) {
824  return $anum cmp $bnum;
825  } else {
826  return 1;
827  }
828  }
829  if ($bnum !~ /^[0-9]*$/) {
830  return -1;
831  }
832 
833  if ($anum <=> $bnum) {
834  return $anum <=> $bnum;
835  } else {
836  if ($#awords == 0) {
837  return -1;
838  } elsif ($#bwords == 0) {
839  return 1;
840  } else {
841  return $awords[1] cmp $bwords[1];
842  }
843  }
844 }
845 
846 =head2 AUTOLOAD
847 
848  Arg[1] : (optional) String/Object - attribute to set
849  Example : # setting a attribute
850  $self->attr($val);
851  # getting the attribute
852  $self->attr;
853  # undefining an attribute
854  $self->attr(undef);
855  Description : lazy function generator for getters/setters
856  Return type : String/Object
857  Exceptions : none
858  Caller : general
859 
860 =cut
861 
862 sub AUTOLOAD {
863  my $self = shift;
864  my $attr = our $AUTOLOAD;
865  $attr =~ s/.*:://;
866  return unless $attr =~ /[^A-Z]/;
867  no strict 'refs';
868  *{$AUTOLOAD} = sub {
869  $_[0]->{'_data'}->{$attr} = $_[1] if (@_ > 1);
870  return $_[0]->{'_data'}->{$attr};
871  };
872  $self->{'_data'}->{$attr} = shift if (@_);
873  return $self->{'_data'}->{$attr};
874 }
875 
876 1;
877 
Bio::EnsEMBL::Utils::ConversionSupport
Definition: ConversionSupport.pm:39
AssemblyMapper::BlastzAligner
Definition: BlastzAligner.pm:43
found_match
public void found_match()
compare
public compare()
Bio::EnsEMBL::Utils::Argument
Definition: Argument.pm:34
AssemblyMapper::BlastzAligner::new
public A new()