ensembl-hive  2.7.0
align_by_clone_identity.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 align_by_clone_identity.pl - create a whole genome alignment between two closely
21 related assemblies, step 1
22 
23 =head1 SYNOPSIS
24 
25 align_by_clone_identity.pl [arguments]
26 
27 Required arguments:
28 
29  --dbname, db_name=NAME database name NAME
30  --host, --dbhost, --db_host=HOST database host HOST
31  --port, --dbport, --db_port=PORT database port PORT
32  --user, --dbuser, --db_user=USER database username USER
33  --pass, --dbpass, --db_pass=PASS database passwort PASS
34  --assembly=ASSEMBLY assembly version ASSEMBLY
35 
36  --altdbname=NAME alternative database NAME
37  --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
38 
39 Optional arguments:
40 
41  --chromosomes, --chr=LIST only process LIST toplevel seq_regions
42  --skipclones=FILE read list of clones to skip from FILE
43 
44  --conffile, --conf=FILE read parameters from FILE
45  (default: conf/Conversion.ini)
46 
47  --logfile, --log=FILE log to FILE (default: *STDOUT)
48  --logpath=PATH write logfile to PATH (default: .)
49  --logappend, --log_append append to logfile (default: truncate)
50 
51  -v, --verbose=0|1 verbose logging (default: false)
52  -i, --interactive=0|1 run script interactively (default: true)
53  -n, --dry_run, --dry=0|1 don't write results to database
54  -h, --help, -? print help (this message)
55 
56 =head1 DESCRIPTION
57 
58 This script is part of a series of scripts to create a mapping between two
59 assemblies. It assembles the toplevel coordinate systems of two different
60 assemblies of a genome by creating a whole genome alignment between the two.
61 
62 The process assumes that the two assemblies are reasonably similar, i.e. there
63 are no major rearrangements or clones moved from one toplevel seq_region to
64 another.
65 
66 See "Related scripts" below for an overview of the whole process.
67 
68 This particular script creates a whole genome alignment between two closely
69 related assemblies. You will need a database containing the reference assembly
70 and the alternative toplevel seq_regions which can be created using
71 load_alternative_assembly.pl.
72 
73 The alignment is created in two steps:
74 
75  1. Match clones with same name and version directly and create alignment
76  blocks for these regions. Clones can be tagged manually to be excluded
77  from these direct matches by listing them in a file of clones to skip
78  (--skipclones argument). This can be useful to get better results in
79  regions with major assembly differences.
80 
81  The result is stored in the assembly table as an assembly between the
82  toplevel seq_regions of both genome assemblies.
83 
84  2. Store non-aligned blocks in a temporary table (tmp_align). They can
85  later be aligned using blastz by align_nonident_regions.pl.
86 
87 =head1 RELATED FILES
88 
89 The whole process of creating a whole genome alignment between two assemblies
90 is done by a series of scripts. Please see
91 
92  ensembl/misc-scripts/assembly/README
93 
94 for a high-level description of this process, and POD in the individual scripts
95 for the details.
96 
97 
98 =head1 AUTHOR
99 
100 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
101 
102 =head1 CONTACT
103 
104 Please post comments/questions to the Ensembl development list
105 <http://lists.ensembl.org/mailman/listinfo/dev>
106 
107 =cut
108 
109 use strict;
110 use warnings;
111 no warnings 'uninitialized';
112 
113 use FindBin qw($Bin);
114 use vars qw($SERVERROOT);
115 
116 BEGIN {
117  $SERVERROOT = "$Bin/../../..";
118  unshift(@INC, "$SERVERROOT/ensembl/modules");
119  unshift(@INC, "$SERVERROOT/bioperl-live");
120 }
121 
122 use Getopt::Long;
123 use Pod::Usage;
124 use Bio::EnsEMBL::Utils::ConversionSupport;
125 use Bio::EnsEMBL::Attribute;
126 
127 $| = 1;
128 
129 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
130 
131 # parse options
132 $support->parse_common_options(@_);
133 $support->parse_extra_options(
134  'assembly=s',
135  'althost=s',
136  'altport=i',
137  'altuser=s',
138  'altpass=s',
139  'altdbname=s',
140  'altassembly=s',
141  'chromosomes|chr=s@',
142  'skipclones|skip_clones=s',
143 );
144 $support->allowed_params(
145  $support->get_common_params,
146  'assembly',
147  'althost',
148  'altport',
149  'altuser',
150  'altpass',
151  'altdbname',
152  'altassembly',
153  'chromosomes',
154  'skipclones',
155 );
156 
157 if ($support->param('help') or $support->error) {
158  warn $support->error if $support->error;
159  pod2usage(1);
160 }
161 
162 $support->comma_to_list('chromosomes');
163 
164 # ask user to confirm parameters to proceed
165 $support->confirm_params;
166 
167 # get log filehandle and print heading and parameters to logfile
168 $support->init_log;
169 
170 $support->check_required_params(
171  'assembly',
172  'altdbname',
173  'altassembly'
174 );
175 
176 # suggest to run non-verbosely
177 my $txt = qq(Running this script with the --verbose option will create a lot of output.
178  It is recommended to do this only for debug purposes.
179  Shall I switch to non-verbose logging for you?);
180 
181 if ($support->param('verbose') and $support->user_proceed($txt)) {
182  $support->param('verbose', 0);
183 }
184 
185 #####
186 # connect to database and get adaptors
187 #
188 my ($dba, $dbh, $sql, $sth);
189 
190 # first set connection parameters for alternative db if not different from
191 # reference db
192 map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user);
193 
194 # reference database
195 my $R_dba = $support->get_database('ensembl');
196 my $R_dbh = $R_dba->dbc->db_handle;
197 my $R_sa = $R_dba->get_SliceAdaptor;
198 
199 # database containing the alternative assembly
200 my $A_dba = $support->get_database('core', 'alt');
201 my $A_sa = $A_dba->get_SliceAdaptor;
202 
203 #####
204 # create temporary table for storing non-aligned blocks
205 #
206 unless ($support->param('dry_run')) {
207  $R_dbh->do(qq(
208  CREATE TABLE IF NOT EXISTS tmp_align (
209  tmp_align_id int(10) unsigned NOT NULL auto_increment,
210  alt_seq_region_name varchar(20) NOT NULL,
211  alt_start int(10) UNSIGNED NOT NULL,
212  alt_end int(10) UNSIGNED NOT NULL,
213  ref_seq_region_name varchar(20) NOT NULL,
214  ref_start int(10) UNSIGNED NOT NULL,
215  ref_end int(10) UNSIGNED NOT NULL,
216 
217  PRIMARY KEY (tmp_align_id)
218  )
219  ));
220 
221  # clear tmp_align table of entries from previous runs
222  $R_dbh->do(qq(DELETE FROM tmp_align));
223 }
224 
225 #####
226 # get reference and alternative toplevel seq_regions
227 #
228 my $R_chrlength = $support->get_chrlength($R_dba, $support->param('assembly'), 'toplevel');
229 my $A_chrlength = $support->get_chrlength($R_dba, $support->param('altassembly'), 'toplevel');
230 
231 #####
232 # read list of clones to skip from file
233 #
234 $support->log("Reading list of clones to skip from file...\n");
235 my %skip = ();
236 if ($support->param('skipclones')) {
237  my $infh = $support->filehandle('<', $support->param('skipclones'));
238  while (<$infh>) {
239  chomp;
240  $skip{$_} = 1;
241  }
242 }
243 $support->log("Done.\n\n");
244 
245 #####
246 # loop over toplevel seq_regions
247 #
248 $support->log_stamped("Looping over toplevel seq_regions...\n");
249 
250 my $match = {};
251 my $nomatch = {};
252 my %stats_total;
253 my @block_length;
254 
255 my $fmt1 = "%-40s%10.0f\n";
256 my $fmt2 = "%-40s%9.2f%%\n";
257 my $fmt3 = "%-12s%-12s%-12s%-12s%-12s%-9s\n";
258 my $fmt4 = "%10.0f %10.0f %7.0f %10.0f %10.0f %7.0f\n";
259 my $fmt5 = "%-40s%10s\n";
260 my $fmt6 = "%-10s%-12s%-10s%-12s\n";
261 
262 my $sth1 = $R_dbh->prepare(qq(
263  INSERT IGNORE INTO assembly (asm_seq_region_id, cmp_seq_region_id,
264  asm_start, asm_end, cmp_start, cmp_end, ori)
265  VALUES (?, ?, ?, ?, ?, ?, 1)
266 ));
267 my $sth2 = $R_dbh->prepare(qq(
268  INSERT INTO tmp_align values(NULL, ?, ?, ?, ?, ?, ?
269 )));
270 
271 foreach my $R_chr ($support->sort_chromosomes($R_chrlength)) {
272 
273  $support->log_stamped("Toplevel seq_region $R_chr...\n", 1);
274 
275  my $A_chr = $R_chr;
276 
277  # fetch toplevel seq_region slices
278  my $R_slice = $R_sa->fetch_by_region('toplevel', $R_chr, undef, undef, undef, $support->param('assembly'));
279  my $A_slice = $A_sa->fetch_by_region('toplevel', $A_chr, undef, undef, undef, $support->param('altassembly'));
280 
281  unless ($R_slice and $A_slice) {
282  $support->log("Seq_region not found in either ref or alt assembly. Skipping.\n", 2);
283  next;
284  }
285 
286  # we need to fetch the alternative slice from the reference db explicitely by
287  # coord_system, since toplevel attribute is not set there
288  my $cs_name = $A_slice->coord_system_name;
289  my $A_slice_ref = $R_sa->fetch_by_region($cs_name, $A_chr, undef, undef, undef, $support->param('altassembly'));
290 
291  # project to contigs (not to clones because for WGS assembly regions there
292  # are no clones)
293  my @R_contigs = @{ $R_slice->project('contig') };
294  my @A_contigs = @{ $A_slice->project('contig') };
295 
296  # loop over alternative clontigs
297  my $last = 0;
298  my $j = 0;
299  my $match_flag = 0;
300  my $last_A_seg;
301  my %stats_chr;
302 
303  foreach my $A_seg (@A_contigs) {
304 
305  my $A_contig = $A_seg->to_Slice;
306 
307  # project contig to clone for clone name comparison
308  my @A_clones = @{ $A_contig->project('clone') };
309 
310  my $A_clone;
311  $A_clone = $A_clones[0]->to_Slice if (@A_clones);
312 
313  $support->log_verbose("Alternative contig ($j) ".$A_contig->seq_region_name.":".$A_contig->start."-".$A_contig->end.":".$A_contig->strand." $A_chr:".$A_seg->from_start."-".$A_seg->from_end."\n", 2);
314 
315  # walk reference contigs
316  REFCLONES:
317  for (my $i = $last; $i < scalar(@R_contigs); $i++) {
318 
319  my $R_contig = $R_contigs[$i]->to_Slice;
320 
321  # project contig to clone for clone name comparison
322  my @R_clones = @{ $R_contig->project('clone') };
323 
324  my $R_clone;
325  $R_clone = $R_clones[0]->to_Slice if (@R_clones);
326 
327  # same clone name.version and contig and clone strand found
328  if ($A_clone and $R_clone and
329  $A_clone->seq_region_name eq $R_clone->seq_region_name and
330  $A_clone->strand == $R_clone->strand and
331  $A_contig->strand == $R_contig->strand) {
332 
333  # same clone start/end -> identical assembly
334  if ($A_clone->start == $R_clone->start and $A_clone->end == $R_clone->end) {
335  # check if clone is tagged to be skipped
336  # this can be used to resolve some odd assembly differences
337  if ($skip{$A_clone->seq_region_name}) {
338 
339  $support->log_verbose("Skipping matching reference clone ($i)".
340  $R_clone->seq_region_name.":".$R_clone->start."-".
341  $R_clone->end.":".$R_clone->strand."$R_chr:".
342  $R_contigs[$i]->from_start."-".$R_contigs[$i]->from_end.
343  "\n", 2);
344 
345  &found_nomatch($R_chr, $A_chr, $match, $nomatch, $A_seg,
346  $last_A_seg, $R_contigs[$i], $R_contigs[$i-1],
347  $match_flag, $i, $j
348  );
349 
350  $stats_chr{'skipped'}++;
351  $match_flag = 0;
352 
353  } else {
354 
355  $support->log_verbose("Found matching reference clone ($i)".
356  $R_clone->seq_region_name.":".$R_clone->start."-".
357  $R_clone->end.":".$R_clone->strand."$R_chr:".
358  $R_contigs[$i]->from_start."-".$R_contigs[$i]->from_end.
359  "\n", 2);
360 
361  &found_match($R_chr, $A_chr, $match, $nomatch, $A_seg,
362  $last_A_seg, $R_contigs[$i], $R_contigs[$i-1],
363  $match_flag, $i, $j
364  );
365 
366  $stats_chr{'identical'}++;
367  $match_flag = 1;
368  }
369 
370  # start/end mismatch
371  } else {
372 
373  $support->log_verbose("Start/end mismatch for clone ($i) ".$R_contig->seq_region_name.":".$R_contig->start."-".$R_contig->end.":".$R_contig->strand." $R_chr:".$R_contigs[$i]->from_start."-".$R_contigs[$i]->from_end."\n", 2);
374 
375  &found_nomatch(
376  $R_chr, $A_chr, $match, $nomatch, $A_seg, $last_A_seg,
377  $R_contigs[$i], $R_contigs[$i-1], $match_flag, $i, $j
378  );
379 
380  $stats_chr{'mismatch'}++;
381  $match_flag = 0;
382  }
383  $i++;
384  $last = $i;
385  last REFCLONES;
386 
387  # different clones or no clones found
388  } else {
389 
390  $support->log_verbose("Skipping clone ($i)".$R_contig->seq_region_name.":".$R_contig->start."-".$R_contig->end.":".$R_contig->strand." $R_chr:".$R_contigs[$i]->from_start."-".$R_contigs[$i]->from_end."\n", 2);
391 
392  &found_nomatch($R_chr, $A_chr, $match, $nomatch, $A_seg, $last_A_seg, $R_contigs[$i], $R_contigs[$i-1], $match_flag, $i, $j);
393 
394  $match_flag = 0;
395 
396  }
397  }
398 
399  $last_A_seg = $A_seg;
400  $j++;
401  }
402 
403  # adjust the final clone count
404  if ($match_flag) {
405 
406  # last clone was a match, adjust matching clone count
407  if ($match->{$R_chr}) {
408 
409  my $c = scalar(@{ $match->{$R_chr} }) - 1;
410  $match->{$R_chr}->[$c]->[2] = scalar(@A_contigs) - $match->{$R_chr}->[$c]->[2];
411  $match->{$R_chr}->[$c]->[5] = scalar(@R_contigs) - $match->{$R_chr}->[$c]->[5];
412 
413  }
414 
415  } else {
416 
417  # last clone was a non-match, adjust non-matching clone count
418  if ($nomatch->{$R_chr}) {
419 
420  my $c = scalar(@{ $nomatch->{$R_chr} }) - 1;
421  $nomatch->{$R_chr}->[$c]->[2] = scalar(@A_contigs) - $nomatch->{$R_chr}->[$c]->[2];
422  $nomatch->{$R_chr}->[$c]->[5] = scalar(@R_contigs) - $nomatch->{$R_chr}->[$c]->[5];
423 
424  }
425 
426  }
427 
428  # filter single assembly inserts from non-aligned blocks (i.e. cases where
429  # a block has clones only in one assembly, not in the other) - there is
430  # nothing to be done with them
431  @{ $nomatch->{$R_chr} } = grep { $_->[2] > 0 and $_->[5] > 0 }
432  @{ $nomatch->{$R_chr} } if ($nomatch->{$R_chr});
433 
434  # store directly aligned blocks in assembly table
435  unless ($support->param('dry_run')) {
436 
437  $support->log("Adding assembly entries for directly aligned blocks...\n", 1);
438  my $c;
439 
440  for ($c = 0; $c < scalar(@{ $match->{$R_chr} || [] }); $c++) {
441  $sth1->execute(
442  $R_sa->get_seq_region_id($R_slice),
443  $R_sa->get_seq_region_id($A_slice_ref),
444  $match->{$R_chr}->[$c]->[3],
445  $match->{$R_chr}->[$c]->[4],
446  $match->{$R_chr}->[$c]->[0],
447  $match->{$R_chr}->[$c]->[1]
448  );
449  }
450 
451  $support->log("Done inserting $c entries.\n", 1);
452  }
453 
454  # store non-aligned blocks in tmp_align table
455  unless ($support->param('dry_run')) {
456 
457  if ($nomatch->{$R_chr}) {
458 
459  $support->log("Storing non-aligned blocks in tmp_align table...\n", 1);
460  my $c;
461 
462  for ($c = 0; $c < scalar(@{ $nomatch->{$R_chr} }); $c++) {
463  $sth2->execute(
464  $nomatch->{$R_chr}->[$c]->[6],
465  $nomatch->{$R_chr}->[$c]->[0],
466  $nomatch->{$R_chr}->[$c]->[1],
467  $R_chr,
468  $nomatch->{$R_chr}->[$c]->[3],
469  $nomatch->{$R_chr}->[$c]->[4],
470  );
471  }
472 
473  $support->log("Done inserting $c entries.\n", 1);
474  }
475  }
476 
477  # stats for this toplevel seq_region
478  $stats_chr{'A_only'} = scalar(@A_contigs) - $stats_chr{'identical'} - $stats_chr{'mismatch'};
479  $stats_chr{'R_only'} = scalar(@R_contigs) - $stats_chr{'identical'} - $stats_chr{'mismatch'};
480  for (my $c = 0; $c < scalar(@{ $match->{$R_chr} || [] }); $c++) {
481  $stats_chr{'A_matchlength'} += $match->{$R_chr}->[$c]->[1] - $match->{$R_chr}->[$c]->[0];
482  $stats_chr{'R_matchlength'} += $match->{$R_chr}->[$c]->[4] - $match->{$R_chr}->[$c]->[3];
483  }
484  $stats_chr{'A_coverage'} = 100 * $stats_chr{'A_matchlength'} / $A_slice->length;
485  $stats_chr{'R_coverage'} = 100 * $stats_chr{'R_matchlength'} / $R_slice->length;
486  map { $stats_total{$_} += $stats_chr{$_} } keys %stats_chr;
487 
488  $support->log("\nStats for toplevel seq_region $R_chr:\n\n", 1);
489  $support->log(sprintf($fmt5, "Alternative toplevel seq_region name:", $A_chr), 2);
490  $support->log(sprintf($fmt1, "Length (alternative):", $A_slice->length), 2);
491  $support->log(sprintf($fmt1, "Length (reference):", $R_slice->length), 2);
492  $support->log(sprintf($fmt1, "Identical clones:", $stats_chr{'identical'}), 2);
493  $support->log(sprintf($fmt1, "Identical clones that were skipped:", $stats_chr{'skipped'}), 2);
494  $support->log(sprintf($fmt1, "Clones with start/end mismatch:", $stats_chr{'mismatch'}), 2);
495  $support->log(sprintf($fmt1, "Clones only in alternative assembly:", $stats_chr{'A_only'}), 2);
496  $support->log(sprintf($fmt1, "Clones only in refernce assembly:", $stats_chr{'R_only'}), 2);
497  $support->log(sprintf($fmt2, "Direct match coverage (alternative):", $stats_chr{'A_coverage'}), 2);
498  $support->log(sprintf($fmt2, "Direct match coverage (reference):", $stats_chr{'R_coverage'}), 2);
499 
500  # Aligned blocks
501  if ($match->{$R_chr}) {
502 
503  $support->log("\nDirectly aligned blocks:\n\n", 1);
504  $support->log(sprintf($fmt3, qw(ALT_START ALT_END ALT_CLONES REF_START REF_END REF_CLONES)), 2);
505  $support->log(('-'x71)."\n", 2);
506 
507  for (my $c = 0; $c < scalar(@{ $match->{$R_chr} }); $c++) {
508 
509  $support->log(sprintf($fmt4, @{ $match->{$R_chr}->[$c] }), 2);
510 
511  # sanity check: aligned region pairs must have same length
512  my $e_len = $match->{$R_chr}->[$c]->[1] - $match->{$R_chr}->[$c]->[0] + 1;
513  my $v_len = $match->{$R_chr}->[$c]->[4] - $match->{$R_chr}->[$c]->[3] + 1;
514 
515  $support->log_warning("Length mismatch: $e_len <> $v_len\n", 2) unless ($e_len == $v_len);
516 
517  }
518  }
519 
520  # Non-aligned blocks
521  if ($nomatch->{$R_chr}) {
522 
523  $support->log("\nNon-aligned blocks:\n\n", 1);
524  $support->log(sprintf($fmt3, qw(ALT_START ALT_END ALT_CLONES REF_START REF_END REF_CLONES)), 2);
525  $support->log(('-'x71)."\n", 2);
526 
527  for (my $c = 0; $c < scalar(@{ $nomatch->{$R_chr} }); $c++) {
528 
529  $support->log(sprintf($fmt4, @{ $nomatch->{$R_chr}->[$c] }), 2);
530 
531  # find longest non-aligned block
532  my $A_length = $nomatch->{$R_chr}->[$c]->[1] - $nomatch->{$R_chr}->[$c]->[0] + 1;
533  my $R_length = $nomatch->{$R_chr}->[$c]->[4] - $nomatch->{$R_chr}->[$c]->[3] + 1;
534  push @block_length, [$A_chr, $A_length, $R_chr, $R_length];
535 
536  }
537  }
538 
539  $support->log_stamped("\nDone with toplevel seq_region $R_chr.\n", 1);
540 }
541 
542 # overall stats
543 $support->log("\nOverall stats:\n");
544 $support->log(sprintf($fmt1, "Identical clones:", $stats_total{'identical'}), 1);
545 $support->log(sprintf($fmt1, "Identical clones that were skipped:", $stats_total{'skipped'}), 1);
546 $support->log(sprintf($fmt1, "Clones with start/end mismatch:", $stats_total{'mismatch'}), 1);
547 $support->log(sprintf($fmt1, "Clones only in alternative assembly:", $stats_total{'A_only'}), 1);
548 $support->log(sprintf($fmt1, "Clones only in reference assembly:", $stats_total{'R_only'}), 1);
549 
550 $support->log("\nNon-match block lengths:\n");
551 $support->log(sprintf($fmt6, qw(ALT_CHR ALT_LENGTH REF_CHR REF_LENGTH)), 1);
552 $support->log(('-'x42)."\n", 1);
553 foreach my $block (sort { $a->[1] <=> $b->[1] } @block_length) {
554  $support->log(sprintf("%-10s%10.0f %-10s%10.0f\n", @{ $block }), 1);
555 }
556 
557 $support->log_stamped("\nDone.\n");
558 
559 # finish logfile
560 $support->finish_log;
561 
562 
563 ### end main
564 
565 
566 =head2 found_match
567 
568  Arg[1] : String $R_chr - reference toplevel seq_region name
569  Arg[2] : String $A_chr - alternative toplevel seq_region name
570  Arg[3] : Hashref $match - datastructure to store aligned blocks
571  Arg[4] : Hashref $nomatch - datastructure to store non-aligned blocks
572  Arg[5] : Bio::EnsEMBL::ProjectionSegment $A_seg - current alternative
573  segment
574  Arg[6] : Bio::EnsEMBL::ProjectionSegment $last_A_seg - last alternative
575  segment
576  Arg[7] : Bio::EnsEMBL::ProjectionSegment $R_seg - current reference
577  segment
578  Arg[8] : Bio::EnsEMBL::ProjectionSegment $last_R_seg - last reference
579  segment
580  Arg[9] : Boolean $match_flag - flag indicating if last clone was a match
581  Arg[10] : Int $i - reference clone count
582  Arg[11] : Int $j - alternative clone count
583  Description : This function is called when two clones match (i.e. have the
584  same name.version in both assemblies). Depending on the state
585  of the last clone (match or nomatch), it extends aligned blocks
586  or finishes the non-aligned block and creates a new aligned
587  block.
588  Return type : none
589  Exceptions : none
590  Caller : internal
591 
592 =cut
593 
594 sub found_match {
595  my ($R_chr, $A_chr, $match, $nomatch, $A_seg, $last_A_seg, $R_seg, $last_R_seg, $match_flag, $i, $j) = @_;
596 
597  # last clone was a match
598  if ($match_flag) {
599 
600  # adjust align block end
601  if ($match->{$R_chr}) {
602 
603  my $c = scalar(@{ $match->{$R_chr} }) - 1;
604 
605  # if the gaps between this clone and the last are different, start
606  # a new block
607  if (($A_seg->from_start - $match->{$R_chr}->[$c]->[1]) !=
608  ($R_seg->from_start - $match->{$R_chr}->[$c]->[4])) {
609 
610  $support->log("Gap size mismatch at A:$A_chr:".$match->{$R_chr}->[$c]->[1].'-'.$A_seg->from_start.", R:$R_chr:".$match->{$R_chr}->[$c]->[4].'-'.$R_seg->from_start."\n", 2);
611 
612  # finish the last align block
613  $match->{$R_chr}->[$c]->[1] = $last_A_seg->from_end;
614  $match->{$R_chr}->[$c]->[2] = $j - $match->{$R_chr}->[$c]->[2];
615  $match->{$R_chr}->[$c]->[4] = $last_R_seg->from_end;
616  $match->{$R_chr}->[$c]->[5] = $i - $match->{$R_chr}->[$c]->[5];
617 
618  # start a new align block
619  push @{ $match->{$R_chr} }, [
620  $A_seg->from_start,
621  $A_seg->from_end,
622  $j,
623  $R_seg->from_start,
624  $R_seg->from_end,
625  $i,
626  $A_chr,
627  ];
628 
629  # adjust align block end
630  } else {
631  $match->{$R_chr}->[$c]->[1] = $A_seg->from_end;
632  $match->{$R_chr}->[$c]->[4] = $R_seg->from_end;
633  }
634  }
635 
636  # last clone was a non-match
637  } else {
638 
639  # start a new align block
640  push @{ $match->{$R_chr} }, [
641  $A_seg->from_start,
642  $A_seg->from_end,
643  $j,
644  $R_seg->from_start,
645  $R_seg->from_end,
646  $i,
647  $A_chr,
648  ];
649 
650  # finish the last non-align block
651  if ($nomatch->{$R_chr} and $last_A_seg) {
652  my $c = scalar(@{ $nomatch->{$R_chr} }) - 1;
653  $nomatch->{$R_chr}->[$c]->[1] = $last_A_seg->from_end;
654  $nomatch->{$R_chr}->[$c]->[2] = $j - $nomatch->{$R_chr}->[$c]->[2];
655  $nomatch->{$R_chr}->[$c]->[4] = $last_R_seg->from_end;
656  $nomatch->{$R_chr}->[$c]->[5] = $i - $nomatch->{$R_chr}->[$c]->[5];
657  }
658  }
659 }
660 
661 =head2 found_nomatch
662 
663  Arg[1] : String $R_chr - reference toplevel seq_region name
664  Arg[2] : String $A_chr - alternative toplevel seq_region name
665  Arg[3] : Hashref $match - datastructure to store aligned blocks
666  Arg[4] : Hashref $nomatch - datastructure to store non-aligned blocks
667  Arg[5] : Bio::EnsEMBL::ProjectionSegment $A_seg - current alternative
668  segment
669  Arg[6] : Bio::EnsEMBL::ProjectionSegment $last_A_seg - last alternative
670  segment
671  Arg[7] : Bio::EnsEMBL::ProjectionSegment $R_seg - current reference
672  segment
673  Arg[8] : Bio::EnsEMBL::ProjectionSegment $last_R_seg - last reference
674  segment
675  Arg[9] : Boolean $match_flag - flag indicating if last clone was a match
676  Arg[10] : Int $i - reference clone count
677  Arg[11] : Int $j - alternative clone count
678  Description : This function is called when two clones don't match (either
679  different name.version or length mismatch in the two
680  assemblies). Depending on the state of the last clone (nomatch
681  or match), it extends non-aligned blocks or finishes the
682  aligned block and creates a new non-aligned block.
683  Return type : none
684  Exceptions : none
685  Caller : internal
686 
687 =cut
688 
689 sub found_nomatch {
690  my ($R_chr, $A_chr, $match, $nomatch, $A_seg, $last_A_seg, $R_seg, $last_R_seg, $match_flag, $i, $j) = @_;
691 
692  # last clone was a match
693  if ($match_flag) {
694 
695  # start a new non-align block
696  push @{ $nomatch->{$R_chr} }, [
697  $A_seg->from_start,
698  $A_seg->from_end,
699  $j,
700  $R_seg->from_start,
701  $R_seg->from_end,
702  $i,
703  $A_chr,
704  ];
705 
706  # finish the last align block
707  if ($match->{$R_chr}) {
708  my $c = scalar(@{ $match->{$R_chr} }) - 1;
709  $match->{$R_chr}->[$c]->[1] = $last_A_seg->from_end;
710  $match->{$R_chr}->[$c]->[2] = $j - $match->{$R_chr}->[$c]->[2];
711  $match->{$R_chr}->[$c]->[4] = $last_R_seg->from_end;
712  $match->{$R_chr}->[$c]->[5] = $i - $match->{$R_chr}->[$c]->[5];
713  }
714 
715  # last clone was a non-match
716  } else {
717 
718  # adjust non-align block end
719  if ($nomatch->{$R_chr}) {
720  my $c = scalar(@{ $nomatch->{$R_chr} }) - 1;
721  $nomatch->{$R_chr}->[$c]->[1] = $A_seg->from_end;
722  $nomatch->{$R_chr}->[$c]->[4] = $R_seg->from_end;
723 
724  # we're at the beginning of a seq_region, so start a new non-align block
725  } else {
726  push @{ $nomatch->{$R_chr} }, [
727  $A_seg->from_start,
728  $A_seg->from_end,
729  $j,
730  $R_seg->from_start,
731  $R_seg->from_end,
732  $i,
733  $A_chr,
734  ];
735  }
736  }
737 }
738 
found_nomatch
public void found_nomatch()
run
public run()