2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
20 check_mapping.pl - script to check whole genome alignment between two
25 check_mapping.pl [arguments]
29 --dbname, db_name=NAME database name NAME
30 --host, --dbhost, --db_host=HOST database host HOST
31 --port, --dbport, --db_port=PORT database port PORT
32 --user, --dbuser, --db_user=USER database username USER
33 --pass, --dbpass, --db_pass=PASS database passwort PASS
34 --assembly=ASSEMBLY assembly version ASSEMBLY
36 --altdbname=NAME alternative database NAME
37 --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
39 --nolog outside of a webserver environment,
40 logging needs to be handled manually.
41 Use in conjunction with shell redirects.
45 --althost=HOST alternative databases host HOST
46 --altport=PORT alternative database port PORT
47 --altuser=USER alternative database username USER
48 --altpass=PASS alternative database password PASS
50 --chromosomes, --chr=LIST only process LIST toplevel seq_regions
52 --conffile, --conf=FILE read parameters from FILE
53 (
default: conf/Conversion.ini)
55 --logfile, --log=FILE log to FILE (
default: *STDOUT)
56 --logpath=PATH write logfile to PATH (
default: .)
57 --logappend, --log_append append to logfile (
default: truncate)
59 -v, --verbose=0|1 verbose logging (
default:
false)
60 -i, --interactive=0|1
run script interactively (
default:
true)
61 -n, --dry_run, --dry=0|1 don
't write results to database
62 -h, --help, -? print help (this message)
66 This script checks if the whole genome alignment between two assemblies is
67 correct. It does so by comparing the sequence in the reference database with
68 the sequence of the projected fragments in the alternative database.
72 The whole process of creating a whole genome alignment between two assemblies
73 is done by a series of scripts. Please see
75 ensembl/misc-scripts/assembly/README
77 for a high-level description of this process, and POD in the individual scripts
83 Please post comments/questions to the Ensembl development list
84 <http://lists.ensembl.org/mailman/listinfo/dev>
90 no warnings 'uninitialized
';
93 use vars qw($SERVERROOT);
96 $SERVERROOT = "$Bin/../../..";
97 unshift(@INC, "$SERVERROOT/ensembl/modules");
98 unshift(@INC, "$SERVERROOT/bioperl-live");
103 use Bio::EnsEMBL::Utils::ConversionSupport;
105 use Algorithm::Diff qw(diff);
109 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
112 $support->parse_common_options(@_);
113 $support->parse_extra_options(
120 'chromosomes|chr=s@
',
122 $support->allowed_params(
123 $support->get_common_params,
133 if ($support->param('help
') or $support->error) {
134 warn $support->error if $support->error;
138 $support->comma_to_list('chromosomes
');
140 # ask user to confirm parameters to proceed
141 #$support->confirm_params;
143 # get log filehandle and print heading and parameters to logfile
146 $support->check_required_params(
152 # first set connection parameters for alternative db if not different from
154 map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user);
157 my $R_dba = $support->get_database('ensembl
');
158 my $R_sa = $R_dba->get_SliceAdaptor;
160 # database containing the alternative assembly
161 my $A_dba = $support->get_database('core
', 'alt
');
162 my $A_sa = $A_dba->get_SliceAdaptor;
164 $support->log("Looping over toplevel seq_regions...\n\n");
166 my @global_diff_bins;
168 foreach my $chr ($support->sort_chromosomes) {
169 $support->log_stamped("Toplevel seq_region $chr...\n", 1);
171 my $R_slice = $R_sa->fetch_by_region('toplevel
', $chr);
172 my $A_slice = $A_sa->fetch_by_region('toplevel
', $chr);
175 $support->log("Not found in alternative db. Skipping.\n", 2);
179 my $cs_name = $A_slice->coord_system_name;
181 # compare reference and alternative sequence
182 my @segments = @{ $R_slice->project($cs_name, $support->param('altassembly
')) };
187 foreach my $seg (@segments) {
189 my $R_sub_slice = $R_slice->sub_Slice($seg->from_start, $seg->from_end);
190 my $R_seq = $R_sub_slice->seq;
192 # alternative sequence
193 my $A_proj_slice = $seg->to_Slice;
195 # ignore PAR region (i.e. we project to the symlinked seq_region)
196 next if ($A_proj_slice->seq_region_name ne $chr);
198 my $A_sub_slice = $A_slice->sub_Slice($A_proj_slice->start, $A_proj_slice->end, $A_proj_slice->strand);
199 my $A_seq = $A_sub_slice->seq;
202 if ($R_seq eq $A_seq) {
203 # sequences are identical -> ok
204 $support->log_verbose("Sequence match at ".$R_sub_slice->name."\n", 2);
207 # not identical -> something is wrong
208 $support->log("Sequence mismatch at ".$R_sub_slice->name."\n", 2);
213 if ($R_sub_slice->length > 20) {
214 $R_sub_seq = substr($R_seq, 0, 10)."...".substr($R_seq, -10, 10);
215 $A_sub_seq = substr($A_seq, 0, 10)."...".substr($A_seq, -10, 10);
217 $R_sub_seq = substr($R_seq, 0, 20);
218 $A_sub_seq = substr($A_seq, 0, 20);
221 $support->log("Ref: $R_sub_seq\n", 3);
222 $support->log("Alt: $A_sub_seq\n\n", 3);
227 if (length($R_seq) == length($A_seq)){
228 #my $diffs = ($R_seq ^ $A_seq) =~ tr/\0//c; # A concatenation of differences
229 # this approach is x10 faster than relying on Algorithm::Diff, as long as there
230 # are no InDels, and the lengths are comparable.
231 my $mask = ($R_seq ^ $A_seq);
232 my @diffs = split (//,$mask);
233 my ($in_change,$change_start,$change_end);
234 for (my $x=0; $x<scalar(@diffs); $x++) {
236 if ($diffs[$x] eq "\0") {
240 my $length = $change_end - $change_start + 1;
241 $global_diff_bins[$length]++;
246 } elsif ($diffs[$x] ne "\0") {
253 my @Ref = split(//,$R_seq);
254 my @Alt = split(//,$A_seq);
255 my @diffs = diff( \@Ref, \@Alt );
258 foreach my $desc (@{$_}) {
259 if ($desc->[0] eq '+
') {$length++;}
260 if ($desc->[0] eq '-
') {$length--;}
262 $global_diff_bins[$length]++;
272 $support->log("Total: $i (of $k) alignments contain sequence mismatches.\n", 2);
274 $support->log("All $k alignments ok.\n", 2);
277 $support->log_stamped("Done.\n\n", 1);
280 $support->log("Summary of changes across all chromosomes\n\n",1);
281 $support->log("|Bin |Frequency\n",2);
282 for (my $bin = 0; $bin < scalar(@global_diff_bins); $bin++) {
283 if (defined ($global_diff_bins[$bin])) {
284 $support->log("|$bin |$global_diff_bins[$bin]\n");
289 $support->finish_log;