2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
20 mapping_stats.pl - print some statistics
about a whole genome alignment between
25 mapping_stats.pl [arguments]
29 --dbname, db_name=NAME database name NAME
30 --host, --dbhost, --db_host=HOST database host HOST
31 --port, --dbport, --db_port=PORT database port PORT
32 --user, --dbuser, --db_user=USER database username USER
33 --pass, --dbpass, --db_pass=PASS database passwort PASS
34 --assembly=ASSEMBLY assembly version ASSEMBLY
36 --altdbname=NAME alternative database NAME
37 --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
41 --althost=HOST alternative databases host HOST
42 --altport=PORT alternative database port PORT
43 --altuser=USER alternative database username USER
44 --altpass=PASS alternative database password PASS
46 --chromosomes, --chr=LIST only process LIST toplevel seq_regions
48 --conffile, --conf=FILE read parameters from FILE
49 (
default: conf/Conversion.ini)
51 --logfile, --log=FILE log to FILE (
default: *STDOUT)
52 --logpath=PATH write logfile to PATH (
default: .)
53 --logappend, --log_append append to logfile (
default: truncate)
55 -v, --verbose=0|1 verbose logging (
default:
false)
56 -i, --interactive=0|1
run script interactively (
default:
true)
57 -n, --dry_run, --dry=0|1 don
't write results to database
58 -h, --help, -? print help (this message)
62 This script prints some statistics about a whole genome alignment between two
63 assemblies, like the alignment coverage and length of alignment blocks.
69 no warnings 'uninitialized
';
72 use vars qw($SERVERROOT);
75 $SERVERROOT = "$Bin/../../..";
76 unshift(@INC, "$SERVERROOT/ensembl/modules");
77 unshift(@INC, "$SERVERROOT/bioperl-live");
82 use Bio::EnsEMBL::Utils::ConversionSupport;
86 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
89 $support->parse_common_options(@_);
90 $support->parse_extra_options(
99 $support->allowed_params(
100 $support->get_common_params,
110 if ($support->param('help
') or $support->error) {
111 warn $support->error if $support->error;
115 $support->comma_to_list('chromosomes
');
117 # ask user to confirm parameters to proceed
118 # $support->confirm_params;
120 # get log filehandle and print heading and parameters to logfile
123 $support->check_required_params(
129 # first set connection parameters for alternative db if not different from
131 map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user);
134 my $R_dba = $support->get_database('ensembl
');
135 my $R_sa = $R_dba->get_SliceAdaptor;
137 # database containing the alternative assembly
138 my $A_dba = $support->get_database('core
', 'alt
');
139 my $A_sa = $A_dba->get_SliceAdaptor;
141 my $fmt1 = "%-40s%12s\n";
142 my $fmt2 = "%-40s%11.1f%%\n";
143 my $fmt3 = "%-44s%8.0f (%2.0f%%)\n";
144 my $fmt4 = "%-40s%12s%10s\n";
146 $support->log("Looping over toplevel seq_regions...\n\n");
148 foreach my $chr ($support->sort_chromosomes(undef, $support->param('assembly
'), 1)) {
149 $support->log_stamped("Toplevel seq_region $chr...\n", 1);
151 # determine non-N sequence length of alternative toplevel seq_region
152 my $A_slice = $A_sa->fetch_by_region('toplevel
', $chr);
155 $support->log("Not found in alternative db. Skipping.\n", 2);
159 my $cs_name = $A_slice->coord_system_name;
162 my $A_chr_length = $A_slice->length;
166 while ($start < $A_chr_length) {
167 $end = $start + 10000000;
168 $end = $A_chr_length if ($end > $A_chr_length);
169 my $sub_slice = $A_slice->sub_Slice($start, $end);
170 my $seq = $sub_slice->seq;
171 $n += $seq =~ tr/N/N/;
175 my $A_length = $A_chr_length - $n;
177 # determine total length of mapping to alternative assembly
178 my $mapping_length = 0;
182 my %cont_mapping_blocks;
183 my %cont_mapping_length;
185 # toplevel seq_region length order of magnitude
186 my $oom = length($A_length);
188 #seq region on the alternative assembly
189 my $R_slice = $R_sa->fetch_by_region($cs_name, $chr,undef, undef, undef,$support->param('altassembly
'));
191 #seq region on the latest assembly
192 my $R_new_assembly_slice = $R_sa->fetch_by_region($cs_name, $chr);
194 #map alternative assembly to latest
195 my @segments = @{ $R_slice->project($cs_name, $support->param('assembly
')) };
198 my $alignment_runs = 0;
201 my $cont_mapping_length = 0;
202 foreach my $seg (@segments) {
203 my $sl = $seg->to_Slice;
205 # ignore PAR region (i.e. we project to the symlinked seq_region)
206 #next if ($sl->seq_region_name ne $chr);
209 $mapping_length += $l;
212 #if current slice is on the same chromosome and within 10bps of the previous slice
213 #add it to the continuous mapping length
214 if ($previous_sl->seq_region_name eq $sl->seq_region_name && abs ($previous_sl->end - $sl->start) <= 10) {
215 $cont_mapping_length += $l;
221 if ($cont_mapping_length > 10**($c_oom-1) and $cont_mapping_length <= 10**$c_oom) {
222 $cont_mapping_blocks{10**$c_oom}++;
223 $cont_mapping_length{10**$c_oom} += $cont_mapping_length;
227 if ($cont_mapping_length == 1) {
228 $cont_mapping_blocks{10}++;
229 $cont_mapping_length{10}++
232 $cont_mapping_length = $l;
237 $cont_mapping_length = $l;
243 if ($l > 10**($c_oom-1) and $l <= 10**$c_oom) {
244 $blocks{10**$c_oom}++;
245 $blocklength{10**$c_oom} += $l;
261 $support->log(sprintf($fmt1, "Reference toplevel seq_region length:",
262 $support->commify($R_new_assembly_slice->length)), 2);
263 $support->log(sprintf($fmt1, "Alternative toplevel seq_region length:",
264 $support->commify($A_chr_length)), 2);
265 $support->log(sprintf($fmt1, "Non-N sequence length (alternative):",
266 $support->commify($A_length)), 2);
267 $support->log(sprintf($fmt1, "Alternative mapping length:",
268 $support->commify($mapping_length)), 2);
269 $support->log(sprintf($fmt2, "Mapping percentage:",
270 $mapping_length/$A_length*100), 2);
274 $support->log(sprintf($fmt4, "Total alignments:", $alignments, "Mapping %"), 2);
277 for (my $i = 0; $i < $oom; $i++) {
280 $support->log(sprintf($fmt3, " ".$support->commify($from)." - ".$support->commify($to)." bp:", $blocks{$to}, $blocklength{$to}/$A_length*100), 2);
287 $support->log(sprintf($fmt4, "Continuous alignment runs:", $alignment_runs, "Mapping %"), 2);
288 $support->log("(gaps up to 10bp)\n",2);
291 for (my $i = 0; $i < $oom; $i++) {
294 $support->log(sprintf($fmt3, " ".$support->commify($from)." - ".$support->commify($to)." bp:", $cont_mapping_blocks{$to}, $cont_mapping_length{$to}/$A_length*100), 2);
303 $support->finish_log;