ensembl-hive  2.7.0
mapping_stats.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 mapping_stats.pl - print some statistics about a whole genome alignment between
21 two assemblies.
22 
23 =head1 SYNOPSIS
24 
25 mapping_stats.pl [arguments]
26 
27 Required arguments:
28 
29  --dbname, db_name=NAME database name NAME
30  --host, --dbhost, --db_host=HOST database host HOST
31  --port, --dbport, --db_port=PORT database port PORT
32  --user, --dbuser, --db_user=USER database username USER
33  --pass, --dbpass, --db_pass=PASS database passwort PASS
34  --assembly=ASSEMBLY assembly version ASSEMBLY
35 
36  --altdbname=NAME alternative database NAME
37  --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
38 
39 Optional arguments:
40 
41  --althost=HOST alternative databases host HOST
42  --altport=PORT alternative database port PORT
43  --altuser=USER alternative database username USER
44  --altpass=PASS alternative database password PASS
45 
46  --chromosomes, --chr=LIST only process LIST toplevel seq_regions
47 
48  --conffile, --conf=FILE read parameters from FILE
49  (default: conf/Conversion.ini)
50 
51  --logfile, --log=FILE log to FILE (default: *STDOUT)
52  --logpath=PATH write logfile to PATH (default: .)
53  --logappend, --log_append append to logfile (default: truncate)
54 
55  -v, --verbose=0|1 verbose logging (default: false)
56  -i, --interactive=0|1 run script interactively (default: true)
57  -n, --dry_run, --dry=0|1 don't write results to database
58  -h, --help, -? print help (this message)
59 
60 =head1 DESCRIPTION
61 
62 This script prints some statistics about a whole genome alignment between two
63 assemblies, like the alignment coverage and length of alignment blocks.
64 
65 =cut
66 
67 use strict;
68 use warnings;
69 no warnings 'uninitialized';
70 
71 use FindBin qw($Bin);
72 use vars qw($SERVERROOT);
73 
74 BEGIN {
75  $SERVERROOT = "$Bin/../../..";
76  unshift(@INC, "$SERVERROOT/ensembl/modules");
77  unshift(@INC, "$SERVERROOT/bioperl-live");
78 }
79 
80 use Getopt::Long;
81 use Pod::Usage;
82 use Bio::EnsEMBL::Utils::ConversionSupport;
83 
84 $| = 1;
85 
86 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
87 
88 # parse options
89 $support->parse_common_options(@_);
90 $support->parse_extra_options(
91  'assembly=s',
92  'altdbname=s',
93  'altassembly=s',
94  'althost=s',
95  'altport=n',
96  'altpass=s',
97  'chromosomes|chr=s@',
98 );
99 $support->allowed_params(
100  $support->get_common_params,
101  'assembly',
102  'altdbname',
103  'altassembly',
104  'althost',
105  'altport',
106  'chromosomes',
107  'altpass',
108 );
109 
110 if ($support->param('help') or $support->error) {
111  warn $support->error if $support->error;
112  pod2usage(1);
113 }
114 
115 $support->comma_to_list('chromosomes');
116 
117 # ask user to confirm parameters to proceed
118 # $support->confirm_params;
119 
120 # get log filehandle and print heading and parameters to logfile
121 $support->init_log;
122 
123 $support->check_required_params(
124  'assembly',
125  'altdbname',
126  'altassembly'
127 );
128 
129 # first set connection parameters for alternative db if not different from
130 # reference db
131 map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user);
132 
133 # reference database
134 my $R_dba = $support->get_database('ensembl');
135 my $R_sa = $R_dba->get_SliceAdaptor;
136 
137 # database containing the alternative assembly
138 my $A_dba = $support->get_database('core', 'alt');
139 my $A_sa = $A_dba->get_SliceAdaptor;
140 
141 my $fmt1 = "%-40s%12s\n";
142 my $fmt2 = "%-40s%11.1f%%\n";
143 my $fmt3 = "%-44s%8.0f (%2.0f%%)\n";
144 my $fmt4 = "%-40s%12s%10s\n";
145 
146 $support->log("Looping over toplevel seq_regions...\n\n");
147 
148 foreach my $chr ($support->sort_chromosomes(undef, $support->param('assembly'), 1)) {
149  $support->log_stamped("Toplevel seq_region $chr...\n", 1);
150 
151  # determine non-N sequence length of alternative toplevel seq_region
152  my $A_slice = $A_sa->fetch_by_region('toplevel', $chr);
153 
154  unless ($A_slice) {
155  $support->log("Not found in alternative db. Skipping.\n", 2);
156  next;
157  }
158 
159  my $cs_name = $A_slice->coord_system_name;
160 
161  my $start = 1;
162  my $A_chr_length = $A_slice->length;
163  my $n;
164  my $end;
165 
166  while ($start < $A_chr_length) {
167  $end = $start + 10000000;
168  $end = $A_chr_length if ($end > $A_chr_length);
169  my $sub_slice = $A_slice->sub_Slice($start, $end);
170  my $seq = $sub_slice->seq;
171  $n += $seq =~ tr/N/N/;
172  $start = $end + 1;
173  }
174 
175  my $A_length = $A_chr_length - $n;
176 
177  # determine total length of mapping to alternative assembly
178  my $mapping_length = 0;
179  my %blocks;
180  my %blocklength;
181 
182  my %cont_mapping_blocks;
183  my %cont_mapping_length;
184 
185  # toplevel seq_region length order of magnitude
186  my $oom = length($A_length);
187 
188  #seq region on the alternative assembly
189  my $R_slice = $R_sa->fetch_by_region($cs_name, $chr,undef, undef, undef,$support->param('altassembly'));
190 
191  #seq region on the latest assembly
192  my $R_new_assembly_slice = $R_sa->fetch_by_region($cs_name, $chr);
193 
194  #map alternative assembly to latest
195  my @segments = @{ $R_slice->project($cs_name, $support->param('assembly')) };
196 
197  my $alignments = 0;
198  my $alignment_runs = 0;
199 
200  my $previous_sl;
201  my $cont_mapping_length = 0;
202  foreach my $seg (@segments) {
203  my $sl = $seg->to_Slice;
204 
205  # ignore PAR region (i.e. we project to the symlinked seq_region)
206  #next if ($sl->seq_region_name ne $chr);
207 
208  my $l = $sl->length;
209  $mapping_length += $l;
210 
211  if ($previous_sl) {
212  #if current slice is on the same chromosome and within 10bps of the previous slice
213  #add it to the continuous mapping length
214  if ($previous_sl->seq_region_name eq $sl->seq_region_name && abs ($previous_sl->end - $sl->start) <= 10) {
215  $cont_mapping_length += $l;
216  } else {
217 
218  my $c_oom = $oom;
219 
220  while ($c_oom) {
221  if ($cont_mapping_length > 10**($c_oom-1) and $cont_mapping_length <= 10**$c_oom) {
222  $cont_mapping_blocks{10**$c_oom}++;
223  $cont_mapping_length{10**$c_oom} += $cont_mapping_length;
224  }
225  $c_oom--;
226  }
227  if ($cont_mapping_length == 1) {
228  $cont_mapping_blocks{10}++;
229  $cont_mapping_length{10}++
230  }
231 
232  $cont_mapping_length = $l;
233 
234  $alignment_runs ++;
235  }
236  } else {
237  $cont_mapping_length = $l;
238  }
239 
240  my $c_oom = $oom;
241 
242  while ($c_oom) {
243  if ($l > 10**($c_oom-1) and $l <= 10**$c_oom) {
244  $blocks{10**$c_oom}++;
245  $blocklength{10**$c_oom} += $l;
246  }
247  $c_oom--;
248  }
249  if ($l == 1) {
250  $blocks{10}++ ;
251  $blocklength{10}++;
252  }
253 
254  $previous_sl = $sl;
255  $alignments++;
256  }
257 
258  # print stats
259  $support->log("\n");
260 
261  $support->log(sprintf($fmt1, "Reference toplevel seq_region length:",
262  $support->commify($R_new_assembly_slice->length)), 2);
263  $support->log(sprintf($fmt1, "Alternative toplevel seq_region length:",
264  $support->commify($A_chr_length)), 2);
265  $support->log(sprintf($fmt1, "Non-N sequence length (alternative):",
266  $support->commify($A_length)), 2);
267  $support->log(sprintf($fmt1, "Alternative mapping length:",
268  $support->commify($mapping_length)), 2);
269  $support->log(sprintf($fmt2, "Mapping percentage:",
270  $mapping_length/$A_length*100), 2);
271 
272  $support->log("\n");
273 
274  $support->log(sprintf($fmt4, "Total alignments:", $alignments, "Mapping %"), 2);
275 
276  if ($alignments) {
277  for (my $i = 0; $i < $oom; $i++) {
278  my $from = 10**$i;
279  my $to = 10**($i+1);
280  $support->log(sprintf($fmt3, " ".$support->commify($from)." - ".$support->commify($to)." bp:", $blocks{$to}, $blocklength{$to}/$A_length*100), 2);
281  }
282  }
283 
284  $support->log("\n");
285 
286 
287  $support->log(sprintf($fmt4, "Continuous alignment runs:", $alignment_runs, "Mapping %"), 2);
288  $support->log("(gaps up to 10bp)\n",2);
289 
290  if ($alignments) {
291  for (my $i = 0; $i < $oom; $i++) {
292  my $from = 10**$i;
293  my $to = 10**($i+1);
294  $support->log(sprintf($fmt3, " ".$support->commify($from)." - ".$support->commify($to)." bp:", $cont_mapping_blocks{$to}, $cont_mapping_length{$to}/$A_length*100), 2);
295  }
296  }
297 
298  $support->log("\n");
299 
300 }
301 
302 # finish logfile
303 $support->finish_log;
304 
about
public about()
run
public run()