ensembl-hive  2.7.0
check_mapping.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 check_mapping.pl - script to check whole genome alignment between two
21 assemblies.
22 
23 =head1 SYNOPSIS
24 
25 check_mapping.pl [arguments]
26 
27 Required arguments:
28 
29  --dbname, db_name=NAME database name NAME
30  --host, --dbhost, --db_host=HOST database host HOST
31  --port, --dbport, --db_port=PORT database port PORT
32  --user, --dbuser, --db_user=USER database username USER
33  --pass, --dbpass, --db_pass=PASS database passwort PASS
34  --assembly=ASSEMBLY assembly version ASSEMBLY
35 
36  --altdbname=NAME alternative database NAME
37  --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
38 
39  --nolog outside of a webserver environment,
40  logging needs to be handled manually.
41  Use in conjunction with shell redirects.
42 
43 Optional arguments:
44 
45  --althost=HOST alternative databases host HOST
46  --altport=PORT alternative database port PORT
47  --altuser=USER alternative database username USER
48  --altpass=PASS alternative database password PASS
49 
50  --chromosomes, --chr=LIST only process LIST toplevel seq_regions
51 
52  --conffile, --conf=FILE read parameters from FILE
53  (default: conf/Conversion.ini)
54 
55  --logfile, --log=FILE log to FILE (default: *STDOUT)
56  --logpath=PATH write logfile to PATH (default: .)
57  --logappend, --log_append append to logfile (default: truncate)
58 
59  -v, --verbose=0|1 verbose logging (default: false)
60  -i, --interactive=0|1 run script interactively (default: true)
61  -n, --dry_run, --dry=0|1 don't write results to database
62  -h, --help, -? print help (this message)
63 
64 =head1 DESCRIPTION
65 
66 This script checks if the whole genome alignment between two assemblies is
67 correct. It does so by comparing the sequence in the reference database with
68 the sequence of the projected fragments in the alternative database.
69 
70 =head1 RELATED FILES
71 
72 The whole process of creating a whole genome alignment between two assemblies
73 is done by a series of scripts. Please see
74 
75  ensembl/misc-scripts/assembly/README
76 
77 for a high-level description of this process, and POD in the individual scripts
78 for the details.
79 
80 
81 =head1 CONTACT
82 
83 Please post comments/questions to the Ensembl development list
84 <http://lists.ensembl.org/mailman/listinfo/dev>
85 
86 =cut
87 
88 use strict;
89 use warnings;
90 no warnings 'uninitialized';
91 
92 use FindBin qw($Bin);
93 use vars qw($SERVERROOT);
94 
95 BEGIN {
96  $SERVERROOT = "$Bin/../../..";
97  unshift(@INC, "$SERVERROOT/ensembl/modules");
98  unshift(@INC, "$SERVERROOT/bioperl-live");
99 }
100 
101 use Getopt::Long;
102 use Pod::Usage;
103 use Bio::EnsEMBL::Utils::ConversionSupport;
104 
105 use Algorithm::Diff qw(diff);
106 
107 $| = 1;
108 
109 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
110 
111 # parse options
112 $support->parse_common_options(@_);
113 $support->parse_extra_options(
114  'assembly=s',
115  'altdbname=s',
116  'altassembly=s',
117  'althost=s',
118  'altport=n',
119  'altpass=s',
120  'chromosomes|chr=s@',
121 );
122 $support->allowed_params(
123  $support->get_common_params,
124  'assembly',
125  'altdbname',
126  'altassembly',
127  'althost',
128  'altport',
129  'altpass',
130  'chromosomes',
131 );
132 
133 if ($support->param('help') or $support->error) {
134  warn $support->error if $support->error;
135  pod2usage(1);
136 }
137 
138 $support->comma_to_list('chromosomes');
139 
140 # ask user to confirm parameters to proceed
141 #$support->confirm_params;
142 
143 # get log filehandle and print heading and parameters to logfile
144 $support->init_log;
145 
146 $support->check_required_params(
147  'assembly',
148  'altdbname',
149  'altassembly'
150 );
151 
152 # first set connection parameters for alternative db if not different from
153 # reference db
154 map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user);
155 
156 # reference database
157 my $R_dba = $support->get_database('ensembl');
158 my $R_sa = $R_dba->get_SliceAdaptor;
159 
160 # database containing the alternative assembly
161 my $A_dba = $support->get_database('core', 'alt');
162 my $A_sa = $A_dba->get_SliceAdaptor;
163 
164 $support->log("Looping over toplevel seq_regions...\n\n");
165 
166 my @global_diff_bins;
167 
168 foreach my $chr ($support->sort_chromosomes) {
169  $support->log_stamped("Toplevel seq_region $chr...\n", 1);
170 
171  my $R_slice = $R_sa->fetch_by_region('toplevel', $chr);
172  my $A_slice = $A_sa->fetch_by_region('toplevel', $chr);
173 
174  unless ($A_slice) {
175  $support->log("Not found in alternative db. Skipping.\n", 2);
176  next;
177  }
178 
179  my $cs_name = $A_slice->coord_system_name;
180 
181  # compare reference and alternative sequence
182  my @segments = @{ $R_slice->project($cs_name, $support->param('altassembly')) };
183 
184  my $i;
185  my $k;
186 
187  foreach my $seg (@segments) {
188  # reference sequence
189  my $R_sub_slice = $R_slice->sub_Slice($seg->from_start, $seg->from_end);
190  my $R_seq = $R_sub_slice->seq;
191 
192  # alternative sequence
193  my $A_proj_slice = $seg->to_Slice;
194 
195  # ignore PAR region (i.e. we project to the symlinked seq_region)
196  next if ($A_proj_slice->seq_region_name ne $chr);
197 
198  my $A_sub_slice = $A_slice->sub_Slice($A_proj_slice->start, $A_proj_slice->end, $A_proj_slice->strand);
199  my $A_seq = $A_sub_slice->seq;
200 
201  # compare
202  if ($R_seq eq $A_seq) {
203  # sequences are identical -> ok
204  $support->log_verbose("Sequence match at ".$R_sub_slice->name."\n", 2);
205 
206  } else {
207  # not identical -> something is wrong
208  $support->log("Sequence mismatch at ".$R_sub_slice->name."\n", 2);
209 
210  my $R_sub_seq;
211  my $A_sub_seq;
212 
213  if ($R_sub_slice->length > 20) {
214  $R_sub_seq = substr($R_seq, 0, 10)."...".substr($R_seq, -10, 10);
215  $A_sub_seq = substr($A_seq, 0, 10)."...".substr($A_seq, -10, 10);
216  } else {
217  $R_sub_seq = substr($R_seq, 0, 20);
218  $A_sub_seq = substr($A_seq, 0, 20);
219  }
220 
221  $support->log("Ref: $R_sub_seq\n", 3);
222  $support->log("Alt: $A_sub_seq\n\n", 3);
223 
224  $i++;
225 
226 
227  if (length($R_seq) == length($A_seq)){
228  #my $diffs = ($R_seq ^ $A_seq) =~ tr/\0//c; # A concatenation of differences
229  # this approach is x10 faster than relying on Algorithm::Diff, as long as there
230  # are no InDels, and the lengths are comparable.
231  my $mask = ($R_seq ^ $A_seq);
232  my @diffs = split (//,$mask);
233  my ($in_change,$change_start,$change_end);
234  for (my $x=0; $x<scalar(@diffs); $x++) {
235  if ($in_change) {
236  if ($diffs[$x] eq "\0") {
237  $in_change = 0;
238  $change_end = $x;
239 
240  my $length = $change_end - $change_start + 1;
241  $global_diff_bins[$length]++;
242  }
243  else {
244  next;
245  }
246  } elsif ($diffs[$x] ne "\0") {
247  $in_change = 1;
248  $change_start = $x;
249  }
250 
251  }
252  } else {
253  my @Ref = split(//,$R_seq);
254  my @Alt = split(//,$A_seq);
255  my @diffs = diff( \@Ref, \@Alt );
256  foreach (@diffs) {
257  my $length = 0;
258  foreach my $desc (@{$_}) {
259  if ($desc->[0] eq '+') {$length++;}
260  if ($desc->[0] eq '-') {$length--;}
261  };
262  $global_diff_bins[$length]++;
263  }
264  }
265 
266  }
267 
268  $k++;
269  }
270 
271  if ($i) {
272  $support->log("Total: $i (of $k) alignments contain sequence mismatches.\n", 2);
273  } else {
274  $support->log("All $k alignments ok.\n", 2);
275  }
276 
277  $support->log_stamped("Done.\n\n", 1);
278 }
279 
280 $support->log("Summary of changes across all chromosomes\n\n",1);
281 $support->log("|Bin |Frequency\n",2);
282 for (my $bin = 0; $bin < scalar(@global_diff_bins); $bin++) {
283  if (defined ($global_diff_bins[$bin])) {
284  $support->log("|$bin |$global_diff_bins[$bin]\n");
285  }
286 }
287 
288 # finish logfile
289 $support->finish_log;
290 
run
public run()