2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
20 compare_assemblies.pl -
compare two assemblies
24 compare_assemblies.pl [arguments]
28 --dbname, db_name=NAME database name NAME
29 --host, --dbhost, --db_host=HOST database host HOST
30 --port, --dbport, --db_port=PORT database port PORT
31 --user, --dbuser, --db_user=USER database username USER
32 --pass, --dbpass, --db_pass=PASS database passwort PASS
33 --assembly=ASSEMBLY assembly version ASSEMBLY
35 --altdbname=NAME alternative database NAME
36 --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
40 --althost=HOST alternative databases host HOST
41 --altport=PORT alternative database port PORT
42 --altuser=USER alternative database username USER
43 --altpass=PASS alternative database password PASS
45 --chromosomes, --chr=LIST only process LIST toplevel seq_regions
47 --conffile, --conf=FILE read parameters from FILE
48 (
default: conf/Conversion.ini)
50 --logfile, --log=FILE log to FILE (
default: *STDOUT)
51 --logpath=PATH write logfile to PATH (
default: .)
52 --logappend, --log_append append to logfile (
default: truncate)
54 -v, --verbose=0|1 verbose logging (
default:
false)
55 -i, --interactive=0|1
run script interactively (
default:
true)
56 -n, --dry_run, --dry=0|1 don
't write results to database
57 -h, --help, -? print help (this message)
61 This script compares two assemblies. At the moment all it does is to list
62 clones that are on different toplevel seq_regions in the two assemblies.
66 The whole process of creating a whole genome alignment between two assemblies
67 is done by a series of scripts. Please see
69 ensembl/misc-scripts/assembly/README
71 for a high-level description of this process, and POD in the individual scripts
77 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
81 Please post comments/questions to the Ensembl development list
82 <http://lists.ensembl.org/mailman/listinfo/dev>
88 no warnings 'uninitialized
';
91 use vars qw($SERVERROOT);
94 $SERVERROOT = "$Bin/../../..";
95 unshift(@INC, "$SERVERROOT/ensembl/modules");
96 unshift(@INC, "$SERVERROOT/bioperl-live");
101 use Bio::EnsEMBL::Utils::ConversionSupport;
105 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
108 $support->parse_common_options(@_);
109 $support->parse_extra_options(
115 'chromosomes|chr=s@
',
117 $support->allowed_params(
118 $support->get_common_params,
127 if ($support->param('help
') or $support->error) {
128 warn $support->error if $support->error;
132 $support->comma_to_list('chromosomes
');
134 # ask user to confirm parameters to proceed
135 $support->confirm_params;
137 # get log filehandle and print heading and parameters to logfile
140 $support->check_required_params(
146 # first set connection parameters for alternative db if not different from
148 map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user pass);
151 my $R_dba = $support->get_database('ensembl
');
152 my $R_sa = $R_dba->get_SliceAdaptor;
154 # database containing the alternative assembly
155 my $A_dba = $support->get_database('core
', 'alt
');
156 my $A_sa = $A_dba->get_SliceAdaptor;
158 my $fmt1 = "%-20s%-12s%-12s\n";
159 my $fmt2 = "%-35s%10.0f\n";
163 $support->log_stamped("Looping over toplevel seq_regions...\n");
165 foreach my $R_chr ($support->sort_chromosomes($support->get_chrlength)) {
166 $support->log_stamped("\nChromosome $R_chr...\n", 1);
168 # fetch toplevel seq_region slice and project to clones
169 my $R_slice = $R_sa->fetch_by_region('toplevel
', $R_chr);
170 my @R_clones = @{ $R_slice->project('clone
') };
172 # loop over reference clones
175 foreach my $R_seg (@R_clones) {
176 $stats{'num_clones
'}++;
177 my $R_clone = $R_seg->to_Slice;
178 my $R_clone_name = $R_clone->seq_region_name;
180 # fetch clone from alternative db and project to toplevel seq_region
181 my $A_clone = $A_sa->fetch_by_region('clone
', $R_clone_name);
183 my ($A_seg) = @{ $A_clone->project('toplevel
') };
185 my $A_slice = $A_seg->to_Slice;
187 # compare toplevel seq_region names
188 my $A_chr = $A_slice->seq_region_name;
189 unless ($R_chr eq $A_chr) {
190 push @diff, [$R_clone_name, $R_chr, $A_chr];
193 $stats{'does_not_project
'}++;
194 $support->log_verbose("Clone $R_clone_name doesn't project to toplevel seq_region.\n
", 2);
197 $stats{'not_in_alt'}++;
198 $support->log_verbose("Clone $R_clone_name not in alternative db.\n
", 2);
201 push @diff_total, @diff;
202 map { $stats_total{$_} += $stats{$_} }
203 qw(num_clones does_not_project not_in_alt);
205 # print results for this toplevel seq_region
206 $support->log("\nStats
for toplevel seq_region $R_chr:\n\n
", 2);
207 $support->log(sprintf($fmt2, "Total number of clones:
", $stats{'num_clones'}), 3);
208 $support->log(sprintf($fmt2, "Clones not in alternative db:
", $stats{'not_in_alt'}), 3);
209 $support->log(sprintf($fmt2, "Clones not in alternative assembly:
", $stats{'does_not_project'}), 3);
211 $support->log("\nClones on different toplevel seq_regions in the 2 assemblies:\n\n
", 3);
212 $support->log(sprintf($fmt1, qw(CLONE R_CHR A_CHR)), 4);
213 $support->log(('-'x44)."\n
", 4);
214 foreach my $d (@diff) {
215 $support->log(sprintf($fmt1, @{ $d }), 4);
218 $support->log("\nAll matching clones on same toplevel seq_region in the 2 assemblies.\n
", 3);
222 $support->log_stamped("Done.\n\n
");
224 # print overall results
225 $support->log("\nOverall stats:\n
");
226 $support->log(sprintf($fmt2, "Total number of clones:
", $stats_total{'num_clones'}), 1);
227 $support->log(sprintf($fmt2, "Clones not in alternative db:
", $stats_total{'not_in_alt'}), 1);
228 $support->log(sprintf($fmt2, "Clones not in alternative assembly:
", $stats_total{'does_not_project'}), 1);
230 $support->log("\nClones on different toplevel seq_regions in the 2 assemblies:\n\n
", 1);
231 $support->log(sprintf($fmt1, qw(CLONE R_CHR A_CHR)), 2);
232 $support->log(('-'x44)."\n
", 2);
233 foreach my $d (@diff_total) {
234 $support->log(sprintf($fmt1, @{ $d }), 2);
237 $support->log("\nAll clones on same toplevel seq_region in the 2 assemblies.\n
", 2);
242 $support->finish_log;