ensembl-hive  2.7.0
compare_assemblies.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 compare_assemblies.pl - compare two assemblies
21 
22 =head1 SYNOPSIS
23 
24 compare_assemblies.pl [arguments]
25 
26 Required arguments:
27 
28  --dbname, db_name=NAME database name NAME
29  --host, --dbhost, --db_host=HOST database host HOST
30  --port, --dbport, --db_port=PORT database port PORT
31  --user, --dbuser, --db_user=USER database username USER
32  --pass, --dbpass, --db_pass=PASS database passwort PASS
33  --assembly=ASSEMBLY assembly version ASSEMBLY
34 
35  --altdbname=NAME alternative database NAME
36  --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
37 
38 Optional arguments:
39 
40  --althost=HOST alternative databases host HOST
41  --altport=PORT alternative database port PORT
42  --altuser=USER alternative database username USER
43  --altpass=PASS alternative database password PASS
44 
45  --chromosomes, --chr=LIST only process LIST toplevel seq_regions
46 
47  --conffile, --conf=FILE read parameters from FILE
48  (default: conf/Conversion.ini)
49 
50  --logfile, --log=FILE log to FILE (default: *STDOUT)
51  --logpath=PATH write logfile to PATH (default: .)
52  --logappend, --log_append append to logfile (default: truncate)
53 
54  -v, --verbose=0|1 verbose logging (default: false)
55  -i, --interactive=0|1 run script interactively (default: true)
56  -n, --dry_run, --dry=0|1 don't write results to database
57  -h, --help, -? print help (this message)
58 
59 =head1 DESCRIPTION
60 
61 This script compares two assemblies. At the moment all it does is to list
62 clones that are on different toplevel seq_regions in the two assemblies.
63 
64 =head1 RELATED FILES
65 
66 The whole process of creating a whole genome alignment between two assemblies
67 is done by a series of scripts. Please see
68 
69  ensembl/misc-scripts/assembly/README
70 
71 for a high-level description of this process, and POD in the individual scripts
72 for the details.
73 
74 
75 =head1 AUTHOR
76 
77 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
78 
79 =head1 CONTACT
80 
81 Please post comments/questions to the Ensembl development list
82 <http://lists.ensembl.org/mailman/listinfo/dev>
83 
84 =cut
85 
86 use strict;
87 use warnings;
88 no warnings 'uninitialized';
89 
90 use FindBin qw($Bin);
91 use vars qw($SERVERROOT);
92 
93 BEGIN {
94  $SERVERROOT = "$Bin/../../..";
95  unshift(@INC, "$SERVERROOT/ensembl/modules");
96  unshift(@INC, "$SERVERROOT/bioperl-live");
97 }
98 
99 use Getopt::Long;
100 use Pod::Usage;
101 use Bio::EnsEMBL::Utils::ConversionSupport;
102 
103 $| = 1;
104 
105 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
106 
107 # parse options
108 $support->parse_common_options(@_);
109 $support->parse_extra_options(
110  'assembly=s',
111  'altdbname=s',
112  'altassembly=s',
113  'althost=s',
114  'altport=n',
115  'chromosomes|chr=s@',
116 );
117 $support->allowed_params(
118  $support->get_common_params,
119  'assembly',
120  'altdbname',
121  'altassembly',
122  'althost',
123  'altport',
124  'chromosomes',
125 );
126 
127 if ($support->param('help') or $support->error) {
128  warn $support->error if $support->error;
129  pod2usage(1);
130 }
131 
132 $support->comma_to_list('chromosomes');
133 
134 # ask user to confirm parameters to proceed
135 $support->confirm_params;
136 
137 # get log filehandle and print heading and parameters to logfile
138 $support->init_log;
139 
140 $support->check_required_params(
141  'assembly',
142  'altdbname',
143  'altassembly'
144 );
145 
146 # first set connection parameters for alternative db if not different from
147 # reference db
148 map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user pass);
149 
150 # reference database
151 my $R_dba = $support->get_database('ensembl');
152 my $R_sa = $R_dba->get_SliceAdaptor;
153 
154 # database containing the alternative assembly
155 my $A_dba = $support->get_database('core', 'alt');
156 my $A_sa = $A_dba->get_SliceAdaptor;
157 
158 my $fmt1 = "%-20s%-12s%-12s\n";
159 my $fmt2 = "%-35s%10.0f\n";
160 my @diff_total;
161 my %stats_total;
162 
163 $support->log_stamped("Looping over toplevel seq_regions...\n");
164 
165 foreach my $R_chr ($support->sort_chromosomes($support->get_chrlength)) {
166  $support->log_stamped("\nChromosome $R_chr...\n", 1);
167 
168  # fetch toplevel seq_region slice and project to clones
169  my $R_slice = $R_sa->fetch_by_region('toplevel', $R_chr);
170  my @R_clones = @{ $R_slice->project('clone') };
171 
172  # loop over reference clones
173  my @diff;
174  my %stats;
175  foreach my $R_seg (@R_clones) {
176  $stats{'num_clones'}++;
177  my $R_clone = $R_seg->to_Slice;
178  my $R_clone_name = $R_clone->seq_region_name;
179 
180  # fetch clone from alternative db and project to toplevel seq_region
181  my $A_clone = $A_sa->fetch_by_region('clone', $R_clone_name);
182  if ($A_clone) {
183  my ($A_seg) = @{ $A_clone->project('toplevel') };
184  if ($A_seg) {
185  my $A_slice = $A_seg->to_Slice;
186 
187  # compare toplevel seq_region names
188  my $A_chr = $A_slice->seq_region_name;
189  unless ($R_chr eq $A_chr) {
190  push @diff, [$R_clone_name, $R_chr, $A_chr];
191  }
192  } else {
193  $stats{'does_not_project'}++;
194  $support->log_verbose("Clone $R_clone_name doesn't project to toplevel seq_region.\n", 2);
195  }
196  } else {
197  $stats{'not_in_alt'}++;
198  $support->log_verbose("Clone $R_clone_name not in alternative db.\n", 2);
199  }
200  }
201  push @diff_total, @diff;
202  map { $stats_total{$_} += $stats{$_} }
203  qw(num_clones does_not_project not_in_alt);
204 
205  # print results for this toplevel seq_region
206  $support->log("\nStats for toplevel seq_region $R_chr:\n\n", 2);
207  $support->log(sprintf($fmt2, "Total number of clones:", $stats{'num_clones'}), 3);
208  $support->log(sprintf($fmt2, "Clones not in alternative db:", $stats{'not_in_alt'}), 3);
209  $support->log(sprintf($fmt2, "Clones not in alternative assembly:", $stats{'does_not_project'}), 3);
210  if (@diff) {
211  $support->log("\nClones on different toplevel seq_regions in the 2 assemblies:\n\n", 3);
212  $support->log(sprintf($fmt1, qw(CLONE R_CHR A_CHR)), 4);
213  $support->log(('-'x44)."\n", 4);
214  foreach my $d (@diff) {
215  $support->log(sprintf($fmt1, @{ $d }), 4);
216  }
217  } else {
218  $support->log("\nAll matching clones on same toplevel seq_region in the 2 assemblies.\n", 3);
219  }
220 }
221 
222 $support->log_stamped("Done.\n\n");
223 
224 # print overall results
225 $support->log("\nOverall stats:\n");
226 $support->log(sprintf($fmt2, "Total number of clones:", $stats_total{'num_clones'}), 1);
227 $support->log(sprintf($fmt2, "Clones not in alternative db:", $stats_total{'not_in_alt'}), 1);
228 $support->log(sprintf($fmt2, "Clones not in alternative assembly:", $stats_total{'does_not_project'}), 1);
229 if (@diff_total) {
230  $support->log("\nClones on different toplevel seq_regions in the 2 assemblies:\n\n", 1);
231  $support->log(sprintf($fmt1, qw(CLONE R_CHR A_CHR)), 2);
232  $support->log(('-'x44)."\n", 2);
233  foreach my $d (@diff_total) {
234  $support->log(sprintf($fmt1, @{ $d }), 2);
235  }
236 } else {
237  $support->log("\nAll clones on same toplevel seq_region in the 2 assemblies.\n", 2);
238 }
239 $support->log("\n");
240 
241 # finish logfile
242 $support->finish_log;
243 
run
public run()
compare
public compare()