ensembl-hive  2.8.1
compare_results.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 
21 
22 =head1 SYNOPSIS
23 
24 run_all.pl [arguments]
25 
26 Required arguments:
27 
28  --dbname, db_name=NAME database name NAME
29  --host, --dbhost, --db_host=HOST database host HOST
30  --port, --dbport, --db_port=PORT database port PORT
31  --user, --dbuser, --db_user=USER database username USER
32  --pass, --dbpass, --db_pass=PASS database passwort PASS
33 
34 Optional arguments:
35 
36  --conffile, --conf=FILE read parameters from FILE
37  (default: conf/Conversion.ini)
38 
39  --logfile, --log=FILE log to FILE (default: *STDOUT)
40  --logpath=PATH write logfile to PATH (default: .)
41  --logappend, --log_append append to logfile (default: truncate)
42  --loglevel=LEVEL define log level (default: INFO)
43 
44  -i, --interactive=0|1 run script interactively (default: true)
45  -n, --dry_run, --dry=0|1 don't write results to database
46  -h, --help, -? print help (this message)
47 
48 =head1 DESCRIPTION
49 
50 
51 
52 =head1 AUTHOR
53 
54 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
55 
56 =head1 CONTACT
57 
58 Please post comments/questions to the Ensembl development list
59 <http://lists.ensembl.org/mailman/listinfo/dev>
60 
61 =cut
62 
63 use strict;
64 use warnings;
65 no warnings 'uninitialized';
66 
67 use FindBin qw($Bin);
68 use Bio::EnsEMBL::Utils::ConfParser;
69 use Bio::EnsEMBL::Utils::Logger;
70 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
71 use Bio::EnsEMBL::DBSQL::DBAdaptor;
72 
73 # parse configuration and commandline arguments
74 my $conf = new Bio::EnsEMBL::Utils::ConfParser(
75  -SERVERROOT => "$Bin/../../../..",
76  -DEFAULT_CONF => "$Bin/../default.conf"
77 );
78 
79 $conf->parse_options(
80  'sourcehost|source_host=s' => 1,
81  'sourceport|source_port=n' => 1,
82  'sourceuser|source_user=s' => 1,
83  'sourcepass|source_pass=s' => 0,
84  'sourcedbname|source_dbname=s' => 1,
85  'targethost|target_host=s' => 1,
86  'targetport|target_port=n' => 1,
87  'targetuser|target_user=s' => 1,
88  'targetpass|target_pass=s' => 0,
89  'targetdbname|target_dbname=s' => 1,
90  'althost|alt_host=s' => 0,
91  'altport|alt_port=n' => 0,
92  'altuser|alt_user=s' => 0,
93  'altpass|alt_pass=s' => 0,
94  'altdbname|alt_dbname=s' => 1,
95  'basedir|basedir=s' => 1,
96  'lsf!' => 0,
97  'lsf_opt|lsfopt=s' => 0,
98  'suffix|sfx=s' => 0,
99  'debug1|d1=s' => 1,
100  'debug2|d2=s' => 1,
101  'type|t=s' => 0,
102 );
103 
104 # set default logpath
105 unless ($conf->param('logpath')) {
106  $conf->param('logpath', path_append($conf->param('basedir'), 'log'));
107 }
108 
109 # assume both dbs are on same host unless specified otherwise
110 foreach my $p (qw(host port user pass)) {
111  unless ($conf->param("alt$p")) {
112  $conf->param("alt$p", $conf->param("target$p"));
113  }
114 }
115 
116 # get log filehandle and print header and parameters to logfile
117 my $logger = new Bio::EnsEMBL::Utils::Logger(
118  -LOGFILE => $conf->param('logfile'),
119  -LOGAUTO => $conf->param('logauto'),
120  -LOGAUTOBASE => 'compare_results',
121  -LOGPATH => $conf->param('logpath'),
122  -LOGAPPEND => $conf->param('logappend'),
123  -LOGLEVEL => $conf->param('loglevel'),
124 );
125 
126 # if user wants to run via lsf, submit script with bsub (this will exit this
127 # instance of the script)
128 &bsubmit if ($conf->param('lsf'));
129 
130 # initialise log
131 $logger->init_log($conf->list_param_values);
132 
133 # connect to dbs
134 my $dba_s = new Bio::EnsEMBL::DBSQL::DBAdaptor(
135  -host => $conf->param("sourcehost"),
136  -port => $conf->param("sourceport"),
137  -user => $conf->param("sourceuser"),
138  -pass => $conf->param("sourcepass"),
139  -dbname => $conf->param("sourcedbname"),
140  -group => 'source',
141 );
142 $dba_s->dnadb($dba_s);
143 
144 my $dba1 = new Bio::EnsEMBL::DBSQL::DBAdaptor(
145  -host => $conf->param("targethost"),
146  -port => $conf->param("targetport"),
147  -user => $conf->param("targetuser"),
148  -pass => $conf->param("targetpass"),
149  -dbname => $conf->param("targetdbname"),
150  -group => 'target',
151 );
152 $dba1->dnadb($dba1);
153 my $dbh1 = $dba1->dbc->db_handle;
154 
155 my $dba2 = new Bio::EnsEMBL::DBSQL::DBAdaptor(
156  -host => $conf->param("althost"),
157  -port => $conf->param("altport"),
158  -user => $conf->param("altuser"),
159  -pass => $conf->param("altpass"),
160  -dbname => $conf->param("altdbname"),
161  -group => 'alt',
162 );
163 $dba2->dnadb($dba2);
164 my $dbh2 = $dba2->dbc->db_handle;
165 
166 # compare mapping results
167 my $type = $conf->param('type') || 'gene';
168 my $function = "compare_${type}s";
169 no strict 'refs';
170 &$function;
171 
172 # finish logfile
173 $logger->finish_log;
174 
175 ### END main ###
176 
177 
178 sub compare_genes {
179  $logger->info("Comparing genes...\n\n", 0, 'stamped');
180  &compare_features('gene');
181  $logger->info("Done\n\n");
182 }
183 
184 
185 sub compare_transcripts {
186  $logger->info("Comparing transcripts...\n\n", 0, 'stamped');
187  &compare_features('transcript');
188  $logger->info("Done\n\n");
189 }
190 
191 
192 sub compare_translations {
193  $logger->info("Comparing translations...\n\n", 0, 'stamped');
194  &compare_features('translation');
195  $logger->info("Done\n\n");
196 }
197 
198 
199 sub compare_exons {
200  $logger->info("Comparing exons...\n\n", 0, 'stamped');
201  &compare_features('exon');
202  $logger->info("Done\n\n");
203 }
204 
205 
206 sub compare_features {
207  my $ftype = shift;
208 
209  # get a filehandle to write results for debugging
210  my $path = path_append($conf->param('basedir'), 'debug');
211  my $file = "$path/${ftype}_diff.txt";
212  open(my $fh, '>', $file) or die "Can't open $file for writing: $!\n";
213 
214  # read scores from files
215  my $scores = {};
216  unless ($ftype eq 'translation') {
217  foreach my $path1 (qw(debug1 debug2)) {
218  my $p1 = $conf->param($path1);
219  my $file1 = "$p1/${ftype}_scores.txt";
220  open(my $fh1, '<', $file1) or die "Can't open $file1 for reading: $!\n";
221 
222  while (my $line = <$fh1>) {
223  chomp $line;
224  my ($old_id, $new_id, $score) = split(/\s+/, $line);
225  $score = sprintf("%.6f", $score);
226 
227  # remember the highest score for each new_id
228  if ($score > $scores->{$path1}->{$new_id}) {
229  $scores->{$path1}->{$new_id} = $score;
230  }
231  }
232 
233  close($fh1);
234  }
235  }
236 
237  #
238  # fetch all features from both runs and create lookup hash by stable_id
239  #
240  $logger->info("Fetching ${ftype} data from dbs...\n", 0, 'stamped');
241 
242  # db 2
243  my $sql1 = qq(SELECT ${ftype}_id, stable_id FROM ${ftype}_stable_id);
244  my $sth1 = $dbh1->prepare($sql1);
245  $sth1->execute;
246 
247  my %fsi1 = ();
248  my %fii1 = ();
249 
250  while (my $r = $sth1->fetchrow_arrayref) {
251  # create lookup hashes of dbID to stable ID and vice versa
252  $fsi1{$r->[1]} = $r->[0];
253  $fii1{$r->[0]} = $r->[1];
254  }
255 
256  $sth1->finish;
257 
258  # db 2
259  my $suffix = $conf->param('suffix');
260  my $sql2 = qq(SELECT ${ftype}_id, stable_id FROM ${ftype}_stable_id${suffix});
261  my $sth2 = $dbh2->prepare($sql2);
262  $sth2->execute;
263 
264  my %fsi2 = ();
265  my %fii2 = ();
266 
267  while (my $r = $sth2->fetchrow_arrayref) {
268  # create lookup hashes of dbID to stable ID and vice versa
269  $fsi2{$r->[1]} = $r->[0];
270  $fii2{$r->[0]} = $r->[1];
271  }
272 
273  $sth2->finish;
274 
275  $logger->info("Done.\n\n", 0, 'stamped');
276 
277  #
278  # get max(gene_stable_id) from source db
279  #
280  my $dbh = $dba_s->dbc->db_handle;
281  my $sql = qq(SELECT max(stable_id) FROM ${ftype}_stable_id);
282  my $sth = $dbh->prepare($sql);
283  $sth->execute;
284  my ($max_stable_id) = $sth->fetchrow_array;
285  $sth->finish;
286 
287  #
288  # now loop over dbIDs in db 1 and compare results with db2
289  #
290  $logger->info("Comparing results...\n", 0, 'stamped');
291 
292  my @stat_keys = qw(TOT NN II NE EN EE);
293  my %stats = map { $_ => 0 } @stat_keys;
294  my $fmt = "%-3s%6d %-20s %-20s %-10s %-10s\n";
295 
296  foreach my $dbID1 (sort { $a <=> $b } keys %fii1) {
297  $stats{TOT}++;
298  my $status;
299 
300  my $sid1 = $fii1{$dbID1};
301  my $sid2 = $fii2{$dbID1};
302 
303  # db 1 has new stable ID
304  if (($max_stable_id cmp $sid1) == -1) {
305 
306  # db 2 has new stable ID too
307  if (($max_stable_id cmp $sid2) == -1) {
308  $status = 'NN';
309 
310  # db 2 reuses an existing stable ID
311  } else {
312  $status = 'NE';
313  }
314 
315  # else db 1 reused an existing stable ID
316  } else {
317 
318  # db 2 uses the same stable ID
319  if ($sid1 eq $sid2) {
320  $status = 'II';
321 
322  # db 2 has a new stable ID
323  } elsif (($max_stable_id cmp $sid2) == -1) {
324  $status = 'EN';
325 
326  # db 2 reuses an existing (but different from db 1) stable ID
327  } else {
328  $status = 'EE';
329  }
330  }
331 
332  # stats
333  $stats{$status}++;
334 
335  # print result line (status dbID sid1 sid2)
336  print $fh sprintf($fmt, $status, $dbID1, $sid1, $sid2,
337  $scores->{'debug1'}->{$dbID1}, $scores->{'debug2'}->{$dbID1});
338  }
339 
340  close($fh);
341 
342  $logger->info("Done.\n\n", 0, 'stamped');
343 
344  # print stats
345  $logger->info("Stats:\n");
346  foreach my $key (@stat_keys) {
347  $logger->info(sprintf(" %-5s%8d (%6s)\n", $key, $stats{$key}, sprintf("%3.1f%%", 100*$stats{$key}/$stats{'TOT'})));
348  }
349 
350 }
351 
352 
353 sub bsubmit {
354  #
355  # build bsub commandline
356  #
357 
358  # automatically create a filename for lsf output
359  my $cmd = 'bsub -o '.$conf->param('logpath');
360  $cmd .= "/lsf_compare_".$logger->log_auto_id.'.out';
361 
362  # add extra lsf options as configured by the user
363  $cmd .= ' '.$conf->param('lsf_opt');
364 
365  # this script's name
366  $cmd .= " $0";
367 
368  # options for this script
369  my $options = $conf->create_commandline_options(
370  logautoid => $logger->log_auto_id,
371  interactive => 0,
372  lsf => 0,
373  );
374  $cmd .= " $options";
375 
376  #
377  # execute bsub
378  #
379  print "\nRe-executing via lsf:\n";
380  print "$cmd\n\n";
381 
382  exec($cmd) or die "Could not exec $0 via lsf: $!\n";
383  #exit;
384 }
385 
transcript
public transcript()
exon
public exon()
debug
public debug()
run
public run()