ensembl-hive  2.8.1
find_overlaps.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 find_overlaps.pl - find assembly entries with overlapping regions
21 
22 =head1 SYNOPSIS
23 
24 find_overlaps.pl [arguments]
25 
26 Required arguments:
27 
28  --dbname, db_name=NAME database name NAME
29  --host, --dbhost, --db_host=HOST database host HOST
30  --port, --dbport, --db_port=PORT database port PORT
31  --user, --dbuser, --db_user=USER database username USER
32  --pass, --dbpass, --db_pass=PASS database passwort PASS
33 
34 Optional arguments:
35 
36  --conffile, --conf=FILE read parameters from FILE
37  (default: conf/Conversion.ini)
38 
39  --logfile, --log=FILE log to FILE (default: *STDOUT)
40  --logpath=PATH write logfile to PATH (default: .)
41  --logappend, --log_append append to logfile (default: truncate)
42 
43  -v, --verbose=0|1 verbose logging (default: false)
44  -i, --interactive=0|1 run script interactively (default: true)
45  -n, --dry_run, --dry=0|1 don't write results to database
46  -h, --help, -? print help (this message)
47 
48 =head1 DESCRIPTION
49 
50 This script looks for overlapping assembly and component seq_regions in the assembly table of
51 an Ensembl core-style database. It prints the number of overlapping entries per
52 coordinate system and optionally (when run with the --verbose option) a list of
53 the overlapping assembly entries.
54 
55 The script is intended to detect and debug problems in the assembly. It
56 complements the AssemblyMultipleOverlap healthcheck.
57 
58 
59 =head1 AUTHOR
60 
61 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
62 
63 =head1 CONTACT
64 
65 Please post comments/questions to the Ensembl development list
66 <http://lists.ensembl.org/mailman/listinfo/dev>
67 
68 =cut
69 
70 use strict;
71 use warnings;
72 no warnings 'uninitialized';
73 
74 use FindBin qw($Bin);
75 use Getopt::Long;
76 use Pod::Usage;
77 use Bio::EnsEMBL::Utils::ConversionSupport;
78 
79 $| = 1;
80 
81 my $support = new Bio::EnsEMBL::Utils::ConversionSupport("$Bin/../../..");
82 
83 # parse options
84 $support->parse_common_options(@_);
85 $support->parse_extra_options(
86 );
87 $support->allowed_params(
88  $support->get_common_params,
89 );
90 
91 if ($support->param('help') or $support->error) {
92  warn $support->error if $support->error;
93  pod2usage(1);
94 }
95 
96 # ask user to confirm parameters to proceed
97 $support->confirm_params;
98 
99 # get log filehandle and print heading and parameters to logfile
100 $support->init_log;
101 
102 # database connection
103 my $dba = $support->get_database('ensembl');
104 my $dbh = $dba->dbc->db_handle;
105 
106 my $sql = qq(
107  SELECT cs1.name AS asm_name, cs1.version AS asm_version,
108  cs2.name AS cmp_name, cs2.version AS cmp_version
109  FROM assembly a, seq_region sr1, seq_region sr2,
110  coord_system cs1, coord_system cs2
111  WHERE a.asm_seq_region_id = sr1.seq_region_id
112  AND sr1.coord_system_id = cs1.coord_system_id
113  AND a.cmp_seq_region_id = sr2.seq_region_id
114  AND sr2.coord_system_id = cs2.coord_system_id
115  GROUP BY asm_name, asm_version, cmp_name, cmp_version
116 );
117 
118 
119 my $fmt1 = "%6s %6s %10s %10s %10s %10s %3s %3s %3s\n";
120 
121 
122 foreach my $type ( qw(asm cmp)){
123 my $sth = $dbh->prepare($sql);
124 $sth->execute;
125 while (my ($asm_name, $asm_version, $cmp_name, $cmp_version) =
126  $sth->fetchrow_array) {
127 
128  my $sql1 = qq(
129  SELECT a.*, sr1.name as asm_sr_name, sr2.name as cmp_sr_name
130  FROM assembly a, seq_region sr1, seq_region sr2,
131  coord_system cs1, coord_system cs2
132  WHERE a.asm_seq_region_id = sr1.seq_region_id
133  AND a.cmp_seq_region_id = sr2.seq_region_id
134  AND sr1.coord_system_id = cs1.coord_system_id
135  AND sr2.coord_system_id = cs2.coord_system_id
136  AND cs1.name = '$asm_name'
137  AND cs2.name = '$cmp_name'
138  );
139 
140  if ($asm_version) {
141  $sql1 .= " AND cs1.version = '$asm_version'";
142  } else {
143  $sql1 .= " AND cs1.version IS NULL";
144  $asm_version = 'NULL';
145  }
146 
147  if ($cmp_version) {
148  $sql1 .= " AND cs2.version = '$cmp_version'";
149  } else {
150  $sql1 .= " AND cs2.version IS NULL";
151  $cmp_version = 'NULL';
152  }
153 
154 
155  $sql1 .= " ORDER BY a.".$type."_seq_region_id, a.".$type."_start ASC, a.".$type."_end";
156 
157  $support->log_stamped("$asm_name.$asm_version $cmp_name.$cmp_version\n");
158 
159  my $sth1 = $dbh->prepare($sql1);
160  $sth1->execute;
161 
162  # do an initial fetch
163  my $last = $sth1->fetchrow_hashref;
164 # foreach my $key (qw(asm_seq_region_id asm_sr_name cmp_seq_region_id cmp_sr_name asm_start asm_end cmp_start cmp_end ori)){
165 # print $key." = ".$last->{$key}."\t";
166 # }
167 # print "\n";
168  my $i = 0;
169 
170  $support->log_verbose($type . ' overlaps:\n');
171  while ($last and (my $r = $sth1->fetchrow_hashref)) {
172 
173 # foreach my $key (qw(asm_seq_region_id asm_sr_name cmp_seq_region_id cmp_sr_name asm_start asm_end cmp_start cmp_end ori)){
174 # print $key." = ".$r->{$key}."\t";
175 # }
176 # print "\n";
177 
178  # look for overlaps with last segment
179  if ($last->{$type.'_seq_region_id'} == $r->{$type.'_seq_region_id'} and
180  $last->{$type.'_end'} >= $r->{$type.'_start'}) {
181 
182  $i++;
183 
184  # debug warnings
185  $support->log_verbose('last:'.sprintf($fmt1, map { $last->{$_} }
186  qw(asm_seq_region_id asm_sr_name cmp_seq_region_id cmp_sr_name
187  asm_start asm_end cmp_start cmp_end ori)), 1);
188  $support->log_verbose('this:'.sprintf($fmt1, map { $r->{$_} }
189  qw(asm_seq_region_id asm_sr_name cmp_seq_region_id cmp_sr_name
190  asm_start asm_end cmp_start cmp_end ori))."\n", 1);
191 
192  }
193 
194  $last = $r;
195  }
196 
197  $sth1->finish;
198 
199  $support->log("\nFound $i overlaps.\n\n", 1);
200 }
201 $sth->finish;
202 }
203 
204 # finish logfile
205 $support->finish_log;
206 
run
public run()