2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
20 fix_overlaps.pl - remove overlapping mappings in the assembly coordinates (asm_start - asm_end) between two closely related assemblies
24 fix_overlaps.pl [arguments]
28 --dbname, db_name=NAME database name NAME
29 --host, --dbhost, --db_host=HOST database host HOST
30 --port, --dbport, --db_port=PORT database port PORT
31 --user, --dbuser, --db_user=USER database username USER
32 --pass, --dbpass, --db_pass=PASS database passwort PASS
33 --assembly=ASSEMBLY assembly version ASSEMBLY
35 --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
36 --chromosomes, --chr=LIST only process LIST seq_regions
40 --conffile, --conf=FILE read parameters from FILE
41 (
default: conf/Conversion.ini)
43 --logfile, --log=FILE log to FILE (
default: *STDOUT)
44 --logpath=PATH write logfile to PATH (
default: .)
45 --logappend, --log_append append to logfile (
default: truncate)
47 -v, --verbose=0|1 verbose logging (
default:
false)
48 -i, --interactive=0|1
run script interactively (
default:
true)
49 -n, --dry_run, --dry=0|1 don
't write results to database
50 -h, --help, -? print help (this message)
54 This script removes overlapping mappings from the assembly coordinates.
55 Mappings are trimmed so that such overlaps don't
break the
AssemblyMapper when
56 projecting between the reference and non-reference assemblies.
58 It also merges adjacent assembly segments which can result from alternating
59 alignments from clone identity and blastz alignment.
63 The whole process of creating a whole genome alignment between two assemblies
64 is done by a series of scripts. Please see
66 ensembl/misc-scripts/assembly/README
68 for a high-level description of
this process, and POD in the individual scripts
74 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
78 Please post comments/questions to the Ensembl development list
85 no warnings
'uninitialized';
97 $support->parse_common_options(@_);
98 $support->parse_extra_options(
101 'chromosomes|chr=s@',
103 $support->allowed_params(
104 $support->get_common_params,
110 if ($support->param(
'help') or $support->error) {
111 warn $support->error
if $support->error;
115 $support->comma_to_list(
'chromosomes');
117 # ask user to confirm parameters to proceed
118 $support->confirm_params;
120 # get log filehandle and print heading and parameters to logfile
123 $support->check_required_params(
129 # database connection
130 my $dba = $support->get_database(
'ensembl');
131 my $dbh = $dba->dbc->db_handle;
133 my $assembly = $support->param(
'assembly');
134 my $altassembly = $support->param(
'altassembly');
138 FROM assembly a, seq_region sr1, seq_region sr2,
139 coord_system cs1, coord_system cs2
140 WHERE a.asm_seq_region_id = sr1.seq_region_id
141 AND a.cmp_seq_region_id = sr2.seq_region_id
142 AND sr1.coord_system_id = cs1.coord_system_id
143 AND sr2.coord_system_id = cs2.coord_system_id
144 AND cs1.version =
'$assembly'
145 AND cs2.version =
'$altassembly'
150 my $sth = $dbh->prepare($sql);
152 my $fmt1 =
"%10s %10s %10s %10s %3s\n";
155 foreach my $chr ($support->param(
'chromosomes')) {
156 $support->log_stamped(
"\nToplevel seq_region $chr...\n");
162 # do an initial fetch
163 my $last = $sth->fetchrow_hashref;
165 # skip seq_regions for which we don't have data
167 $support->log(
"No mappings found. Skipping.\n", 1);
177 while ($last and (my $r = $sth->fetchrow_hashref)) {
178 # merge adjacent assembly segments (these can result from alternating
179 # alignments from clone identity and blastz alignment)
180 if ($last->{
'asm_end'} == ($r->{
'asm_start'} - 1) and
181 $last->{
'cmp_end'} == ($r->{
'cmp_start'} - 1)) {
186 $support->log_verbose(
'merging - last: '.sprintf($fmt1,
187 map { $last->{$_} } qw(asm_start asm_end cmp_start cmp_end ori)), 1);
188 $support->log_verbose(
'this: '.sprintf($fmt1,
map { $r->{$_} }
189 qw(asm_start asm_end cmp_start cmp_end ori)), 1);
194 # merge segments and add new row
195 $last->{
'asm_end'} = $r->{
'asm_end'};
196 $last->{
'cmp_end'} = $r->{
'cmp_end'};
202 # bridge small gaps (again, these can result from alternating alignments
203 # from clone identity and blastz alignment). A maximum gap size of 10bp is
205 my $asm_gap = $r->{
'asm_start'} - $last->{
'asm_end'} - 1;
206 my $cmp_gap = $r->{
'cmp_start'} - $last->{
'cmp_end'} - 1;
208 if ($asm_gap == $cmp_gap and $asm_gap <= 10 and $asm_gap > 0) {
213 $support->log_verbose(
'bridging - last: '.sprintf($fmt1,
214 map { $last->{$_} } qw(asm_start asm_end cmp_start cmp_end ori)), 1);
215 $support->log_verbose(
'this: '.sprintf($fmt1,
map { $r->{$_} }
216 qw(asm_start asm_end cmp_start cmp_end ori)), 1);
221 # merge segments and add new row
222 $last->{
'asm_end'} = $r->{
'asm_end'};
223 $last->{
'cmp_end'} = $r->{
'cmp_end'};
229 # look for overlaps with last segment
230 if ($last->{
'asm_end'} >= $r->{
'asm_start'}) {
235 $support->log_verbose(
'last: '.sprintf($fmt1,
map { $last->{$_} }
236 qw(asm_start asm_end cmp_start cmp_end ori)), 1);
237 $support->log_verbose(
'before: '.sprintf($fmt1,
map { $r->{$_} }
238 qw(asm_start asm_end cmp_start cmp_end ori)), 1);
240 # skip if this segment ends before the last one
241 if ($r->{
'asm_end'} <= $last->{
'asm_end'}) {
242 $support->log_verbose(
"skipped\n\n", 1);
246 my $overlap = $last->{
'asm_end'} - $r->{
'asm_start'} + 1;
248 $r->{
'asm_start'} += $overlap;
250 if ($r->{
'ori'} == -1) {
251 $r->{
'cmp_end'} -= $overlap;
253 $r->{
'cmp_start'} += $overlap;
256 $support->log_verbose(
'after: '.sprintf($fmt1,
map { $r->{$_} }
257 qw(asm_start asm_end cmp_start cmp_end ori)).
"\n", 1);
264 $support->log(
"Fixed $i mappings.\n", 1);
265 $support->log(
"Merged $j mappings.\n", 1);
266 $support->log(
"Bridged $k gaps.\n", 1);
270 if (!$support->param(
'dry_run') && ($i > 0 || $j > 0 || $k > 0 )) {
272 # delete all current mappings from the db and insert the corrected ones
275 FROM assembly a, seq_region sr1, seq_region sr2,
276 coord_system cs1, coord_system cs2
277 WHERE a.asm_seq_region_id = sr1.seq_region_id
278 AND a.cmp_seq_region_id = sr2.seq_region_id
279 AND sr1.coord_system_id = cs1.coord_system_id
280 AND sr2.coord_system_id = cs2.coord_system_id
281 AND cs1.version =
'$assembly'
282 AND cs2.version =
'$altassembly'
283 AND sr1.name =
'$chr'
286 $support->log(
"\nDeleted $c entries from the assembly table.\n");
288 # now insert the fixed entries
289 $sql = qq(INSERT INTO assembly VALUES (?, ?, ?, ?, ?, ?, ?));
290 my $sth1 = $dbh->prepare($sql);
292 foreach my $r (@rows) {
293 $sth1->execute(
map { $r->{$_} } qw(asm_seq_region_id cmp_seq_region_id asm_start asm_end cmp_start cmp_end ori));
296 $support->log(
"Added ".scalar(@rows).
" fixed entries to the assembly table.\n");
305 $support->finish_log;