2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
20 merge_cmp_gaps.pl - merge small gaps between mappings and bridge mappings without gaps
24 fix_overlaps.pl [arguments]
28 --dbname, db_name=NAME database name NAME
29 --host, --dbhost, --db_host=HOST database host HOST
30 --port, --dbport, --db_port=PORT database port PORT
31 --user, --dbuser, --db_user=USER database username USER
32 --pass, --dbpass, --db_pass=PASS database passwort PASS
33 --assembly=ASSEMBLY assembly version ASSEMBLY
35 --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
36 --chromosomes, --chr=LIST only process LIST seq_regions
40 --conffile, --conf=FILE read parameters from FILE
41 (
default: conf/Conversion.ini)
43 --logfile, --log=FILE log to FILE (
default: *STDOUT)
44 --logpath=PATH write logfile to PATH (
default: .)
45 --logappend, --log_append append to logfile (
default: truncate)
47 -v, --verbose=0|1 verbose logging (
default:
false)
48 -i, --interactive=0|1
run script interactively (
default:
true)
49 -n, --dry_run, --dry=0|1 don
't write results to database
50 -h, --help, -? print help (this message)
54 The script merges adjacent assembly segments which can result from alternating
55 alignments from clone identity and blastz alignment.
60 Monika Komorowska <monika@ebi.ac.uk>, Ensembl core API team
64 Please post comments/questions to the Ensembl development list
65 <http://lists.ensembl.org/mailman/listinfo/dev>
71 no warnings 'uninitialized
';
76 use Bio::EnsEMBL::Utils::ConversionSupport;
80 my $support = new Bio::EnsEMBL::Utils::ConversionSupport("$Bin/../../..");
83 $support->parse_common_options(@_);
84 $support->parse_extra_options(
89 $support->allowed_params(
90 $support->get_common_params,
96 if ($support->param('help
') or $support->error) {
97 warn $support->error if $support->error;
101 $support->comma_to_list('chromosomes
');
103 # ask user to confirm parameters to proceed
104 $support->confirm_params;
106 # get log filehandle and print heading and parameters to logfile
109 $support->check_required_params(
115 # database connection
116 my $dba = $support->get_database('ensembl
');
117 my $dbh = $dba->dbc->db_handle;
119 my $assembly = $support->param('assembly
');
120 my $altassembly = $support->param('altassembly
');
124 FROM assembly a, seq_region sr1, seq_region sr2,
125 coord_system cs1, coord_system cs2
126 WHERE a.asm_seq_region_id = sr1.seq_region_id
127 AND a.cmp_seq_region_id = sr2.seq_region_id
128 AND sr1.coord_system_id = cs1.coord_system_id
129 AND sr2.coord_system_id = cs2.coord_system_id
130 AND cs1.version = '$assembly
'
131 AND cs2.version = '$altassembly
'
133 ORDER BY a.ori, a.cmp_start
136 my $sth = $dbh->prepare($sql);
138 my $fmt1 = "%10s %10s %10s %10s %3s\n";
141 foreach my $chr ($support->param('chromosomes
')) {
144 $support->log_stamped("\nToplevel seq_region $chr...\n");
150 # do an initial fetch
151 my $last = $sth->fetchrow_hashref;
153 # skip seq_regions for which we don't have data
155 $support->log(
"No mappings found. Skipping.\n", 1);
165 while ($last and (my $r = $sth->fetchrow_hashref)) {
166 # merge adjacent assembly segments (these can result from alternating
167 # alignments from clone identity and blastz alignment)
168 if ($last->{
'asm_end'} == ($r->{
'asm_start'} - 1) and
169 $last->{
'cmp_end'} == ($r->{
'cmp_start'} - 1)) {
174 $support->log_verbose(
'merging - last: '.sprintf($fmt1,
175 map { $last->{$_} } qw(asm_start asm_end cmp_start cmp_end ori)), 1);
176 $support->log_verbose(
'this: '.sprintf($fmt1,
map { $r->{$_} }
177 qw(asm_start asm_end cmp_start cmp_end ori)), 1);
182 # merge segments and add new row
183 $last->{
'asm_end'} = $r->{
'asm_end'};
184 $last->{
'cmp_end'} = $r->{
'cmp_end'};
190 # bridge small gaps (again, these can result from alternating alignments
191 # from clone identity and blastz alignment). A maximum gap size of 10bp is
193 my $asm_gap = $r->{
'asm_start'} - $last->{
'asm_end'} - 1;
194 my $cmp_gap = $r->{
'cmp_start'} - $last->{
'cmp_end'} - 1;
196 if ($asm_gap == $cmp_gap and $asm_gap <= 10 and $asm_gap > 0) {
201 $support->log_verbose(
'bridging - last: '.sprintf($fmt1,
202 map { $last->{$_} } qw(asm_start asm_end cmp_start cmp_end ori)), 1);
203 $support->log_verbose(
'this: '.sprintf($fmt1,
map { $r->{$_} }
204 qw(asm_start asm_end cmp_start cmp_end ori)), 1);
209 # merge segments and add new row
210 $last->{
'asm_end'} = $r->{
'asm_end'};
211 $last->{
'cmp_end'} = $r->{
'cmp_end'};
221 $support->log(
"Merged $j mappings.\n", 1);
222 $support->log(
"Bridged $k gaps.\n", 1);
224 if ((!$support->param(
'dry_run')) && ($j + $k > 0) ) {
226 # delete all current mappings from the db and insert the corrected ones
229 FROM assembly a, seq_region sr1, seq_region sr2,
230 coord_system cs1, coord_system cs2
231 WHERE a.asm_seq_region_id = sr1.seq_region_id
232 AND a.cmp_seq_region_id = sr2.seq_region_id
233 AND sr1.coord_system_id = cs1.coord_system_id
234 AND sr2.coord_system_id = cs2.coord_system_id
235 AND cs1.version =
'$assembly'
236 AND cs2.version =
'$altassembly'
237 AND sr2.name =
'$chr'
240 $support->log(
"\nDeleted $c entries from the assembly table.\n");
242 # now insert the fixed entries
243 $sql = qq(INSERT IGNORE INTO assembly VALUES (?, ?, ?, ?, ?, ?, ?));
244 my $sth1 = $dbh->prepare($sql);
246 foreach my $r (@rows) {
247 $sth1->execute(
map { $r->{$_} } qw(asm_seq_region_id cmp_seq_region_id asm_start asm_end cmp_start cmp_end ori));
250 $support->log(
"Added ".scalar(@rows).
" fixed entries to the assembly table.\n");
261 $support->finish_log;