ensembl-hive  2.7.0
merge_cmp_gaps.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 merge_cmp_gaps.pl - merge small gaps between mappings and bridge mappings without gaps
21 
22 =head1 SYNOPSIS
23 
24 fix_overlaps.pl [arguments]
25 
26 Required arguments:
27 
28  --dbname, db_name=NAME database name NAME
29  --host, --dbhost, --db_host=HOST database host HOST
30  --port, --dbport, --db_port=PORT database port PORT
31  --user, --dbuser, --db_user=USER database username USER
32  --pass, --dbpass, --db_pass=PASS database passwort PASS
33  --assembly=ASSEMBLY assembly version ASSEMBLY
34 
35  --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
36  --chromosomes, --chr=LIST only process LIST seq_regions
37 
38 Optional arguments:
39 
40  --conffile, --conf=FILE read parameters from FILE
41  (default: conf/Conversion.ini)
42 
43  --logfile, --log=FILE log to FILE (default: *STDOUT)
44  --logpath=PATH write logfile to PATH (default: .)
45  --logappend, --log_append append to logfile (default: truncate)
46 
47  -v, --verbose=0|1 verbose logging (default: false)
48  -i, --interactive=0|1 run script interactively (default: true)
49  -n, --dry_run, --dry=0|1 don't write results to database
50  -h, --help, -? print help (this message)
51 
52 =head1 DESCRIPTION
53 
54 The script merges adjacent assembly segments which can result from alternating
55 alignments from clone identity and blastz alignment.
56 
57 
58 =head1 AUTHOR
59 
60 Monika Komorowska <monika@ebi.ac.uk>, Ensembl core API team
61 
62 =head1 CONTACT
63 
64 Please post comments/questions to the Ensembl development list
65 <http://lists.ensembl.org/mailman/listinfo/dev>
66 
67 =cut
68 
69 use strict;
70 use warnings;
71 no warnings 'uninitialized';
72 
73 use FindBin qw($Bin);
74 use Getopt::Long;
75 use Pod::Usage;
76 use Bio::EnsEMBL::Utils::ConversionSupport;
77 
78 $| = 1;
79 
80 my $support = new Bio::EnsEMBL::Utils::ConversionSupport("$Bin/../../..");
81 
82 # parse options
83 $support->parse_common_options(@_);
84 $support->parse_extra_options(
85  'assembly=s',
86  'altassembly=s',
87  'chromosomes|chr=s@',
88 );
89 $support->allowed_params(
90  $support->get_common_params,
91  'assembly',
92  'altassembly',
93  'chromosomes',
94 );
95 
96 if ($support->param('help') or $support->error) {
97  warn $support->error if $support->error;
98  pod2usage(1);
99 }
100 
101 $support->comma_to_list('chromosomes');
102 
103 # ask user to confirm parameters to proceed
104 $support->confirm_params;
105 
106 # get log filehandle and print heading and parameters to logfile
107 $support->init_log;
108 
109 $support->check_required_params(
110  'assembly',
111  'altassembly',
112  'chromosomes'
113 );
114 
115 # database connection
116 my $dba = $support->get_database('ensembl');
117 my $dbh = $dba->dbc->db_handle;
118 
119 my $assembly = $support->param('assembly');
120 my $altassembly = $support->param('altassembly');
121 
122 my $sql = qq(
123  SELECT a.*
124  FROM assembly a, seq_region sr1, seq_region sr2,
125  coord_system cs1, coord_system cs2
126  WHERE a.asm_seq_region_id = sr1.seq_region_id
127  AND a.cmp_seq_region_id = sr2.seq_region_id
128  AND sr1.coord_system_id = cs1.coord_system_id
129  AND sr2.coord_system_id = cs2.coord_system_id
130  AND cs1.version = '$assembly'
131  AND cs2.version = '$altassembly'
132  AND sr2.name = ?
133  ORDER BY a.ori, a.cmp_start
134 );
135 
136 my $sth = $dbh->prepare($sql);
137 
138 my $fmt1 = "%10s %10s %10s %10s %3s\n";
139 
140 
141 foreach my $chr ($support->param('chromosomes')) {
142 
143 
144  $support->log_stamped("\nToplevel seq_region $chr...\n");
145 
146  $sth->execute($chr);
147 
148  my @rows = ();
149 
150  # do an initial fetch
151  my $last = $sth->fetchrow_hashref;
152 
153  # skip seq_regions for which we don't have data
154  unless ($last) {
155  $support->log("No mappings found. Skipping.\n", 1);
156  next;
157  }
158 
159  push @rows, $last;
160 
161  my $i = 0;
162  my $j = 0;
163  my $k = 0;
164 
165  while ($last and (my $r = $sth->fetchrow_hashref)) {
166  # merge adjacent assembly segments (these can result from alternating
167  # alignments from clone identity and blastz alignment)
168  if ($last->{'asm_end'} == ($r->{'asm_start'} - 1) and
169  $last->{'cmp_end'} == ($r->{'cmp_start'} - 1)) {
170 
171  $j++;
172 
173  # debug warnings
174  $support->log_verbose('merging - last: '.sprintf($fmt1,
175  map { $last->{$_} } qw(asm_start asm_end cmp_start cmp_end ori)), 1);
176  $support->log_verbose('this: '.sprintf($fmt1, map { $r->{$_} }
177  qw(asm_start asm_end cmp_start cmp_end ori)), 1);
178 
179  # remove last row
180  pop(@rows);
181 
182  # merge segments and add new row
183  $last->{'asm_end'} = $r->{'asm_end'};
184  $last->{'cmp_end'} = $r->{'cmp_end'};
185  push @rows, $last;
186 
187  next;
188  }
189 
190  # bridge small gaps (again, these can result from alternating alignments
191  # from clone identity and blastz alignment). A maximum gap size of 10bp is
192  # allowed
193  my $asm_gap = $r->{'asm_start'} - $last->{'asm_end'} - 1;
194  my $cmp_gap = $r->{'cmp_start'} - $last->{'cmp_end'} - 1;
195 
196  if ($asm_gap == $cmp_gap and $asm_gap <= 10 and $asm_gap > 0) {
197 
198  $k++;
199 
200  # debug warnings
201  $support->log_verbose('bridging - last: '.sprintf($fmt1,
202  map { $last->{$_} } qw(asm_start asm_end cmp_start cmp_end ori)), 1);
203  $support->log_verbose('this: '.sprintf($fmt1, map { $r->{$_} }
204  qw(asm_start asm_end cmp_start cmp_end ori)), 1);
205 
206  # remove last row
207  pop(@rows);
208 
209  # merge segments and add new row
210  $last->{'asm_end'} = $r->{'asm_end'};
211  $last->{'cmp_end'} = $r->{'cmp_end'};
212  push @rows, $last;
213 
214  next;
215  }
216 
217  push @rows, $r;
218  $last = $r;
219  }
220 
221  $support->log("Merged $j mappings.\n", 1);
222  $support->log("Bridged $k gaps.\n", 1);
223 
224  if ((!$support->param('dry_run')) && ($j + $k > 0) ) {
225 
226  # delete all current mappings from the db and insert the corrected ones
227  my $c = $dbh->do(qq(
228  DELETE a
229  FROM assembly a, seq_region sr1, seq_region sr2,
230  coord_system cs1, coord_system cs2
231  WHERE a.asm_seq_region_id = sr1.seq_region_id
232  AND a.cmp_seq_region_id = sr2.seq_region_id
233  AND sr1.coord_system_id = cs1.coord_system_id
234  AND sr2.coord_system_id = cs2.coord_system_id
235  AND cs1.version = '$assembly'
236  AND cs2.version = '$altassembly'
237  AND sr2.name = '$chr'
238  ));
239 
240  $support->log("\nDeleted $c entries from the assembly table.\n");
241 
242  # now insert the fixed entries
243  $sql = qq(INSERT IGNORE INTO assembly VALUES (?, ?, ?, ?, ?, ?, ?));
244  my $sth1 = $dbh->prepare($sql);
245 
246  foreach my $r (@rows) {
247  $sth1->execute(map { $r->{$_} } qw(asm_seq_region_id cmp_seq_region_id asm_start asm_end cmp_start cmp_end ori));
248  }
249 
250  $support->log("Added ".scalar(@rows)." fixed entries to the assembly table.\n");
251 
252  }
253 
254 
255 }
256 
257 $sth->finish;
258 
259 
260 # finish logfile
261 $support->finish_log;
262 
map
public map()
run
public run()