ensembl-hive  2.8.1
fix_overlaps.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 fix_overlaps.pl - remove overlapping mappings in the assembly coordinates (asm_start - asm_end) between two closely related assemblies
21 
22 =head1 SYNOPSIS
23 
24 fix_overlaps.pl [arguments]
25 
26 Required arguments:
27 
28  --dbname, db_name=NAME database name NAME
29  --host, --dbhost, --db_host=HOST database host HOST
30  --port, --dbport, --db_port=PORT database port PORT
31  --user, --dbuser, --db_user=USER database username USER
32  --pass, --dbpass, --db_pass=PASS database passwort PASS
33  --assembly=ASSEMBLY assembly version ASSEMBLY
34 
35  --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
36  --chromosomes, --chr=LIST only process LIST seq_regions
37 
38 Optional arguments:
39 
40  --conffile, --conf=FILE read parameters from FILE
41  (default: conf/Conversion.ini)
42 
43  --logfile, --log=FILE log to FILE (default: *STDOUT)
44  --logpath=PATH write logfile to PATH (default: .)
45  --logappend, --log_append append to logfile (default: truncate)
46 
47  -v, --verbose=0|1 verbose logging (default: false)
48  -i, --interactive=0|1 run script interactively (default: true)
49  -n, --dry_run, --dry=0|1 don't write results to database
50  -h, --help, -? print help (this message)
51 
52 =head1 DESCRIPTION
53 
54 This script removes overlapping mappings from the assembly coordinates.
55 Mappings are trimmed so that such overlaps don't break the AssemblyMapper when
56 projecting between the reference and non-reference assemblies.
57 
58 It also merges adjacent assembly segments which can result from alternating
59 alignments from clone identity and blastz alignment.
60 
61 =head1 RELATED FILES
62 
63 The whole process of creating a whole genome alignment between two assemblies
64 is done by a series of scripts. Please see
65 
66  ensembl/misc-scripts/assembly/README
67 
68 for a high-level description of this process, and POD in the individual scripts
69 for the details.
70 
71 
72 =head1 AUTHOR
73 
74 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
75 
76 =head1 CONTACT
77 
78 Please post comments/questions to the Ensembl development list
79 <http://lists.ensembl.org/mailman/listinfo/dev>
80 
81 =cut
82 
83 use strict;
84 use warnings;
85 no warnings 'uninitialized';
86 
87 use FindBin qw($Bin);
88 use Getopt::Long;
89 use Pod::Usage;
91 
92 $| = 1;
93 
94 my $support = new Bio::EnsEMBL::Utils::ConversionSupport("$Bin/../../..");
95 
96 # parse options
97 $support->parse_common_options(@_);
98 $support->parse_extra_options(
99  'assembly=s',
100  'altassembly=s',
101  'chromosomes|chr=s@',
102 );
103 $support->allowed_params(
104  $support->get_common_params,
105  'assembly',
106  'altassembly',
107  'chromosomes',
108 );
109 
110 if ($support->param('help') or $support->error) {
111  warn $support->error if $support->error;
112  pod2usage(1);
113 }
114 
115 $support->comma_to_list('chromosomes');
116 
117 # ask user to confirm parameters to proceed
118 $support->confirm_params;
119 
120 # get log filehandle and print heading and parameters to logfile
121 $support->init_log;
122 
123 $support->check_required_params(
124  'assembly',
125  'altassembly',
126  'chromosomes',
127 );
128 
129 # database connection
130 my $dba = $support->get_database('ensembl');
131 my $dbh = $dba->dbc->db_handle;
132 
133 my $assembly = $support->param('assembly');
134 my $altassembly = $support->param('altassembly');
135 
136 my $sql = qq(
137  SELECT a.*
138  FROM assembly a, seq_region sr1, seq_region sr2,
139  coord_system cs1, coord_system cs2
140  WHERE a.asm_seq_region_id = sr1.seq_region_id
141  AND a.cmp_seq_region_id = sr2.seq_region_id
142  AND sr1.coord_system_id = cs1.coord_system_id
143  AND sr2.coord_system_id = cs2.coord_system_id
144  AND cs1.version = '$assembly'
145  AND cs2.version = '$altassembly'
146  AND sr1.name = ?
147  ORDER BY a.asm_start
148 );
149 
150 my $sth = $dbh->prepare($sql);
151 
152 my $fmt1 = "%10s %10s %10s %10s %3s\n";
153 
154 
155 foreach my $chr ($support->param('chromosomes')) {
156  $support->log_stamped("\nToplevel seq_region $chr...\n");
157 
158  $sth->execute($chr);
159 
160  my @rows = ();
161 
162  # do an initial fetch
163  my $last = $sth->fetchrow_hashref;
164 
165  # skip seq_regions for which we don't have data
166  unless ($last) {
167  $support->log("No mappings found. Skipping.\n", 1);
168  next;
169  }
170 
171  push @rows, $last;
172 
173  my $i = 0;
174  my $j = 0;
175  my $k = 0;
176 
177  while ($last and (my $r = $sth->fetchrow_hashref)) {
178  # merge adjacent assembly segments (these can result from alternating
179  # alignments from clone identity and blastz alignment)
180  if ($last->{'asm_end'} == ($r->{'asm_start'} - 1) and
181  $last->{'cmp_end'} == ($r->{'cmp_start'} - 1)) {
182 
183  $j++;
184 
185  # debug warnings
186  $support->log_verbose('merging - last: '.sprintf($fmt1,
187  map { $last->{$_} } qw(asm_start asm_end cmp_start cmp_end ori)), 1);
188  $support->log_verbose('this: '.sprintf($fmt1, map { $r->{$_} }
189  qw(asm_start asm_end cmp_start cmp_end ori)), 1);
190 
191  # remove last row
192  pop(@rows);
193 
194  # merge segments and add new row
195  $last->{'asm_end'} = $r->{'asm_end'};
196  $last->{'cmp_end'} = $r->{'cmp_end'};
197  push @rows, $last;
198 
199  next;
200  }
201 
202  # bridge small gaps (again, these can result from alternating alignments
203  # from clone identity and blastz alignment). A maximum gap size of 10bp is
204  # allowed
205  my $asm_gap = $r->{'asm_start'} - $last->{'asm_end'} - 1;
206  my $cmp_gap = $r->{'cmp_start'} - $last->{'cmp_end'} - 1;
207 
208  if ($asm_gap == $cmp_gap and $asm_gap <= 10 and $asm_gap > 0) {
209 
210  $k++;
211 
212  # debug warnings
213  $support->log_verbose('bridging - last: '.sprintf($fmt1,
214  map { $last->{$_} } qw(asm_start asm_end cmp_start cmp_end ori)), 1);
215  $support->log_verbose('this: '.sprintf($fmt1, map { $r->{$_} }
216  qw(asm_start asm_end cmp_start cmp_end ori)), 1);
217 
218  # remove last row
219  pop(@rows);
220 
221  # merge segments and add new row
222  $last->{'asm_end'} = $r->{'asm_end'};
223  $last->{'cmp_end'} = $r->{'cmp_end'};
224  push @rows, $last;
225 
226  next;
227  }
228 
229  # look for overlaps with last segment
230  if ($last->{'asm_end'} >= $r->{'asm_start'}) {
231 
232  $i++;
233 
234  # debug warnings
235  $support->log_verbose('last: '.sprintf($fmt1, map { $last->{$_} }
236  qw(asm_start asm_end cmp_start cmp_end ori)), 1);
237  $support->log_verbose('before: '.sprintf($fmt1, map { $r->{$_} }
238  qw(asm_start asm_end cmp_start cmp_end ori)), 1);
239 
240  # skip if this segment ends before the last one
241  if ($r->{'asm_end'} <= $last->{'asm_end'}) {
242  $support->log_verbose("skipped\n\n", 1);
243  next;
244  }
245 
246  my $overlap = $last->{'asm_end'} - $r->{'asm_start'} + 1;
247 
248  $r->{'asm_start'} += $overlap;
249 
250  if ($r->{'ori'} == -1) {
251  $r->{'cmp_end'} -= $overlap;
252  } else {
253  $r->{'cmp_start'} += $overlap;
254  }
255 
256  $support->log_verbose('after: '.sprintf($fmt1, map { $r->{$_} }
257  qw(asm_start asm_end cmp_start cmp_end ori))."\n", 1);
258  }
259 
260  push @rows, $r;
261  $last = $r;
262  }
263 
264  $support->log("Fixed $i mappings.\n", 1);
265  $support->log("Merged $j mappings.\n", 1);
266  $support->log("Bridged $k gaps.\n", 1);
267 
268 
269 
270  if (!$support->param('dry_run') && ($i > 0 || $j > 0 || $k > 0 )) {
271 
272  # delete all current mappings from the db and insert the corrected ones
273  my $c = $dbh->do(qq(
274  DELETE a
275  FROM assembly a, seq_region sr1, seq_region sr2,
276  coord_system cs1, coord_system cs2
277  WHERE a.asm_seq_region_id = sr1.seq_region_id
278  AND a.cmp_seq_region_id = sr2.seq_region_id
279  AND sr1.coord_system_id = cs1.coord_system_id
280  AND sr2.coord_system_id = cs2.coord_system_id
281  AND cs1.version = '$assembly'
282  AND cs2.version = '$altassembly'
283  AND sr1.name = '$chr'
284  ));
285 
286  $support->log("\nDeleted $c entries from the assembly table.\n");
287 
288  # now insert the fixed entries
289  $sql = qq(INSERT INTO assembly VALUES (?, ?, ?, ?, ?, ?, ?));
290  my $sth1 = $dbh->prepare($sql);
291 
292  foreach my $r (@rows) {
293  $sth1->execute(map { $r->{$_} } qw(asm_seq_region_id cmp_seq_region_id asm_start asm_end cmp_start cmp_end ori));
294  }
295 
296  $support->log("Added ".scalar(@rows)." fixed entries to the assembly table.\n");
297 
298  }
299 
300 }
301 
302 $sth->finish;
303 
304 # finish logfile
305 $support->finish_log;
306 
Bio::EnsEMBL::Utils::ConversionSupport
Definition: ConversionSupport.pm:39
map
public map()
AssemblyMapper
Definition: BlastzAligner.pm:7
run
public run()