ensembl-hive  2.8.1
fix_cmp_overlaps.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 fix_cmp_overlaps.pl - remove overlapping mappings between assemblies from the component coordinates
21 
22 =head1 SYNOPSIS
23 
24 fix_overlaps.pl [arguments]
25 
26 Required arguments:
27 
28  --dbname, db_name=NAME database name NAME
29  --host, --dbhost, --db_host=HOST database host HOST
30  --port, --dbport, --db_port=PORT database port PORT
31  --user, --dbuser, --db_user=USER database username USER
32  --pass, --dbpass, --db_pass=PASS database passwort PASS
33  --assembly=ASSEMBLY assembly version ASSEMBLY
34 
35  --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
36 
37  --chromosomes, --chr=LIST only process LIST seq_regions
38 
39 Optional arguments:
40 
41 
42 
43  --conffile, --conf=FILE read parameters from FILE
44  (default: conf/Conversion.ini)
45 
46  --logfile, --log=FILE log to FILE (default: *STDOUT)
47  --logpath=PATH write logfile to PATH (default: .)
48  --logappend, --log_append append to logfile (default: truncate)
49 
50  -v, --verbose=0|1 verbose logging (default: false)
51  -i, --interactive=0|1 run script interactively (default: true)
52  -n, --dry_run, --dry=0|1 don't write results to database
53  -h, --help, -? print help (this message)
54 
55 =head1 DESCRIPTION
56 
57 
58 This script removes overlapping mappings from the component coordinates.
59 Mappings are trimmed so they don't break the AssemblyMapper when
60 projecting between the non-reference to the reference assembly.
61 
62 
63 =cut
64 
65 use strict;
66 use warnings;
67 no warnings 'uninitialized';
68 
69 use FindBin qw($Bin);
70 use Getopt::Long;
71 use Pod::Usage;
73 
74 $| = 1;
75 
76 my $support = new Bio::EnsEMBL::Utils::ConversionSupport("$Bin/../../..");
77 
78 # parse options
79 $support->parse_common_options(@_);
80 $support->parse_extra_options(
81  'assembly=s',
82  'altassembly=s',
83  'chromosomes|chr=s@',
84 );
85 $support->allowed_params(
86  $support->get_common_params,
87  'assembly',
88  'altassembly',
89  'chromosomes',
90 );
91 
92 if ($support->param('help') or $support->error) {
93  warn $support->error if $support->error;
94  pod2usage(1);
95 }
96 
97 $support->comma_to_list('chromosomes');
98 
99 # ask user to confirm parameters to proceed
100 $support->confirm_params;
101 
102 # get log filehandle and print heading and parameters to logfile
103 $support->init_log;
104 
105 $support->check_required_params(
106  'assembly',
107  'altassembly',
108  'chromosomes'
109 );
110 
111 # database connection
112 my $dba = $support->get_database('ensembl');
113 my $dbh = $dba->dbc->db_handle;
114 
115 my $assembly = $support->param('assembly');
116 my $altassembly = $support->param('altassembly');
117 
118 my $sql = qq(
119  SELECT a.*
120  FROM assembly a, seq_region sr1, seq_region sr2,
121  coord_system cs1, coord_system cs2
122  WHERE a.asm_seq_region_id = sr1.seq_region_id
123  AND a.cmp_seq_region_id = sr2.seq_region_id
124  AND sr1.coord_system_id = cs1.coord_system_id
125  AND sr2.coord_system_id = cs2.coord_system_id
126  AND cs1.version = '$assembly'
127  AND cs2.version = '$altassembly'
128  AND sr2.name = ?
129  ORDER BY a.cmp_start
130 );
131 
132 my $sth = $dbh->prepare($sql);
133 
134 my $fmt1 = "%10s %10s %10s %10s %3s\n";
135 
136 
137 foreach my $chr ($support->param('chromosomes')) {
138  $support->log_stamped("\nToplevel seq_region $chr...\n");
139 
140  $sth->execute($chr);
141 
142  my @rows = ();
143 
144  # do an initial fetch
145  my $last = $sth->fetchrow_hashref;
146 
147  # skip seq_regions for which we don't have data
148  unless ($last) {
149  $support->log("No mappings found. Skipping.\n", 1);
150  next;
151  }
152 
153  push @rows, $last;
154 
155  my $i = 0;
156  my $j = 0;
157  my $k = 0;
158 
159  while ($last and (my $r = $sth->fetchrow_hashref)) {
160 
161  # look for overlaps with last segment
162  if ($last->{'cmp_end'} >= $r->{'cmp_start'}) {
163 
164  $i++;
165 
166  # debug warnings
167  $support->log_verbose('last: '.sprintf($fmt1, map { $last->{$_} }
168  qw(asm_start asm_end cmp_start cmp_end ori)), 1);
169  $support->log_verbose('before: '.sprintf($fmt1, map { $r->{$_} }
170  qw(asm_start asm_end cmp_start cmp_end ori)), 1);
171 
172  # skip if this segment ends before the last one
173  if ($r->{'cmp_end'} <= $last->{'cmp_end'}) {
174  $support->log_verbose("skipped\n\n", 1);
175  next;
176  }
177 
178  my $overlap = $last->{'cmp_end'} - $r->{'cmp_start'} + 1;
179 
180  $r->{'cmp_start'} += $overlap;
181 
182  if ($r->{'ori'} == -1) {
183  $r->{'asm_end'} -= $overlap;
184  } else {
185  $r->{'asm_start'} += $overlap;
186  }
187 
188  $support->log_verbose('after: '.sprintf($fmt1, map { $r->{$_} }
189  qw(asm_start asm_end cmp_start cmp_end ori))."\n", 1);
190  }
191 
192  push @rows, $r;
193  $last = $r;
194  }
195 
196  $support->log("Fixed $i mappings.\n", 1);
197 
198 
199  if (!$support->param('dry_run') && $i > 0 ) {
200 
201  # delete all current mappings from the db and insert the corrected ones
202  my $c = $dbh->do(qq(
203  DELETE a
204  FROM assembly a, seq_region sr1, seq_region sr2,
205  coord_system cs1, coord_system cs2
206  WHERE a.asm_seq_region_id = sr1.seq_region_id
207  AND a.cmp_seq_region_id = sr2.seq_region_id
208  AND sr1.coord_system_id = cs1.coord_system_id
209  AND sr2.coord_system_id = cs2.coord_system_id
210  AND cs1.version = '$assembly'
211  AND cs2.version = '$altassembly'
212  AND sr2.name = '$chr'
213  ));
214 
215  $support->log("\nDeleted $c entries from the assembly table.\n");
216 
217  # now insert the fixed entries
218  $sql = qq(INSERT INTO assembly VALUES (?, ?, ?, ?, ?, ?, ?));
219  my $sth1 = $dbh->prepare($sql);
220 
221  foreach my $r (@rows) {
222  $sth1->execute(map { $r->{$_} } qw(asm_seq_region_id cmp_seq_region_id asm_start asm_end cmp_start cmp_end ori));
223  }
224 
225  $support->log("Added ".scalar(@rows)." fixed entries to the assembly table.\n");
226 
227  }
228 
229 
230 }
231 
232 $sth->finish;
233 
234 # finish logfile
235 $support->finish_log;
236 
Bio::EnsEMBL::Utils::ConversionSupport
Definition: ConversionSupport.pm:39
map
public map()
AssemblyMapper
Definition: BlastzAligner.pm:7
run
public run()