ensembl-hive  2.8.1
fix_asm_overlaps.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 fix_asm_overlaps.pl - remove overlapping mappings between assemblies from the assembly coordinates (asm_start -asm_end)
21 
22 
23 =head1 SYNOPSIS
24 
25 fix_overlaps.pl [arguments]
26 
27 Required arguments:
28 
29  --dbname, db_name=NAME database name NAME
30  --host, --dbhost, --db_host=HOST database host HOST
31  --port, --dbport, --db_port=PORT database port PORT
32  --user, --dbuser, --db_user=USER database username USER
33  --pass, --dbpass, --db_pass=PASS database passwort PASS
34  --assembly=ASSEMBLY assembly version ASSEMBLY
35 
36  --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
37  --chromosomes, --chr=LIST only process LIST seq_regions
38 
39 Optional arguments:
40 
41  --conffile, --conf=FILE read parameters from FILE
42  (default: conf/Conversion.ini)
43 
44  --logfile, --log=FILE log to FILE (default: *STDOUT)
45  --logpath=PATH write logfile to PATH (default: .)
46  --logappend, --log_append append to logfile (default: truncate)
47 
48  -v, --verbose=0|1 verbose logging (default: false)
49  -i, --interactive=0|1 run script interactively (default: true)
50  -n, --dry_run, --dry=0|1 don't write results to database
51  -h, --help, -? print help (this message)
52 
53 =head1 DESCRIPTION
54 
55 This script removes overlapping mappings from the assembly coordinates.
56 Mappings are trimmed so they don't break the AssemblyMapper when
57 projecting between the reference to the non-reference assembly.
58 
59 
60 =head1 AUTHOR
61 
62 Monika Komorowska <monika@ebi.ac.uk>, Ensembl core API team
63 
64 =head1 CONTACT
65 
66 Please post comments/questions to the Ensembl development list
67 <http://lists.ensembl.org/mailman/listinfo/dev>
68 
69 =cut
70 
71 use strict;
72 use warnings;
73 no warnings 'uninitialized';
74 
75 use FindBin qw($Bin);
76 use Getopt::Long;
77 use Pod::Usage;
79 
80 $| = 1;
81 
82 my $support = new Bio::EnsEMBL::Utils::ConversionSupport("$Bin/../../..");
83 
84 # parse options
85 $support->parse_common_options(@_);
86 $support->parse_extra_options(
87  'assembly=s',
88  'altassembly=s',
89  'chromosomes|chr=s@',
90 );
91 $support->allowed_params(
92  $support->get_common_params,
93  'assembly',
94  'altassembly',
95  'chromosomes',
96 );
97 
98 if ($support->param('help') or $support->error) {
99  warn $support->error if $support->error;
100  pod2usage(1);
101 }
102 
103 $support->comma_to_list('chromosomes');
104 
105 # ask user to confirm parameters to proceed
106 $support->confirm_params;
107 
108 # get log filehandle and print heading and parameters to logfile
109 $support->init_log;
110 
111 $support->check_required_params(
112  'assembly',
113  'altassembly',
114  'chromosomes'
115 );
116 
117 # database connection
118 my $dba = $support->get_database('ensembl');
119 my $dbh = $dba->dbc->db_handle;
120 
121 my $assembly = $support->param('assembly');
122 my $altassembly = $support->param('altassembly');
123 
124 my $sql = qq(
125  SELECT a.*
126  FROM assembly a, seq_region sr1, seq_region sr2,
127  coord_system cs1, coord_system cs2
128  WHERE a.asm_seq_region_id = sr1.seq_region_id
129  AND a.cmp_seq_region_id = sr2.seq_region_id
130  AND sr1.coord_system_id = cs1.coord_system_id
131  AND sr2.coord_system_id = cs2.coord_system_id
132  AND cs1.version = '$assembly'
133  AND cs2.version = '$altassembly'
134  AND sr1.name = ?
135  ORDER BY a.asm_start
136 );
137 
138 my $sth = $dbh->prepare($sql);
139 
140 my $fmt1 = "%10s %10s %10s %10s %3s\n";
141 
142 
143 foreach my $chr ($support->param('chromosomes')) {
144  $support->log_stamped("\nToplevel seq_region $chr...\n");
145 
146  $sth->execute($chr);
147 
148  my @rows = ();
149 
150  # do an initial fetch
151  my $last = $sth->fetchrow_hashref;
152 
153  # skip seq_regions for which we don't have data
154  unless ($last) {
155  $support->log("No mappings found. Skipping.\n", 1);
156  next;
157  }
158 
159  push @rows, $last;
160 
161  my $i = 0;
162  my $j = 0;
163  my $k = 0;
164 
165  while ($last and (my $r = $sth->fetchrow_hashref)) {
166 
167  # look for overlaps with last segment
168  if ($last->{'asm_end'} >= $r->{'asm_start'}) {
169 
170  $i++;
171 
172  # debug warnings
173  $support->log_verbose('last: '.sprintf($fmt1, map { $last->{$_} }
174  qw(asm_start asm_end cmp_start cmp_end ori)), 1);
175  $support->log_verbose('before: '.sprintf($fmt1, map { $r->{$_} }
176  qw(asm_start asm_end cmp_start cmp_end ori)), 1);
177 
178  # skip if this segment ends before the last one
179  if ($r->{'asm_end'} <= $last->{'asm_end'}) {
180  $support->log_verbose("skipped\n\n", 1);
181  next;
182  }
183 
184  my $overlap = $last->{'asm_end'} - $r->{'asm_start'} + 1;
185 
186  $r->{'asm_start'} += $overlap;
187 
188  if ($r->{'ori'} == -1) {
189  $r->{'cmp_end'} -= $overlap;
190  } else {
191  $r->{'cmp_start'} += $overlap;
192  }
193 
194  $support->log_verbose('after: '.sprintf($fmt1, map { $r->{$_} }
195  qw(asm_start asm_end cmp_start cmp_end ori))."\n", 1);
196  }
197 
198  push @rows, $r;
199  $last = $r;
200  }
201 
202  $support->log("Fixed $i mappings.\n", 1);
203 
204  if (!$support->param('dry_run') && ($i > 0)) {
205 
206  # delete all current mappings from the db and insert the corrected ones
207  my $c = $dbh->do(qq(
208  DELETE a
209  FROM assembly a, seq_region sr1, seq_region sr2,
210  coord_system cs1, coord_system cs2
211  WHERE a.asm_seq_region_id = sr1.seq_region_id
212  AND a.cmp_seq_region_id = sr2.seq_region_id
213  AND sr1.coord_system_id = cs1.coord_system_id
214  AND sr2.coord_system_id = cs2.coord_system_id
215  AND cs1.version = '$assembly'
216  AND cs2.version = '$altassembly'
217  AND sr1.name = '$chr'
218  ));
219 
220  $support->log("\nDeleted $c entries from the assembly table.\n");
221 
222  # now insert the fixed entries
223  $sql = qq(INSERT INTO assembly VALUES (?, ?, ?, ?, ?, ?, ?));
224  my $sth1 = $dbh->prepare($sql);
225 
226  foreach my $r (@rows) {
227  $sth1->execute(map { $r->{$_} } qw(asm_seq_region_id cmp_seq_region_id asm_start asm_end cmp_start cmp_end ori));
228  }
229 
230  $support->log("Added ".scalar(@rows)." fixed entries to the assembly table.\n");
231 
232  }
233 
234 }
235 
236 $sth->finish;
237 
238 # finish logfile
239 $support->finish_log;
240 
Bio::EnsEMBL::Utils::ConversionSupport
Definition: ConversionSupport.pm:39
map
public map()
AssemblyMapper
Definition: BlastzAligner.pm:7
run
public run()