ensembl-hive  2.7.0
align_nonident_regions.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 align_nonident_regions.pl - create whole genome alignment between two closely
21 related assemblies for non-identical regions
22 
23 =head1 SYNOPSIS
24 
25 align_nonident_regions.pl [arguments]
26 
27 Required arguments:
28 
29  --dbname, db_name=NAME database name NAME
30  --host, --dbhost, --db_host=HOST database host HOST
31  --port, --dbport, --db_port=PORT database port PORT
32  --user, --dbuser, --db_user=USER database username USER
33  --pass, --dbpass, --db_pass=PASS database passwort PASS
34  --assembly=ASSEMBLY assembly version ASSEMBLY
35 
36  --altdbname=NAME alternative database NAME
37  --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
38 
39 Optional arguments:
40 
41  --chromosomes, --chr=LIST only process LIST chromosomes
42  --bindir=DIR look for program binaries in DIR
43  --tmpfir=DIR use DIR for temporary files (useful for
44  re-runs after failure)
45 
46  --conffile, --conf=FILE read parameters from FILE
47  (default: conf/Conversion.ini)
48 
49  --logfile, --log=FILE log to FILE (default: *STDOUT)
50  --logpath=PATH write logfile to PATH (default: .)
51  --logappend, --log_append append to logfile (default: truncate)
52 
53  -v, --verbose=0|1 verbose logging (default: false)
54  -i, --interactive=0|1 run script interactively (default: true)
55  -n, --dry_run, --dry=0|1 don't write results to database
56  -h, --help, -? print help (this message)
57 
58 =head1 DESCRIPTION
59 
60 This script is part of a series of scripts to create a mapping between two
61 assemblies. It assembles the chromosome coordinate systems of two different
62 assemblies of a genome by creating a whole genome alignment between the two.
63 
64 The process assumes that the two assemblies are reasonably similar, i.e. there
65 are no major rearrangements or clones moved from one chromosome to another.
66 
67 See "Related files" below for an overview of the whole process.
68 
69 This particular script creates a whole genome alignment between two closely
70 related assemblies for non-identical regions. These regions are identified by
71 another script (align_by_clone_identity.pl) and stored in a temporary database
72 table (tmp_align).
73 
74 Alignments are calculated by this algorithm:
75 
76  1. fetch region from tmp_align
77  2. write soft-masked sequences to temporary files
78  3. align using blastz
79  4. filter best hits (for query sequences, i.e. alternative regions) using
80  axtBest
81  5. parse blastz output to create blocks of exact matches only
82  7. write alignments to assembly table
83 
84 =head1 RELATED FILES
85 
86 The whole process of creating a whole genome alignment between two assemblies
87 is done by a series of scripts. Please see
88 
89  ensembl/misc-scripts/assembly/README
90 
91 for a high-level description of this process, and POD in the individual scripts
92 for the details.
93 
94 
95 =head1 AUTHOR
96 
97 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
98 
99 =head1 CONTACT
100 
101 Please post comments/questions to the Ensembl development list
102 <http://lists.ensembl.org/mailman/listinfo/dev>
103 
104 =cut
105 
106 use strict;
107 use warnings;
108 no warnings 'uninitialized';
109 
110 use FindBin qw($Bin);
111 use vars qw($SERVERROOT);
112 
113 BEGIN {
114  $SERVERROOT = "$Bin/../../..";
115  unshift(@INC, ".");
116  unshift(@INC, "$SERVERROOT/ensembl/modules");
117  unshift(@INC, "$SERVERROOT/bioperl-live");
118 }
119 
120 use Getopt::Long;
121 use Pod::Usage;
122 use Bio::EnsEMBL::Utils::ConversionSupport;
123 use AssemblyMapper::BlastzAligner;
124 
125 $| = 1;
126 
127 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
128 
129 # parse options
130 $support->parse_common_options(@_);
131 $support->parse_extra_options(
132  'assembly=s',
133  'althost=s',
134  'altport=i',
135  'altuser=s',
136  'altpass=s',
137  'altdbname=s',
138  'altassembly=s',
139  'bindir=s',
140  'tmpdir=s',
141  'chromosomes|chr=s@',
142 );
143 $support->allowed_params(
144  $support->get_common_params,
145  'assembly',
146  'althost',
147  'altport',
148  'altuser',
149  'altpass',
150  'altdbname',
151  'altassembly',
152  'bindir',
153  'tmpdir',
154  'chromosomes',
155 );
156 
157 if ($support->param('help') or $support->error) {
158  warn $support->error if $support->error;
159  pod2usage(1);
160 }
161 
162 # ask user to confirm parameters to proceed
163 $support->confirm_params;
164 
165 # get log filehandle and print heading and parameters to logfile
166 $support->init_log;
167 
168 $support->check_required_params(
169  'assembly',
170  'altdbname',
171  'altassembly'
172 );
173 
174 #####
175 # connect to database and get adaptors
176 #
177 my ($dba, $dbh, $sql, $sth);
178 
179 # first set connection parameters for alternative db if not different from
180 # reference db
181 map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user);
182 
183 # reference database
184 my $R_dba = $support->get_database('ensembl');
185 my $R_dbh = $R_dba->dbc->db_handle;
186 my $R_sa = $R_dba->get_SliceAdaptor;
187 
188 # database containing the alternative assembly
189 my $A_dba = $support->get_database('core', 'alt');
190 my $A_sa = $A_dba->get_SliceAdaptor;
191 
192 # create BlastzAligner object
193 my $aligner = AssemblyMapper::BlastzAligner->new(-SUPPORT => $support);
194 
195 # create tmpdir to store input and output
196 $aligner->create_tempdir($support->param('tmpdir'));
197 
198 # loop over non-aligned regions in tmp_align table
199 $support->log_stamped("Looping over non-aligned blocks...\n");
200 
201 $sql = qq(SELECT * FROM tmp_align);
202 if ($support->param('chromosomes')) {
203  my $chr_string = join("', '", $support->param('chromosomes'));
204  $sql .= " WHERE ref_seq_region_name IN ('$chr_string')";
205 }
206 $sth = $R_dbh->prepare($sql);
207 $sth->execute;
208 
209 while (my $row = $sth->fetchrow_hashref) {
210 
211  my $id = $row->{'tmp_align_id'};
212  $aligner->id($id);
213  $aligner->seq_region_name($row->{'ref_seq_region_name'});
214 
215  $support->log_stamped("Block with tmp_align_id = $id\n", 1);
216 
217  my $A_slice = $A_sa->fetch_by_region(
218  'toplevel',
219  $row->{'alt_seq_region_name'},
220  $row->{'alt_start'},
221  $row->{'alt_end'},
222  1,
223  $support->param('altassembly'),
224  );
225 
226  my $R_slice = $R_sa->fetch_by_region(
227  'toplevel',
228  $row->{'ref_seq_region_name'},
229  $row->{'ref_start'},
230  $row->{'ref_end'},
231  1,
232  $support->param('assembly'),
233  );
234 
235  # write sequences to file, and convert sequence files from fasta to nib
236  # format (needed for lavToAxt)
237  my $A_basename = "alt_seq.$id";
238  my $R_basename = "ref_seq.$id";
239 
240  $support->log("Writing sequences to fasta and nib files...\n", 2);
241 
242  $aligner->write_sequence(
243  $A_slice,
244  $support->param('altassembly'),
245  $A_basename
246  );
247 
248  $aligner->write_sequence(
249  $R_slice,
250  $support->param('assembly'),
251  $R_basename
252  );
253 
254  $support->log("Done.\n", 2);
255 
256  # align using blastz
257  $support->log("Running blastz...\n", 2);
258  $aligner->run_blastz($A_basename, $R_basename);
259  $support->log("Done.\n", 2);
260 
261  # convert blastz output from lav to axt format
262  $support->log("Converting blastz output from lav to axt format...\n", 2);
263  $aligner->lav_to_axt;
264  $support->log("Done.\n", 2);
265 
266  # find best alignment with axtBest
267  $support->log("Finding best alignment with axtBest...\n", 2);
268  $aligner->find_best_alignment;
269  $support->log("Done.\n", 2);
270 
271  # parse blastz output, and convert relative alignment coordinates to
272  # chromosomal coords
273  $support->log("Parsing blastz output...\n", 2);
274 
275  $aligner->parse_blastz_output;
276 
277  $aligner->adjust_coords(
278  $row->{'alt_start'},
279  $row->{'alt_end'},
280  { $id => [ $row->{'ref_start'}, $row->{'ref_end'} ] }
281  );
282 
283  $support->log("Done.\n", 2);
284 
285  # cleanup temp files
286  $support->log("Cleaning up temp files...\n", 2);
287  $aligner->cleanup_tmpfiles(
288  "$A_basename.fa",
289  "$A_basename.nib",
290  "$R_basename.fa",
291  "$R_basename.nib",
292  );
293  $support->log("Done.\n", 2);
294 
295  # log alignment stats
296  $aligner->log_block_stats(2);
297 
298  $support->log_stamped("Done with block $id.\n", 1);
299 
300 }
301 
302 $support->log_stamped("Done.\n");
303 
304 # write alignments to assembly table
305 unless ($support->param('dry_run')) {
306  $aligner->write_assembly($R_dba);
307 }
308 
309 # cleanup
310 $support->log_stamped("\nRemoving tmpdir...\n");
311 $aligner->remove_tempdir;
312 $support->log_stamped("Done.\n\n");
313 
314 # overall stats
315 $aligner->log_overall_stats;
316 
317 # remind to drop tmp_align
318 $support->log("\nDon't forget to drop the tmp_align table when all is done!\n\n");
319 
320 # finish logfile
321 $support->finish_log;
322 
323 
run
public run()