2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
20 align_nonident_regions.pl - create whole genome alignment between two closely
21 related assemblies
for non-identical regions
25 align_nonident_regions.pl [arguments]
29 --dbname, db_name=NAME database name NAME
30 --host, --dbhost, --db_host=HOST database host HOST
31 --port, --dbport, --db_port=PORT database port PORT
32 --user, --dbuser, --db_user=USER database username USER
33 --pass, --dbpass, --db_pass=PASS database passwort PASS
34 --assembly=ASSEMBLY assembly version ASSEMBLY
36 --altdbname=NAME alternative database NAME
37 --altassembly=ASSEMBLY alternative assembly version ASSEMBLY
41 --chromosomes, --chr=LIST only process LIST chromosomes
42 --bindir=DIR look
for program binaries in DIR
43 --tmpfir=DIR use DIR
for temporary files (useful
for
44 re-runs after failure)
46 --conffile, --conf=FILE read parameters from FILE
47 (
default: conf/Conversion.ini)
49 --logfile, --log=FILE log to FILE (
default: *STDOUT)
50 --logpath=PATH write logfile to PATH (
default: .)
51 --logappend, --log_append append to logfile (
default: truncate)
53 -v, --verbose=0|1 verbose logging (
default:
false)
54 -i, --interactive=0|1
run script interactively (
default:
true)
55 -n, --dry_run, --dry=0|1 don
't write results to database
56 -h, --help, -? print help (this message)
60 This script is part of a series of scripts to create a mapping between two
61 assemblies. It assembles the chromosome coordinate systems of two different
62 assemblies of a genome by creating a whole genome alignment between the two.
64 The process assumes that the two assemblies are reasonably similar, i.e. there
65 are no major rearrangements or clones moved from one chromosome to another.
67 See "Related files" below for an overview of the whole process.
69 This particular script creates a whole genome alignment between two closely
70 related assemblies for non-identical regions. These regions are identified by
71 another script (align_by_clone_identity.pl) and stored in a temporary database
74 Alignments are calculated by this algorithm:
76 1. fetch region from tmp_align
77 2. write soft-masked sequences to temporary files
79 4. filter best hits (for query sequences, i.e. alternative regions) using
81 5. parse blastz output to create blocks of exact matches only
82 7. write alignments to assembly table
86 The whole process of creating a whole genome alignment between two assemblies
87 is done by a series of scripts. Please see
89 ensembl/misc-scripts/assembly/README
91 for a high-level description of this process, and POD in the individual scripts
97 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
101 Please post comments/questions to the Ensembl development list
102 <http://lists.ensembl.org/mailman/listinfo/dev>
108 no warnings 'uninitialized
';
110 use FindBin qw($Bin);
111 use vars qw($SERVERROOT);
114 $SERVERROOT = "$Bin/../../..";
116 unshift(@INC, "$SERVERROOT/ensembl/modules");
117 unshift(@INC, "$SERVERROOT/bioperl-live");
122 use Bio::EnsEMBL::Utils::ConversionSupport;
123 use AssemblyMapper::BlastzAligner;
127 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
130 $support->parse_common_options(@_);
131 $support->parse_extra_options(
141 'chromosomes|chr=s@
',
143 $support->allowed_params(
144 $support->get_common_params,
157 if ($support->param('help
') or $support->error) {
158 warn $support->error if $support->error;
162 # ask user to confirm parameters to proceed
163 $support->confirm_params;
165 # get log filehandle and print heading and parameters to logfile
168 $support->check_required_params(
175 # connect to database and get adaptors
177 my ($dba, $dbh, $sql, $sth);
179 # first set connection parameters for alternative db if not different from
181 map { $support->param("alt$_", $support->param($_)) unless ($support->param("alt$_")) } qw(host port user);
184 my $R_dba = $support->get_database('ensembl
');
185 my $R_dbh = $R_dba->dbc->db_handle;
186 my $R_sa = $R_dba->get_SliceAdaptor;
188 # database containing the alternative assembly
189 my $A_dba = $support->get_database('core
', 'alt
');
190 my $A_sa = $A_dba->get_SliceAdaptor;
192 # create BlastzAligner object
193 my $aligner = AssemblyMapper::BlastzAligner->new(-SUPPORT => $support);
195 # create tmpdir to store input and output
196 $aligner->create_tempdir($support->param('tmpdir
'));
198 # loop over non-aligned regions in tmp_align table
199 $support->log_stamped("Looping over non-aligned blocks...\n");
201 $sql = qq(SELECT * FROM tmp_align);
202 if ($support->param('chromosomes
')) {
203 my $chr_string = join("',
'", $support->param('chromosomes
'));
204 $sql .= " WHERE ref_seq_region_name IN ('$chr_string
')";
206 $sth = $R_dbh->prepare($sql);
209 while (my $row = $sth->fetchrow_hashref) {
211 my $id = $row->{'tmp_align_id
'};
213 $aligner->seq_region_name($row->{'ref_seq_region_name
'});
215 $support->log_stamped("Block with tmp_align_id = $id\n", 1);
217 my $A_slice = $A_sa->fetch_by_region(
219 $row->{'alt_seq_region_name
'},
223 $support->param('altassembly
'),
226 my $R_slice = $R_sa->fetch_by_region(
228 $row->{'ref_seq_region_name
'},
232 $support->param('assembly
'),
235 # write sequences to file, and convert sequence files from fasta to nib
236 # format (needed for lavToAxt)
237 my $A_basename = "alt_seq.$id";
238 my $R_basename = "ref_seq.$id";
240 $support->log("Writing sequences to fasta and nib files...\n", 2);
242 $aligner->write_sequence(
244 $support->param('altassembly
'),
248 $aligner->write_sequence(
250 $support->param('assembly
'),
254 $support->log("Done.\n", 2);
257 $support->log("Running blastz...\n", 2);
258 $aligner->run_blastz($A_basename, $R_basename);
259 $support->log("Done.\n", 2);
261 # convert blastz output from lav to axt format
262 $support->log("Converting blastz output from lav to axt format...\n", 2);
263 $aligner->lav_to_axt;
264 $support->log("Done.\n", 2);
266 # find best alignment with axtBest
267 $support->log("Finding best alignment with axtBest...\n", 2);
268 $aligner->find_best_alignment;
269 $support->log("Done.\n", 2);
271 # parse blastz output, and convert relative alignment coordinates to
273 $support->log("Parsing blastz output...\n", 2);
275 $aligner->parse_blastz_output;
277 $aligner->adjust_coords(
280 { $id => [ $row->{'ref_start
'}, $row->{'ref_end
'} ] }
283 $support->log("Done.\n", 2);
286 $support->log("Cleaning up temp files...\n", 2);
287 $aligner->cleanup_tmpfiles(
293 $support->log("Done.\n", 2);
295 # log alignment stats
296 $aligner->log_block_stats(2);
298 $support->log_stamped("Done with block $id.\n", 1);
302 $support->log_stamped("Done.\n");
304 # write alignments to assembly table
305 unless ($support->param('dry_run
')) {
306 $aligner->write_assembly($R_dba);
310 $support->log_stamped("\nRemoving tmpdir...\n");
311 $aligner->remove_tempdir;
312 $support->log_stamped("Done.\n\n");
315 $aligner->log_overall_stats;
317 # remind to drop tmp_align
318 $support->log("\nDon't forget to drop the tmp_align table when all is done!\n\n
");
321 $support->finish_log;