2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
21 This code writes chain files from Ensembl assembly records. Chain files are defined at
25 They can be used with liftover (http:
27 chain score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd
id
31 You can see
this in the the example chain files
for human mappings http:
33 dt & dq are meant to be the differences between the end of
this current block and the start of the next in the target and query. The
final line is just a size to indicate no further offset.
35 tSize and qSize are the total sizes of the target and reference sequences.
45 use Scalar::Util qw/looks_like_number/;
47 use File::Path qw/mkpath/;
56 my ($db_name, $db_host, $db_user, $db_pass, $db_port, $help, @species, $group, $release, $dir, $compress, $ucsc);
61 $dir = File::Spec->curdir();
64 "db_name|dbname|database=s" => \$db_name,
65 "db_host|dbhost|host=s" => \$db_host,
66 "db_user|dbuser|user|username=s" => \$db_user,
67 "db_pass|dbpass|pass|password=s" => \$db_pass,
68 "db_port|dbport|port=s" => \$db_port,
69 "species=s@" => \@species,
70 "version|release=i" => \$release,
71 "directory|dir=s" => \$dir,
72 "compress|gzip|gz!" => \$compress,
78 -HOST => $db_host, -PORT => $db_port,
81 $args{-PASS} = $db_pass
if $db_pass;
83 # must be a DB we need
85 Bio::EnsEMBL::DBSQL::DBAdaptor->new(
93 Bio::EnsEMBL::Registry->load_registry_from_db(
95 -DB_VERSION => $release,
99 $COMPRESS = $compress;
105 say
"Working against a restricted species list";
106 foreach my $s (@species) {
107 my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($s, $group);
108 die
"Cannot find a DBAdaptor for the species ${s}" unless $dba;
113 say
"Dumping chain file for all available species";
117 foreach my $dba (@dbas) {
119 my $prod_name = $dba->get_MetaContainer()->get_production_name();
121 $final_dbas{$prod_name} = $dba
if @{$entries};
122 $dba->dbc->disconnect_if_idle();
125 # Return including sorting the species by name as it's just nicer that way
126 return ($dir,
map { $final_dbas{$_} } sort keys %final_dbas);
133 foreach my $dba (@dbas) {
135 $dba->dbc->disconnect_if_idle;
141 my ($dir, $core_dba) = @_;
142 my $prod_name = $core_dba->get_MetaContainer->get_production_name();
144 say
"Processing ${prod_name}";
146 foreach my $mappings (@{$liftovers}) {
147 my ($asm_cs, $cmp_cs) = @{$mappings};
148 say
"\tWorking with $asm_cs to $cmp_cs";
149 say
"\t\tFetching mappings";
151 write_mappings($dir, $asm_cs, $cmp_cs, $prod_name, $asm_to_cmp_mappings, $core_dba);
152 say
"\t\tFetching reverse mappings";
154 write_mappings($dir, $cmp_cs, $asm_cs, $prod_name, $cmp_to_asm_mappings, $core_dba);
160 #Parse mapping keys like chromosome:GRCh37#chromosome:NCBI36 to ['GRCh37','NCBI36']
163 my $mappings = $core_dba->get_MetaContainer()->list_value_by_key(
'liftover.mapping');
165 foreach my $mapping (@{$mappings}) {
166 die
"Can't parse mapping string for coord system version '${mapping}'!\n"
167 unless $mapping =~ /.+:(.+)#.+:(.+)/;
170 $unique_mappings{$version_1.
":". $version_2} = [$version_1, $version_2];
172 return [values %unique_mappings];
176 my ($dir, $source_cs, $target_cs, $prod_name, $mappings, $core_dba) = @_;
177 my $file =
"${source_cs}_to_${target_cs}.chain";
178 $file .=
'.gz' if $COMPRESS;
179 my $target_dir = File::Spec->catdir($dir, $prod_name);
181 my $path = File::Spec->catfile($target_dir, $file);
182 say
"\t\tBuilding chain mappings";
184 my $ucsc_chains = [];
186 say
"\t\tBuilding UCSC mappings";
189 say
"\t\tWriting mappings to $path";
198 gz_work_with_file($path,
'w', $writer);
201 work_with_file($path,
'w', $writer);
208 my ($assembly_mappings) = @_;
211 my ($t_name, $t_size, $t_strand, $t_start, $t_end);
212 my ($q_name, $q_size, $q_strand, $q_start, $q_end);
216 my $length = scalar(@{$assembly_mappings});
217 for (my $i = 0; $i < $length; $i++) {
219 my $current = $assembly_mappings->[$i];
220 my $next = ($i+1 != $length) ? $assembly_mappings->[$i+1] : undef;
222 my $ori = $current->{ori};
223 my ($asm_diff, $cmp_diff);
225 $asm_diff = ($next->{asm_start} - $current->{asm_end})-1;
226 # Rev strands means the next cmp region has a lower start than the
227 # current end (because it's running in reverse). Rember length in 1-based
228 # coords always is (high-low)-1
229 $cmp_diff = ($ori == 1) ? ($next->{cmp_start} - $current->{cmp_end})-1 : ($current->{cmp_start} - $next->{cmp_end})-1;
233 # Reset variables to current
236 ($t_name, $t_size, $t_strand, $t_start) = ($current->{asm_name}, $current->{asm_length}, 1, $current->{asm_start});
237 ($q_name, $q_size, $q_strand) = ($current->{cmp_name}, $current->{cmp_length}, $current->{ori});
238 $q_start = ($ori == 1) ? $current->{cmp_start} : $current->{cmp_end};
241 # Block that decides we need to start a new chain definition
243 # Can mean we have run out of mappings, we are into a new chromsome (both source and target)
244 # or strand has swapped.
245 # Final reason is we've had an out-of-order meaning a negative gap was produced
246 # (we're going backwards). In any situation this means the chain is finished
247 if( ! defined $next ||
248 $t_name ne $next->{asm_name} ||
249 $q_name ne $next->{cmp_name} || # we can
switch target chromosomes. e.g. cross chromsome mappings in mouse NCBI37->GRCm38
250 $ori != $next->{ori} ||
253 # Add the last gap on which is just the length of this alignment
254 push(@chain_gaps, [$current->{length}]);
255 # Set the ends of the chain since this is the last block
256 $t_end = $current->{asm_end};
257 $q_end = ($ori == 1) ? $current->{cmp_end} : $current->{cmp_start};
259 #If strand was negative we need to represent all data as reverse complemented regions
260 if($q_strand == -1) {
261 # $t_start = ($t_size - $t_start)+1;
262 # $t_end = ($t_size - $t_end)+1;
263 $q_start = ($q_size - $q_start)+1;
264 $q_end = ($q_size - $q_end)+1;
266 # Convert to UCSC formats (0-based half-open intervals and +/- strands)
269 $t_strand = ($t_strand == 1) ?
'+' :
'-';
270 $q_strand = ($q_strand == 1) ?
'+' :
'-';
274 push(@chain_mappings, {
275 header => [
'chain', $chain_score, $t_name, $t_size, $t_strand, $t_start, $t_end, $q_name, $q_size, $q_strand, $q_start, $q_end, $chain_id],
276 gaps => [@chain_gaps]
279 if(! defined $next) {
284 ($t_name, $t_size, $t_strand, $t_start) = ();
285 ($q_name, $q_size, $q_strand, $q_start) = ();
288 push(@chain_gaps, [$current->{length}, $asm_diff, $cmp_diff]);
291 return \@chain_mappings;
295 my ($dba, $asm_version, $cmp_version) = @_;
297 [
'sr1.name as asm_name',
298 'sr1.length as asm_length',
299 'sr2.name as cmp_name',
300 'sr2.length as cmp_length',
301 'asm.asm_start',
'asm.asm_end',
'asm.cmp_start',
'asm.cmp_end'],
302 'order by cs2.version, sr2.name, asm.cmp_start');
303 return $dba->dbc->sql_helper->execute( -SQL => $sql, -PARAMS => [$asm_version, $cmp_version], -USE_HASHREFS => 1);
307 my ($dba, $asm_version, $cmp_version) = @_;
308 # Reverse the columns to get the reverse mapping
310 [
'sr1.name as cmp_name',
311 'sr1.length as cmp_length',
312 'sr2.name as asm_name',
313 'sr2.length as asm_length',
314 'asm.cmp_start as asm_start',
'asm.cmp_end as asm_end',
'asm.asm_start as cmp_start',
'asm.asm_end as cmp_end'],
315 'order by cs2.version, sr2.name, asm.cmp_start');
316 return $dba->dbc->sql_helper->execute( -SQL => $sql, -PARAMS => [$asm_version, $cmp_version], -USE_HASHREFS => 1);
320 my ($columns, $order_by) = @_;
321 my $select = join(q{, }, @{$columns},
'asm.ori',
'(asm.asm_end - asm.asm_start)+1 as length');
324 from coord_system cs1
325 join seq_region sr1 on (cs1.coord_system_id = sr1.coord_system_id)
326 join assembly asm on (sr1.seq_region_id = asm.asm_seq_region_id)
327 join seq_region sr2 on (sr2.seq_region_id = asm.cmp_seq_region_id)
328 join coord_system cs2 on (sr2.coord_system_id = cs2.coord_system_id)
336 my ($fh, $chains) = @_;
337 while(my $chain_def = shift @{$chains}) {
338 my $header = $chain_def->{header};
339 my $gaps = $chain_def->{gaps};
340 print $fh join(q{ }, @{$header});
342 foreach my $gap (@{$gaps}) {
343 print $fh join(q{ }, @{$gap});
351 my ($dba, $prod_name, $chains) = @_;
353 foreach my $chain_def (@{$chains}) {
354 my $ensembl_name = $chain_def->{header}->[2];
356 next
if $ensembl_name eq $target_name;
357 my $new_chain_def = decode_json(encode_json($chain_def)); # quick clone
358 # Substitute tName which in a GRCh37 -> GRCh38 mapping tName is GRCh37's code
359 $new_chain_def->{header}[2] = $target_name;
360 push(@new_chains, $new_chain_def);
366 my ($dba, $prod_name, $ensembl_name) = @_;
368 return $ucsc_name_cache{$prod_name}{$ensembl_name}
if exists $ucsc_name_cache{$prod_name}{$ensembl_name};
370 # Default is Ensembl name
371 my $ucsc_name = $ensembl_name;
372 my $slice = $dba->get_SliceAdaptor()->fetch_by_region(undef, $ensembl_name);
373 my $synonyms = $slice->get_all_synonyms(
'UCSC');
375 $ucsc_name = $synonyms->[0]->name();
378 if($slice->is_chromosome()) {
379 #MT is a special case; it's chrM
380 if($ensembl_name eq
'MT' ) {
383 # If it was a ref region add chr onto it (only check if we have an adaptor)
384 elsif($slice->is_reference()) {
385 $ucsc_name =
'chr'.$ensembl_name;
389 return $ucsc_name_cache{$prod_name}{$ensembl_name} = $ucsc_name;