ensembl-hive  2.8.1
ensembl_assembly_to_chain.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 =pod
18 
19 =head1 CHAIN FILES
20 
21 This code writes chain files from Ensembl assembly records. Chain files are defined at
22 
23  http://genome.ucsc.edu/goldenPath/help/chain.html
24 
25 They can be used with liftover (http://genome.ucsc.edu/cgi-bin/hgLiftOver) and crossmap (http://crossmap.sourceforge.net). The file format says the following:
26 
27  chain score tName tSize tStrand tStart tEnd qName qSize qStrand qStart qEnd id
28  size dt dq
29  size
30 
31 You can see this in the the example chain files for human mappings http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg18.over.chain.gz and http://hgdownload.soe.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz.
32 
33 dt & dq are meant to be the differences between the end of this current block and the start of the next in the target and query. The final line is just a size to indicate no further offset.
34 
35 tSize and qSize are the total sizes of the target and reference sequences.
36 
37 =cut
38 
39 use strict;
40 use warnings;
41 use Getopt::Long;
44 use Bio::EnsEMBL::Utils::IO qw/work_with_file gz_work_with_file/;
45 use Scalar::Util qw/looks_like_number/;
46 use feature qw/say/;
47 use File::Path qw/mkpath/;
48 use File::Spec;
49 use JSON;
50 
51 my $COMPRESS = 0;
52 my $UCSC = 0;
53 my %ucsc_name_cache;
54 
55 sub get_options {
56  my ($db_name, $db_host, $db_user, $db_pass, $db_port, $help, @species, $group, $release, $dir, $compress, $ucsc);
57  $db_port = 3306;
58  $group = 'core';
59  $compress=0;
60  $ucsc=0;
61  $dir = File::Spec->curdir();
62 
63  GetOptions(
64  "db_name|dbname|database=s" => \$db_name,
65  "db_host|dbhost|host=s" => \$db_host,
66  "db_user|dbuser|user|username=s" => \$db_user,
67  "db_pass|dbpass|pass|password=s" => \$db_pass,
68  "db_port|dbport|port=s" => \$db_port,
69  "species=s@" => \@species,
70  "version|release=i" => \$release,
71  "directory|dir=s" => \$dir,
72  "compress|gzip|gz!" => \$compress,
73  "ucsc!" => \$ucsc,
74  "help|h!" => \$help,
75  );
76 
77  my %args = (
78  -HOST => $db_host, -PORT => $db_port,
79  -USER => $db_user
80  );
81  $args{-PASS} = $db_pass if $db_pass;
82 
83  # must be a DB we need
84  if($db_name) {
85  Bio::EnsEMBL::DBSQL::DBAdaptor->new(
86  %args,
87  -DBNAME => $db_name,
88  -SPECIES => $db_name,
89  -GROUP => 'core'
90  );
91  }
92  else {
93  Bio::EnsEMBL::Registry->load_registry_from_db(
94  %args,
95  -DB_VERSION => $release,
96  );
97  }
98 
99  $COMPRESS = $compress;
100  $UCSC = $ucsc;
101  my @dbas;
102  my %final_dbas;
103 
104  if(@species) {
105  say "Working against a restricted species list";
106  foreach my $s (@species) {
107  my $dba = Bio::EnsEMBL::Registry->get_DBAdaptor($s, $group);
108  die "Cannot find a DBAdaptor for the species ${s}" unless $dba;
109  push(@dbas, $dba);
110  }
111  }
112  else {
113  say "Dumping chain file for all available species";
114  @dbas = @{Bio::EnsEMBL::Registry->get_all_DBAdaptors(-GROUP => 'core')};
115  }
116 
117  foreach my $dba (@dbas) {
118  my $entries = get_liftover_mappings($dba);
119  my $prod_name = $dba->get_MetaContainer()->get_production_name();
120  $prod_name //= $dba->species();
121  $final_dbas{$prod_name} = $dba if @{$entries};
122  $dba->dbc->disconnect_if_idle();
123  }
124 
125  # Return including sorting the species by name as it's just nicer that way
126  return ($dir, map { $final_dbas{$_} } sort keys %final_dbas);
127 }
128 
129 run();
130 
131 sub run {
132  my ($dir, @dbas) = get_options();
133  foreach my $dba (@dbas) {
134  run_on_dba($dir, $dba);
135  $dba->dbc->disconnect_if_idle;
136  }
137  return;
138 }
139 
140 sub run_on_dba {
141  my ($dir, $core_dba) = @_;
142  my $prod_name = $core_dba->get_MetaContainer->get_production_name();
143  $prod_name //= $core_dba->species();
144  say "Processing ${prod_name}";
145  my $liftovers = get_liftover_mappings($core_dba);
146  foreach my $mappings (@{$liftovers}) {
147  my ($asm_cs, $cmp_cs) = @{$mappings};
148  say "\tWorking with $asm_cs to $cmp_cs";
149  say "\t\tFetching mappings";
150  my $asm_to_cmp_mappings = get_assembly_mappings($core_dba, $asm_cs, $cmp_cs);
151  write_mappings($dir, $asm_cs, $cmp_cs, $prod_name, $asm_to_cmp_mappings, $core_dba);
152  say "\t\tFetching reverse mappings";
153  my $cmp_to_asm_mappings = get_reverse_assembly_mappings($core_dba, $asm_cs, $cmp_cs);
154  write_mappings($dir, $cmp_cs, $asm_cs, $prod_name, $cmp_to_asm_mappings, $core_dba);
155  say "\t\tFinished";
156  }
157  return;
158 }
159 
160 #Parse mapping keys like chromosome:GRCh37#chromosome:NCBI36 to ['GRCh37','NCBI36']
162  my ($core_dba) = @_;
163  my $mappings = $core_dba->get_MetaContainer()->list_value_by_key('liftover.mapping');
164  my %unique_mappings;
165  foreach my $mapping (@{$mappings}) {
166  die "Can't parse mapping string for coord system version '${mapping}'!\n"
167  unless $mapping =~ /.+:(.+)#.+:(.+)/;
168  my $version_1 = $1;
169  my $version_2 = $2;
170  $unique_mappings{$version_1. ":". $version_2} = [$version_1, $version_2];
171  }
172  return [values %unique_mappings];
173 }
174 
175 sub write_mappings {
176  my ($dir, $source_cs, $target_cs, $prod_name, $mappings, $core_dba) = @_;
177  my $file = "${source_cs}_to_${target_cs}.chain";
178  $file .= '.gz' if $COMPRESS;
179  my $target_dir = File::Spec->catdir($dir, $prod_name);
180  mkpath($target_dir);
181  my $path = File::Spec->catfile($target_dir, $file);
182  say "\t\tBuilding chain mappings";
183  my $chains = build_chain_mappings($mappings);
184  my $ucsc_chains = [];
185  if($UCSC) {
186  say "\t\tBuilding UCSC mappings";
187  $ucsc_chains = create_ucsc_chains($core_dba, $prod_name, $chains);
188  }
189  say "\t\tWriting mappings to $path";
190 
191  my $writer = sub {
192  my ($fh) = @_;
193  print_chains($fh, $chains);
194  print_chains($fh, $ucsc_chains) if @{$ucsc_chains};
195  };
196 
197  if($COMPRESS) {
198  gz_work_with_file($path, 'w', $writer);
199  }
200  else {
201  work_with_file($path, 'w', $writer);
202  }
203 
204  return;
205 }
206 
208  my ($assembly_mappings) = @_;
209  my @chain_mappings;
210 
211  my ($t_name, $t_size, $t_strand, $t_start, $t_end);
212  my ($q_name, $q_size, $q_strand, $q_start, $q_end);
213  my $chain_id = 1;
214  my @chain_gaps;
215 
216  my $length = scalar(@{$assembly_mappings});
217  for (my $i = 0; $i < $length; $i++) {
218 
219  my $current = $assembly_mappings->[$i];
220  my $next = ($i+1 != $length) ? $assembly_mappings->[$i+1] : undef;
221 
222  my $ori = $current->{ori};
223  my ($asm_diff, $cmp_diff);
224  if($next) {
225  $asm_diff = ($next->{asm_start} - $current->{asm_end})-1;
226  # Rev strands means the next cmp region has a lower start than the
227  # current end (because it's running in reverse). Rember length in 1-based
228  # coords always is (high-low)-1
229  $cmp_diff = ($ori == 1) ? ($next->{cmp_start} - $current->{cmp_end})-1 : ($current->{cmp_start} - $next->{cmp_end})-1;
230  }
231 
232  if(! $t_name) {
233  # Reset variables to current
234  @chain_gaps = ();
235  $chain_id++;
236  ($t_name, $t_size, $t_strand, $t_start) = ($current->{asm_name}, $current->{asm_length}, 1, $current->{asm_start});
237  ($q_name, $q_size, $q_strand) = ($current->{cmp_name}, $current->{cmp_length}, $current->{ori});
238  $q_start = ($ori == 1) ? $current->{cmp_start} : $current->{cmp_end};
239  }
240 
241  # Block that decides we need to start a new chain definition
242  #
243  # Can mean we have run out of mappings, we are into a new chromsome (both source and target)
244  # or strand has swapped.
245  # Final reason is we've had an out-of-order meaning a negative gap was produced
246  # (we're going backwards). In any situation this means the chain is finished
247  if( ! defined $next ||
248  $t_name ne $next->{asm_name} ||
249  $q_name ne $next->{cmp_name} || # we can switch target chromosomes. e.g. cross chromsome mappings in mouse NCBI37->GRCm38
250  $ori != $next->{ori} ||
251  $asm_diff < 0 ||
252  $cmp_diff < 0) {
253  # Add the last gap on which is just the length of this alignment
254  push(@chain_gaps, [$current->{length}]);
255  # Set the ends of the chain since this is the last block
256  $t_end = $current->{asm_end};
257  $q_end = ($ori == 1) ? $current->{cmp_end} : $current->{cmp_start};
258 
259  #If strand was negative we need to represent all data as reverse complemented regions
260  if($q_strand == -1) {
261  # $t_start = ($t_size - $t_start)+1;
262  # $t_end = ($t_size - $t_end)+1;
263  $q_start = ($q_size - $q_start)+1;
264  $q_end = ($q_size - $q_end)+1;
265  }
266  # Convert to UCSC formats (0-based half-open intervals and +/- strands)
267  $t_start--;
268  $q_start--;
269  $t_strand = ($t_strand == 1) ? '+' : '-';
270  $q_strand = ($q_strand == 1) ? '+' : '-';
271 
272  #Store the chain
273  my $chain_score = 1;
274  push(@chain_mappings, {
275  header => ['chain', $chain_score, $t_name, $t_size, $t_strand, $t_start, $t_end, $q_name, $q_size, $q_strand, $q_start, $q_end, $chain_id],
276  gaps => [@chain_gaps]
277  });
278 
279  if(! defined $next) {
280  last;
281  }
282 
283  # Clear variables
284  ($t_name, $t_size, $t_strand, $t_start) = ();
285  ($q_name, $q_size, $q_strand, $q_start) = ();
286  }
287 
288  push(@chain_gaps, [$current->{length}, $asm_diff, $cmp_diff]);
289  }
290 
291  return \@chain_mappings;
292 }
293 
295  my ($dba, $asm_version, $cmp_version) = @_;
296  my $sql = get_sql(
297  ['sr1.name as asm_name',
298  'sr1.length as asm_length',
299  'sr2.name as cmp_name',
300  'sr2.length as cmp_length',
301  'asm.asm_start', 'asm.asm_end', 'asm.cmp_start', 'asm.cmp_end'],
302  'order by cs2.version, sr2.name, asm.cmp_start');
303  return $dba->dbc->sql_helper->execute( -SQL => $sql, -PARAMS => [$asm_version, $cmp_version], -USE_HASHREFS => 1);
304 }
305 
307  my ($dba, $asm_version, $cmp_version) = @_;
308  # Reverse the columns to get the reverse mapping
309  my $sql = get_sql(
310  ['sr1.name as cmp_name',
311  'sr1.length as cmp_length',
312  'sr2.name as asm_name',
313  'sr2.length as asm_length',
314  'asm.cmp_start as asm_start', 'asm.cmp_end as asm_end', 'asm.asm_start as cmp_start', 'asm.asm_end as cmp_end'],
315  'order by cs2.version, sr2.name, asm.cmp_start');
316  return $dba->dbc->sql_helper->execute( -SQL => $sql, -PARAMS => [$asm_version, $cmp_version], -USE_HASHREFS => 1);
317 }
318 
319 sub get_sql {
320  my ($columns, $order_by) = @_;
321  my $select = join(q{, }, @{$columns}, 'asm.ori', '(asm.asm_end - asm.asm_start)+1 as length');
322  return <<SQL;
323 select ${select}
324 from coord_system cs1
325 join seq_region sr1 on (cs1.coord_system_id = sr1.coord_system_id)
326 join assembly asm on (sr1.seq_region_id = asm.asm_seq_region_id)
327 join seq_region sr2 on (sr2.seq_region_id = asm.cmp_seq_region_id)
328 join coord_system cs2 on (sr2.coord_system_id = cs2.coord_system_id)
329 where cs1.version =?
330 and cs2.version =?
331 $order_by
332 SQL
333 }
334 
335 sub print_chains {
336  my ($fh, $chains) = @_;
337  while(my $chain_def = shift @{$chains}) {
338  my $header = $chain_def->{header};
339  my $gaps = $chain_def->{gaps};
340  print $fh join(q{ }, @{$header});
341  print $fh "\n";
342  foreach my $gap (@{$gaps}) {
343  print $fh join(q{ }, @{$gap});
344  print $fh "\n";
345  }
346  print $fh "\n";
347  }
348 }
349 
350 sub create_ucsc_chains {
351  my ($dba, $prod_name, $chains) = @_;
352  my @new_chains;
353  foreach my $chain_def (@{$chains}) {
354  my $ensembl_name = $chain_def->{header}->[2];
355  my $target_name = ensembl_to_ucsc_name($dba, $prod_name, $ensembl_name);
356  next if $ensembl_name eq $target_name;
357  my $new_chain_def = decode_json(encode_json($chain_def)); # quick clone
358  # Substitute tName which in a GRCh37 -> GRCh38 mapping tName is GRCh37's code
359  $new_chain_def->{header}[2] = $target_name;
360  push(@new_chains, $new_chain_def);
361  }
362  return \@new_chains;
363 }
364 
366  my ($dba, $prod_name, $ensembl_name) = @_;
367 
368  return $ucsc_name_cache{$prod_name}{$ensembl_name} if exists $ucsc_name_cache{$prod_name}{$ensembl_name};
369 
370  # Default is Ensembl name
371  my $ucsc_name = $ensembl_name;
372  my $slice = $dba->get_SliceAdaptor()->fetch_by_region(undef, $ensembl_name);
373  my $synonyms = $slice->get_all_synonyms('UCSC');
374  if(@{$synonyms}) {
375  $ucsc_name = $synonyms->[0]->name();
376  }
377  else {
378  if($slice->is_chromosome()) {
379  #MT is a special case; it's chrM
380  if($ensembl_name eq 'MT' ) {
381  $ucsc_name = 'chrM';
382  }
383  # If it was a ref region add chr onto it (only check if we have an adaptor)
384  elsif($slice->is_reference()) {
385  $ucsc_name = 'chr'.$ensembl_name;
386  }
387  }
388  }
389  return $ucsc_name_cache{$prod_name}{$ensembl_name} = $ucsc_name;
390 }
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
ensembl_to_ucsc_name
public ensembl_to_ucsc_name()
map
public map()
build_chain_mappings
public build_chain_mappings()
get_assembly_mappings
public get_assembly_mappings()
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
Bio::EnsEMBL::Utils::IO
Definition: IO.pm:80
create_ucsc_chains
public create_ucsc_chains()
get_reverse_assembly_mappings
public get_reverse_assembly_mappings()
write_mappings
public write_mappings()
run
public run()
get_options
public get_options()
run_on_dba
public run_on_dba()
Bio::EnsEMBL::Registry::get_all_DBAdaptors
public List get_all_DBAdaptors()
get_liftover_mappings
public get_liftover_mappings()
print_chains
public print_chains()
get_sql
public get_sql()