ensembl-hive  2.7.0
load_feature_mappings.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 =head1 NAME
19 
20 load_feature_mappings.pl - load feature (genes and SNPs) location in the old
21 database (database with alternative assembly), mapping from old to new assembly
22 and feature location in the new database (database with reference assembly)
23 to a test database
24 Run script test_assembly_mapping.pl to produce statistics on the quality of the
25 mappings.
26 Sample config file load_feature_mappings.ini.example.
27 
28 =head1 SYNOPSIS
29 
30 load_feature_mappings.pl [arguments]
31 
32 Required arguments:
33 
34  --host=HOST new core db host HOST
35  --port=PORT new core db port PORT
36  --user=USER new core db username USER
37  --pass=PASS new core db passwort PASS
38  --dbname=NAME new core db name NAME
39  --altdbname=NAME old core db name NAME
40  --chromosomes, --chr=LIST 'all' or comma separated list of chromosomes
41  --assembly=NAME new assembly NAME
42  --altassembly=NAME old assembly NAME
43  --features, --ft=LIST features to map:genes,SNPs
44 
45  --vardbname new varation db NAME (if mapping SNPs)
46  --varaltdbname old variation db NAME (if mapping SNPs)
47 
48  if not dry run:
49  --testdbname=NAME test database name NAME (test database will be dropped and recreated)
50 
51 Optional arguments:
52 
53  --conffile=filename read parameters from FILE
54  (default: conf/Conversion.ini)
55 
56  (if different from --host, --port, --user, --pass):
57  --althost=hOST old core db host HOST
58  --altport=PORT old core db port PORT
59  --altuser=USER old core db username USER
60  --altpass=PASS old core db passwort PASS
61 
62  --testhost=HOST test db host HOST
63  --testport=PORT test db port PORT
64  --testuser=USER test db username USER
65  --testpass=PASS test db passwort PASS
66 
67  --varhost=hOST new variation db host HOST
68  --varport=PORT new variation db port PORT
69  --varuser=USER new variation db username USER
70  --varpass=PASS new variation db passwort PASS
71 
72  --varalthost=hOST old variation db host HOST
73  --varaltport=PORT old variation db port PORT
74  --varaltuser=USER old variation db username USER
75  --varaltpass=PASS old variation db passwort PASS
76 
77  --logfile, --log=FILE log to FILE (default: *STDOUT)
78  --logpath=PATH write logfile to PATH (default: .)
79  --logappend, --log_append append to logfile (default: truncate)
80 
81  -v, --verbose=0|1 verbose logging (default: false)
82  -i, --interactive=0|1 run script interactively (default: true)
83  -n, --dry_run, --dry=0|1 dont write results to database
84  -h, --help, -? print help (this message)
85 
86 =head1 DESCRIPTION
87 
88 This script will map genes for a given chromosome list from the old to the new assembly and
89 find gene locations in the new database using stable_ids.
90 It will load the results into the new database.
91 
92 =cut
93 
94 use strict;
95 use warnings;
96 no warnings 'uninitialized';
97 
98 use FindBin qw($Bin);
99 use Getopt::Long;
100 use Pod::Usage;
103 use Bio::EnsEMBL::Variation::DBSQL::DBAdaptor;
104 
105 use vars qw($SERVERROOT);
106 
107 sub map_slice;
109 
110 BEGIN {
111  $SERVERROOT = "$Bin/../";
112 }
113 
114 
115 $| = 1;
116 
117 my $support = new Bio::EnsEMBL::Utils::ConversionSupport($SERVERROOT);
118 
119 # parse options
120 $support->parse_common_options(@_);
121 $support->parse_extra_options(
122  'althost=s',
123  'altport=n',
124  'altuser=s',
125  'altpass=s',
126  'altdbname=s',
127  'assembly=s',
128  'altassembly=s',
129  'testhost=s',
130  'testport=n',
131  'testuser=s',
132  'testpass=s',
133  'testdbname=s',
134  'chromosomes|chr=s@',
135  'features|ft=s@',
136  'varhost=s',
137  'varport=n',
138  'varuser=s',
139  'varpass=s',
140  'vardbname=s',
141  'varalthost=s',
142  'varaltport=n',
143  'varaltuser=s',
144  'varaltpass=s',
145  'varaltdbname=s',
146 );
147 $support->allowed_params(
148  $support->get_common_params,
149  'althost',
150  'altport',
151  'altuser',
152  'altdbname',
153  'assembly',
154  'altassembly',
155  'testhost',
156  'testport',
157  'testuser',
158  'testpass',
159  'testdbname',
160  'chromosomes',
161  'features',
162  'varhost',
163  'varport',
164  'varuser',
165  'varpass',
166  'vardbname',
167  'varalthost',
168  'varaltport',
169  'varaltuser',
170  'varaltpass',
171  'varaltdbname',
172 );
173 
174 if ($support->param('help') or $support->error) {
175  warn $support->error if $support->error;
176  pod2usage(1);
177 }
178 
179 # ask user to confirm parameters to proceed
180 $support->confirm_params;
181 
182 # get log filehandle and print heading and parameters to logfile
183 $support->init_log;
184 
185 $support->check_required_params(
186  'altdbname',
187  'assembly',
188  'altassembly',
189  'chromosomes',
190  'features',
191  'dbname',
192  'host',
193  'user'
194 );
195 
196 if ($support->param('dry_run')) {
197  $support->log_stamped("Dry run. Test database will not be updated. Writing results to logfile instead.\n\n");
198  $support->param('verbose',1);
199 } else {
200  if ( !$support->param('testdbname') ) {
201  $support->log_error("testdbname is required if not a dry run\n",1);
202  }
203 }
204 
205 my %valid_features = ('genes' => 1,
206 'SNPs' => 1 );
207 
208 $support->comma_to_list('features');
209 my @feature_types = $support->param('features');
210 my %feature_types = map { $_ => 1} @feature_types;
211 
212 foreach my $feature_type (@feature_types ) {
213  if (!exists($valid_features{$feature_type}) ) {
214  delete $feature_types{$feature_type};
215  print STDERR "invalid feature type: $feature_type\n";
216  }
217 }
218 
219 if (scalar(keys %feature_types) == 0) {
220  $support->log_error("listed feature(s) not valid: use genes or SNPs only\n",1);
221 }
222 
223 if (exists($feature_types{'SNPs'}) ){
224  if( !$support->param('vardbname') or !$support->param('varaltdbname') ) {
225  $support->log_error("vardbname and varaltdbname are required if mapping SNPs\n",1);
226  }
227 
228 }
229 
230 
231 #####
232 # connect to database and get adaptors
233 #
234 my ($dba, $dbh, $sth);
235 
236 if ( !defined($support->param('pass')) ) {
237  $support->param('pass', '');
238 }
239 
240 
241 # first set connection parameters for alternative db and test db
242 if ( !defined($support->param('althost')) ) { $support->param('althost',$support->param('host')); }
243 if ( !defined($support->param('altport')) ) { $support->param('altport',$support->param('port')); }
244 if ( !defined($support->param('altuser')) ) { $support->param('altuser',$support->param('user')); }
245 if ( !defined($support->param('altpass')) ) { $support->param('altpass',$support->param('pass')); }
246 
247 if ( !defined($support->param('testhost')) ) { $support->param('testhost',$support->param('host')); }
248 if ( !defined($support->param('testport')) ) { $support->param('testport',$support->param('port')); }
249 if ( !defined($support->param('testuser')) ) { $support->param('testuser',$support->param('user')); }
250 if ( !defined($support->param('testpass')) ) { $support->param('testpass',$support->param('pass')); }
251 
252 # reference database
253 $dba->{'ref'} = new Bio::EnsEMBL::DBSQL::DBAdaptor( -host => $support->param('host'),
254  -user => $support->param('user'),
255  -pass => $support->param('pass'),
256  -port => $support->param('port'),
257  -dbname => $support->param('dbname') );
258 
259 $dbh->{'ref'} = $dba->{'ref'}->dbc->db_handle;
260 
261 # database containing the alternative assembly
262 $dba->{'alt'} = new Bio::EnsEMBL::DBSQL::DBAdaptor( -host => $support->param('althost'),
263  -user => $support->param('altuser'),
264  -pass => $support->param('altpass'),
265  -port => $support->param('altport'),
266  -dbname => $support->param('altdbname') );
267 
268 $dbh->{'alt'} = $dba->{'alt'}->dbc->db_handle;
269 
270 #test database
271 my $test_dbname = $support->param('testdbname');
272 $support->param('testdbname','');
273 $dbh->{'test'} = $support->get_dbconnection('test');
274 
275 $support->comma_to_list('chromosomes');
276 my @chrs = $support->param('chromosomes');
277 
278 #get chrmosomes
279 
280 my %db_chrs;
281 
282 my $chr_name;
283 $sth = $dbh->{'ref'}->prepare("select s.name from seq_region s join coord_system c using(coord_system_id) where rank=1 and length(s.name < 3) order by s.name+0");
284 $sth->execute;
285 $sth->bind_columns(\$chr_name);
286 while ($sth->fetch){
287  $db_chrs{$chr_name} = 1;
288 }
289 $sth->finish;
290 
291 if ($chrs[0] eq 'all') {
292  @chrs = sort keys %db_chrs;
293 } else {
294  my %temp_chrs;
295  foreach my $chr (@chrs) {
296  if ( exists($db_chrs{$chr}) ) {
297  $temp_chrs{$chr} = 1;
298  } else {
299  print STDERR "unknown chromosome $chr\n";
300  }
301  }
302  @chrs = sort keys %temp_chrs;
303 }
304 
305 if ( scalar(@chrs) == 0) {
306  $support->log_error("listed chromosome(s) not found in the database\n",1);
307 }
308 
309 
310 if (!$support->param('dry_run')) {
311  #create the test database
312 
313  eval{
314  $dbh->{'test'}->do("drop database if exists $test_dbname");
315  $dbh->{'test'}->do("create database $test_dbname");
316  $dbh->{'test'}->do("use $test_dbname");
317  $dbh->{'test'}->do("CREATE TABLE mapping (
318  feature_id int(10) unsigned NOT NULL AUTO_INCREMENT,
319  feature_type char(30) DEFAULT NULL,
320  stable_id varchar(128) DEFAULT NULL,
321  old_assembly varchar(30) DEFAULT NULL,
322  old_chr char(3) DEFAULT NULL,
323  old_start int(20) DEFAULT NULL,
324  old_end int(20) DEFAULT NULL,
325  old_length int(20) DEFAULT NULL,
326  old_strand tinyint(2) DEFAULT NULL,
327  feature_found_in_new_db tinyint(1) unsigned DEFAULT NULL,
328  new_chr char(3) DEFAULT NULL,
329  new_start int(20) DEFAULT NULL,
330  new_end int(20) DEFAULT NULL,
331  new_length int(20) DEFAULT NULL,
332  new_strand tinyint(2) DEFAULT NULL,
333  new_assembly varchar(30) DEFAULT NULL,
334  mapping_quality tinyint(1) unsigned DEFAULT NULL,
335  mapping_start int(20) DEFAULT NULL,
336  mapping_end int(20) DEFAULT NULL,
337  mapping_length int(20) DEFAULT NULL,
338  mapping_strands varchar(5) DEFAULT NULL,
339  mapping_chrs varchar(10) DEFAULT NULL,
340  PRIMARY KEY (feature_id))");
341 
342  };
343  if ( $@ ) {
344  $support->log_error("errors encountered when creating the test database\n",1);
345  }
346 }
347 
348 
349 if ( exists($feature_types{'SNPs'}) ) {
350  #connect to the variation db
351 
352  if ( !defined($support->param('varhost')) ) { $support->param('varhost',$support->param('host')); }
353  if ( !defined($support->param('varport')) ) { $support->param('varport',$support->param('port')); }
354  if ( !defined($support->param('varuser')) ) { $support->param('varuser',$support->param('user')); }
355  if ( !defined($support->param('varpass')) ) { $support->param('varpass',$support->param('pass')); }
356 
357  if ( !defined($support->param('varalthost')) ) { $support->param('varalthost',$support->param('host')); }
358  if ( !defined($support->param('varaltport')) ) { $support->param('varaltport',$support->param('port')); }
359  if ( !defined($support->param('varaltuser')) ) { $support->param('varaltuser',$support->param('user')); }
360  if ( !defined($support->param('varaltpass')) ) { $support->param('varaltpass',$support->param('pass')); }
361 
362 
363 
364  $dba->{'var'} = new Bio::EnsEMBL::Variation::DBSQL::DBAdaptor(
365  -host => $support->param('varhost'),
366  -port => $support->param('varport'),
367  -user => $support->param('varuser'),
368  -pass => $support->param('varpass'),
369  -dbname => $support->param('vardbname'));
370 
371  $dba->{'varalt'} = new Bio::EnsEMBL::Variation::DBSQL::DBAdaptor(
372  -host => $support->param('varalthost'),
373  -port => $support->param('varaltport'),
374  -user => $support->param('varaltuser'),
375  -pass => $support->param('varaltpass'),
376  -dbname => $support->param('varaltdbname'));
377 
378 }
379 
380 
381 #connect to the test mapping db to store results
382 if ( !$support->param('dry_run') ) {
383  $sth = $dbh->{'test'}->prepare("INSERT INTO mapping(feature_type, stable_id, old_assembly,old_chr,old_start,old_end,old_length,old_strand,feature_found_in_new_db,new_chr,new_start,new_end,new_length,new_strand,new_assembly,mapping_quality,mapping_start,mapping_end,mapping_length,mapping_strands,mapping_chrs ) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)");
384 
385 }
386 
387 my $new_sa = $dba->{'ref'}->get_adaptor("slice");
388 my $old_sa = $dba->{'alt'}->get_adaptor("slice");
389 
390 if ( exists($feature_types{'genes'}) ) {
391 
392 
393  my $new_ga = $dba->{'ref'}->get_adaptor("gene");
394  my $old_ga = $dba->{'alt'}->get_adaptor("gene");
395 
396  foreach my $chr (@chrs) {
397  my $feature_type = 'gene';
398  get_old_new_feature($old_ga,$new_ga,$feature_type,$chr, undef);
399 
400  }
401 
402 }
403 
404 if ( exists($feature_types{'SNPs'}) ) {
405 
406  my $new_vfa = $dba->{'var'}->get_adaptor('variationfeature');
407  my $new_va = $dba->{'var'}->get_adaptor('variation');
408  my $old_vfa = $dba->{'varalt'}->get_adaptor('variationfeature');
409 
410  foreach my $chr (@chrs) {
411  my $feature_type = 'SNP';
412  get_old_new_feature($old_vfa,$new_vfa,$feature_type,$chr, $new_va);
413  }
414 }
415 
416 if ( !$support->param('dry_run') ){
417  $sth->finish();
418 }
419 
420 # finish logfile
421 $support->finish_log;
422 
423 
425 
426  my $old_adaptor = shift;
427  my $new_adaptor = shift;
428  my $feature_type = shift;
429  my $chr = shift;
430  my $new_va = shift;
431 
432  $support->log_stamped("fetching $feature_type mappings for chromosome $chr\n",1);
433 
434  #get all genes on the chromosome from the old database
435  my $old_slice = $old_sa->fetch_by_region('chromosome', $chr, undef, undef,
436  undef, $support->param('altassembly'));
437 
438  # get a slice on the old assembly from the new database
439  my $newdb_oldasm_chr = $new_sa->fetch_by_region('chromosome', $chr, undef, undef,
440  undef, $support->param('altassembly'));
441 
442  my @old_features = @{$old_adaptor->fetch_all_by_Slice($old_slice)};
443  foreach my $old_feature (@old_features) {
444  my $feature_name;
445  if ($feature_type eq 'gene') {
446  $feature_name = $old_feature->stable_id;
447  } elsif ($feature_type eq 'SNP') {
448  $feature_name = $old_feature->variation_name;
449  }
450  $support->log_verbose("$feature_type $feature_name location in old db: " . $old_feature->start . ":" . $old_feature->end . ":" . $old_feature->strand . " chromosome $chr\n",1);
451 
452 
453  if ($old_feature->length == 0) {
454  next;
455  }
456 
457  my $newdb_old_asm_slice = Bio::EnsEMBL::Slice->new(-coord_system => $newdb_oldasm_chr->coord_system,
458  -start => $old_feature->start,
459  -end => $old_feature->end,
460  -strand => $old_feature->strand,
461  -seq_region_name => $chr,
462  -seq_region_length => $old_feature->length,
463  -adaptor => $new_sa);
464 
465  my $mapping_start = $newdb_oldasm_chr->end;
466  my ($mapping_end, $mapping_length, $mapping_strands, $mapping_chrs, $mapping);
467  ($mapping_start, $mapping_end, $mapping_length, $mapping_strands,$mapping_chrs, $mapping) = map_slice($newdb_old_asm_slice, $mapping_start);
468 
469  #get the feature from the new db
470  my $new_feature;
471  if ($feature_type eq 'gene') {
472  $new_feature = $new_adaptor->fetch_by_stable_id($feature_name);
473  } elsif ($feature_type eq 'SNP') {
474 
475  my $variation = $new_va->fetch_by_name($feature_name);
476  if ($variation) {
477  my @new_features = @{$new_adaptor->fetch_all_by_Variation($variation)};
478  foreach my $vf (@new_features) {
479  if ($vf->allele_string eq $old_feature->allele_string) {
480  $new_feature = $vf;
481  }
482  }
483  }
484  }
485 
486 
487  my $new_feature_found = 0;
488  my ($new_chr,$new_start,$new_end,$new_length,$new_strand);
489  if (defined($new_feature) ) {
490  $new_chr = $new_feature->slice->seq_region_name;
491  $new_start = $new_feature->start;
492  $new_end = $new_feature->end;
493  $new_strand = $new_feature->strand;
494  $new_length = $new_feature->end - $new_feature->start + 1;
495  $new_feature_found = 1;
496  $support->log_verbose("$feature_type $feature_name found in the new db; location:$new_start:$new_end:$new_strand chromosome $new_chr\n",1);
497 
498  } else {
499  $support->log_verbose("$feature_type $feature_name not found in the new db; can't compare $feature_type location to mapped location\n",1);
500  }
501 
502  #store mapping in the db
503  if ( !$support->param('dry_run') ) {
504  $sth->execute($feature_type,$feature_name,$support->param('altassembly'),$chr,$old_feature->start, $old_feature->end, $old_feature->end - $old_feature->start + 1, $old_feature->strand, $new_feature_found, $new_chr, $new_start, $new_end,$new_length, $new_strand, $support->param('assembly'), $mapping, $mapping_start, $mapping_end, $mapping_length, $mapping_strands, $mapping_chrs );
505  }
506 
507  }
508 
509 }
510 
511 
512 sub map_slice {
513 
514  my $newdb_old_asm_slice = shift;
515  my $new_asm_start = shift;
516 
517  $support->log_verbose("projection to new assembly: \n",1);
518  #project to new assembly
519  my @segments = @{ $newdb_old_asm_slice->project('chromosome', $support->param('assembly')) };
520 
521  my $new_asm_end = 0;
522  my %mapping_strands;
523  my %mapping_chrs;
524  my $first_strand;
525  my $first_chr;
526  my $mapping_length = 0;
527 
528  foreach my $segment (@segments) {
529  my $p_slice = $segment->to_Slice;
530  if (! defined($first_strand)) {
531  $first_strand = $p_slice->strand;
532  }
533  if (! defined($first_chr) ){
534  $first_chr = $p_slice->seq_region_name;
535  }
536  if ($p_slice->start < $new_asm_start && $p_slice->strand == $first_strand && $p_slice->seq_region_name eq $first_chr) {
537  $new_asm_start = $p_slice->start;
538  }
539  if ($p_slice->end > $new_asm_end && $p_slice->strand == $first_strand && $p_slice->seq_region_name eq $first_chr ) {
540  $new_asm_end = $p_slice->end;
541  }
542  if ($new_asm_end > 0 && $p_slice->strand == $first_strand && $p_slice->seq_region_name eq $first_chr ) {
543 
544  $mapping_length += $p_slice->length;
545  }
546  $mapping_strands{$p_slice->strand} = 1;
547  $mapping_chrs{$p_slice->seq_region_name} = 1;
548  $support->log_verbose($p_slice->start() . ":" . $p_slice->end() . ":" . $p_slice->strand . " chromosome ".$p_slice->seq_region_name ."\n",1);
549  }
550 
551  my $mapping; #0-no mapping, 1-gaps exist, 2-no gaps
552 
553  if (scalar(@segments) == 0) {
554  $support->log_verbose("no mapping available\n",1);
555  $mapping = 0;
556  undef $new_asm_start;
557  undef $new_asm_end;
558  undef $mapping_length;
559  } else {
560  $support->log_verbose("maping start $new_asm_start mapping end $new_asm_end\n",1);
561 
562  if (scalar(@segments) > 1) {
563  $support->log_verbose("gaps in new assembly sequence mapping\n",1);
564  $mapping = 1;
565  } else {
566  $support->log_verbose("no gaps in new assembly sequence mapping\n",1);
567  $mapping = 2;
568  }
569  }
570 
571  my $mapping_strands;
572  if (scalar(keys %mapping_strands) > 1) {
573  $mapping_strands = 'both';
574  }
575  if (scalar(keys %mapping_strands) == 1) {
576  my @strands = keys %mapping_strands;
577  $mapping_strands = $strands[0];
578  }
579  my $mapping_chrs;
580  if (scalar(keys %mapping_chrs) >= 1) {
581  my @chrs = keys %mapping_chrs;
582  $mapping_chrs = join(',',@chrs);
583  }
584 
585  return($new_asm_start, $new_asm_end, $mapping_length, $mapping_strands,$mapping_chrs, $mapping);
586 }
Bio::EnsEMBL::Utils::ConversionSupport
Definition: ConversionSupport.pm:39
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
map
public map()
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::Slice::new
public Bio::EnsEMBL::Slice new()
BEGIN
public BEGIN()
run
public run()
get_old_new_feature
public get_old_new_feature()
map_slice
public map_slice()
Bio::EnsEMBL::Slice::end
public Int end()