2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
20 load_feature_mappings.pl - load feature (genes and SNPs) location in the old
21 database (database with alternative assembly), mapping from old to
new assembly
22 and feature location in the
new database (database with reference assembly)
24 Run script test_assembly_mapping.pl to produce statistics on the quality of the
26 Sample config file load_feature_mappings.ini.example.
30 load_feature_mappings.pl [arguments]
34 --host=HOST
new core db host HOST
35 --port=PORT
new core db port PORT
36 --user=USER
new core db username USER
37 --pass=PASS
new core db passwort PASS
38 --dbname=NAME
new core db name NAME
39 --altdbname=NAME old core db name NAME
40 --chromosomes, --chr=LIST
'all' or comma separated list of chromosomes
41 --assembly=NAME
new assembly NAME
42 --altassembly=NAME old assembly NAME
43 --features, --ft=LIST features to
map:genes,SNPs
45 --vardbname
new varation db NAME (
if mapping SNPs)
46 --varaltdbname old variation db NAME (
if mapping SNPs)
49 --testdbname=NAME test database name NAME (test database will be dropped and recreated)
53 --conffile=filename read parameters from FILE
54 (
default: conf/Conversion.ini)
56 (
if different from --host, --port, --user, --pass):
57 --althost=hOST old core db host HOST
58 --altport=PORT old core db port PORT
59 --altuser=USER old core db username USER
60 --altpass=PASS old core db passwort PASS
62 --testhost=HOST test db host HOST
63 --testport=PORT test db port PORT
64 --testuser=USER test db username USER
65 --testpass=PASS test db passwort PASS
67 --varhost=hOST
new variation db host HOST
68 --varport=PORT
new variation db port PORT
69 --varuser=USER
new variation db username USER
70 --varpass=PASS
new variation db passwort PASS
72 --varalthost=hOST old variation db host HOST
73 --varaltport=PORT old variation db port PORT
74 --varaltuser=USER old variation db username USER
75 --varaltpass=PASS old variation db passwort PASS
77 --logfile, --log=FILE log to FILE (
default: *STDOUT)
78 --logpath=PATH write logfile to PATH (
default: .)
79 --logappend, --log_append append to logfile (
default: truncate)
81 -v, --verbose=0|1 verbose logging (
default:
false)
82 -i, --interactive=0|1
run script interactively (
default:
true)
83 -n, --dry_run, --dry=0|1 dont write results to database
84 -h, --help, -? print help (
this message)
88 This script will
map genes
for a given chromosome list from the old to the
new assembly and
89 find gene locations in the
new database
using stable_ids.
90 It will load the results into the
new database.
96 no warnings
'uninitialized';
103 use Bio::EnsEMBL::Variation::DBSQL::DBAdaptor;
105 use vars qw($SERVERROOT);
111 $SERVERROOT =
"$Bin/../";
120 $support->parse_common_options(@_);
121 $support->parse_extra_options(
134 'chromosomes|chr=s@',
147 $support->allowed_params(
148 $support->get_common_params,
174 if ($support->param(
'help') or $support->error) {
175 warn $support->error
if $support->error;
179 # ask user to confirm parameters to proceed
180 $support->confirm_params;
182 # get log filehandle and print heading and parameters to logfile
185 $support->check_required_params(
196 if ($support->param(
'dry_run')) {
197 $support->log_stamped(
"Dry run. Test database will not be updated. Writing results to logfile instead.\n\n");
198 $support->param(
'verbose',1);
200 if ( !$support->param(
'testdbname') ) {
201 $support->log_error(
"testdbname is required if not a dry run\n",1);
205 my %valid_features = (
'genes' => 1,
208 $support->comma_to_list(
'features');
209 my @feature_types = $support->param(
'features');
210 my %feature_types =
map { $_ => 1} @feature_types;
212 foreach my $feature_type (@feature_types ) {
213 if (!exists($valid_features{$feature_type}) ) {
214 delete $feature_types{$feature_type};
215 print STDERR
"invalid feature type: $feature_type\n";
219 if (scalar(keys %feature_types) == 0) {
220 $support->log_error(
"listed feature(s) not valid: use genes or SNPs only\n",1);
223 if (exists($feature_types{
'SNPs'}) ){
224 if( !$support->param(
'vardbname') or !$support->param(
'varaltdbname') ) {
225 $support->log_error(
"vardbname and varaltdbname are required if mapping SNPs\n",1);
232 # connect to database and get adaptors
234 my ($dba, $dbh, $sth);
236 if ( !defined($support->param(
'pass')) ) {
237 $support->param(
'pass',
'');
241 # first set connection parameters for alternative db and test db
242 if ( !defined($support->param(
'althost')) ) { $support->param(
'althost',$support->param(
'host')); }
243 if ( !defined($support->param(
'altport')) ) { $support->param(
'altport',$support->param(
'port')); }
244 if ( !defined($support->param(
'altuser')) ) { $support->param(
'altuser',$support->param(
'user')); }
245 if ( !defined($support->param(
'altpass')) ) { $support->param(
'altpass',$support->param(
'pass')); }
247 if ( !defined($support->param(
'testhost')) ) { $support->param(
'testhost',$support->param(
'host')); }
248 if ( !defined($support->param(
'testport')) ) { $support->param(
'testport',$support->param(
'port')); }
249 if ( !defined($support->param(
'testuser')) ) { $support->param(
'testuser',$support->param(
'user')); }
250 if ( !defined($support->param(
'testpass')) ) { $support->param(
'testpass',$support->param(
'pass')); }
254 -user => $support->param(
'user'),
255 -pass => $support->param(
'pass'),
256 -port => $support->param(
'port'),
257 -dbname => $support->param(
'dbname') );
259 $dbh->{
'ref'} = $dba->{
'ref'}->dbc->db_handle;
261 # database containing the alternative assembly
263 -user => $support->param(
'altuser'),
264 -pass => $support->param(
'altpass'),
265 -port => $support->param(
'altport'),
266 -dbname => $support->param(
'altdbname') );
268 $dbh->{
'alt'} = $dba->{
'alt'}->dbc->db_handle;
271 my $test_dbname = $support->param(
'testdbname');
272 $support->param(
'testdbname',
'');
273 $dbh->{
'test'} = $support->get_dbconnection(
'test');
275 $support->comma_to_list(
'chromosomes');
276 my @chrs = $support->param(
'chromosomes');
283 $sth = $dbh->{
'ref'}->prepare(
"select s.name from seq_region s join coord_system c using(coord_system_id) where rank=1 and length(s.name < 3) order by s.name+0");
285 $sth->bind_columns(\$chr_name);
287 $db_chrs{$chr_name} = 1;
291 if ($chrs[0] eq
'all') {
292 @chrs = sort keys %db_chrs;
295 foreach my $chr (@chrs) {
296 if ( exists($db_chrs{$chr}) ) {
297 $temp_chrs{$chr} = 1;
299 print STDERR
"unknown chromosome $chr\n";
302 @chrs = sort keys %temp_chrs;
305 if ( scalar(@chrs) == 0) {
306 $support->log_error(
"listed chromosome(s) not found in the database\n",1);
310 if (!$support->param(
'dry_run')) {
311 #create the test database
314 $dbh->{
'test'}->do(
"drop database if exists $test_dbname");
315 $dbh->{
'test'}->do(
"create database $test_dbname");
316 $dbh->{
'test'}->do(
"use $test_dbname");
317 $dbh->{
'test'}->do(
"CREATE TABLE mapping (
318 feature_id int(10) unsigned NOT NULL AUTO_INCREMENT,
319 feature_type char(30) DEFAULT NULL,
320 stable_id varchar(128) DEFAULT NULL,
321 old_assembly varchar(30) DEFAULT NULL,
322 old_chr char(3) DEFAULT NULL,
323 old_start int(20) DEFAULT NULL,
324 old_end int(20) DEFAULT NULL,
325 old_length int(20) DEFAULT NULL,
326 old_strand tinyint(2) DEFAULT NULL,
327 feature_found_in_new_db tinyint(1) unsigned DEFAULT NULL,
328 new_chr char(3) DEFAULT NULL,
329 new_start int(20) DEFAULT NULL,
330 new_end int(20) DEFAULT NULL,
331 new_length int(20) DEFAULT NULL,
332 new_strand tinyint(2) DEFAULT NULL,
333 new_assembly varchar(30) DEFAULT NULL,
334 mapping_quality tinyint(1) unsigned DEFAULT NULL,
335 mapping_start int(20) DEFAULT NULL,
336 mapping_end int(20) DEFAULT NULL,
337 mapping_length int(20) DEFAULT NULL,
338 mapping_strands varchar(5) DEFAULT NULL,
339 mapping_chrs varchar(10) DEFAULT NULL,
340 PRIMARY KEY (feature_id))");
344 $support->log_error(
"errors encountered when creating the test database\n",1);
349 if ( exists($feature_types{
'SNPs'}) ) {
350 #connect to the variation db
352 if ( !defined($support->param(
'varhost')) ) { $support->param(
'varhost',$support->param(
'host')); }
353 if ( !defined($support->param(
'varport')) ) { $support->param(
'varport',$support->param(
'port')); }
354 if ( !defined($support->param(
'varuser')) ) { $support->param(
'varuser',$support->param(
'user')); }
355 if ( !defined($support->param(
'varpass')) ) { $support->param(
'varpass',$support->param(
'pass')); }
357 if ( !defined($support->param(
'varalthost')) ) { $support->param(
'varalthost',$support->param(
'host')); }
358 if ( !defined($support->param(
'varaltport')) ) { $support->param(
'varaltport',$support->param(
'port')); }
359 if ( !defined($support->param(
'varaltuser')) ) { $support->param(
'varaltuser',$support->param(
'user')); }
360 if ( !defined($support->param(
'varaltpass')) ) { $support->param(
'varaltpass',$support->param(
'pass')); }
364 $dba->{
'var'} =
new Bio::EnsEMBL::Variation::DBSQL::DBAdaptor(
365 -host => $support->param(
'varhost'),
366 -port => $support->param(
'varport'),
367 -user => $support->param(
'varuser'),
368 -pass => $support->param(
'varpass'),
369 -dbname => $support->param(
'vardbname'));
371 $dba->{
'varalt'} =
new Bio::EnsEMBL::Variation::DBSQL::DBAdaptor(
372 -host => $support->param(
'varalthost'),
373 -port => $support->param(
'varaltport'),
374 -user => $support->param(
'varaltuser'),
375 -pass => $support->param(
'varaltpass'),
376 -dbname => $support->param(
'varaltdbname'));
381 #connect to the test mapping db to store results
382 if ( !$support->param(
'dry_run') ) {
383 $sth = $dbh->{
'test'}->prepare(
"INSERT INTO mapping(feature_type, stable_id, old_assembly,old_chr,old_start,old_end,old_length,old_strand,feature_found_in_new_db,new_chr,new_start,new_end,new_length,new_strand,new_assembly,mapping_quality,mapping_start,mapping_end,mapping_length,mapping_strands,mapping_chrs ) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)");
387 my $new_sa = $dba->{
'ref'}->get_adaptor(
"slice");
388 my $old_sa = $dba->{
'alt'}->get_adaptor(
"slice");
390 if ( exists($feature_types{
'genes'}) ) {
393 my $new_ga = $dba->{
'ref'}->get_adaptor(
"gene");
394 my $old_ga = $dba->{
'alt'}->get_adaptor(
"gene");
396 foreach my $chr (@chrs) {
397 my $feature_type =
'gene';
404 if ( exists($feature_types{
'SNPs'}) ) {
406 my $new_vfa = $dba->{
'var'}->get_adaptor(
'variationfeature');
407 my $new_va = $dba->{
'var'}->get_adaptor(
'variation');
408 my $old_vfa = $dba->{
'varalt'}->get_adaptor(
'variationfeature');
410 foreach my $chr (@chrs) {
411 my $feature_type =
'SNP';
416 if ( !$support->param(
'dry_run') ){
421 $support->finish_log;
426 my $old_adaptor = shift;
427 my $new_adaptor = shift;
428 my $feature_type = shift;
432 $support->log_stamped(
"fetching $feature_type mappings for chromosome $chr\n",1);
434 #get all genes on the chromosome from the old database
435 my $old_slice = $old_sa->fetch_by_region(
'chromosome', $chr, undef, undef,
436 undef, $support->param(
'altassembly'));
438 # get a slice on the old assembly from the new database
439 my $newdb_oldasm_chr = $new_sa->fetch_by_region(
'chromosome', $chr, undef, undef,
440 undef, $support->param(
'altassembly'));
442 my @old_features = @{$old_adaptor->fetch_all_by_Slice($old_slice)};
443 foreach my $old_feature (@old_features) {
445 if ($feature_type eq
'gene') {
446 $feature_name = $old_feature->stable_id;
447 } elsif ($feature_type eq
'SNP') {
448 $feature_name = $old_feature->variation_name;
450 $support->log_verbose(
"$feature_type $feature_name location in old db: " . $old_feature->start .
":" . $old_feature->end .
":" . $old_feature->strand .
" chromosome $chr\n",1);
453 if ($old_feature->length == 0) {
458 -start => $old_feature->start,
459 -end => $old_feature->end,
460 -strand => $old_feature->strand,
461 -seq_region_name => $chr,
462 -seq_region_length => $old_feature->length,
463 -adaptor => $new_sa);
465 my $mapping_start = $newdb_oldasm_chr->
end;
466 my ($mapping_end, $mapping_length, $mapping_strands, $mapping_chrs, $mapping);
467 ($mapping_start, $mapping_end, $mapping_length, $mapping_strands,$mapping_chrs, $mapping) =
map_slice($newdb_old_asm_slice, $mapping_start);
469 #get the feature from the new db
471 if ($feature_type eq
'gene') {
472 $new_feature = $new_adaptor->fetch_by_stable_id($feature_name);
473 } elsif ($feature_type eq
'SNP') {
475 my $variation = $new_va->fetch_by_name($feature_name);
477 my @new_features = @{$new_adaptor->fetch_all_by_Variation($variation)};
478 foreach my $vf (@new_features) {
479 if ($vf->allele_string eq $old_feature->allele_string) {
487 my $new_feature_found = 0;
488 my ($new_chr,$new_start,$new_end,$new_length,$new_strand);
489 if (defined($new_feature) ) {
490 $new_chr = $new_feature->slice->seq_region_name;
491 $new_start = $new_feature->start;
492 $new_end = $new_feature->end;
493 $new_strand = $new_feature->strand;
494 $new_length = $new_feature->end - $new_feature->start + 1;
495 $new_feature_found = 1;
496 $support->log_verbose(
"$feature_type $feature_name found in the new db; location:$new_start:$new_end:$new_strand chromosome $new_chr\n",1);
499 $support->log_verbose(
"$feature_type $feature_name not found in the new db; can't compare $feature_type location to mapped location\n",1);
502 #store mapping in the db
503 if ( !$support->param(
'dry_run') ) {
504 $sth->execute($feature_type,$feature_name,$support->param(
'altassembly'),$chr,$old_feature->start, $old_feature->end, $old_feature->end - $old_feature->start + 1, $old_feature->strand, $new_feature_found, $new_chr, $new_start, $new_end,$new_length, $new_strand, $support->param(
'assembly'), $mapping, $mapping_start, $mapping_end, $mapping_length, $mapping_strands, $mapping_chrs );
514 my $newdb_old_asm_slice = shift;
515 my $new_asm_start = shift;
517 $support->log_verbose(
"projection to new assembly: \n",1);
518 #project to new assembly
519 my @segments = @{ $newdb_old_asm_slice->project(
'chromosome', $support->param(
'assembly')) };
526 my $mapping_length = 0;
528 foreach my $segment (@segments) {
529 my $p_slice = $segment->to_Slice;
530 if (! defined($first_strand)) {
531 $first_strand = $p_slice->strand;
533 if (! defined($first_chr) ){
534 $first_chr = $p_slice->seq_region_name;
536 if ($p_slice->start < $new_asm_start && $p_slice->strand == $first_strand && $p_slice->seq_region_name eq $first_chr) {
537 $new_asm_start = $p_slice->start;
539 if ($p_slice->end > $new_asm_end && $p_slice->strand == $first_strand && $p_slice->seq_region_name eq $first_chr ) {
540 $new_asm_end = $p_slice->end;
542 if ($new_asm_end > 0 && $p_slice->strand == $first_strand && $p_slice->seq_region_name eq $first_chr ) {
544 $mapping_length += $p_slice->length;
546 $mapping_strands{$p_slice->strand} = 1;
547 $mapping_chrs{$p_slice->seq_region_name} = 1;
548 $support->log_verbose($p_slice->start() .
":" . $p_slice->end() .
":" . $p_slice->strand .
" chromosome ".$p_slice->seq_region_name .
"\n",1);
551 my $mapping; #0-no mapping, 1-gaps exist, 2-no gaps
553 if (scalar(@segments) == 0) {
554 $support->log_verbose(
"no mapping available\n",1);
556 undef $new_asm_start;
558 undef $mapping_length;
560 $support->log_verbose(
"maping start $new_asm_start mapping end $new_asm_end\n",1);
562 if (scalar(@segments) > 1) {
563 $support->log_verbose(
"gaps in new assembly sequence mapping\n",1);
566 $support->log_verbose(
"no gaps in new assembly sequence mapping\n",1);
572 if (scalar(keys %mapping_strands) > 1) {
573 $mapping_strands =
'both';
575 if (scalar(keys %mapping_strands) == 1) {
576 my @strands = keys %mapping_strands;
577 $mapping_strands = $strands[0];
580 if (scalar(keys %mapping_chrs) >= 1) {
581 my @chrs = keys %mapping_chrs;
582 $mapping_chrs = join(
',',@chrs);
585 return($new_asm_start, $new_asm_end, $mapping_length, $mapping_strands,$mapping_chrs, $mapping);