2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 # Don't change the above line.
18 # Change the PATH in the myRun.ksh script if you want to use another perl.
22 fake_stable_id_mapping.pl - fix the stable ID
archive for a database after some
27 fake_stable_id_mapping.pl [options]
31 --conffile, --conf=FILE read parameters from FILE
32 (
default: conf/Conversion.ini)
34 --dbname, db_name=NAME use
new database NAME
35 --host, --dbhost, --db_host=HOST use
new database host HOST
36 --port, --dbport, --db_port=PORT use
new database port PORT
37 --user, --dbuser, --db_user=USER use
new database username USER
38 --pass, --dbpass, --db_pass=PASS use
new database passwort PASS
39 --altdbname=NAME use old database NAME
40 --althost=HOST use old database host HOST
41 --altport=PORT use old database port PORT
42 --altuser=USER use old database username USER
43 --altpass=PASS use old database passwort PASS
45 --gene_stable_id_file|gsi_file|gsi=FILE
46 (deprectated) the path of the file
47 containing a list of gene stable Ids
49 --transcript_stable_id_file|tsi_file|tsi=FILE
50 (deprectated, optional) the path of the
52 stable Ids that were deleted
53 --skip_ncrna|skip_ncRNA|skip_nc=0|1 (optionally) skip ncRNAs
54 --skip_biotypes=LIST (optionally) skip LISTed biotypes
56 --logfile, --log=FILE log to FILE (
default: *STDOUT)
57 --logpath=PATH write logfile to PATH (
default: .)
58 --logappend, --log_append append to logfile (
default: truncate)
59 -v, --verbose verbose logging (
default:
false)
60 -i, --interactive=0|1
run script interactively (
default:
true)
61 -n, --dry_run, --dry=0|1 don
't write results to database
62 -h, --help, -? print help (this message)
66 This script fakes a stable ID archive run for a database where some genes were
69 It assumes that the new database already has its *_stable_id tables populated.
70 A new mapping session is created and all stable IDs other than the deleted ones
71 are mapped to themselves. For the deleted genes, appropriate entries in
72 gene_archive and peptide_archive are created. All this is done to the new
73 database, whereas stable Ids of deleted objects are looked up in the old
74 database. The scripts also increments the stable ID versions of genes where
75 transcripts were deleted (but the gene is still there due to other retained
78 Please note that when using two different databases as input and one is from
79 the last release, you might have to patch it to the current schema.
84 Patrick Meidl <pm2@sanger.ac.uk>
88 Post questions to the EnsEMBL development list http://lists.ensembl.org/mailman/listinfo/dev
94 no warnings 'uninitialized
';
99 use Bio::EnsEMBL::Utils::ConversionSupport;
100 use Digest::MD5 qw(md5_hex);
104 my $support = new Bio::EnsEMBL::Utils::ConversionSupport("$Bin/../../..");
107 $support->parse_common_options(@_);
108 $support->parse_extra_options(
114 'gene_stable_id_file|gsi_file|gsi=s
',
115 'transcript_stable_id_file|tsi_file|tsi=s
',
116 'skip_ncrna|skip_ncRNA|skip_nc=s
',
119 $support->allowed_params(
120 $support->get_common_params,
126 'gene_stable_id_file
',
127 'transcript_stable_id_file
',
132 if ($support->param('help
') or $support->error) {
133 warn $support->error if $support->error;
137 $support->comma_to_list('skip_biotypes
');
139 # ask user to confirm parameters to proceed
140 $support->confirm_params;
142 # get log filehandle and print heading and parameters to logfile
145 $support->check_required_params(
152 # connect to database and get adaptors
153 my $dba_new = $support->get_database('core
');
154 my $dba_old = $support->get_database('ensembl
', 'alt
');
155 my $dbh_new = $dba_new->dbc->db_handle;
157 # define some globally used variables
168 my %skip_biotypes = ();
171 # find out which genes and transcripts were deleted
173 if ($support->param('gene_stable_id_file
') or
174 $support->param('transcript_stable_id_file
')) {
176 # read list of deleted gene and transcript stable IDs from file(s)
177 # this is error-prone and therefore DEPRECATED!
178 &parse_deleted_files;
181 # infer deleted objects from dbs (more robust)
185 # create new mapping session
186 my $mapping_session_id = &create_mapping_session;
188 # create stable_id_event entries for all objects, mapping to themselves
189 &create_stable_id_events;
191 # increment gene version for all genes where transcripts were deleted
192 &increment_gene_versions;
194 # set stable_id_event.new_stable_id to NULL for deleted objects
197 # populate gene_archive and peptide_archive
201 $support->finish_log;
207 =head2 parse_deleted_files
209 Example : &parse_deleted_files;
210 Description : DEPRECATED
211 Read list of deleted gene and transcript stable IDs from file(s).
212 Note that this method of determining which objects were deleted
213 is now DEPRECATED (because it was error-prone when dealing with
214 both whole gene and individual transcript deletions).
216 Exceptions : thrown on missing files
223 sub parse_deleted_files {
225 $support->log_warning("DEPRECATED. Don't use stable ID files (
this is error-prone), rather let the script determine which objects were deleted from the dbs.\n
");
227 my $ta = $dba_old->get_TranscriptAdaptor;
228 my $ga = $dba_old->get_GeneAdaptor;
231 # read list of deleted gene_stable_ids from file
233 $support->log_stamped("Reading list of deleted gene_stable_ids from file, and fetching associated
transcript and translation stable IDs from the db...\n
");
234 my $gfh = $support->filehandle('<', $support->param('gene_stable_id_file'));
236 while (my $g = <$gfh>) {
238 my $gene = $ga->fetch_by_stable_id($g);
243 # fetch associated transcript and translation stable IDs from the old db
244 foreach my $transcript (@{ $gene->get_all_Transcripts }) {
245 $tsi_del{$transcript->stable_id} = 1;
246 my $tl = $transcript->translation;
247 $tlsi_del{$tl->stable_id} = 1 if ($tl);
252 # read list of deleted transcript_stable_ids from file
254 if ($support->param('transcript_stable_id_file')) {
256 $support->log_stamped("Reading list of deleted transcript_stable_ids from file, and fetching associated translation stable IDs from the db...\n
");
258 my $tfh = $support->filehandle('<', $support->param('transcript_stable_id_file'));
260 while (my $t = <$tfh>) {
262 my $transcript = $ta->fetch_by_stable_id($t);
264 # skip non-protein-coding genes
265 unless ($transcript->biotype eq 'protein_coding') {
266 $support->log_warning("Transcript
".$transcript->stable_id." is non-protein_coding, skipping.\n
", 1);
270 $tsi_del{$transcript->stable_id} = 1;
271 my $tl = $transcript->translation;
272 $tlsi_del{$tl->stable_id} = 1 if ($tl);
274 my $gene = $ga->fetch_by_transcript_id($transcript->dbID);
275 $genes{$gene->stable_id} = $gene;
276 $gsi_mod{$gene->stable_id} = 1;
280 $gsi_string = "'".join("',
'", keys(%gsi_del))."'";
281 $gsi_mod_string = "'".join("',
'", keys(%gsi_mod))."'";
282 $tsi_string = "'".join("',
'", keys(%tsi_del))."'";
283 $tlsi_string = "'".join("',
'", keys(%tlsi_del))."'";
285 $support->log_stamped("Done loading
".scalar(keys(%gsi_del))." gene,
".scalar(keys(%tsi_del))." transcript and
".scalar(keys(%tlsi_del))." translation stable IDs.\n\n
");
290 =head2 determine_deleted
292 Example : &determine_deleted;
293 Description : Infer deleted genes/transcripts from dbs by comparing which
294 objects are in the old and new db.
303 sub determine_deleted {
305 $support->log_stamped("Determining list of deleted gene,
transcript and translation stable IDs by comparing dbs...\n
");
307 # optionally skip ncRNAs
309 # this is the complete list of ncRNA biotype; you might need to update it (the
310 # code below will try to help you with this)
311 my @nc_biotypes = qw(
330 if ($support->param('skip_ncrna')) {
332 %skip_biotypes = map { $_ => 1 } @nc_biotypes;
334 # make sure we have a complete list of ncRNA biotypes
335 my $sql = qq(SELECT DISTINCT biotype from gene);
336 my $sth1 = $dbh_new->prepare($sql);
340 while ((my $biotype) = $sth1->fetchrow_array) {
341 push @biotypes_db, $biotype unless ($skip_biotypes{$biotype});
347 print "These are the non-ncRNA biotypes found in the db:\n
";
348 map { print " $_\n
" } @biotypes_db;
349 print "\nPlease check that the list of ncRNA biotypes is still complete, otherwise adapt the script.\n
";
350 exit unless $support->user_proceed("Continue?
");
354 # optionally skip other biotypes
355 if ($support->param('skip_biotypes')) {
356 %skip_biotypes = map { $_ => 1 } $support->param('skip_biotypes');
359 # get old and new genes and transcripts from db
360 my $ga_old = $dba_old->get_GeneAdaptor;
361 my @genes_old = @{ $ga_old->fetch_all };
363 my $ga_new = $dba_new->get_GeneAdaptor;
364 my %genes_new = map { $_->stable_id => $_ } @{ $ga_new->fetch_all };
365 my $ta_new = $dba_new->get_TranscriptAdaptor;
366 my %tsi_new = map { $_ => 1 } @{ $ta_new->list_stable_ids };
368 while (my $g_old = shift(@genes_old)) {
371 next if ($skip_biotypes{$g_old->biotype});
373 my $gsi = $g_old->stable_id;
375 # mark gene as deleted
376 unless ($genes_new{$gsi}) {
378 $genes{$gsi} = $g_old;
382 foreach my $tr (@{ $g_old->get_all_Transcripts }) {
385 next if ($skip_biotypes{$tr->biotype});
387 my $tsi = $tr->stable_id;
389 unless ($tsi_new{$tsi}) {
390 # mark transcript and translation as deleted
392 my $tl = $tr->translation;
393 $tlsi_del{$tl->stable_id} = 1 if ($tl);
395 # mark gene as modified
397 $genes_mod{$gsi} = $g_old;
398 $genes{$gsi} = $g_old;
403 # create stable ID strings for use in mysql IN statements
404 $gsi_string = "'".join("',
'", keys(%gsi_del))."'";
405 $gsi_mod_string = "'".join("',
'", keys(%gsi_mod))."'";
406 $tsi_string = "'".join("',
'", keys(%tsi_del))."'";
407 $tlsi_string = "'".join("',
'", keys(%tlsi_del))."'";
410 my $fmt = "%-15s%6d\n
";
411 $support->log("Deleted objects found:\n
", 1);
412 $support->log(sprintf($fmt, "genes
", scalar(keys(%gsi_del))), 2);
413 $support->log(sprintf($fmt, "transcripts
", scalar(keys(%tsi_del))), 2);
414 $support->log(sprintf($fmt, "translations
", scalar(keys(%tlsi_del))), 2);
416 $support->log("Modified genes found:
".scalar(keys(%gsi_mod))."\n
", 1);
418 $support->log_stamped("Done.\n\n
");
422 =head2 create_mapping_session
424 Example : my $msi = &create_mapping_session;
425 Description : Creates a new mapping_session in the db.
426 Return type : Int - mapping_session_id of the newly created entry
434 sub create_mapping_session {
436 $support->log("Creating
new mapping session...\n
");
438 my $old_db_name = $support->param('altdbname');
439 my $new_db_name = $support->param('dbname');
441 my $old_mca = $dba_old->get_MetaContainer;
442 my ($old_release) = @{ $old_mca->list_value_by_key('schema_version') };
443 my ($old_assembly) = @{ $old_mca->list_value_by_key('assembly.default') };
445 my $new_mca = $dba_new->get_MetaContainer;
446 my ($new_release) = @{ $new_mca->list_value_by_key('schema_version') };
447 my ($new_assembly) = @{ $new_mca->list_value_by_key('assembly.default') };
450 INSERT INTO mapping_session
451 VALUES (NULL, '$old_db_name', '$new_db_name', '$old_release',
452 '$new_release','$old_assembly', '$new_assembly', NOW())
454 my $c = $dbh_new->do($sql) unless ($support->param('dry_run'));
455 my $mapping_session_id = $dbh_new->{'mysql_insertid'};
457 my $fmt = "%-23s%-40s\n
";
458 $support->log(sprintf($fmt, 'mapping_session_id', $mapping_session_id), 1);
459 $support->log(sprintf($fmt, 'old_db_name', $old_db_name), 1);
460 $support->log(sprintf($fmt, 'new_db_name', $new_db_name), 1);
461 $support->log(sprintf($fmt, 'old_release', $old_release), 1);
462 $support->log(sprintf($fmt, 'new_release', $new_release), 1);
463 $support->log(sprintf($fmt, 'old_assembly', $old_assembly), 1);
464 $support->log(sprintf($fmt, 'new_assembly', $new_assembly), 1);
466 $support->log("Done.\n\n
");
468 return $mapping_session_id;
472 =head2 create_stable_id_events
474 Example : &create_stable_id_events
475 Description : Creates stable_id_event entries for all objects found in the old
476 db, mapping them to themselves. Optionally, some biotypes will
477 be skipped (this is useful if a separate script is run to deal
487 sub create_stable_id_events {
489 $support->log_stamped("Creating stable_id_event entries
for all objects, mapping to themselves...\n
");
491 my $ga = $dba_old->get_GeneAdaptor;
492 my @genes = @{ $ga->fetch_all };
495 INSERT INTO stable_id_event
496 VALUES (?, ?, ?, ?, ?, ?, ?)
498 my $sth = $dbh_new->prepare($sql);
500 my %stats = map { $_ => 0 } qw(g tr tl g_tot tr_tot);
501 my $num_genes = scalar(@genes);
504 while (my $gene = shift(@genes)) {
508 next if ($skip_biotypes{$gene->biotype});
510 unless ($support->param('dry_run')) {
525 my @transcripts = @{ $gene->get_all_Transcripts };
526 while (my $tr = shift(@transcripts)) {
530 next if ($skip_biotypes{$tr->biotype});
532 unless ($support->param('dry_run')) {
547 if (my $tl = $tr->translation) {
549 unless ($support->param('dry_run')) {
570 $support->log_stamped("Done inserting entries
for $stats{g} (of $stats{g_tot}) genes, $stats{tr} (of $stats{tr_tot}) transcripts, $stats{tl} translations.\n\n
");
575 =head2 increment_gene_versions
577 Example : &increment_gene_versions;
578 Description : Increment version of all genes where transcripts were deleted.
579 Also checks that gene_stable_id.stable_id is correct (and adjusts
589 sub increment_gene_versions {
591 $support->log_stamped("Incrementing gene versions
for genes where transcripts were deleted...\n
");
593 # update stable_id_event
595 UPDATE stable_id_event
596 SET new_version = new_version + 1
597 WHERE new_stable_id IN ($gsi_mod_string)
598 AND mapping_session_id = $mapping_session_id
601 $c = $dbh_new->do($sql) unless ($support->param('dry_run'));
602 $support->log("stable_id_event [$c]\n
", 1);
604 # update gene_stable_id
606 UPDATE gene_stable_id
611 my $sth = $dbh_new->prepare($sql);
615 foreach my $g (values(%genes_mod)) {
616 my $version = $g->version + 1;
617 $c += $sth->execute($version, $g->stable_id, $version)
618 unless ($support->param('dry_run'));
621 $support->log("gene_stable_id [$c]\n
", 1);
623 $support->log_stamped("Done.\n\n
");
629 Example : &mark_deleted;
630 Description : Sets stable_id_event.new_stable_id to NULL for deleted objects.
641 $support->log_stamped("Setting new_stable_id to NULL
for deleted objects...\n
");
643 $support->log("Genes...
", 1);
645 UPDATE stable_id_event
646 SET new_stable_id = NULL, new_version = 0
647 WHERE new_stable_id IN ($gsi_string)
648 AND mapping_session_id = $mapping_session_id
650 my $c = $dbh_new->do($sql) unless ($support->param('dry_run'));
651 $support->log("[$c]\n
");
653 $support->log("Transcripts...
", 1);
655 UPDATE stable_id_event
656 SET new_stable_id = NULL, new_version = 0
657 WHERE new_stable_id IN ($tsi_string)
658 AND mapping_session_id = $mapping_session_id
660 $c = $dbh_new->do($sql) unless ($support->param('dry_run'));
661 $support->log("[$c]\n
");
663 $support->log("Translations...
", 1);
665 UPDATE stable_id_event
666 SET new_stable_id = NULL, new_version = 0
667 WHERE new_stable_id IN ($tlsi_string)
668 AND mapping_session_id = $mapping_session_id
670 $c = $dbh_new->do($sql) unless ($support->param('dry_run'));
671 $support->log("[$c]\n
");
673 $support->log_stamped("Done.\n\n
");
678 =head2 populate_archive
680 Example : &populate_archive;
681 Description : Populates gene_archive and peptide_archive for all deleted
691 sub populated_archive {
693 $support->log_stamped("Populating gene_archive and peptide_archive...\n
");
695 my $sth_gene = $dbh_new->prepare(qq(
696 INSERT INTO gene_archive
697 VALUES (?, ?, ?, ?, ?, ?, ?, ?)
699 my $sth_pep = $dbh_new->prepare(qq(
700 INSERT INTO peptide_archive (md5_checksum, peptide_seq)
706 foreach my $gsi (keys(%genes)) {
707 my $gene = $genes{$gsi};
709 foreach my $trans (@{ $gene->get_all_Transcripts }) {
711 # skip transcripts that were not deleted (since %genes may contain genes
712 # where only some but not all transcripts were deleted)
713 next unless ($tsi_del{$trans->stable_id});
715 my $tl = $trans->translation;
717 # add peptide_archive entry
718 unless ($support->param('dry_run')) {
720 my $tl_stable_id = "";
725 $tl_stable_id = $tl->stable_id;
726 $tl_version = $tl->version;
727 my $pep_seq = $trans->translate->seq;
728 $sth_pep->execute(md5_hex($pep_seq), $pep_seq);
729 $pid = $dbh_new->{'mysql_insertid'};
732 # add gene_archive entry
749 $support->log_stamped("Done adding $c entries.\n\n
");