2 # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
3 # Copyright [2016-2024] EMBL-European Bioinformatics Institute
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 # Don't change the above line.
18 # Change the PATH in the myRun.ksh script if you want to use another perl.
29 --dbname, db_name=NAME database name NAME
30 --host, --dbhost, --db_host=HOST database host HOST
31 --port, --dbport, --db_port=PORT database port PORT
32 --user, --dbuser, --db_user=USER database username USER
33 --pass, --dbpass, --db_pass=PASS database passwort PASS
37 --conffile, --conf=FILE read parameters from FILE
38 (
default: conf/Conversion.ini)
40 --logfile, --log=FILE log to FILE (
default: *STDOUT)
41 --logpath=PATH write logfile to PATH (
default: .)
42 --logappend, --log_append append to logfile (
default: truncate)
43 --loglevel=LEVEL define log level (
default: INFO)
45 -i, --interactive=0|1
run script interactively (
default:
true)
46 -n, --dry_run, --dry=0|1 don
't write results to database
47 -h, --help, -? print help (this message)
55 Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
59 Please post comments/questions to the Ensembl development list
60 <http://lists.ensembl.org/mailman/listinfo/dev>
66 no warnings 'uninitialized
';
69 use Bio::EnsEMBL::Utils::ConfParser;
70 use Bio::EnsEMBL::Utils::Logger;
71 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
72 use Bio::EnsEMBL::IdMapping::Cache;
73 use Bio::EnsEMBL::IdMapping::ExonScoreBuilder;
74 use Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder;
75 use Bio::EnsEMBL::IdMapping::GeneScoreBuilder;
76 use Bio::EnsEMBL::IdMapping::InternalIdMapper;
77 use Bio::EnsEMBL::IdMapping::StableIdMapper;
78 use Bio::EnsEMBL::IdMapping::Archiver;
79 use Bio::EnsEMBL::IdMapping::ResultAnalyser;
81 #use Devel::Size qw(size total_size);
83 #$Data::Dumper::Indent = 1;
85 # parse configuration and commandline arguments
86 my $conf = new Bio::EnsEMBL::Utils::ConfParser(
87 -SERVERROOT => "$Bin/../../..",
88 -DEFAULT_CONF => "$Bin/default.conf"
93 'basedir|basedir=s
' => 1,
94 'chromosomes|chr=s@
' => 0,
97 'biotypes_include=s@
' => 0,
98 'biotypes_exclude=s@
' => 0,
99 'min_exon_length|minexonlength=i
' => 0,
100 'exonerate_path|exoneratepath=s
' => 1,
101 'exonerate_threshold|exoneratethreshold=f
' => 0,
102 'exonerate_jobs|exoneratejobs=i
' => 0,
103 'exonerate_bytes_per_job|exoneratebytesperjob=f
' => 0,
104 'exonerate_extra_params|exonerateextraparams=s
' => 0,
105 'plugin_internal_id_mappers_gene=s@
' => 0,
106 'plugin_internal_id_mappers_transcript=s@
' => 0,
107 'plugin_internal_id_mappers_exon=s@
' => 0,
108 'mapping_types=s@
' => 1,
109 'plugin_stable_id_generator=s
' => 0,
110 'upload_events|uploadevents=s
' => 0,
115 # set default logpath
116 unless ($conf->param('logpath
')) {
117 $conf->param('logpath
', path_append($conf->param('basedir
'), 'log
'));
120 # get log filehandle and print heading and parameters to logfile
121 my $logger = new Bio::EnsEMBL::Utils::Logger(
122 -LOGFILE => $conf->param('logfile
'),
123 -LOGAUTO => $conf->param('logauto
'),
124 -LOGAUTOBASE => 'id_mapping
',
125 -LOGAUTOID => $conf->param('logautoid
'),
126 -LOGPATH => $conf->param('logpath
'),
127 -LOGAPPEND => $conf->param('logappend
'),
128 -LOGLEVEL => $conf->param('loglevel
'),
129 -IS_COMPONENT => $conf->param('is_component
'),
133 $logger->init_log($conf->list_param_values);
141 my $transcript_scores;
144 my $transcript_mappings;
146 my $translation_mappings;
148 # loading cache from file
149 my $cache = Bio::EnsEMBL::IdMapping::Cache->new(
156 # get a stable ID mapper
157 my $stable_id_mapper = Bio::EnsEMBL::IdMapping::StableIdMapper->new(
164 # find out which entities we want to map
165 my %mapping_types = ();
166 foreach my $type ($conf->param('mapping_types
')) {
167 $mapping_types{$type} = 1;
171 # run in requested mode
172 my $mode = $conf->param('mode
') || 'normal
';
173 if ( $mode eq 'mapping
' ) { $mode = 'normal
' }
174 my $run = "run_$mode";
193 # assign stable IDs and make creation and deletion events
196 # generate similarity events
197 &generate_similarity_events;
199 # dump existing stable_id_event table to file
200 &dump_existing_events;
202 # create gene and peptide archive
203 &archive($stable_id_mapper->mapping_session_id);
205 # upload table data files into db
206 &upload_mapping_session_and_events;
210 # final stats and mapping summary
216 # upload table data files into db
217 &upload_mapping_session_and_events;
225 # get new ScoreBuilders for exons, transcripts and genes
226 $esb = Bio::EnsEMBL::IdMapping::ExonScoreBuilder->new(
231 $tsb = Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder->new(
236 $gsb = Bio::EnsEMBL::IdMapping::GeneScoreBuilder->new(
243 $exon_scores = $esb->score_exons;
246 $transcript_scores = $tsb->score_transcripts($exon_scores);
249 $gene_scores = $gsb->score_genes($transcript_scores);
255 # get an internal ID mapper
256 my $internal_id_mapper = Bio::EnsEMBL::IdMapping::InternalIdMapper->new(
263 $gene_mappings = $internal_id_mapper->map_genes($gene_scores,
264 $transcript_scores, $gsb);
268 $mapping_types{'translation
'}) {
269 $transcript_mappings = $internal_id_mapper->map_transcripts(
270 $transcript_scores, $gene_mappings, $tsb);
274 if ($mapping_types{'exon'}) {
275 $exon_mappings = $internal_id_mapper->map_exons($exon_scores,
276 $transcript_mappings, $esb);
280 if ($mapping_types{'translation
'}) {
281 $translation_mappings = $internal_id_mapper->map_translations(
282 $transcript_mappings);
287 sub assign_stable_ids {
294 if ($mapping_types{'exon'}) {
295 $stable_id_mapper->map_stable_ids($exon_mappings, 'exon');
300 $stable_id_mapper->map_stable_ids($transcript_mappings, 'transcript');
304 if ($mapping_types{'translation
'}) {
305 $stable_id_mapper->map_stable_ids($translation_mappings, 'translation
');
309 if ($mapping_types{'gene
'}) {
310 $stable_id_mapper->map_stable_ids($gene_mappings, 'gene
');
314 # dump mappings to file for debug purposes
315 $stable_id_mapper->dump_debug_mappings;
317 # write stable_id_events to file
318 $stable_id_mapper->write_stable_id_events('new');
323 sub generate_similarity_events {
325 $logger->info("Generating similarity events...\n", 0, 'stamped
');
328 if ($mapping_types{'gene
'}) {
329 $logger->debug("genes\n", 1);
330 $stable_id_mapper->generate_similarity_events($gene_mappings, $gene_scores,
335 my $filtered_transcript_scores;
336 if ($mapping_types{'transcript'} or $mapping_types{'translation
'}) {
337 $filtered_transcript_scores =
338 $stable_id_mapper->filter_same_gene_transcript_similarities(
343 $logger->debug("transcripts\n", 1);
344 $stable_id_mapper->generate_similarity_events($transcript_mappings,
349 if ($mapping_types{'translation
'}) {
350 $logger->debug("translations\n", 1);
351 $stable_id_mapper->generate_translation_similarity_events(
352 $translation_mappings, $filtered_transcript_scores);
355 # write stable_id_events to file
356 $stable_id_mapper->write_stable_id_events('similarity
');
358 # write_retrofit_stable_id_events?? [todo]
360 $logger->info("Done.\n\n", 0, 'stamped
');
364 sub dump_existing_events {
365 $logger->info("Dumping existing stable_id_events...\n", 0, 'stamped
');
367 my $i = $stable_id_mapper->dump_table_to_file('source
', 'stable_id_event
',
368 'stable_id_event_existing.txt
', 1);
370 $logger->info("Done writing $i entries.\n\n", 0, 'stamped
');
375 my $mapping_session_id = shift;
377 $logger->info("Create gene and peptide archive...\n", 0, 'stamped
');
380 my $archiver = Bio::EnsEMBL::IdMapping::Archiver->new(
386 # create gene and peptide archive
387 $archiver->create_archive($mapping_session_id);
389 $logger->info("Done.\n\n", 0, 'stamped
');
391 # dump existing archive tables to file
392 $logger->info("Dumping existing gene and peptide archive...\n", 0, 'stamped
');
394 my $i = $archiver->dump_table_to_file('source
', 'gene_archive
',
395 'gene_archive_existing.txt
', 1);
396 my $j = $archiver->dump_table_to_file('source
', 'peptide_archive
',
397 'peptide_archive_existing.txt
', 1);
399 $logger->info("Done writing $i gene_archive and $j peptide_archive entries.\n\n", 0, 'stamped
');
403 sub upload_mapping_session_and_events {
404 if ($conf->is_true('upload_events
') and ! $conf->param('dry_run
')) {
406 $logger->info("Uploading mapping_session and stable_id_event tables...\n");
411 $logger->info("mapping_session...\n", 1);
412 $i += $stable_id_mapper->upload_file_into_table('target
', 'mapping_session
',
413 'mapping_session.txt
');
414 $logger->info("$i\n", 1);
416 $logger->info("stable_id_event...\n", 1);
417 $j += $stable_id_mapper->upload_file_into_table('target
', 'stable_id_event
',
418 'stable_id_event_existing.txt
');
419 $j += $stable_id_mapper->upload_file_into_table('target
', 'stable_id_event
',
420 'stable_id_event_new.txt
', 1);
421 $j += $stable_id_mapper->upload_file_into_table('target
', 'stable_id_event
',
422 'stable_id_event_similarity.txt
', 1);
423 $logger->info("$j\n", 1);
425 $logger->info("Done.\n\n");
428 $logger->info("Stable ID event and mapping session tables not uploaded.\n\n");
433 sub upload_stable_ids {
436 $logger->info("Uploading stable ID tables...\n");
438 foreach my $t ($conf->param('mapping_types
')) {
439 $logger->info("${t}_stable_id...\n", 1);
440 my $i = $stable_id_mapper->upload_file_into_table('target
',
441 "${t}_stable_id", "${t}_stable_id.txt");
442 $logger->info("$i\n", 1);
445 $logger->info("Done.\n\n");
448 $logger->info("Stable ID tables not uploaded.\n\n");
454 if ($conf->is_true('upload_archive') and ! $conf->param('dry_run
')) {
456 $logger->info("Uploading gene and peptide tables...\n");
458 foreach my $t (qw(gene peptide)) {
459 $logger->info("${t}_archive...\n", 1);
461 $i += $stable_id_mapper->upload_file_into_table('target
', "${t}_archive",
462 "${t}_archive_existing.txt", 1);
463 $i += $stable_id_mapper->upload_file_into_table('target
', "${t}_archive",
464 "${t}_archive_new.txt", 1);
465 $logger->info("$i\n", 1);
468 $logger->info("Done.\n\n");
471 $logger->info("Gene and peptide archive tables not uploaded.\n\n");
476 sub analyse_results {
478 $logger->info("Analysing results...\n", 0, 'stamped
');
480 # get a result analyser
481 my $analyser = Bio::EnsEMBL::IdMapping::ResultAnalyser->new(
488 $analyser->analyse($gene_mappings,
489 $stable_id_mapper->get_all_stable_id_events('similarity
'));
491 # write results to file
492 $analyser->write_results_to_file;
495 $analyser->create_clicklist;
497 $logger->info("Done.\n\n", 0, 'stamped
');
500 $logger->info("Creating mapping summary...\n", 0, 'stamped
');
501 $analyser->create_mapping_summary;
502 $logger->info("Done.\n", 0, 'stamped
');
507 # test memory consumption of cache after merging. used for debugging.
509 sub log_cache_stats {
510 $logger->info("\nCache memory usage:\n\n");
515 $keys{'cache
'} = size($cache->{'cache
'});
517 foreach my $name (keys %{ $cache->{'cache
'} }) {
518 $keys{$name} = size($cache->{'cache
'}->{$name});
519 foreach my $type (keys %{ $cache->{'cache
'}->{$name} }) {
520 $keys{$type} = size($cache->{'cache
'}->{$name}->{$type});
521 $s += size($cache->{'cache
'}->{$name}->{$type});
525 my $ts = total_size($cache->{'cache
'});
527 my $fmt = "%-50s%12.0f\n";
529 foreach my $k (sort { $keys{$a} <=> $keys{$b} } keys %keys) {
530 $logger->info(sprintf($fmt, $k, $keys{$k}), 1);
532 $logger->info(sprintf($fmt, "total overhead", $s), 1);
533 $logger->info(sprintf($fmt, "data", ($ts-$s)), 1);
534 $logger->info(sprintf($fmt, "total", $ts)."\n", 1);
538 foreach my $eid (keys %{ $cache->get_by_name('exons_by_id
', 'target
') }) {
541 $logger->info("\nData object memory usage:\n\n");
543 my $exon = $cache->get_by_key('exons_by_id
', 'target
', $eid);
544 my $s1 = size($exon);
545 my $ts1 = total_size($exon);
547 $logger->info(sprintf($fmt, "object", $s1), 1);
548 $logger->info(sprintf($fmt, "data", ($ts1-$s1)), 1);
549 $logger->info(sprintf($fmt, "total", $ts1)."\n", 1);
551 print $exon->stable_id."\n";
552 #warn Data::Dumper::Dumper($exon);