3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
42 package Bio::EnsEMBL::IdMapping::InternalIdMapper;
46 no warnings
'uninitialized';
58 # scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
59 use constant SIMILAR_SCORE_RATIO => 0.01;
64 my $gene_scores = shift;
65 my $transcript_scores = shift;
69 unless ($gene_scores and
70 $gene_scores->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
71 throw(
'Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
74 unless ($transcript_scores and
75 $transcript_scores->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
76 throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
80 $gsb->isa(
'Bio::EnsEMBL::IdMapping::GeneScoreBuilder')) {
81 throw(
'Need a Bio::EnsEMBL::IdMapping::GeneScoreBuilder.');
84 $self->
logger->
info(
"== Internal ID mapping for genes...\n\n", 0,
'stamped');
86 my $dump_path = path_append($self->conf->param(
'basedir'),
'mapping');
89 -DUMP_PATH => $dump_path,
90 -CACHE_FILE =>
'gene_mappings.ser',
95 if (-s $mapping_cache) {
98 $self->logger->info(
"Reading gene mappings from file...\n", 0,
'stamped');
99 $self->logger->debug(
"Cache file $mapping_cache.\n", 1);
100 $mappings->read_from_file;
101 $self->logger->info(
"Done.\n\n", 0,
'stamped');
105 # create gene mappings
106 $self->logger->info(
"No gene mappings found. Will calculate them now.\n");
108 # determine which plugin methods to run
109 my @default_plugins = (qw(
118 my @plugins = $self->conf->param(
'plugin_internal_id_mappers_gene');
119 @plugins = @default_plugins unless (defined($plugins[0]));
122 -DUMP_PATH => $dump_path,
123 -CACHE_FILE =>
'gene_mappings0.ser',
129 # run the scoring chain
131 foreach my $plugin (@plugins) {
132 ($gene_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
133 $gsb, $new_mappings, $gene_scores, $transcript_scores);
135 push(@mappings, $new_mappings);
138 # report remaining ambiguities
139 $self->logger->info($gene_scores->get_source_count.
140 " source genes are ambiguous with ".
141 $gene_scores->get_target_count.
" target genes.\n\n");
143 $self->log_ambiguous($gene_scores,
'gene');
145 # merge mappings and write to file
146 $mappings->add_all(@mappings);
147 $mappings->write_to_file;
149 if ($self->logger->loglevel eq
'debug') {
150 $mappings->log(
'gene', $self->conf->param(
'basedir'));
153 $self->logger->info(
"Done.\n\n", 0,
'stamped');
161 sub map_transcripts {
163 my $transcript_scores = shift;
164 my $gene_mappings = shift;
168 unless ($transcript_scores and
169 $transcript_scores->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
170 throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
173 unless ($gene_mappings and
174 $gene_mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList')) {
175 throw(
'Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
179 $tsb->isa(
'Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder')) {
180 throw(
'Need a Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder.');
183 $self->logger->info(
"== Internal ID mapping for transcripts...\n\n", 0,
'stamped');
185 my $dump_path = path_append($self->conf->param(
'basedir'),
'mapping');
188 -DUMP_PATH => $dump_path,
189 -CACHE_FILE =>
'transcript_mappings.ser',
194 if (-s $mapping_cache) {
197 $self->logger->info(
"Reading transcript mappings from file...\n", 0,
199 $self->logger->debug(
"Cache file $mapping_cache.\n", 1);
200 $mappings->read_from_file;
201 $self->logger->info(
"Done.\n\n", 0,
'stamped');
205 # create transcript mappings
206 $self->logger->info(
"No transcript mappings found. Will calculate them now.\n");
208 # determine which plugin methods to run
209 my @default_plugins = (qw(
218 my @plugins = $self->conf->param(
'plugin_internal_id_mappers_transcript');
219 @plugins = @default_plugins unless (defined($plugins[0]));
222 -DUMP_PATH => $dump_path,
223 -CACHE_FILE =>
'transcript_mappings0.ser',
229 # run the scoring chain
231 foreach my $plugin (@plugins) {
232 ($transcript_scores, $new_mappings) = $self->delegate_to_plugin($plugin,
233 $i++, $tsb, $new_mappings, $transcript_scores, $gene_mappings);
235 push(@mappings, $new_mappings);
238 # report remaining ambiguities
239 $self->logger->info($transcript_scores->get_source_count.
240 " source transcripts are ambiguous with ".
241 $transcript_scores->get_target_count.
" target transcripts.\n\n");
243 $self->log_ambiguous($transcript_scores,
'transcript');
245 # merge mappings and write to file
246 $mappings->add_all(@mappings);
247 $mappings->write_to_file;
249 if ($self->logger->loglevel eq
'debug') {
250 $mappings->log(
'transcript', $self->conf->param(
'basedir'));
253 $self->logger->info(
"Done.\n\n", 0,
'stamped');
264 my $exon_scores = shift;
265 my $transcript_mappings = shift;
269 unless ($exon_scores and
270 $exon_scores->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
271 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.');
274 unless ($transcript_mappings and
275 $transcript_mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList')) {
276 throw(
'Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
280 $esb->isa(
'Bio::EnsEMBL::IdMapping::ExonScoreBuilder')) {
281 throw(
'Need a Bio::EnsEMBL::IdMapping::ExonScoreBuilder.');
284 $self->logger->info(
"== Internal ID mapping for exons...\n\n", 0,
'stamped');
286 my $dump_path = path_append($self->conf->param(
'basedir'),
'mapping');
289 -DUMP_PATH => $dump_path,
290 -CACHE_FILE =>
'exon_mappings.ser',
295 if (-s $mapping_cache) {
298 $self->logger->info(
"Reading exon mappings from file...\n", 0,
300 $self->logger->debug(
"Cache file $mapping_cache.\n", 1);
301 $mappings->read_from_file;
302 $self->logger->info(
"Done.\n\n", 0,
'stamped');
306 # create exon mappings
307 $self->logger->info(
"No exon mappings found. Will calculate them now.\n");
309 # determine which plugin methods to run
310 my @default_plugins = (qw(
317 my @plugins = $self->conf->param(
'plugin_internal_id_mappers_exon');
318 @plugins = @default_plugins unless (defined($plugins[0]));
321 -DUMP_PATH => $dump_path,
322 -CACHE_FILE =>
'exon_mappings0.ser',
328 # run the scoring chain
330 foreach my $plugin (@plugins) {
331 ($exon_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
332 $esb, $new_mappings, $exon_scores, $transcript_mappings);
334 push(@mappings, $new_mappings);
337 # report remaining ambiguities
338 $self->logger->info($exon_scores->get_source_count.
339 " source exons are ambiguous with ".
340 $exon_scores->get_target_count.
" target exons.\n\n");
342 $self->log_ambiguous($exon_scores,
'exon');
344 # merge mappings and write to file
345 $mappings->add_all(@mappings);
346 $mappings->write_to_file;
348 if ($self->logger->loglevel eq
'debug') {
349 $mappings->log(
'exon', $self->conf->param(
'basedir'));
352 $self->logger->info(
"Done.\n\n", 0,
'stamped');
362 # this is not implemented as a plugin, since a) it's too simple and b) it's
363 # tied to transcripts so there are no translation scores or score builder.
365 sub map_translations {
367 my $transcript_mappings = shift;
370 unless ($transcript_mappings and
371 $transcript_mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList')) {
372 throw(
'Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
375 $self->logger->info(
"== Internal ID mapping for translations...\n\n", 0,
'stamped');
377 my $dump_path = path_append($self->conf->param(
'basedir'),
'mapping');
380 -DUMP_PATH => $dump_path,
381 -CACHE_FILE =>
'translation_mappings.ser',
386 if (-s $mapping_cache) {
389 $self->logger->info(
"Reading translation mappings from file...\n", 0,
391 $self->logger->debug(
"Cache file $mapping_cache.\n", 1);
392 $mappings->read_from_file;
393 $self->logger->info(
"Done.\n\n", 0,
'stamped');
397 # create translation mappings
398 $self->logger->info(
"No translation mappings found. Will calculate them now.\n");
400 $self->logger->info(
"Translation mapping...\n", 0,
'stamped');
403 # map translations for mapped transcripts
407 foreach my $entry (@{ $transcript_mappings->get_all_Entries }) {
409 my $source_tl = $self->cache->get_by_key(
'transcripts_by_id',
410 'source', $entry->source)->translation;
411 my $target_tl = $self->cache->get_by_key(
'transcripts_by_id',
412 'target', $entry->target)->translation;
414 if ($source_tl and $target_tl) {
416 # add mapping for the translations; note that the score is taken from
417 # the transcript mapping
419 $source_tl->id, $target_tl->id, $entry->score
421 $mappings->add_Entry($tl_entry);
429 $self->logger->debug(
"Skipped transcripts without translation: $i\n", 1);
430 $self->logger->info(
"New mappings: ".$mappings->get_entry_count.
"\n\n");
432 $mappings->write_to_file;
434 if ($self->logger->loglevel eq
'debug') {
435 $mappings->log(
'translation', $self->conf->param(
'basedir'));
438 $self->logger->info(
"Done.\n\n", 0,
'stamped');
447 sub delegate_to_plugin {
451 my $score_builder = shift;
452 my $mappings = shift;
456 unless ($score_builder and
457 $score_builder->isa(
'Bio::EnsEMBL::IdMapping::ScoreBuilder')) {
458 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoreBuilder.');
461 unless ($mappings and
462 $mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList')) {
463 throw(
'Need a Bio::EnsEMBL::IdMapping::MappingList.');
467 $scores->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
468 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
471 # split plugin name into module and method
472 $plugin =~ /(.*)::(\w+)$/;
476 unless ($module and $method) {
477 throw(
"Unable to determine module and method name from $plugin.\n");
480 # instantiate the plugin unless we already have an instance
482 if ($self->has_plugin($module)) {
484 # re-use an existing plugin instance
485 $plugin_instance = $self->get_plugin($module);
489 # inject and instantiate the plugin module
491 $plugin_instance = $module->new(
492 -LOGGER => $self->logger,
493 -CONF => $self->conf,
494 -CACHE => $self->cache
496 $self->add_plugin($plugin_instance);
500 # run the method on the plugin
502 # pass in a sequence number (number of method run, used for generating
503 # checkpoint files), the scores used for determining the mapping, and all
504 # other arguments passed to this method (these will vary for different object
507 # return the scores and mappings to feed into the next plugin in the chain
508 return $plugin_instance->$method($num, $score_builder, $mappings, $scores, @_);
516 defined($self->{
'_plugins'}->{$module}) ? (
return 1) : (return 0);
524 return $self->{
'_plugins'}->{$module};
530 my $plugin_instance = shift;
532 $self->{
'_plugins'}->{ref($plugin_instance)} = $plugin_instance;
542 $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
543 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
546 # create dump directory if it doesn't exist
547 my $debug_path = $self->conf->param(
'basedir').
'/debug';
548 unless (-d $debug_path) {
549 system(
"mkdir -p $debug_path") == 0 or
550 throw(
"Unable to create directory $debug_path.\n");
553 my $logfile =
"$debug_path/ambiguous_${type}.txt";
555 open(my $fh,
'>', $logfile) or
556 throw(
"Unable to open $logfile for writing: $!");
558 my @low_scoring = ();
559 my @high_scoring = ();
563 foreach my $entry (sort { $a->source <=> $b->source }
564 @{ $matrix->get_all_Entries }) {
566 $last_id ||= $entry->target;
568 if ($last_id != $entry->source) {
569 $self->write_ambiguous($type,
'source', $fh, \@low_scoring,
571 $last_id = $entry->source;
574 if ($entry->score < 0.5) {
575 push @low_scoring, $entry;
577 push @high_scoring, $entry;
582 $self->write_ambiguous($type,
'source', $fh, \@low_scoring, \@high_scoring);
584 # now do the same by target
586 foreach my $entry (sort { $a->target <=> $b->target }
587 @{ $matrix->get_all_Entries }) {
589 $last_id ||= $entry->target;
591 if ($last_id != $entry->target) {
592 $self->write_ambiguous($type,
'target', $fh, \@low_scoring,
594 $last_id = $entry->target;
597 if ($entry->score < 0.5) {
598 push @low_scoring, $entry;
600 push @high_scoring, $entry;
605 $self->write_ambiguous($type,
'target', $fh, \@low_scoring, \@high_scoring);
611 sub write_ambiguous {
612 my ($self, $type, $db_type, $fh, $low, $high) = @_;
614 # if only source or target are ambiguous (i.e. you have only one mapping from
615 # this perspective) then log from the other perspective
616 if (scalar(@$low) + scalar(@$high) <= 1) {
624 $first_id = $low->[0]->$db_type;
626 $first_id = $high->[0]->$db_type;
630 if ($db_type eq
'source') {
631 $other_db_type =
'target';
633 $other_db_type =
'source';
636 print $fh
"$db_type $type $first_id scores ambiguously:\n";
640 print $fh
" high scoring ${other_db_type}s\n";
642 while (my $e = shift(@$high)) {
643 print $fh
" ", $e->$other_db_type,
" ", $e->score,
"\n";
649 print $fh
" low scoring ${other_db_type}s\n ";
653 while (my $e = shift(@$low)) {
654 print $fh
"\n " unless (($i++)%10);
655 print $fh $e->$other_db_type,
", ";