ensembl-hive  2.8.1
InternalIdMapper.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 =head1 SYNOPSIS
34 
35 =head1 DESCRIPTION
36 
37 =head1 METHODS
38 
39 =cut
40 
41 
42 package Bio::EnsEMBL::IdMapping::InternalIdMapper;
43 
44 use strict;
45 use warnings;
46 no warnings 'uninitialized';
47 
50 
51 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
52 use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append);
56 
57 
58 # scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
59 use constant SIMILAR_SCORE_RATIO => 0.01;
60 
61 
62 sub map_genes {
63  my $self = shift;
64  my $gene_scores = shift;
65  my $transcript_scores = shift;
66  my $gsb = shift;
67 
68  # argument checks
69  unless ($gene_scores and
70  $gene_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
71  throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
72  }
73 
74  unless ($transcript_scores and
75  $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
76  throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
77  }
78 
79  unless ($gsb and
80  $gsb->isa('Bio::EnsEMBL::IdMapping::GeneScoreBuilder')) {
81  throw('Need a Bio::EnsEMBL::IdMapping::GeneScoreBuilder.');
82  }
83 
84  $self->logger->info("== Internal ID mapping for genes...\n\n", 0, 'stamped');
85 
86  my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
87 
89  -DUMP_PATH => $dump_path,
90  -CACHE_FILE => 'gene_mappings.ser',
91  );
92 
93  my $mapping_cache = $mappings->cache_file;
94 
95  if (-s $mapping_cache) {
96 
97  # read from file
98  $self->logger->info("Reading gene mappings from file...\n", 0, 'stamped');
99  $self->logger->debug("Cache file $mapping_cache.\n", 1);
100  $mappings->read_from_file;
101  $self->logger->info("Done.\n\n", 0, 'stamped');
102 
103  } else {
104 
105  # create gene mappings
106  $self->logger->info("No gene mappings found. Will calculate them now.\n");
107 
108  # determine which plugin methods to run
109  my @default_plugins = (qw(
116  ));
117 
118  my @plugins = $self->conf->param('plugin_internal_id_mappers_gene');
119  @plugins = @default_plugins unless (defined($plugins[0]));
120 
121  my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
122  -DUMP_PATH => $dump_path,
123  -CACHE_FILE => 'gene_mappings0.ser',
124  );
125  my @mappings = ();
126  my $i = 0;
127 
128  #
129  # run the scoring chain
130  #
131  foreach my $plugin (@plugins) {
132  ($gene_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
133  $gsb, $new_mappings, $gene_scores, $transcript_scores);
134 
135  push(@mappings, $new_mappings);
136  }
137 
138  # report remaining ambiguities
139  $self->logger->info($gene_scores->get_source_count.
140  " source genes are ambiguous with ".
141  $gene_scores->get_target_count." target genes.\n\n");
142 
143  $self->log_ambiguous($gene_scores, 'gene');
144 
145  # merge mappings and write to file
146  $mappings->add_all(@mappings);
147  $mappings->write_to_file;
148 
149  if ($self->logger->loglevel eq 'debug') {
150  $mappings->log('gene', $self->conf->param('basedir'));
151  }
152 
153  $self->logger->info("Done.\n\n", 0, 'stamped');
154 
155  }
156 
157  return $mappings;
158 }
159 
160 
161 sub map_transcripts {
162  my $self = shift;
163  my $transcript_scores = shift;
164  my $gene_mappings = shift;
165  my $tsb = shift;
166 
167  # argument checks
168  unless ($transcript_scores and
169  $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
170  throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
171  }
172 
173  unless ($gene_mappings and
174  $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
175  throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
176  }
177 
178  unless ($tsb and
179  $tsb->isa('Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder')) {
180  throw('Need a Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder.');
181  }
182 
183  $self->logger->info("== Internal ID mapping for transcripts...\n\n", 0, 'stamped');
184 
185  my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
186 
188  -DUMP_PATH => $dump_path,
189  -CACHE_FILE => 'transcript_mappings.ser',
190  );
191 
192  my $mapping_cache = $mappings->cache_file;
193 
194  if (-s $mapping_cache) {
195 
196  # read from file
197  $self->logger->info("Reading transcript mappings from file...\n", 0,
198  'stamped');
199  $self->logger->debug("Cache file $mapping_cache.\n", 1);
200  $mappings->read_from_file;
201  $self->logger->info("Done.\n\n", 0, 'stamped');
202 
203  } else {
204 
205  # create transcript mappings
206  $self->logger->info("No transcript mappings found. Will calculate them now.\n");
207 
208  # determine which plugin methods to run
209  my @default_plugins = (qw(
216  ));
217 
218  my @plugins = $self->conf->param('plugin_internal_id_mappers_transcript');
219  @plugins = @default_plugins unless (defined($plugins[0]));
220 
221  my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
222  -DUMP_PATH => $dump_path,
223  -CACHE_FILE => 'transcript_mappings0.ser',
224  );
225  my @mappings = ();
226  my $i = 0;
227 
228  #
229  # run the scoring chain
230  #
231  foreach my $plugin (@plugins) {
232  ($transcript_scores, $new_mappings) = $self->delegate_to_plugin($plugin,
233  $i++, $tsb, $new_mappings, $transcript_scores, $gene_mappings);
234 
235  push(@mappings, $new_mappings);
236  }
237 
238  # report remaining ambiguities
239  $self->logger->info($transcript_scores->get_source_count.
240  " source transcripts are ambiguous with ".
241  $transcript_scores->get_target_count." target transcripts.\n\n");
242 
243  $self->log_ambiguous($transcript_scores, 'transcript');
244 
245  # merge mappings and write to file
246  $mappings->add_all(@mappings);
247  $mappings->write_to_file;
248 
249  if ($self->logger->loglevel eq 'debug') {
250  $mappings->log('transcript', $self->conf->param('basedir'));
251  }
252 
253  $self->logger->info("Done.\n\n", 0, 'stamped');
254 
255  }
256 
257  return $mappings;
258 
259 }
260 
261 
262 sub map_exons {
263  my $self = shift;
264  my $exon_scores = shift;
265  my $transcript_mappings = shift;
266  my $esb = shift;
267 
268  # argument checks
269  unless ($exon_scores and
270  $exon_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
271  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.');
272  }
273 
274  unless ($transcript_mappings and
275  $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
276  throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
277  }
278 
279  unless ($esb and
280  $esb->isa('Bio::EnsEMBL::IdMapping::ExonScoreBuilder')) {
281  throw('Need a Bio::EnsEMBL::IdMapping::ExonScoreBuilder.');
282  }
283 
284  $self->logger->info("== Internal ID mapping for exons...\n\n", 0, 'stamped');
285 
286  my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
287 
289  -DUMP_PATH => $dump_path,
290  -CACHE_FILE => 'exon_mappings.ser',
291  );
292 
293  my $mapping_cache = $mappings->cache_file;
294 
295  if (-s $mapping_cache) {
296 
297  # read from file
298  $self->logger->info("Reading exon mappings from file...\n", 0,
299  'stamped');
300  $self->logger->debug("Cache file $mapping_cache.\n", 1);
301  $mappings->read_from_file;
302  $self->logger->info("Done.\n\n", 0, 'stamped');
303 
304  } else {
305 
306  # create exon mappings
307  $self->logger->info("No exon mappings found. Will calculate them now.\n");
308 
309  # determine which plugin methods to run
310  my @default_plugins = (qw(
315  ));
316 
317  my @plugins = $self->conf->param('plugin_internal_id_mappers_exon');
318  @plugins = @default_plugins unless (defined($plugins[0]));
319 
320  my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
321  -DUMP_PATH => $dump_path,
322  -CACHE_FILE => 'exon_mappings0.ser',
323  );
324  my @mappings = ();
325  my $i = 0;
326 
327  #
328  # run the scoring chain
329  #
330  foreach my $plugin (@plugins) {
331  ($exon_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
332  $esb, $new_mappings, $exon_scores, $transcript_mappings);
333 
334  push(@mappings, $new_mappings);
335  }
336 
337  # report remaining ambiguities
338  $self->logger->info($exon_scores->get_source_count.
339  " source exons are ambiguous with ".
340  $exon_scores->get_target_count." target exons.\n\n");
341 
342  $self->log_ambiguous($exon_scores, 'exon');
343 
344  # merge mappings and write to file
345  $mappings->add_all(@mappings);
346  $mappings->write_to_file;
347 
348  if ($self->logger->loglevel eq 'debug') {
349  $mappings->log('exon', $self->conf->param('basedir'));
350  }
351 
352  $self->logger->info("Done.\n\n", 0, 'stamped');
353 
354  }
355 
356  return $mappings;
357 
358 }
359 
360 
361 #
362 # this is not implemented as a plugin, since a) it's too simple and b) it's
363 # tied to transcripts so there are no translation scores or score builder.
364 #
365 sub map_translations {
366  my $self = shift;
367  my $transcript_mappings = shift;
368 
369  # argument checks
370  unless ($transcript_mappings and
371  $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
372  throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
373  }
374 
375  $self->logger->info("== Internal ID mapping for translations...\n\n", 0, 'stamped');
376 
377  my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
378 
380  -DUMP_PATH => $dump_path,
381  -CACHE_FILE => 'translation_mappings.ser',
382  );
383 
384  my $mapping_cache = $mappings->cache_file;
385 
386  if (-s $mapping_cache) {
387 
388  # read from file
389  $self->logger->info("Reading translation mappings from file...\n", 0,
390  'stamped');
391  $self->logger->debug("Cache file $mapping_cache.\n", 1);
392  $mappings->read_from_file;
393  $self->logger->info("Done.\n\n", 0, 'stamped');
394 
395  } else {
396 
397  # create translation mappings
398  $self->logger->info("No translation mappings found. Will calculate them now.\n");
399 
400  $self->logger->info("Translation mapping...\n", 0, 'stamped');
401 
402  #
403  # map translations for mapped transcripts
404  #
405  my $i = 0;
406 
407  foreach my $entry (@{ $transcript_mappings->get_all_Entries }) {
408 
409  my $source_tl = $self->cache->get_by_key('transcripts_by_id',
410  'source', $entry->source)->translation;
411  my $target_tl = $self->cache->get_by_key('transcripts_by_id',
412  'target', $entry->target)->translation;
413 
414  if ($source_tl and $target_tl) {
415 
416  # add mapping for the translations; note that the score is taken from
417  # the transcript mapping
418  my $tl_entry = Bio::EnsEMBL::IdMapping::Entry->new_fast([
419  $source_tl->id, $target_tl->id, $entry->score
420  ]);
421  $mappings->add_Entry($tl_entry);
422 
423  } else {
424  $i++;
425  }
426 
427  }
428 
429  $self->logger->debug("Skipped transcripts without translation: $i\n", 1);
430  $self->logger->info("New mappings: ".$mappings->get_entry_count."\n\n");
431 
432  $mappings->write_to_file;
433 
434  if ($self->logger->loglevel eq 'debug') {
435  $mappings->log('translation', $self->conf->param('basedir'));
436  }
437 
438  $self->logger->info("Done.\n\n", 0, 'stamped');
439 
440  }
441 
442  return $mappings;
443 
444 }
445 
446 
447 sub delegate_to_plugin {
448  my $self = shift;
449  my $plugin = shift;
450  my $num = shift;
451  my $score_builder = shift;
452  my $mappings = shift;
453  my $scores = shift;
454 
455  # argument checks
456  unless ($score_builder and
457  $score_builder->isa('Bio::EnsEMBL::IdMapping::ScoreBuilder')) {
458  throw('Need a Bio::EnsEMBL::IdMapping::ScoreBuilder.');
459  }
460 
461  unless ($mappings and
462  $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
463  throw('Need a Bio::EnsEMBL::IdMapping::MappingList.');
464  }
465 
466  unless ($scores and
467  $scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
468  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
469  }
470 
471  # split plugin name into module and method
472  $plugin =~ /(.*)::(\w+)$/;
473  my $module = $1;
474  my $method = $2;
475 
476  unless ($module and $method) {
477  throw("Unable to determine module and method name from $plugin.\n");
478  }
479 
480  # instantiate the plugin unless we already have an instance
481  my $plugin_instance;
482  if ($self->has_plugin($module)) {
483 
484  # re-use an existing plugin instance
485  $plugin_instance = $self->get_plugin($module);
486 
487  } else {
488 
489  # inject and instantiate the plugin module
490  inject($module);
491  $plugin_instance = $module->new(
492  -LOGGER => $self->logger,
493  -CONF => $self->conf,
494  -CACHE => $self->cache
495  );
496  $self->add_plugin($plugin_instance);
497 
498  }
499 
500  # run the method on the plugin
501  #
502  # pass in a sequence number (number of method run, used for generating
503  # checkpoint files), the scores used for determining the mapping, and all
504  # other arguments passed to this method (these will vary for different object
505  # types)
506  #
507  # return the scores and mappings to feed into the next plugin in the chain
508  return $plugin_instance->$method($num, $score_builder, $mappings, $scores, @_);
509 }
510 
511 
512 sub has_plugin {
513  my $self = shift;
514  my $module = shift;
515 
516  defined($self->{'_plugins'}->{$module}) ? (return 1) : (return 0);
517 }
518 
519 
520 sub get_plugin {
521  my $self = shift;
522  my $module = shift;
523 
524  return $self->{'_plugins'}->{$module};
525 }
526 
527 
528 sub add_plugin {
529  my $self = shift;
530  my $plugin_instance = shift;
531 
532  $self->{'_plugins'}->{ref($plugin_instance)} = $plugin_instance;
533 }
534 
535 
536 sub log_ambiguous {
537  my $self = shift;
538  my $matrix = shift;
539  my $type = shift;
540 
541  unless ($matrix and
542  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
543  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
544  }
545 
546  # create dump directory if it doesn't exist
547  my $debug_path = $self->conf->param('basedir').'/debug';
548  unless (-d $debug_path) {
549  system("mkdir -p $debug_path") == 0 or
550  throw("Unable to create directory $debug_path.\n");
551  }
552 
553  my $logfile = "$debug_path/ambiguous_${type}.txt";
554 
555  open(my $fh, '>', $logfile) or
556  throw("Unable to open $logfile for writing: $!");
557 
558  my @low_scoring = ();
559  my @high_scoring = ();
560  my $last_id;
561 
562  # log by source
563  foreach my $entry (sort { $a->source <=> $b->source }
564  @{ $matrix->get_all_Entries }) {
565 
566  $last_id ||= $entry->target;
567 
568  if ($last_id != $entry->source) {
569  $self->write_ambiguous($type, 'source', $fh, \@low_scoring,
570  \@high_scoring);
571  $last_id = $entry->source;
572  }
573 
574  if ($entry->score < 0.5) {
575  push @low_scoring, $entry;
576  } else {
577  push @high_scoring, $entry;
578  }
579  }
580 
581  # write last source
582  $self->write_ambiguous($type, 'source', $fh, \@low_scoring, \@high_scoring);
583 
584  # now do the same by target
585  $last_id = undef;
586  foreach my $entry (sort { $a->target <=> $b->target }
587  @{ $matrix->get_all_Entries }) {
588 
589  $last_id ||= $entry->target;
590 
591  if ($last_id != $entry->target) {
592  $self->write_ambiguous($type, 'target', $fh, \@low_scoring,
593  \@high_scoring);
594  $last_id = $entry->target;
595  }
596 
597  if ($entry->score < 0.5) {
598  push @low_scoring, $entry;
599  } else {
600  push @high_scoring, $entry;
601  }
602  }
603 
604  # write last target
605  $self->write_ambiguous($type, 'target', $fh, \@low_scoring, \@high_scoring);
606 
607  close($fh);
608 }
609 
610 
611 sub write_ambiguous {
612  my ($self, $type, $db_type, $fh, $low, $high) = @_;
613 
614  # if only source or target are ambiguous (i.e. you have only one mapping from
615  # this perspective) then log from the other perspective
616  if (scalar(@$low) + scalar(@$high) <= 1) {
617  @$low = ();
618  @$high = ();
619  return;
620  }
621 
622  my $first_id;
623  if (@$low) {
624  $first_id = $low->[0]->$db_type;
625  } else {
626  $first_id = $high->[0]->$db_type;
627  }
628 
629  my $other_db_type;
630  if ($db_type eq 'source') {
631  $other_db_type = 'target';
632  } else {
633  $other_db_type = 'source';
634  }
635 
636  print $fh "$db_type $type $first_id scores ambiguously:\n";
637 
638  # high scorers
639  if (@$high) {
640  print $fh " high scoring ${other_db_type}s\n";
641 
642  while (my $e = shift(@$high)) {
643  print $fh " ", $e->$other_db_type, " ", $e->score, "\n";
644  }
645  }
646 
647  # low scorers
648  if (@$low) {
649  print $fh " low scoring ${other_db_type}s\n ";
650 
651  my $i = 1;
652 
653  while (my $e = shift(@$low)) {
654  print $fh "\n " unless (($i++)%10);
655  print $fh $e->$other_db_type, ", ";
656  }
657  print $fh "\n";
658  }
659 
660  print $fh "\n";
661 }
662 
663 
664 1;
665 
Bio::EnsEMBL::IdMapping::Entry
Definition: Entry.pm:16
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::best_transcript
public best_transcript()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::biotype
public biotype()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::mapped_transcript
public mapped_transcript()
Bio::EnsEMBL::IdMapping::BaseObject
Definition: BaseObject.pm:25
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::init_basic
public init_basic()
Bio::EnsEMBL::IdMapping::MappingList::new
public Bio::EnsEMBL::IdMapping::MappingList new()
Bio::EnsEMBL::IdMapping::SyntenyFramework
Definition: SyntenyFramework.pm:41
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::synteny
public synteny()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::location
public location()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::internal_id
public internal_id()
Bio::EnsEMBL::IdMapping::Serialisable::cache_file
public String cache_file()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::non_exact_translation
public non_exact_translation()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::internal_id
public internal_id()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::init_basic
public init_basic()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::init_basic
public init_basic()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::single_transcript
public single_transcript()
Bio::EnsEMBL::Utils::Logger::info
public info()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::single_gene
public single_gene()
Bio::EnsEMBL::IdMapping::MappingList
Definition: MappingList.pm:38
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::mapped_gene
public mapped_gene()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::internal_id
public internal_id()
Bio::EnsEMBL::IdMapping::Entry::new_fast
public A new_fast()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::IdMapping::SyntenyFramework::logger
public Bio::EnsEMBL::Utils::Logger logger()
Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::biotype
public biotype()