ensembl-hive  2.8.1
StableIdMapper.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 =head1 SYNOPSIS
34 
35 =head1 DESCRIPTION
36 
37 =head1 METHODS
38 
39 =cut
40 
41 package Bio::EnsEMBL::IdMapping::StableIdMapper;
42 
43 use strict;
44 use warnings;
45 no warnings 'uninitialized';
46 
49 
50 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
51 use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append);
53 use POSIX qw(strftime);
54 
55 
56 # instance variables
57 my %debug_mappings;
58 
59 
60 sub new {
61  my $caller = shift;
62  my $class = ref($caller) || $caller;
63  my $self = $class->SUPER::new(@_);
64 
65  # inject a StableIdGenerator
66  #
67  # If you write your own generators, make sure they extend
68  # Bio::EnsEMBL::Idmapping::BaseObject and additionally implement these three
69  # methods: initial_stable_id(), increment_stable_id() and calculate_version().
70  my $stable_id_generator = $self->conf->param('plugin_stable_id_generator') ||
71  'Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric';
72  $self->logger->debug("Using $stable_id_generator to generate stable Ids.\n");
73  inject($stable_id_generator);
74 
75  # create a new StableIdGenerator object
76  my $generator_instance = $stable_id_generator->new(
77  -LOGGER => $self->logger,
78  -CONF => $self->conf,
79  -CACHE => $self->cache
80  );
81  $self->stable_id_generator($generator_instance);
82 
83  return $self;
84 }
85 
86 
87 sub generate_mapping_session {
88  my $self = shift;
89 
90  # only run this method once
91  return if ($self->mapping_session_date);
92 
93  $self->logger->info("Generating new mapping_session...\n");
94 
95  $self->mapping_session_date(time);
96  $self->mapping_session_date_fmt(strftime("%Y-%m-%d %T",
97  localtime($self->mapping_session_date)));
98 
99  my $s_dba = $self->cache->get_DBAdaptor('source');
100  my $s_dbh = $s_dba->dbc->db_handle;
101  my $t_dba = $self->cache->get_DBAdaptor('target');
102  my $t_dbh = $t_dba->dbc->db_handle;
103 
104  # check if mapping_session_id was manually set by the configuration
105  my $mapping_session_id = $self->conf->param('mapping_session_id');
106 
107  if ($mapping_session_id) {
108 
109  $self->logger->debug("Using manually configured mapping_session_id $mapping_session_id\n", 1);
110 
111  } else {
112 
113  # calculate mapping_session_id from db
114  my $sql = qq(SELECT MAX(mapping_session_id) FROM mapping_session);
115  $mapping_session_id = $self->fetch_value_from_db($s_dbh, $sql);
116 
117  unless ($mapping_session_id) {
118  $self->logger->debug("No previous mapping_session found.\n", 1);
119  }
120 
121  # increment last mapping_session_id
122  $mapping_session_id++;
123 
124  $self->logger->debug("Using mapping_session_id $mapping_session_id\n", 1);
125  }
126 
127  $self->mapping_session_id($mapping_session_id);
128 
129  # write old mapping_session table to a file
130  my $i;
131  my $fh = $self->get_filehandle('mapping_session.txt', 'tables');
132 
133  my $sth1 = $s_dbh->prepare("SELECT * FROM mapping_session");
134  $sth1->execute;
135 
136  while (my @row = $sth1->fetchrow_array) {
137  $i++;
138  print $fh join("\t", @row);
139  print $fh "\n";
140  }
141 
142  $sth1->finish;
143 
144  # append the new mapping_session to the file
145  my $release_sql = qq(
146  SELECT meta_value FROM meta WHERE meta_key = 'schema_version'
147  );
148  my $old_release = $self->fetch_value_from_db($s_dbh, $release_sql);
149  my $new_release = $self->fetch_value_from_db($t_dbh, $release_sql);
150 
151  my $assembly_sql = qq(
152  SELECT meta_value FROM meta WHERE meta_key = 'assembly.default'
153  );
154  my $old_assembly = $self->fetch_value_from_db($s_dbh, $assembly_sql);
155  my $new_assembly = $self->fetch_value_from_db($t_dbh, $assembly_sql);
156 
157  unless ($old_release and $new_release and $old_assembly and $new_assembly) {
158  $self->logger->warning("Not all data for new mapping_session found:\n", 1);
159  $self->logger->info("old_release: $old_release, new_release: $new_release");
160  $self->logger->info("old_assembly: $old_assembly, new_assembly $new_assembly\n", 2);
161  }
162 
163  print $fh join("\t",
164  $mapping_session_id,
165  $self->conf->param('sourcedbname'),
166  $self->conf->param('targetdbname'),
167  $old_release,
168  $new_release,
169  $old_assembly,
170  $new_assembly,
171  $self->mapping_session_date_fmt);
172 
173  print $fh "\n";
174  close($fh);
175 
176  $self->logger->info("Done writing ".++$i." mapping_session entries.\n\n");
177 }
178 
179 
180 sub map_stable_ids {
181  my $self = shift;
182  my $mappings = shift;
183  my $type = shift;
184 
185  unless ($mappings and
186  $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
187  throw("Need a Bio::EnsEMBL::IdMapping::MappingList of ${type}s.");
188  }
189 
190  # generate a new mapping_session and write all mapping_session data to a file
191  $self->generate_mapping_session;
192 
193  $self->logger->info("== Stable ID mapping for $type...\n\n", 0, 'stamped');
194 
195  # check if there are any objects of this type at all
196  my %all_sources = %{ $self->cache->get_by_name("${type}s_by_id", 'source') };
197  my %all_targets = %{ $self->cache->get_by_name("${type}s_by_id", 'target') };
198  unless (scalar(keys %all_sources)) {
199  $self->logger->info("No cached ${type}s found.\n\n");
200  return;
201  }
202 
203  my %stats = map { $_ => 0 }
204  qw(mapped new lost);
205 
206  # create some lookup hashes from the mappings
207  my %sources_mapped = ();
208  my %targets_mapped = ();
209  my %scores_by_target = ();
210 
211  foreach my $e (@{ $mappings->get_all_Entries }) {
212  $sources_mapped{$e->source} = $e->target;
213  $targets_mapped{$e->target} = $e->source;
214  $scores_by_target{$e->target} = $e->score;
215  }
216 
217  # determine starting stable ID for new assignments
218  my $new_stable_id = $self->stable_id_generator->initial_stable_id($type);
219 
220  #
221  # assign mapped and new stable IDs
222  #
223  foreach my $tid (keys %all_targets) {
224 
225  my $t_obj = $all_targets{$tid};
226 
227  # a mapping exists, assign stable ID accordingly
228  if (my $sid = $targets_mapped{$tid}) {
229 
230  my $s_obj = $all_sources{$sid};
231 
232  # set target's stable ID and created_date
233  $t_obj->stable_id($s_obj->stable_id);
234  $t_obj->created_date($s_obj->created_date);
235 
236  # calculate and set version
237  my $old_version = $s_obj->version();
238  my $new_version = $self->stable_id_generator->calculate_version($s_obj, $t_obj) ;
239  $t_obj->version($new_version);
240 
241  # change modified_date if version changed
242  if ($old_version == $new_version) {
243  $t_obj->modified_date($s_obj->modified_date);
244  } else {
245  $t_obj->modified_date($self->mapping_session_date);
246  # If version changed, score cannot be 1
247  if ($scores_by_target{$tid} == 1) {
248  $scores_by_target{$tid} = 0.99;
249  }
250  }
251 
252  # create a stable_id_event entry (not for exons)
253  unless ( $type eq 'exon' ) {
254  # Only add events when something changed.
255  if ( !( $s_obj->stable_id eq $t_obj->stable_id &&
256  $s_obj->version == $t_obj->version &&
257  $scores_by_target{$tid} > 0.9999 ) )
258  {
259  my $key = join( "\t",
260  $s_obj->stable_id, $s_obj->version,
261  $t_obj->stable_id, $t_obj->version,
262  $self->mapping_session_id, $type,
263  $scores_by_target{$tid} );
264  $self->add_stable_id_event( 'new', $key );
265  }
266  }
267 
268  # add to debug hash
269  push @{ $debug_mappings{$type} }, [ $sid, $tid, $t_obj->stable_id ];
270 
271  # stats
272  $stats{'mapped'}++;
273 
274  # no mapping was found, assign a new stable ID
275  } else {
276 
277  $t_obj->stable_id($new_stable_id);
278  $t_obj->version(1);
279  $t_obj->created_date($self->mapping_session_date);
280  $t_obj->modified_date($self->mapping_session_date);
281 
282  # create a stable_id_event entry (not for exons)
283  unless ($type eq 'exon') {
284  my $key = join("\t",
285  '\N',
286  0,
287  $t_obj->stable_id,
288  $t_obj->version,
289  $self->mapping_session_id,
290  $type,
291  0
292  );
293  $self->add_stable_id_event('new', $key);
294  }
295 
296  # increment the stable Id (to be assigned to the next unmapped object)
297  $new_stable_id = $self->stable_id_generator->increment_stable_id(
298  $new_stable_id);
299 
300  # stats
301  $stats{'new'}++;
302 
303  }
304 
305  }
306 
307  #
308  # deletion events for lost sources
309  #
310  my $fh;
311  if ($type eq 'gene' or $type eq 'transcript') {
312  $fh = $self->get_filehandle("${type}s_lost.txt", 'debug');
313  }
314 
315  foreach my $sid (keys %all_sources) {
316 
317  my $s_obj = $all_sources{$sid};
318 
319  # no mapping exists, add deletion event
320  unless ($sources_mapped{$sid}) {
321  unless ($type eq 'exon') {
322  my $key = join("\t",
323  $s_obj->stable_id,
324  $s_obj->version,
325  '\N',
326  0,
327  $self->mapping_session_id,
328  $type,
329  0
330  );
331  $self->add_stable_id_event('new', $key);
332  }
333 
334  # stats
335  my $status;
336  $stats{'lost'}++;
337 
338  # log lost genes and transcripts (for debug purposes)
339  #
340  # The Java app did this with a separate method
341  # (StableIdMapper.dumpLostGeneAndTranscripts()) which also claims to log
342  # losses due to merge. Since at that point this data isn't available yet
343  # the logging can be done much more efficient here
344  if ($type eq 'gene' or $type eq 'transcript') {
345  print $fh $s_obj->stable_id, "\t$status\n";
346  }
347  }
348  }
349 
350  close($fh) if (defined($fh));
351 
352  #
353  # write stable IDs to file
354  #
355  $self->write_stable_ids_to_file($type, \%all_targets);
356 
357  # also generate and write stats to file
358  $self->generate_mapping_stats($type, \%stats);
359 
360  $self->logger->info("Done.\n\n");
361 }
362 
363 
365  my ( $self, $mappings, $scores, $type ) = @_;
366 
367  # argument checks
368  unless ( $mappings and
369  $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList') )
370  {
371  throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
372  }
373 
374  unless ( $scores and
375  $scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
376  {
377  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
378  }
379 
380  throw("Need a type (gene|transcript|translation).") unless ($type);
381 
382  my $mapped;
383 
384  foreach my $e ( @{ $mappings->get_all_Entries } ) {
385 
386  # create lookup hash for mapped sources and targets; we'll need this
387  # later
388  $mapped->{'source'}->{ $e->source } = 1;
389  $mapped->{'target'}->{ $e->target } = 1;
390 
391  } ## end foreach my $e ( @{ $mappings...})
392 
393  #
394  # similarities for other entries
395  #
396  foreach my $dbtype ( keys %$mapped ) {
397 
398  # note: $dbtype will be either 'source' or 'target'
399  my $m1 = "get_all_${dbtype}s";
400  my $m2 = "get_Entries_for_${dbtype}";
401 
402  foreach my $id ( @{ $scores->$m1 } ) {
403 
404  # skip if this is a mapped source/target
405  if ( $mapped->{$dbtype}->{$id} ) { next }
406 
407  my @entries =
408  sort { $b->score <=> $a->score } @{ $scores->$m2($id) };
409 
410  unless (@entries) { next }
411 
412  # skip if top score < 0.75
413  my $top_score = $entries[0]->score;
414  if ( $top_score < 0.75 ) { next }
415 
416  # add similarities for all entries within 5% of top scorer
417  while ( my $e = shift(@entries) ) {
418 
419  if ( $mapped->{'source'}->{$e->source} ) { next ; }
420  if ( $mapped->{'target'}->{$e->target} ) { next ; }
421 
422  if ( $e->score > ( $top_score*0.95 ) ) {
423 
424  my $s_obj =
425  $self->cache->get_by_key( "${type}s_by_id", 'source',
426  $e->source );
427  my $t_obj =
428  $self->cache->get_by_key( "${type}s_by_id", 'target',
429  $e->target );
430 
431  my $key = join( "\t",
432  $s_obj->stable_id, $s_obj->version,
433  $t_obj->stable_id, $t_obj->version,
434  $self->mapping_session_id, $type,
435  $e->score );
436  $self->add_stable_id_event( 'similarity', $key );
437 
438  }
439  }
440 
441  } ## end foreach my $id ( @{ $scores...})
442  } ## end foreach my $dbtype ( keys %$mapped)
443 
444 } ## end sub generate_similarity_events
445 
446 
447 sub filter_same_gene_transcript_similarities {
448  my $self = shift;
449  my $transcript_scores = shift;
450 
451  # argument checks
452  unless ($transcript_scores and
453  $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
454  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of transcripts.');
455  }
456 
457  # create a new matrix for the filtered entries
458  my $filtered_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
459  -DUMP_PATH => path_append($self->conf->param('basedir'), 'matrix'),
460  -CACHE_FILE => 'filtered_transcript_scores.ser',
461  );
462 
463  # lookup hash for all target transcripts
464  my %all_targets = map { $_->stable_id => 1 }
465  values %{ $self->cache->get_by_name("transcripts_by_id", 'target') };
466 
467  my $i = 0;
468 
469  foreach my $e (@{ $transcript_scores->get_all_Entries }) {
470 
471  my $s_tr = $self->cache->get_by_key('transcripts_by_id', 'source',
472  $e->source);
473  my $s_gene = $self->cache->get_by_key('genes_by_transcript_id', 'source',
474  $e->source);
475  my $t_gene = $self->cache->get_by_key('genes_by_transcript_id', 'target',
476  $e->target);
477  # workaround for caching issue: only gene objects in 'genes_by_id' cache
478  # have a stable ID assigned
479  #$t_gene = $self->cache->get_by_key('genes_by_id', 'target', $t_gene->id);
480 
481  #$self->logger->debug("xxx ".join(":", $s_tr->stable_id, $s_gene->stable_id,
482  # $t_gene->stable_id)."\n");
483 
484  # skip if source and target transcript are in same gene, BUT keep events for
485  # deleted transcripts
486  if (($s_gene->stable_id eq $t_gene->stable_id) and
487  $all_targets{$s_tr->stable_id}) {
488  $i++;
489  next;
490  }
491 
492  $filtered_scores->add_Entry($e);
493  }
494 
495  $self->logger->debug("Skipped $i same gene transcript mappings.\n");
496 
497  return $filtered_scores;
498 }
499 
500 
501 sub generate_translation_similarity_events {
502  my $self = shift;
503  my $mappings = shift;
504  my $transcript_scores = shift;
505 
506  # argument checks
507  unless ($mappings and
508  $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
509  throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
510  }
511 
512  unless ($transcript_scores and
513  $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
514  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
515  }
516 
517  # create a fake translation scoring matrix
518  my $translation_scores = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
519  -DUMP_PATH => path_append($self->conf->param('basedir'), 'matrix'),
520  -CACHE_FILE => 'translation_scores.ser',
521  );
522 
523  foreach my $e (@{ $transcript_scores->get_all_Entries }) {
524 
525  my $s_tl = $self->cache->get_by_key('transcripts_by_id', 'source',
526  $e->source)->translation;
527  my $t_tl = $self->cache->get_by_key('transcripts_by_id', 'target',
528  $e->target)->translation;
529 
530  # add an entry to the translation scoring matrix using the score of the
531  # corresponding transcripts
532  if ($s_tl and $t_tl) {
533  $translation_scores->add_score($s_tl->id, $t_tl->id, $e->score);
534  }
535  }
536 
537  # now generate similarity events using this fake scoring matrix
538  $self->generate_similarity_events($mappings, $translation_scores,
539  'translation');
540 }
541 
542 
543 sub write_stable_ids_to_file {
544  my $self = shift;
545  my $type = shift;
546  my $all_targets = shift;
547 
548  $self->logger->info("Writing ${type} stable IDs to file...\n");
549 
550  my $fh = $self->get_filehandle("${type}_stable_id.txt", 'tables');
551 
552  my @sorted_targets = map { $all_targets->{$_} } sort { $a <=> $b }
553  keys %$all_targets;
554 
555  foreach my $obj (@sorted_targets) {
556 
557  # check for missing created and modified dates
558  my $created_date = $obj->created_date;
559  unless ($created_date) {
560  #$self->logger->debug("Missing created_date for target ".
561  # $obj->to_string."\n", 1);
562  $created_date = $self->mapping_session_date;
563  }
564 
565  my $modified_date = $obj->modified_date;
566  unless ($modified_date) {
567  #$self->logger->debug("Missing modified_date for target ".
568  # $obj->to_string."\n", 1);
569  $modified_date = $self->mapping_session_date;
570  }
571 
572  my $row = join("\t",
573  $obj->id,
574  $obj->stable_id,
575  $obj->version,
576  strftime("%Y-%m-%d %T", localtime($created_date)),
577  strftime("%Y-%m-%d %T", localtime($modified_date)),
578  );
579 
580  print $fh "$row\n";
581  }
582 
583  close($fh);
584 
585  $self->logger->info("Done writing ".scalar(@sorted_targets)." entries.\n\n");
586 }
587 
588 
589 sub generate_mapping_stats {
590  my $self = shift;
591  my $type = shift;
592  my $stats = shift;
593 
594  my $result = ucfirst($type)." mapping results:\n\n";
595 
596  my $fmt1 = "%-10s%-10s%-10s%-10s\n";
597  my $fmt2 = "%-10s%6.0f %6.0f %4.2f%%\n";
598 
599  $result .= sprintf($fmt1, qw(TYPE MAPPED LOST PERCENTAGE));
600  $result .= ('-'x40)."\n";
601 
602  my $mapped_total = $stats->{'mapped'};
603  my $lost_total = $stats->{'lost'};
604 
605  $result .= sprintf($fmt2, 'total', $mapped_total, $lost_total,
606  $mapped_total/($mapped_total+$lost_total)*100);
607 
608  # log result
609  $self->logger->info($result."\n");
610 
611  # write result to file
612  my $fh = $self->get_filehandle("${type}_mapping_stats.txt", 'stats');
613  print $fh $result;
614  close($fh);
615 }
616 
617 
618 sub dump_debug_mappings {
619  my $self = shift;
620 
621  foreach my $type (keys %debug_mappings) {
622 
623  $self->logger->debug("Writing $type mappings to debug/${type}_mappings.txt...\n");
624 
625  my $fh = $self->get_filehandle("${type}_mappings.txt", 'debug');
626 
627  foreach my $row (@{ $debug_mappings{$type} }) {
628  print $fh join("\t", @$row);
629  print $fh "\n";
630  }
631 
632  close($fh);
633 
634  $self->logger->debug("Done.\n");
635  }
636 }
637 
638 
639 sub write_stable_id_events {
640  my $self = shift;
641  my $event_type = shift;
642 
643  throw("Need an event type (new|similarity).") unless ($event_type);
644 
645  $self->logger->debug("Writing $event_type stable_id_events to file...\n");
646 
647  my $fh = $self->get_filehandle("stable_id_event_${event_type}.txt", 'tables');
648  my $i = 0;
649 
650  foreach my $event (@{ $self->get_all_stable_id_events($event_type) }) {
651  print $fh "$event\n";
652  $i++;
653  }
654 
655  close($fh);
656 
657  $self->logger->debug("Done writing $i entries.\n");
658 }
659 
660 
661 sub add_stable_id_event {
662  my ($self, $type, $event) = @_;
663 
664  # argument check
665  throw("Need an event type (new|similarity).") unless ($type);
666 
667  $self->{'stable_id_events'}->{$type}->{$event} = 1;
668 }
669 
670 
671 sub get_all_stable_id_events {
672  my ($self, $type) = @_;
673 
674  # argument check
675  throw("Need an event type (new|similarity).") unless ($type);
676 
677  return [ keys %{ $self->{'stable_id_events'}->{$type} } ];
678 }
679 
680 
681 sub mapping_session_id {
682  my $self = shift;
683  $self->{'_mapping_session_id'} = shift if (@_);
684  return $self->{'_mapping_session_id'};
685 }
686 
687 
688 sub mapping_session_date {
689  my $self = shift;
690  $self->{'_mapping_session_date'} = shift if (@_);
691  return $self->{'_mapping_session_date'};
692 }
693 
694 
695 sub mapping_session_date_fmt {
696  my $self = shift;
697  $self->{'_mapping_session_date_fmt'} = shift if (@_);
698  return $self->{'_mapping_session_date_fmt'};
699 }
700 
701 
702 sub stable_id_generator {
703  my $self = shift;
704  $self->{'_stable_id_generator'} = shift if (@_);
705  return $self->{'_stable_id_generator'};
706 }
707 
708 
709 1;
710 
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::add_score
public Float add_score()
map
public map()
Bio::EnsEMBL::IdMapping::BaseObject
Definition: BaseObject.pm:25
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
Definition: ScoredMappingMatrix.pm:44
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::new
public Bio::EnsEMBL::IdMapping::ScoredMappingMatrix new()
generate_similarity_events
public generate_similarity_events()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68