3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
41 package Bio::EnsEMBL::IdMapping::StableIdMapper;
45 no warnings
'uninitialized';
53 use POSIX qw(strftime);
62 my $class = ref($caller) || $caller;
63 my $self = $class->SUPER::new(@_);
65 # inject a StableIdGenerator
67 # If you write your own generators, make sure they extend
68 # Bio::EnsEMBL::Idmapping::BaseObject and additionally implement these three
69 # methods: initial_stable_id(), increment_stable_id() and calculate_version().
70 my $stable_id_generator = $self->conf->param(
'plugin_stable_id_generator') ||
71 'Bio::EnsEMBL::IdMapping::StableIdGenerator::EnsemblGeneric';
72 $self->logger->debug(
"Using $stable_id_generator to generate stable Ids.\n");
73 inject($stable_id_generator);
75 # create a new StableIdGenerator object
76 my $generator_instance = $stable_id_generator->new(
77 -LOGGER => $self->logger,
79 -CACHE => $self->cache
81 $self->stable_id_generator($generator_instance);
87 sub generate_mapping_session {
90 # only run this method once
91 return if ($self->mapping_session_date);
93 $self->logger->info(
"Generating new mapping_session...\n");
95 $self->mapping_session_date(time);
96 $self->mapping_session_date_fmt(strftime(
"%Y-%m-%d %T",
97 localtime($self->mapping_session_date)));
99 my $s_dba = $self->cache->get_DBAdaptor(
'source');
100 my $s_dbh = $s_dba->dbc->db_handle;
101 my $t_dba = $self->cache->get_DBAdaptor(
'target');
102 my $t_dbh = $t_dba->dbc->db_handle;
104 # check if mapping_session_id was manually set by the configuration
105 my $mapping_session_id = $self->conf->param(
'mapping_session_id');
107 if ($mapping_session_id) {
109 $self->logger->debug(
"Using manually configured mapping_session_id $mapping_session_id\n", 1);
113 # calculate mapping_session_id from db
114 my $sql = qq(SELECT MAX(mapping_session_id) FROM mapping_session);
115 $mapping_session_id = $self->fetch_value_from_db($s_dbh, $sql);
117 unless ($mapping_session_id) {
118 $self->logger->debug(
"No previous mapping_session found.\n", 1);
121 # increment last mapping_session_id
122 $mapping_session_id++;
124 $self->logger->debug(
"Using mapping_session_id $mapping_session_id\n", 1);
127 $self->mapping_session_id($mapping_session_id);
129 # write old mapping_session table to a file
131 my $fh = $self->get_filehandle(
'mapping_session.txt',
'tables');
133 my $sth1 = $s_dbh->prepare(
"SELECT * FROM mapping_session");
136 while (my @row = $sth1->fetchrow_array) {
138 print $fh join(
"\t", @row);
144 # append the new mapping_session to the file
145 my $release_sql = qq(
146 SELECT meta_value FROM meta WHERE meta_key =
'schema_version'
148 my $old_release = $self->fetch_value_from_db($s_dbh, $release_sql);
149 my $new_release = $self->fetch_value_from_db($t_dbh, $release_sql);
151 my $assembly_sql = qq(
152 SELECT meta_value FROM meta WHERE meta_key =
'assembly.default'
154 my $old_assembly = $self->fetch_value_from_db($s_dbh, $assembly_sql);
155 my $new_assembly = $self->fetch_value_from_db($t_dbh, $assembly_sql);
157 unless ($old_release and $new_release and $old_assembly and $new_assembly) {
158 $self->logger->warning(
"Not all data for new mapping_session found:\n", 1);
159 $self->logger->info(
"old_release: $old_release, new_release: $new_release");
160 $self->logger->info(
"old_assembly: $old_assembly, new_assembly $new_assembly\n", 2);
165 $self->conf->param(
'sourcedbname'),
166 $self->conf->param(
'targetdbname'),
171 $self->mapping_session_date_fmt);
176 $self->logger->info(
"Done writing ".++$i.
" mapping_session entries.\n\n");
182 my $mappings = shift;
185 unless ($mappings and
186 $mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList')) {
187 throw(
"Need a Bio::EnsEMBL::IdMapping::MappingList of ${type}s.");
190 # generate a new mapping_session and write all mapping_session data to a file
191 $self->generate_mapping_session;
193 $self->logger->info(
"== Stable ID mapping for $type...\n\n", 0,
'stamped');
195 # check if there are any objects of this type at all
196 my %all_sources = %{ $self->cache->get_by_name(
"${type}s_by_id",
'source') };
197 my %all_targets = %{ $self->cache->get_by_name(
"${type}s_by_id",
'target') };
198 unless (scalar(keys %all_sources)) {
199 $self->logger->info(
"No cached ${type}s found.\n\n");
203 my %stats =
map { $_ => 0 }
206 # create some lookup hashes from the mappings
207 my %sources_mapped = ();
208 my %targets_mapped = ();
209 my %scores_by_target = ();
211 foreach my $e (@{ $mappings->get_all_Entries }) {
212 $sources_mapped{$e->source} = $e->target;
213 $targets_mapped{$e->target} = $e->source;
214 $scores_by_target{$e->target} = $e->score;
217 # determine starting stable ID for new assignments
218 my $new_stable_id = $self->stable_id_generator->initial_stable_id($type);
221 # assign mapped and new stable IDs
223 foreach my $tid (keys %all_targets) {
225 my $t_obj = $all_targets{$tid};
227 # a mapping exists, assign stable ID accordingly
228 if (my $sid = $targets_mapped{$tid}) {
230 my $s_obj = $all_sources{$sid};
232 # set target's stable ID and created_date
233 $t_obj->stable_id($s_obj->stable_id);
234 $t_obj->created_date($s_obj->created_date);
236 # calculate and set version
237 my $old_version = $s_obj->version();
238 my $new_version = $self->stable_id_generator->calculate_version($s_obj, $t_obj) ;
239 $t_obj->version($new_version);
241 # change modified_date if version changed
242 if ($old_version == $new_version) {
243 $t_obj->modified_date($s_obj->modified_date);
245 $t_obj->modified_date($self->mapping_session_date);
246 # If version changed, score cannot be 1
247 if ($scores_by_target{$tid} == 1) {
248 $scores_by_target{$tid} = 0.99;
252 # create a stable_id_event entry (not for exons)
253 unless ( $type eq
'exon' ) {
254 # Only add events when something changed.
255 if ( !( $s_obj->stable_id eq $t_obj->stable_id &&
256 $s_obj->version == $t_obj->version &&
257 $scores_by_target{$tid} > 0.9999 ) )
259 my $key = join(
"\t",
260 $s_obj->stable_id, $s_obj->version,
261 $t_obj->stable_id, $t_obj->version,
262 $self->mapping_session_id, $type,
263 $scores_by_target{$tid} );
264 $self->add_stable_id_event(
'new', $key );
269 push @{ $debug_mappings{$type} }, [ $sid, $tid, $t_obj->stable_id ];
274 # no mapping was found, assign a new stable ID
277 $t_obj->stable_id($new_stable_id);
279 $t_obj->created_date($self->mapping_session_date);
280 $t_obj->modified_date($self->mapping_session_date);
282 # create a stable_id_event entry (not for exons)
283 unless ($type eq
'exon') {
289 $self->mapping_session_id,
293 $self->add_stable_id_event(
'new', $key);
296 # increment the stable Id (to be assigned to the next unmapped object)
297 $new_stable_id = $self->stable_id_generator->increment_stable_id(
308 # deletion events for lost sources
311 if ($type eq
'gene' or $type eq
'transcript') {
312 $fh = $self->get_filehandle(
"${type}s_lost.txt",
'debug');
315 foreach my $sid (keys %all_sources) {
317 my $s_obj = $all_sources{$sid};
319 # no mapping exists, add deletion event
320 unless ($sources_mapped{$sid}) {
321 unless ($type eq
'exon') {
327 $self->mapping_session_id,
331 $self->add_stable_id_event(
'new', $key);
338 # log lost genes and transcripts (for debug purposes)
340 # The Java app did this with a separate method
341 # (StableIdMapper.dumpLostGeneAndTranscripts()) which also claims to log
342 # losses due to merge. Since at that point this data isn't available yet
343 # the logging can be done much more efficient here
344 if ($type eq
'gene' or $type eq
'transcript') {
345 print $fh $s_obj->stable_id,
"\t$status\n";
350 close($fh)
if (defined($fh));
353 # write stable IDs to file
355 $self->write_stable_ids_to_file($type, \%all_targets);
357 # also generate and write stats to file
358 $self->generate_mapping_stats($type, \%stats);
360 $self->logger->info(
"Done.\n\n");
365 my ( $self, $mappings, $scores, $type ) = @_;
368 unless ( $mappings and
369 $mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList') )
371 throw(
'Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
375 $scores->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
377 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
380 throw(
"Need a type (gene|transcript|translation).") unless ($type);
384 foreach my $e ( @{ $mappings->get_all_Entries } ) {
386 # create lookup hash for mapped sources and targets; we'll need this
388 $mapped->{
'source'}->{ $e->source } = 1;
389 $mapped->{
'target'}->{ $e->target } = 1;
391 } ## end
foreach my $e ( @{ $mappings...})
394 # similarities
for other entries
396 foreach my $dbtype ( keys %$mapped ) {
398 # note: $dbtype will be either 'source' or 'target'
399 my $m1 =
"get_all_${dbtype}s";
400 my $m2 =
"get_Entries_for_${dbtype}";
402 foreach my $id ( @{ $scores->$m1 } ) {
404 # skip if this is a mapped source/target
405 if ( $mapped->{$dbtype}->{$id} ) { next }
408 sort { $b->score <=> $a->score } @{ $scores->$m2($id) };
410 unless (@entries) { next }
412 # skip if top score < 0.75
413 my $top_score = $entries[0]->score;
414 if ( $top_score < 0.75 ) { next }
416 # add similarities for all entries within 5% of top scorer
417 while ( my $e = shift(@entries) ) {
419 if ( $mapped->{
'source'}->{$e->source} ) { next ; }
420 if ( $mapped->{
'target'}->{$e->target} ) { next ; }
422 if ( $e->score > ( $top_score*0.95 ) ) {
425 $self->cache->get_by_key(
"${type}s_by_id",
'source',
428 $self->cache->get_by_key(
"${type}s_by_id",
'target',
431 my $key = join(
"\t",
432 $s_obj->stable_id, $s_obj->version,
433 $t_obj->stable_id, $t_obj->version,
434 $self->mapping_session_id, $type,
436 $self->add_stable_id_event(
'similarity', $key );
441 } ## end
foreach my $id ( @{ $scores...})
442 } ## end
foreach my $dbtype ( keys %$mapped)
447 sub filter_same_gene_transcript_similarities {
449 my $transcript_scores = shift;
452 unless ($transcript_scores and
453 $transcript_scores->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
454 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of transcripts.');
457 # create a new matrix for the filtered entries
459 -DUMP_PATH => path_append($self->conf->param(
'basedir'),
'matrix'),
460 -CACHE_FILE =>
'filtered_transcript_scores.ser',
463 # lookup hash for all target transcripts
464 my %all_targets =
map { $_->stable_id => 1 }
465 values %{ $self->cache->get_by_name(
"transcripts_by_id",
'target') };
469 foreach my $e (@{ $transcript_scores->get_all_Entries }) {
471 my $s_tr = $self->cache->get_by_key(
'transcripts_by_id',
'source',
473 my $s_gene = $self->cache->get_by_key(
'genes_by_transcript_id',
'source',
475 my $t_gene = $self->cache->get_by_key(
'genes_by_transcript_id',
'target',
477 # workaround for caching issue: only gene objects in 'genes_by_id' cache
478 # have a stable ID assigned
479 #$t_gene = $self->cache->get_by_key('genes_by_id', 'target', $t_gene->id);
481 #$self->logger->debug("xxx ".join(":", $s_tr->stable_id, $s_gene->stable_id,
482 # $t_gene->stable_id)."\n");
484 # skip if source and target transcript are in same gene, BUT keep events for
485 # deleted transcripts
486 if (($s_gene->stable_id eq $t_gene->stable_id) and
487 $all_targets{$s_tr->stable_id}) {
492 $filtered_scores->add_Entry($e);
495 $self->logger->debug(
"Skipped $i same gene transcript mappings.\n");
497 return $filtered_scores;
501 sub generate_translation_similarity_events {
503 my $mappings = shift;
504 my $transcript_scores = shift;
507 unless ($mappings and
508 $mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList')) {
509 throw(
'Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
512 unless ($transcript_scores and
513 $transcript_scores->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
514 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
517 # create a fake translation scoring matrix
519 -DUMP_PATH => path_append($self->conf->param(
'basedir'),
'matrix'),
520 -CACHE_FILE =>
'translation_scores.ser',
523 foreach my $e (@{ $transcript_scores->get_all_Entries }) {
525 my $s_tl = $self->cache->get_by_key(
'transcripts_by_id',
'source',
526 $e->source)->translation;
527 my $t_tl = $self->cache->get_by_key(
'transcripts_by_id',
'target',
528 $e->target)->translation;
530 # add an entry to the translation scoring matrix using the score of the
531 # corresponding transcripts
532 if ($s_tl and $t_tl) {
533 $translation_scores->
add_score($s_tl->id, $t_tl->id, $e->score);
537 # now generate similarity events using this fake scoring matrix
538 $self->generate_similarity_events($mappings, $translation_scores,
543 sub write_stable_ids_to_file {
546 my $all_targets = shift;
548 $self->logger->info(
"Writing ${type} stable IDs to file...\n");
550 my $fh = $self->get_filehandle(
"${type}_stable_id.txt",
'tables');
552 my @sorted_targets =
map { $all_targets->{$_} } sort { $a <=> $b }
555 foreach my $obj (@sorted_targets) {
557 # check for missing created and modified dates
558 my $created_date = $obj->created_date;
559 unless ($created_date) {
560 #$self->logger->debug("Missing created_date for target ".
561 # $obj->to_string."\n", 1);
562 $created_date = $self->mapping_session_date;
565 my $modified_date = $obj->modified_date;
566 unless ($modified_date) {
567 #$self->logger->debug("Missing modified_date for target ".
568 # $obj->to_string."\n", 1);
569 $modified_date = $self->mapping_session_date;
576 strftime(
"%Y-%m-%d %T", localtime($created_date)),
577 strftime(
"%Y-%m-%d %T", localtime($modified_date)),
585 $self->logger->info(
"Done writing ".scalar(@sorted_targets).
" entries.\n\n");
589 sub generate_mapping_stats {
594 my $result = ucfirst($type).
" mapping results:\n\n";
596 my $fmt1 =
"%-10s%-10s%-10s%-10s\n";
597 my $fmt2 =
"%-10s%6.0f %6.0f %4.2f%%\n";
599 $result .= sprintf($fmt1, qw(TYPE MAPPED LOST PERCENTAGE));
600 $result .= (
'-'x40).
"\n";
602 my $mapped_total = $stats->{
'mapped'};
603 my $lost_total = $stats->{
'lost'};
605 $result .= sprintf($fmt2,
'total', $mapped_total, $lost_total,
606 $mapped_total/($mapped_total+$lost_total)*100);
609 $self->logger->info($result.
"\n");
611 # write result to file
612 my $fh = $self->get_filehandle(
"${type}_mapping_stats.txt",
'stats');
618 sub dump_debug_mappings {
621 foreach my $type (keys %debug_mappings) {
623 $self->logger->debug(
"Writing $type mappings to debug/${type}_mappings.txt...\n");
625 my $fh = $self->get_filehandle(
"${type}_mappings.txt",
'debug');
627 foreach my $row (@{ $debug_mappings{$type} }) {
628 print $fh join(
"\t", @$row);
634 $self->logger->debug(
"Done.\n");
639 sub write_stable_id_events {
641 my $event_type = shift;
643 throw(
"Need an event type (new|similarity).") unless ($event_type);
645 $self->logger->debug(
"Writing $event_type stable_id_events to file...\n");
647 my $fh = $self->get_filehandle(
"stable_id_event_${event_type}.txt",
'tables');
650 foreach my $event (@{ $self->get_all_stable_id_events($event_type) }) {
651 print $fh
"$event\n";
657 $self->logger->debug(
"Done writing $i entries.\n");
661 sub add_stable_id_event {
662 my ($self, $type, $event) = @_;
665 throw(
"Need an event type (new|similarity).") unless ($type);
667 $self->{
'stable_id_events'}->{$type}->{$event} = 1;
671 sub get_all_stable_id_events {
672 my ($self, $type) = @_;
675 throw(
"Need an event type (new|similarity).") unless ($type);
677 return [ keys %{ $self->{
'stable_id_events'}->{$type} } ];
681 sub mapping_session_id {
683 $self->{
'_mapping_session_id'} = shift
if (@_);
684 return $self->{
'_mapping_session_id'};
688 sub mapping_session_date {
690 $self->{
'_mapping_session_date'} = shift
if (@_);
691 return $self->{
'_mapping_session_date'};
695 sub mapping_session_date_fmt {
697 $self->{
'_mapping_session_date_fmt'} = shift
if (@_);
698 return $self->{
'_mapping_session_date_fmt'};
702 sub stable_id_generator {
704 $self->{
'_stable_id_generator'} = shift
if (@_);
705 return $self->{
'_stable_id_generator'};