3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
37 This
class is not instantiated. Please see subclasses for
usage examples
42 This is the base
class for the score builders used in the stable Id
43 mapping application. It contains methods which are used by more than one
48 create_shrinked_matrix
54 package Bio::EnsEMBL::IdMapping::ScoreBuilder;
58 no warnings
'uninitialized';
68 =head2 create_shrinked_matrix
70 Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
72 Arg[2] : Bio::EnsEMBL::Idmapping::MappingList $mappings - mappings
73 Arg[3] : String $cache_file - base name of a cache file (extension
'.ser'
74 will be added automatically)
for the returned matrix
75 Example : my $new_scores = $score_builder->create_shrinked_matrix(
76 $gene_scores, $mappings,
"gene_matrix1");
77 Description : Create a shrinked scoring matrix which doesn
't contain entries
78 which were already mapped. It also logs how many new mappings
79 were added in this process.
80 Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
81 Exceptions : thrown on wrong or missing arguments
82 Caller : InternalIdMapper plugin
88 # TODO: shrinked = shrunken?
89 sub create_shrinked_matrix {
93 my $cache_file = shift; # base name, extension '.ser
' will be added
101 unless ($mappings and
102 $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList
')) {
106 throw('Need a cache file name.
') unless ($cache_file);
108 my $dump_path = path_append($self->conf->param('basedir
'), 'matrix
');
109 $cache_file .= '.ser
';
111 my $shrinked_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
112 -DUMP_PATH => $dump_path,
113 -CACHE_FILE => $cache_file,
117 # if we already found a saved matrix, just return it
118 if ($shrinked_matrix->loaded) {
120 $self->logger->info("Read existing scoring matrix from $cache_file.\n");
124 # create lookup hashes for sources and targets in the MappingList
128 foreach my $entry (@{ $mappings->get_all_Entries }) {
129 $sources{$entry->source} = 1;
130 $targets{$entry->target} = 1;
133 # add all entries to shrinked matrix which are not in the MappingList
134 foreach my $entry (@{ $matrix->get_all_Entries }) {
135 unless ($sources{$entry->source} or $targets{$entry->target}) {
136 $shrinked_matrix->add_Entry($entry);
142 # log shrinking stats
143 $self->logger->info('Sources
'.$matrix->get_source_count.' -->
'.
144 $shrinked_matrix->get_source_count."\n");
145 $self->logger->info('Targets
'.$matrix->get_target_count.' -->
'.
146 $shrinked_matrix->get_target_count."\n");
147 $self->logger->info('Entries
'.$matrix->get_entry_count.' -->
'.
148 $shrinked_matrix->get_entry_count."\n");
149 $self->logger->info('New mappings:
'.$mappings->get_entry_count."\n\n");
151 return $shrinked_matrix;
155 =head2 internal_id_rescore
157 Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
159 Example : $score_builder->internal_id_rescore($gene_scores);
160 Description : Rescore ambiguous mappings based on internal Ids. This is the
161 last disambiguation step and is only useful if objects with the
162 same internal Id were used in source and target dbs (e.g. in
163 patch builds or if objects were copied from source to target).
165 If a source and target gene have the same internal Id and there
166 are mappings to other target genes then these *other* mappings
169 Exceptions : thrown on wrong or missing argument
170 Caller : InternalIdMapper plugins
176 sub internal_id_rescore {
188 foreach my $source ( @{ $matrix->get_all_sources } ) {
190 sort { $b <=> $a } @{ $matrix->get_Entries_for_source($source) };
192 # nothing to do if we only have one mapping
193 if ( scalar(@entries) == 1 ) { next }
195 # only penalise if mappings are ambiguous
196 if ( $entries[0]->score != $entries[1]->score ) { next }
198 # only penalise if one source id == target id where score == best
202 foreach my $e (@entries) {
203 if ( $e->target == $source and $e->score == $entries[0]->score() )
210 if ( !$ambiguous ) { next }
212 # now penalise those where source id != target id and score == best
214 foreach my $e (@entries) {
215 if ( $e->target != $source and $e->score == $entries[0]->score() )
217 # PENALTY: Reduce score for ambiguous mappings.
218 $matrix->set_score( $source, $e->target(), 0.9*$e->score() );
223 } ## end foreach my $source ( @{ $matrix...})
225 $self->logger->debug("Scored entries with internal ID mismatch: $i\n",
227 } ## end sub internal_id_rescore
230 =head2 log_matrix_stats
232 Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
234 Example : $score_builder->log_matrix_stats;
235 Description : Logs scoring matrix statistics (number of entries, min/max/avg
238 Exceptions : thrown on wrong or missing argument
245 sub log_matrix_stats {
250 $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
')) {
254 my $fmt1 = "%-40s%10.0f\n";
255 my $fmt2 = "%-40s%10.5f\n";
257 $self->logger->info(sprintf($fmt1, "Scoring matrix entries:",
258 $matrix->get_entry_count), 1);
260 $self->logger->info(sprintf($fmt1, "Scoring matrix sources:",
261 $matrix->get_source_count), 1);
263 $self->logger->info(sprintf($fmt1, "Scoring matrix targets:",
264 $matrix->get_target_count), 1);
266 $self->logger->info(sprintf($fmt2, "Average score:",
267 $matrix->get_average_score), 1);
269 my ($min, $max) = @{ $matrix->get_min_max_scores };
270 $self->logger->info(sprintf($fmt2, "Min. score:", $min), 1);
271 $self->logger->info(sprintf($fmt2, "Max. score:", $max), 1);