ensembl-hive  2.7.0
ScoreBuilder.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::IdMapping::ScoreBuilder - score builder base class
34 
35 =head1 SYNOPSIS
36 
37 This class is not instantiated. Please see subclasses for usage examples
38 (e.g. GeneScoreBuilder).
39 
40 =head1 DESCRIPTION
41 
42 This is the base class for the score builders used in the stable Id
43 mapping application. It contains methods which are used by more than one
45 
46 =head1 METHODS
47 
48  create_shrinked_matrix
49  internal_id_rescore
50  log_matrix_stats
51 
52 =cut
53 
54 package Bio::EnsEMBL::IdMapping::ScoreBuilder;
55 
56 use strict;
57 use warnings;
58 no warnings 'uninitialized';
59 
62 
63 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
64 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
66 
67 
68 =head2 create_shrinked_matrix
69 
70  Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
71  matrix
72  Arg[2] : Bio::EnsEMBL::Idmapping::MappingList $mappings - mappings
73  Arg[3] : String $cache_file - base name of a cache file (extension '.ser'
74  will be added automatically) for the returned matrix
75  Example : my $new_scores = $score_builder->create_shrinked_matrix(
76  $gene_scores, $mappings, "gene_matrix1");
77  Description : Create a shrinked scoring matrix which doesn't contain entries
78  which were already mapped. It also logs how many new mappings
79  were added in this process.
80  Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
81  Exceptions : thrown on wrong or missing arguments
82  Caller : InternalIdMapper plugin
83  Status : At Risk
84  : under development
85 
86 =cut
87 
88 # TODO: shrinked = shrunken?
89 sub create_shrinked_matrix {
90  my $self = shift;
91  my $matrix = shift;
92  my $mappings = shift;
93  my $cache_file = shift; # base name, extension '.ser' will be added
94 
95  # argument checks
96  unless ($matrix and
99  }
100 
101  unless ($mappings and
102  $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
103  throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
104  }
105 
106  throw('Need a cache file name.') unless ($cache_file);
107 
108  my $dump_path = path_append($self->conf->param('basedir'), 'matrix');
109  $cache_file .= '.ser';
110 
111  my $shrinked_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
112  -DUMP_PATH => $dump_path,
113  -CACHE_FILE => $cache_file,
114  -AUTO_LOAD => 1,
115  );
116 
117  # if we already found a saved matrix, just return it
118  if ($shrinked_matrix->loaded) {
119 
120  $self->logger->info("Read existing scoring matrix from $cache_file.\n");
121 
122  } else {
123 
124  # create lookup hashes for sources and targets in the MappingList
125  my %sources = ();
126  my %targets = ();
127 
128  foreach my $entry (@{ $mappings->get_all_Entries }) {
129  $sources{$entry->source} = 1;
130  $targets{$entry->target} = 1;
131  }
132 
133  # add all entries to shrinked matrix which are not in the MappingList
134  foreach my $entry (@{ $matrix->get_all_Entries }) {
135  unless ($sources{$entry->source} or $targets{$entry->target}) {
136  $shrinked_matrix->add_Entry($entry);
137  }
138  }
139 
140  }
141 
142  # log shrinking stats
143  $self->logger->info('Sources '.$matrix->get_source_count.' --> '.
144  $shrinked_matrix->get_source_count."\n");
145  $self->logger->info('Targets '.$matrix->get_target_count.' --> '.
146  $shrinked_matrix->get_target_count."\n");
147  $self->logger->info('Entries '.$matrix->get_entry_count.' --> '.
148  $shrinked_matrix->get_entry_count."\n");
149  $self->logger->info('New mappings: '.$mappings->get_entry_count."\n\n");
150 
151  return $shrinked_matrix;
152 }
153 
154 
155 =head2 internal_id_rescore
156 
157  Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
158  matrix
159  Example : $score_builder->internal_id_rescore($gene_scores);
160  Description : Rescore ambiguous mappings based on internal Ids. This is the
161  last disambiguation step and is only useful if objects with the
162  same internal Id were used in source and target dbs (e.g. in
163  patch builds or if objects were copied from source to target).
164 
165  If a source and target gene have the same internal Id and there
166  are mappings to other target genes then these *other* mappings
167  are penalised.
168  Return type : none
169  Exceptions : thrown on wrong or missing argument
170  Caller : InternalIdMapper plugins
171  Status : At Risk
172  : under development
173 
174 =cut
175 
176 sub internal_id_rescore {
177  my $self = shift;
178  my $matrix = shift;
179 
180  unless ($matrix
182  {
184  }
185 
186  my $i = 0;
187 
188  foreach my $source ( @{ $matrix->get_all_sources } ) {
189  my @entries =
190  sort { $b <=> $a } @{ $matrix->get_Entries_for_source($source) };
191 
192  # nothing to do if we only have one mapping
193  if ( scalar(@entries) == 1 ) { next }
194 
195  # only penalise if mappings are ambiguous
196  if ( $entries[0]->score != $entries[1]->score ) { next }
197 
198  # only penalise if one source id == target id where score == best
199  # score
200  my $ambiguous = 0;
201 
202  foreach my $e (@entries) {
203  if ( $e->target == $source and $e->score == $entries[0]->score() )
204  {
205  $ambiguous = 1;
206  last;
207  }
208  }
209 
210  if ( !$ambiguous ) { next }
211 
212  # now penalise those where source id != target id and score == best
213  # score
214  foreach my $e (@entries) {
215  if ( $e->target != $source and $e->score == $entries[0]->score() )
216  {
217  # PENALTY: Reduce score for ambiguous mappings.
218  $matrix->set_score( $source, $e->target(), 0.9*$e->score() );
219  $i++;
220  }
221  }
222 
223  } ## end foreach my $source ( @{ $matrix...})
224 
225  $self->logger->debug("Scored entries with internal ID mismatch: $i\n",
226  1 );
227 } ## end sub internal_id_rescore
228 
229 
230 =head2 log_matrix_stats
231 
232  Arg[1] : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
233  matrix
234  Example : $score_builder->log_matrix_stats;
235  Description : Logs scoring matrix statistics (number of entries, min/max/avg
236  scores).
237  Return type : none
238  Exceptions : thrown on wrong or missing argument
239  Caller : general
240  Status : At Risk
241  : under development
242 
243 =cut
244 
245 sub log_matrix_stats {
246  my $self = shift;
247  my $matrix = shift;
248 
249  unless ($matrix and
250  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
251  throw('You must provide a ScoredMappingMatrix.');
252  }
253 
254  my $fmt1 = "%-40s%10.0f\n";
255  my $fmt2 = "%-40s%10.5f\n";
256 
257  $self->logger->info(sprintf($fmt1, "Scoring matrix entries:",
258  $matrix->get_entry_count), 1);
259 
260  $self->logger->info(sprintf($fmt1, "Scoring matrix sources:",
261  $matrix->get_source_count), 1);
262 
263  $self->logger->info(sprintf($fmt1, "Scoring matrix targets:",
264  $matrix->get_target_count), 1);
265 
266  $self->logger->info(sprintf($fmt2, "Average score:",
267  $matrix->get_average_score), 1);
268 
269  my ($min, $max) = @{ $matrix->get_min_max_scores };
270  $self->logger->info(sprintf($fmt2, "Min. score:", $min), 1);
271  $self->logger->info(sprintf($fmt2, "Max. score:", $max), 1);
272 }
273 
274 
275 1;
276 
usage
public usage()
Bio::EnsEMBL::IdMapping::BaseObject
Definition: BaseObject.pm:25
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
Definition: ScoredMappingMatrix.pm:44
Bio::EnsEMBL::IdMapping::ScoreBuilder
Definition: ScoreBuilder.pm:22
Bio::EnsEMBL::IdMapping::MappingList
Definition: MappingList.pm:38
Bio::EnsEMBL::IdMapping::GeneScoreBuilder
Definition: GeneScoreBuilder.pm:18
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68