ensembl-hive  2.6
BaseMapper.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2024] EMBL-European Bioinformatics Institute
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 
22 =head1 CONTACT
23 
24  Please email comments or questions to the public Ensembl
25  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
26 
27  Questions may also be sent to the Ensembl help desk at
28  <http://www.ensembl.org/Help/Contact>.
29 
30 =cut
31 
32 =head1 NAME
33 
34 =head1 SYNOPSIS
35 
36 =head1 DESCRIPTION
37 
38 =head1 METHODS
39 
40 =cut
41 
42 package Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;
43 
44 use strict;
45 use warnings;
46 no warnings 'uninitialized';
47 
50 
51 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
52 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
54 
55 # scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
56 use constant SIMILAR_SCORE_RATIO => 0.01;
57 
58 #
59 # find the highest unambiguous score for all sources and targets in a scoring
60 # matrix
61 #
62 sub basic_mapping {
63  my $self = shift;
64  my $matrix = shift;
65  my $mapping_name = shift;
66 
67  # argument checks
68  unless ($matrix
69  and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
70  {
71  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
72  }
73 
74  throw('Need a name for serialising the mapping.')
75  unless ($mapping_name);
76 
77  # Create a new MappingList object. Specify AUTO_LOAD to load
78  # serialised existing mappings if found
79  my $dump_path =
80  path_append( $self->conf->param('basedir'), 'mapping' );
81 
82  my $mappings =
84  -DUMP_PATH => $dump_path,
85  -CACHE_FILE => "${mapping_name}.ser",
86  -AUTO_LOAD => 1, );
87 
88  # checkpoint test: return a previously stored MappingList
89  if ( $mappings->loaded ) {
90  $self->logger->info(
91  "Read existing mappings from ${mapping_name}.ser.\n");
92  return $mappings;
93  }
94 
95  my $sources_done = {};
96  my $targets_done = {};
97 
98  # sort scoring matrix entries by descending score
99  my @sorted_entries =
100  sort { $b->score <=> $a->score } @{ $matrix->get_all_Entries };
101 
102  # debug
103  #my $idx = substr($mapping_name, -1);
104 
105  while ( my $entry = shift(@sorted_entries) ) {
106 
107  #$self->logger->debug("\nxxx$idx ".$entry->to_string." ");
108 
109  # we already found a mapping for either source or target
110  next
111  if ( $sources_done->{ $entry->source }
112  or $targets_done->{ $entry->target } );
113 
114  #$self->logger->debug('d');
115 
116  # there's a better mapping for either source or target
117  next
118  if ( $self->higher_score_exists(
119  $entry, $matrix, $sources_done, $targets_done
120  ) );
121 
122  #$self->logger->debug('h');
123 
124  # check for ambiguous mappings; they are dealt with later
125  my $other_sources = [];
126  my $other_targets = [];
127 
128  if ( $self->ambiguous_mapping( $entry, $matrix,
129  $other_sources, $other_targets ) )
130  {
131  #$self->logger->debug('a');
132 
133  $other_sources =
134  $self->filter_sources( $other_sources, $sources_done );
135  $other_targets =
136  $self->filter_targets( $other_targets, $targets_done );
137 
138  next if ( scalar(@$other_sources) or scalar(@$other_targets) );
139  }
140 
141  #$self->logger->debug('A');
142 
143  # this is the best mapping, add it
144  $mappings->add_Entry($entry);
145 
146  $sources_done->{ $entry->source } = 1;
147  $targets_done->{ $entry->target } = 1;
148  } ## end while ( my $entry = shift...)
149 
150  # create checkpoint
151  $mappings->write_to_file;
152 
153  return $mappings;
154 } ## end sub basic_mapping
155 
156 sub higher_score_exists {
157  my ( $self, $entry, $matrix, $sources_done, $targets_done ) = @_;
158 
159  my $source = $entry->source;
160  my $target = $entry->target;
161  my $score = $entry->score;
162 
163  foreach
164  my $other_source ( @{ $matrix->get_sources_for_target($target) } )
165  {
166  if ( $other_source != $source
167  and !$sources_done->{$other_source}
168  and $score < $matrix->get_score( $other_source, $target ) )
169  {
170  return 1;
171  }
172  }
173 
174  foreach
175  my $other_target ( @{ $matrix->get_targets_for_source($source) } )
176  {
177  if ( $other_target != $target
178  and !$targets_done->{$other_target}
179  and $score < $matrix->get_score( $source, $other_target ) )
180  {
181  return 1;
182  }
183  }
184 
185  return 0;
186 } ## end sub higher_score_exists
187 
188 #
189 # find ambiguous mappings (see scores_similar() for definition)
190 #
191 sub ambiguous_mapping {
192  my ( $self, $entry, $matrix, $other_sources, $other_targets ) = @_;
193 
194  my $source = $entry->source;
195  my $target = $entry->target;
196  my $score = $entry->score;
197 
198  my $retval = 0;
199 
200  foreach
201  my $other_source ( @{ $matrix->get_sources_for_target($target) } )
202  {
203  my $other_score = $matrix->get_score( $other_source, $target );
204 
205  if ( $other_source != $source
206  and ( $self->scores_similar( $score, $other_score )
207  or $score < $other_score ) )
208  {
209  $retval = 1;
210  push @{$other_sources}, $other_source;
211  }
212  }
213 
214  foreach
215  my $other_target ( @{ $matrix->get_targets_for_source($source) } )
216  {
217  my $other_score = $matrix->get_score( $source, $other_target );
218 
219  if ( $other_target != $target
220  and ( $self->scores_similar( $score, $other_score )
221  or $score < $other_score ) )
222  {
223  $retval = 1;
224  push @{$other_targets}, $other_target;
225  }
226  }
227 
228  return $retval;
229 } ## end sub ambiguous_mapping
230 
231 #
232 # rule for similarity taken from java code...
233 #
234 sub scores_similar {
235  my ( $self, $s1, $s2 ) = @_;
236 
237  # always give priority to exact matches over very similar ones
238  return 0 if ( $s1 == 1 and $s2 < 1 );
239 
240  my $diff = $s1 - $s2;
241  $diff = -$diff if ( $diff < 0 );
242 
243  my $pc = 2*$diff/( $s1 + $s2 );
244 
245  return ( $pc < SIMILAR_SCORE_RATIO );
246 }
247 
248 sub filter_sources {
249  my ( $self, $other_sources, $sources_done ) = @_;
250 
251  unless ( scalar( @{$other_sources} )
252  and scalar( keys %{$sources_done} ) )
253  {
254  return $other_sources;
255  }
256 
257  my @tmp = ();
258 
259  foreach my $e ( @{$other_sources} ) {
260  push @tmp, $e unless ( $sources_done->{$e} );
261  }
262 
263  return \@tmp;
264 }
265 
266 sub filter_targets {
267  my ( $self, $other_targets, $targets_done ) = @_;
268 
269  unless ( scalar( @{$other_targets} )
270  and scalar( keys %{$targets_done} ) )
271  {
272  return $other_targets;
273  }
274 
275  my @tmp = ();
276 
277  foreach my $e ( @{$other_targets} ) {
278  push @tmp, $e unless ( $targets_done->{$e} );
279  }
280 
281  return \@tmp;
282 }
283 
284 1;
Bio::EnsEMBL::IdMapping::BaseObject
Definition: BaseObject.pm:25
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
Bio::EnsEMBL::IdMapping::MappingList::new
public Bio::EnsEMBL::IdMapping::MappingList new()
Bio::EnsEMBL::IdMapping::MappingList::get_all_Entries
public Arrayref get_all_Entries()
Bio::EnsEMBL::IdMapping::MappingList
Definition: MappingList.pm:38
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68