ensembl-hive  2.6
GeneScoreBuilder.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2024] EMBL-European Bioinformatics Institute
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 =head1 SYNOPSIS
34 
35 =head1 DESCRIPTION
36 
37 Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from
38 Java application.
39 
40 =head1 METHODS
41 
42 =cut
43 
44 package Bio::EnsEMBL::IdMapping::GeneScoreBuilder;
45 
46 use strict;
47 use warnings;
48 no warnings 'uninitialized';
49 
52 
53 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
54 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
56 
57 
58 sub score_genes {
59  my $self = shift;
60  my $transcript_matrix = shift;
61 
62  unless ($transcript_matrix and
63  $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
64  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
65  }
66 
67  $self->logger->info("-- Scoring genes...\n\n", 0, 'stamped');
68 
69  # build scores based on transcript scores
70  my $matrix = $self->scores_from_transcript_scores($transcript_matrix);
71 
72  # debug logging
73  if ($self->logger->loglevel eq 'debug') {
74  $matrix->log('gene', $self->conf->param('basedir'));
75  }
76 
77  # log stats of combined matrix
78  my $fmt = "%-40s%10.0f\n";
79 
80  $self->logger->info("Scoring matrix:\n");
81 
82  $self->logger->info(sprintf($fmt, "Total source genes:",
83  $self->cache->get_count_by_name('genes_by_id', 'source')), 1);
84 
85  $self->logger->info(sprintf($fmt, "Total target genes:",
86  $self->cache->get_count_by_name('genes_by_id', 'target')), 1);
87 
88  $self->log_matrix_stats($matrix);
89 
90  $self->logger->info("\nDone with gene scoring.\n\n");
91 
92  return $matrix;
93 }
94 
95 
96 sub scores_from_transcript_scores {
97  my $self = shift;
98  my $transcript_matrix = shift;
99 
100  unless ($transcript_matrix and
101  $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
102  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
103  }
104 
105  my $dump_path = path_append($self->conf->param('basedir'), 'matrix');
106 
108  -DUMP_PATH => $dump_path,
109  -CACHE_FILE => 'gene_matrix.ser',
110  );
111 
112  my $gene_cache = $matrix->cache_file;
113 
114  if (-s $gene_cache) {
115 
116  # read from file
117  $self->logger->info("Reading gene scoring matrix from file...\n", 0, 'stamped');
118  $self->logger->debug("Cache file $gene_cache.\n", 1);
119  $matrix->read_from_file;
120  $self->logger->info("Done.\n\n", 0, 'stamped');
121 
122  } else {
123 
124  # build scoring matrix
125  $self->logger->info("No gene scoring matrix found. Will build new one.\n");
126 
127  $self->logger->info("Gene scoring...\n", 0, 'stamped');
128  $matrix = $self->build_scores($matrix, $transcript_matrix);
129  $self->logger->info("Done.\n\n", 0, 'stamped');
130 
131  # write scoring matrix to file
132  $matrix->write_to_file;
133 
134  }
135 
136  return $matrix;
137 }
138 
139 
140 sub build_scores {
141  my $self = shift;
142  my $matrix = shift;
143  my $transcript_matrix = shift;
144 
145  unless ($matrix and
146  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
147  throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
148  }
149 
150  unless ($transcript_matrix and
151  $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
152  throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
153  }
154 
155  # first find out which source and target genes have scoring transcripts and
156  # build a "flag" matrix for these genes (all scores are 1)
157  $self->flag_matrix_from_transcript_scores($matrix, $transcript_matrix);
158 
159  # now calculate the actual scores for the genes in the flag matrix
160  my $final_matrix = $self->score_matrix_from_flag_matrix($matrix,
161  $transcript_matrix);
162 
163  return $final_matrix;
164 }
165 
166 
167 sub flag_matrix_from_transcript_scores {
168  my $self = shift;
169  my $matrix = shift;
170  my $transcript_matrix = shift;
171 
172  unless ($matrix and
173  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
174  throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
175  }
176 
177  unless ($transcript_matrix and
178  $transcript_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
179  throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
180  }
181 
182  # initialise progress logger
183  my $i;
184  my $num_entries = $transcript_matrix->get_entry_count;
185  my $progress_id = $self->logger->init_progress($num_entries, 100);
186 
187  $self->logger->info("Creating flag matrix...\n", 1);
188 
189  # for every transcript scoring matrix entry, make an entry in the gene flag
190  # matrix.
191  foreach my $entry (@{ $transcript_matrix->get_all_Entries }) {
192 
193  $self->logger->log_progress($progress_id, ++$i, 1);
194 
195  my $source_gene = $self->cache->get_by_key('genes_by_transcript_id',
196  'source', $entry->source);
197 
198  my $target_gene = $self->cache->get_by_key('genes_by_transcript_id',
199  'target', $entry->target);
200 
201  $matrix->add_score($source_gene->id, $target_gene->id, 1);
202  }
203 
204  $self->logger->info("\n");
205 
206  return $matrix;
207 }
208 
209 
210 sub score_matrix_from_flag_matrix {
211  my $self = shift;
212  my $flag_matrix = shift;
213  my $transcript_matrix = shift;
214  my $simple_scoring = shift;
215 
216  unless ( $flag_matrix and
217  $flag_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
218  {
219  throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
220  }
221 
222  unless ( $transcript_matrix
223  and
224  $transcript_matrix->isa(
225  'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')
226  )
227  {
228  throw(
229  'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
230  );
231  }
232 
233  # create a new scoring matrix which will replace the flag matrix
234  my $matrix =
236  -DUMP_PATH => $flag_matrix->dump_path,
237  -CACHE_FILE => $flag_matrix->cache_file_name,
238  );
239 
240  # initialise progress logger
241  my $i;
242  my $num_entries = $flag_matrix->get_entry_count;
243  my $progress_id = $self->logger->init_progress( $num_entries, 100 );
244 
245  $self->logger->info( "Creating score matrix from flag matrix...\n",
246  1 );
247 
248  my $gene_score_threshold =
249  $self->conf()->param('gene_score_threshold') || 0;
250 
251  # loop over flag matrix and do proper scoring for each entry
252  foreach my $entry ( @{ $flag_matrix->get_all_Entries } ) {
253 
254  $self->logger->log_progress( $progress_id, ++$i, 1 );
255 
256  my $score = 0;
257  my $source_gene =
258  $self->cache->get_by_key( 'genes_by_id', 'source',
259  $entry->source );
260  my $target_gene =
261  $self->cache->get_by_key( 'genes_by_id', 'target',
262  $entry->target );
263 
264  if ($simple_scoring) {
265 
266  # simple scoring (used for rescoring purposes)
267  $score =
268  $self->simple_gene_gene_score( $source_gene, $target_gene,
269  $transcript_matrix );
270 
271  }
272  else {
273 
274  # full scoring
275  $score =
276  $self->complex_gene_gene_score( $source_gene, $target_gene,
277  $transcript_matrix );
278  }
279 
280  if ( $score > $gene_score_threshold ) {
281  $matrix->add_score( $entry->source, $entry->target, $score );
282  }
283  } ## end foreach my $entry ( @{ $flag_matrix...})
284 
285  $self->logger->info("\n");
286 
287  return $matrix;
288 } ## end sub score_matrix_from_flag_matrix
289 
290 
291 sub complex_gene_gene_score {
292  my $self = shift;
293  my $source_gene = shift;
294  my $target_gene = shift;
295  my $transcript_matrix = shift;
296 
297  my $source_gene_score = 0;
298  my $target_gene_score = 0;
299  my $source_gene_accum_length = 0; # sum of all transcript lengths
300  my $target_gene_accum_length = 0; # sum of all transcript lengths
301 
302  # We are only interested in scoring with transcripts that are in the target
303  # gene. The scored mapping matrix may contain scores for transcripts that
304  # aren't in this gene so create a hash of the target genes's transcripts
305  my %target_transcripts = map { $_->id => 1 }
306  @{ $target_gene->get_all_Transcripts };
307 
308  # loop over source transcripts
309  foreach my $source_transcript (@{ $source_gene->get_all_Transcripts }) {
310 
311  # now loop over target transcripts and find the highest scoring target
312  # transcript belonging to the target gene
313  my $max_source_score = -1;
314 
315  foreach my $target_transcript_id (@{ $transcript_matrix->get_targets_for_source($source_transcript->id) }) {
316 
317  next unless ($target_transcripts{$target_transcript_id});
318 
319  my $score = $transcript_matrix->get_score(
320  $source_transcript->id, $target_transcript_id);
321  $max_source_score = $score if ($score > $max_source_score);
322  }
323 
324  if ($max_source_score > 0) {
325  $source_gene_score += ($max_source_score * $source_transcript->length);
326  }
327 
328  $source_gene_accum_length += $source_transcript->length;
329  }
330 
331  # now do the same for target genes
332  my %source_transcripts = map { $_->id => 1 }
333  @{ $source_gene->get_all_Transcripts };
334 
335  # loop over target transcripts
336  foreach my $target_transcript (@{ $target_gene->get_all_Transcripts }) {
337 
338  # now loop over source transcripts and find the highest scoring source
339  # transcript belonging to the source gene
340  my $max_target_score = -1;
341 
342  foreach my $source_transcript_id (@{ $transcript_matrix->get_sources_for_target($target_transcript->id) }) {
343 
344  next unless ($source_transcripts{$source_transcript_id});
345 
346  my $score = $transcript_matrix->get_score(
347  $source_transcript_id, $target_transcript->id);
348  $max_target_score = $score if ($score > $max_target_score);
349  }
350 
351  if ($max_target_score > 0) {
352  $target_gene_score += ($max_target_score * $target_transcript->length);
353  }
354 
355  $target_gene_accum_length += $target_transcript->length;
356  }
357 
358  # calculate overall score for this gene
359  my $gene_score = 0;
360 
361  if (($source_gene_accum_length + $target_gene_accum_length) > 0) {
362 
363  $gene_score = ($source_gene_score + $target_gene_score) /
364  ($source_gene_accum_length + $target_gene_accum_length);
365 
366  } else {
367 
368  $self->logger->warning("Combined transcript length of source (".$source_gene->id.") and target (".$target_gene->id.") gene is zero!\n", 1);
369 
370  }
371 
372  if ($gene_score > 1) {
373  $self->logger->warning("Illegal gene score: $gene_score (".
374  join("|", $source_gene_score, $target_gene_score,
375  $source_gene_accum_length, $target_gene_accum_length).")\n", 1);
376  }
377 
378  return $gene_score;
379 }
380 
381 
382 #
383 # Simplified scoring for genes. Score is best scoring transcript pair.
384 # This is used when the more elaborate gene representing score does not
385 # distinguish very well.
386 #
387 sub simple_gene_gene_score {
388  my $self = shift;
389  my $source_gene = shift;
390  my $target_gene = shift;
391  my $transcript_matrix = shift;
392 
393  my $gene_score = 0;
394 
395  foreach my $source_transcript (@{ $source_gene->get_all_Transcripts }) {
396  foreach my $target_transcript (@{ $target_gene->get_all_Transcripts }) {
397 
398  my $score = $transcript_matrix->get_score($source_transcript->id,
399  $target_transcript->id);
400 
401  $gene_score = $score if ($score > $gene_score);
402  }
403  }
404 
405  return $gene_score;
406 }
407 
408 
409 sub simple_gene_rescore {
410  my $self = shift;
411  my $gene_matrix = shift;
412  my $transcript_matrix = shift;
413 
414  $gene_matrix = $self->score_matrix_from_flag_matrix($gene_matrix,
415  $transcript_matrix, 1);
416 }
417 
418 #
419 # penalise scores between genes with different biotypes.
420 # entries are modified in place
421 #
422 sub biotype_gene_rescore {
423  my $self = shift;
424  my $matrix = shift;
425 
426  unless ($matrix
427  and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
428  {
429  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
430  }
431 
432  my $i = 0;
433 
434  foreach my $entry ( @{ $matrix->get_all_Entries } ) {
435 
436  my $source_gene =
437  $self->cache->get_by_key( 'genes_by_id', 'source',
438  $entry->source );
439 
440  my $target_gene =
441  $self->cache->get_by_key( 'genes_by_id', 'target',
442  $entry->target );
443 
444  if ( $source_gene->biotype() ne $target_gene->biotype() ) {
445  # PENALTY: Lower the score for mappings that differ in biotype.
446  $matrix->set_score( $entry->source(), $entry->target(),
447  0.9*$entry->score() );
448  $i++;
449  }
450  }
451 
452  $self->logger->debug( "Scored genes with biotype mismatch: $i\n", 1 );
453 } ## end sub biotype_gene_rescore
454 
455 
456 sub location_gene_rescore {
457  my $self = shift;
458  my $matrix = shift;
459 
460  unless ($matrix
461  and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
462  {
463  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
464  }
465 
466  my $i = 0;
467 
468  foreach my $entry ( @{ $matrix->get_all_Entries } ) {
469 
470  my $source_gene =
471  $self->cache->get_by_key( 'genes_by_id', 'source',
472  $entry->source );
473 
474  my $target_gene =
475  $self->cache->get_by_key( 'genes_by_id', 'target',
476  $entry->target );
477 
478  if ( $source_gene->seq_region_name() ne $target_gene->seq_region_name() ) {
479  # PENALTY: Lower the score for mappings that are not on the same slice.
480  $matrix->set_score( $entry->source(), $entry->target(),
481  0.9*$entry->score() );
482  $i++;
483  }
484  }
485 
486 }
487 
488 
489 1;
490 
transcript
public transcript()
map
public map()
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
Definition: ScoredMappingMatrix.pm:44
Bio::EnsEMBL::IdMapping::ScoreBuilder
Definition: ScoreBuilder.pm:22
Bio::EnsEMBL::IdMapping::Serialisable::cache_file
public String cache_file()
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::new
public Bio::EnsEMBL::IdMapping::ScoredMappingMatrix new()
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::log
public void log()
build_scores
public build_scores()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::get_entry_count
public Int get_entry_count()
Bio::EnsEMBL::IdMapping::ExonScoreBuilder
Definition: ExonScoreBuilder.pm:19