3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2024] EMBL-European Bioinformatics Institute
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
44 package Bio::EnsEMBL::IdMapping::GeneScoreBuilder;
48 no warnings
'uninitialized';
60 my $transcript_matrix = shift;
62 unless ($transcript_matrix and
63 $transcript_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
64 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
67 $self->logger->info(
"-- Scoring genes...\n\n", 0,
'stamped');
69 # build scores based on transcript scores
70 my $matrix = $self->scores_from_transcript_scores($transcript_matrix);
73 if ($self->logger->loglevel eq
'debug') {
74 $matrix->
log(
'gene', $self->conf->param(
'basedir'));
77 # log stats of combined matrix
78 my $fmt =
"%-40s%10.0f\n";
80 $self->logger->info(
"Scoring matrix:\n");
82 $self->logger->info(sprintf($fmt,
"Total source genes:",
83 $self->cache->get_count_by_name(
'genes_by_id',
'source')), 1);
85 $self->logger->info(sprintf($fmt,
"Total target genes:",
86 $self->cache->get_count_by_name(
'genes_by_id',
'target')), 1);
88 $self->log_matrix_stats($matrix);
90 $self->logger->info(
"\nDone with gene scoring.\n\n");
96 sub scores_from_transcript_scores {
98 my $transcript_matrix = shift;
100 unless ($transcript_matrix and
101 $transcript_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
102 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
105 my $dump_path = path_append($self->conf->param(
'basedir'),
'matrix');
108 -DUMP_PATH => $dump_path,
109 -CACHE_FILE =>
'gene_matrix.ser',
114 if (-s $gene_cache) {
117 $self->logger->info(
"Reading gene scoring matrix from file...\n", 0,
'stamped');
118 $self->logger->debug(
"Cache file $gene_cache.\n", 1);
119 $matrix->read_from_file;
120 $self->logger->info(
"Done.\n\n", 0,
'stamped');
124 # build scoring matrix
125 $self->logger->info(
"No gene scoring matrix found. Will build new one.\n");
127 $self->logger->info(
"Gene scoring...\n", 0,
'stamped');
128 $matrix = $self->build_scores($matrix, $transcript_matrix);
129 $self->logger->info(
"Done.\n\n", 0,
'stamped');
131 # write scoring matrix to file
132 $matrix->write_to_file;
143 my $transcript_matrix = shift;
146 $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
147 throw(
'Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
150 unless ($transcript_matrix and
151 $transcript_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
152 throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
155 # first find out which source and target genes have scoring transcripts and
156 # build a "flag" matrix for these genes (all scores are 1)
157 $self->flag_matrix_from_transcript_scores($matrix, $transcript_matrix);
159 # now calculate the actual scores for the genes in the flag matrix
160 my $final_matrix = $self->score_matrix_from_flag_matrix($matrix,
163 return $final_matrix;
167 sub flag_matrix_from_transcript_scores {
170 my $transcript_matrix = shift;
173 $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
174 throw(
'Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
177 unless ($transcript_matrix and
178 $transcript_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
179 throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
182 # initialise progress logger
184 my $num_entries = $transcript_matrix->get_entry_count;
185 my $progress_id = $self->logger->init_progress($num_entries, 100);
187 $self->logger->info(
"Creating flag matrix...\n", 1);
189 # for every transcript scoring matrix entry, make an entry in the gene flag
191 foreach my $entry (@{ $transcript_matrix->get_all_Entries }) {
193 $self->logger->log_progress($progress_id, ++$i, 1);
195 my $source_gene = $self->cache->get_by_key(
'genes_by_transcript_id',
196 'source', $entry->source);
198 my $target_gene = $self->cache->get_by_key(
'genes_by_transcript_id',
199 'target', $entry->target);
201 $matrix->add_score($source_gene->id, $target_gene->id, 1);
204 $self->logger->info(
"\n");
210 sub score_matrix_from_flag_matrix {
212 my $flag_matrix = shift;
213 my $transcript_matrix = shift;
214 my $simple_scoring = shift;
216 unless ( $flag_matrix and
217 $flag_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
219 throw(
'Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
222 unless ( $transcript_matrix
224 $transcript_matrix->isa(
225 'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')
229 'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
233 # create a new scoring matrix which will replace the flag matrix
236 -DUMP_PATH => $flag_matrix->dump_path,
237 -CACHE_FILE => $flag_matrix->cache_file_name,
240 # initialise progress logger
243 my $progress_id = $self->logger->init_progress( $num_entries, 100 );
245 $self->logger->info(
"Creating score matrix from flag matrix...\n",
248 my $gene_score_threshold =
249 $self->conf()->param(
'gene_score_threshold') || 0;
251 # loop over flag matrix and do proper scoring for each entry
252 foreach my $entry ( @{ $flag_matrix->get_all_Entries } ) {
254 $self->logger->log_progress( $progress_id, ++$i, 1 );
258 $self->cache->get_by_key(
'genes_by_id',
'source',
261 $self->cache->get_by_key(
'genes_by_id',
'target',
264 if ($simple_scoring) {
266 # simple scoring (used for rescoring purposes)
268 $self->simple_gene_gene_score( $source_gene, $target_gene,
269 $transcript_matrix );
276 $self->complex_gene_gene_score( $source_gene, $target_gene,
277 $transcript_matrix );
280 if ( $score > $gene_score_threshold ) {
281 $matrix->add_score( $entry->source, $entry->target, $score );
283 } ## end
foreach my $entry ( @{ $flag_matrix...})
285 $self->logger->info(
"\n");
288 } ## end sub score_matrix_from_flag_matrix
291 sub complex_gene_gene_score {
293 my $source_gene = shift;
294 my $target_gene = shift;
295 my $transcript_matrix = shift;
297 my $source_gene_score = 0;
298 my $target_gene_score = 0;
299 my $source_gene_accum_length = 0; # sum of all
transcript lengths
300 my $target_gene_accum_length = 0; # sum of all
transcript lengths
302 # We are only interested in scoring with transcripts that are in the target
303 # gene. The scored mapping matrix may contain scores for transcripts that
304 # aren't in this gene so create a hash of the target genes's transcripts
305 my %target_transcripts =
map { $_->id => 1 }
306 @{ $target_gene->get_all_Transcripts };
308 # loop over source transcripts
309 foreach my $source_transcript (@{ $source_gene->get_all_Transcripts }) {
311 # now loop over target transcripts and find the highest scoring target
312 # transcript belonging to the target gene
313 my $max_source_score = -1;
315 foreach my $target_transcript_id (@{ $transcript_matrix->get_targets_for_source($source_transcript->id) }) {
317 next unless ($target_transcripts{$target_transcript_id});
319 my $score = $transcript_matrix->get_score(
320 $source_transcript->id, $target_transcript_id);
321 $max_source_score = $score
if ($score > $max_source_score);
324 if ($max_source_score > 0) {
325 $source_gene_score += ($max_source_score * $source_transcript->length);
328 $source_gene_accum_length += $source_transcript->length;
331 # now do the same for target genes
332 my %source_transcripts =
map { $_->id => 1 }
333 @{ $source_gene->get_all_Transcripts };
335 # loop over target transcripts
336 foreach my $target_transcript (@{ $target_gene->get_all_Transcripts }) {
338 # now loop over source transcripts and find the highest scoring source
339 # transcript belonging to the source gene
340 my $max_target_score = -1;
342 foreach my $source_transcript_id (@{ $transcript_matrix->get_sources_for_target($target_transcript->id) }) {
344 next unless ($source_transcripts{$source_transcript_id});
346 my $score = $transcript_matrix->get_score(
347 $source_transcript_id, $target_transcript->id);
348 $max_target_score = $score
if ($score > $max_target_score);
351 if ($max_target_score > 0) {
352 $target_gene_score += ($max_target_score * $target_transcript->length);
355 $target_gene_accum_length += $target_transcript->length;
358 # calculate overall score for this gene
361 if (($source_gene_accum_length + $target_gene_accum_length) > 0) {
363 $gene_score = ($source_gene_score + $target_gene_score) /
364 ($source_gene_accum_length + $target_gene_accum_length);
368 $self->logger->warning(
"Combined transcript length of source (".$source_gene->id.
") and target (".$target_gene->id.
") gene is zero!\n", 1);
372 if ($gene_score > 1) {
373 $self->logger->warning(
"Illegal gene score: $gene_score (".
374 join(
"|", $source_gene_score, $target_gene_score,
375 $source_gene_accum_length, $target_gene_accum_length).
")\n", 1);
383 # Simplified scoring for genes. Score is best scoring transcript pair.
384 # This is used when the more elaborate gene representing score does not
385 # distinguish very well.
387 sub simple_gene_gene_score {
389 my $source_gene = shift;
390 my $target_gene = shift;
391 my $transcript_matrix = shift;
395 foreach my $source_transcript (@{ $source_gene->get_all_Transcripts }) {
396 foreach my $target_transcript (@{ $target_gene->get_all_Transcripts }) {
398 my $score = $transcript_matrix->get_score($source_transcript->id,
399 $target_transcript->id);
401 $gene_score = $score
if ($score > $gene_score);
409 sub simple_gene_rescore {
411 my $gene_matrix = shift;
412 my $transcript_matrix = shift;
414 $gene_matrix = $self->score_matrix_from_flag_matrix($gene_matrix,
415 $transcript_matrix, 1);
419 # penalise scores between genes with different biotypes.
420 # entries are modified in place
422 sub biotype_gene_rescore {
427 and $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
429 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
434 foreach my $entry ( @{ $matrix->get_all_Entries } ) {
437 $self->cache->get_by_key(
'genes_by_id',
'source',
441 $self->cache->get_by_key(
'genes_by_id',
'target',
444 if ( $source_gene->biotype() ne $target_gene->biotype() ) {
445 # PENALTY: Lower the score for mappings that differ in biotype.
446 $matrix->set_score( $entry->source(), $entry->target(),
447 0.9*$entry->score() );
452 $self->logger->debug(
"Scored genes with biotype mismatch: $i\n", 1 );
453 } ## end sub biotype_gene_rescore
456 sub location_gene_rescore {
461 and $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
463 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
468 foreach my $entry ( @{ $matrix->get_all_Entries } ) {
471 $self->cache->get_by_key(
'genes_by_id',
'source',
475 $self->cache->get_by_key(
'genes_by_id',
'target',
478 if ( $source_gene->seq_region_name() ne $target_gene->seq_region_name() ) {
479 # PENALTY: Lower the score for mappings that are not on the same slice.
480 $matrix->set_score( $entry->source(), $entry->target(),
481 0.9*$entry->score() );