3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2024] EMBL-European Bioinformatics Institute
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
46 package Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder;
50 no warnings
'uninitialized';
60 sub score_transcripts {
62 my $exon_matrix = shift;
64 unless ($exon_matrix and
65 $exon_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
66 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
69 $self->logger->info(
"-- Scoring transcripts...\n\n", 0,
'stamped');
71 # build scores based on exon scores
72 my $matrix = $self->scores_from_exon_scores($exon_matrix);
75 if ($self->logger->loglevel eq
'debug') {
76 $matrix->
log(
'transcript', $self->conf->param(
'basedir'));
79 # log stats of combined matrix
80 my $fmt =
"%-40s%10.0f\n";
82 $self->logger->info(
"Scoring matrix:\n");
84 $self->logger->info(sprintf($fmt,
"Total source transcripts:",
85 $self->cache->get_count_by_name(
'transcripts_by_id',
'source')), 1);
87 $self->logger->info(sprintf($fmt,
"Total target transcripts:",
88 $self->cache->get_count_by_name(
'transcripts_by_id',
'target')), 1);
90 $self->log_matrix_stats($matrix);
92 $self->logger->info(
"\nDone with transcript scoring.\n\n");
98 sub scores_from_exon_scores {
100 my $exon_matrix = shift;
102 unless ($exon_matrix and
103 $exon_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
104 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
107 my $dump_path = path_append($self->conf->param(
'basedir'),
'matrix');
110 -DUMP_PATH => $dump_path,
111 -CACHE_FILE =>
'transcript_matrix.ser',
116 if (-s $transcript_cache) {
119 $self->logger->info(
"Reading transcript scoring matrix from file...\n", 0,
'stamped');
120 $self->logger->debug(
"Cache file $transcript_cache.\n", 1);
121 $matrix->read_from_file;
122 $self->logger->info(
"Done.\n\n", 0,
'stamped');
126 # build scoring matrix
127 $self->logger->info(
"No transcript scoring matrix found. Will build new one.\n");
129 $self->logger->info(
"Transcript scoring...\n", 0,
'stamped');
130 $matrix = $self->build_scores($matrix, $exon_matrix);
131 $self->logger->info(
"Done.\n\n", 0,
'stamped');
133 # write scoring matrix to file
134 $matrix->write_to_file;
145 my $exon_matrix = shift;
148 $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
149 throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
152 unless ($exon_matrix and
153 $exon_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
154 throw(
'Need a exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
157 # first find out which source and target transcripts have scoring exons and
158 # build a "flag" matrix for these transcripts (all scores are 1)
159 $self->flag_matrix_from_exon_scores($matrix, $exon_matrix);
161 # now calculate the actual scores for the transcripts in the flag matrix
163 $self->score_matrix_from_flag_matrix($matrix, $exon_matrix);
165 return $final_matrix;
169 sub flag_matrix_from_exon_scores {
172 my $exon_matrix = shift;
174 # initialise progress logger
176 my $num_transcripts =
177 scalar(keys %{ $self->cache->get_by_name(
'transcripts_by_id',
'source') });
178 my $progress_id = $self->logger->init_progress($num_transcripts, 100);
180 $self->logger->info(
"Creating flag matrix...\n", 1);
182 # loop over source transcripts
183 foreach my $source_transcript (values %{ $self->cache->get_by_name(
'transcripts_by_id',
'source') }) {
186 $self->logger->log_progress($progress_id, ++$i, 1);
188 # get all exons for the source transcript
189 foreach my $source_exon (@{ $source_transcript->get_all_Exons }) {
191 # get target exons for this source exon from scoring matrix
192 foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) {
194 # get target transcripts that contain this exon
195 foreach my $target_transcript (@{ $self->cache->get_by_key(
'transcripts_by_exon_id',
'target', $target_exon_id) }) {
197 # add scoring flag for these two transcripts
198 $matrix->add_score($source_transcript->id, $target_transcript->id, 1);
205 $self->logger->info(
"\n");
211 sub score_matrix_from_flag_matrix {
213 my $flag_matrix = shift;
214 my $exon_matrix = shift;
216 unless ($flag_matrix and
217 $flag_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
218 throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
221 unless ($exon_matrix and
222 $exon_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
223 throw(
'Need an exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
226 my $transcript_score_threshold =
227 $self->conf->param(
'transcript_score_threshold') || 0;
229 # create a new scoring matrix which will replace the flag matrix
231 -DUMP_PATH => $flag_matrix->dump_path,
232 -CACHE_FILE => $flag_matrix->cache_file_name,
235 # initialise progress logger
237 my $num_transcripts =
238 scalar(keys %{ $self->cache->get_by_name(
'transcripts_by_id',
'source') });
239 my $progress_id = $self->logger->init_progress($num_transcripts, 100);
241 $self->logger->info(
"Creating score matrix from flag matrix...\n", 1);
244 my $fmt_d1 =
"%-14s%-15s%-14s%-14s%-14s\n";
247 # loop over source transcripts
248 foreach my $source_transcript (values %{ $self->cache->get_by_name(
'transcripts_by_id',
'source') }) {
251 $self->logger->log_progress($progress_id, ++$i, 1);
253 # We are only interested in scoring with exons that are in the target
254 # transcript. The ScoredMappingMatrix may contain scores for exons that
255 # aren't in this transcript so create a hash of the target transcript's
257 my %source_exons =
map { $_->id => 1 }
258 @{ $source_transcript->get_all_Exons };
260 my $source_transcript_length = $source_transcript->length;
262 # get all corresponding target transcripts from the flag matrix
263 foreach my $target_transcript_id (@{ $flag_matrix->get_targets_for_source($source_transcript->id) }) {
265 my $target_transcript = $self->cache->get_by_key(
'transcripts_by_id',
'target', $target_transcript_id);
267 my $source_transcript_score = 0;
268 my $target_transcript_score = 0;
269 my $target_transcript_length = $target_transcript->length;
271 my %target_exons =
map { $_->id => 1 }
272 @{ $target_transcript->get_all_Exons };
274 # now loop over source exons and find the highest scoring target exon
275 # belonging to the target transcript
277 foreach my $source_exon (@{ $source_transcript->get_all_Exons }) {
279 my $max_source_score = -1;
281 foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) {
283 next unless ($target_exons{$target_exon_id});
285 my $score = $exon_matrix->get_score($source_exon->id,
287 $max_source_score = $score
if ($score > $max_source_score);
290 if ($max_source_score > 0) {
291 $source_transcript_score += ($max_source_score * $source_exon->length);
296 # now do the same for target exons
298 foreach my $target_exon (@{ $target_transcript->get_all_Exons }) {
300 my $max_target_score = -1;
302 foreach my $source_exon_id (@{ $exon_matrix->get_sources_for_target($target_exon->id) }) {
304 next unless ($source_exons{$source_exon_id});
306 my $score = $exon_matrix->get_score(
307 $source_exon_id, $target_exon->id);
308 $max_target_score = $score
if ($score > $max_target_score);
311 if ($max_target_score > 0) {
312 $target_transcript_score += ($max_target_score * $target_exon->length);
318 # calculate transcript score and add to scoring matrix
320 if (($source_transcript_length + $target_transcript_length) > 0) {
323 if (($source_transcript_score > $source_transcript_length) or
324 ($target_transcript_score > $target_transcript_length)) {
326 $self->logger->warning(
"Score > length for source ($source_transcript_score <> $source_transcript_length) or target ($target_transcript_score <> $target_transcript_length).\n", 1);
330 my $source_transcript_biotype_group = $self->get_biotype_group($source_transcript->biotype());
331 my $target_transcript_biotype_group = $self->get_biotype_group($target_transcript->biotype());
335 $self->logger->info($source_transcript->id.
":".$target_transcript->id.
336 " source score: $source_transcript_score".
337 " source length: $source_transcript_length".
338 " source biotype:" . $source_transcript->biotype() .
339 " source group: $source_transcript_biotype_group".
340 " target score: $target_transcript_score".
341 " target biotype:" . $target_transcript->biotype() .
342 " target group: $target_transcript_biotype_group".
343 " target length: $target_transcript_length\n");
346 my $transcript_score =
347 ($source_transcript_score + $target_transcript_score) /
348 ($source_transcript_length + $target_transcript_length);
350 ## Add penalty if biotypes are different
351 if ($source_transcript->biotype() ne $target_transcript->biotype()) {
352 $transcript_score = $transcript_score * 0.9;
355 ## Add penalty if biotype groups are different
356 if ($source_transcript_biotype_group ne $target_transcript_biotype_group) {
357 $transcript_score = $transcript_score * 0.8;
360 # everything is fine, add score to matrix
361 if ($transcript_score > $transcript_score_threshold) {
362 $matrix->add_score($source_transcript->id, $target_transcript->id,
370 $self->logger->warning(
"Combined length of source (".$source_transcript->id.
") and target (".$target_transcript->id.
") transcript is zero!\n", 1);
377 $self->logger->info(
"\n");
385 # penalise scores between genes with different biotypes.
386 # entries are modified in place
388 sub biotype_transcript_rescore {
389 my ( $self, $matrix ) = @_;
391 if ( defined($matrix) &&
392 !$matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
394 throw(
'Exprected Bio::EnsEMBL::IdMapping::ScoredMappingMatrix');
399 foreach my $entry ( @{ $matrix->get_all_Entries } ) {
401 my $source_transcript =
402 $self->cache->get_by_key(
'transcripts_by_id',
'source',
405 my $target_transcript =
406 $self->cache->get_by_key(
'transcripts_by_id',
'target',
409 if ($source_transcript->biotype() ne $target_transcript->biotype() )
411 # PENALTY: Lower the score for mappings to transcripts of
413 $matrix->set_score( $entry->source(), $entry->target(),
414 0.9*$entry->score() );
419 $self->logger->debug(
"Scored transcripts with biotype mismatch: $i\n",
421 } ## end sub biotype_transcript_rescore
424 sub different_translation_rescore {
429 and $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
431 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
436 foreach my $entry ( sort { $b->score <=> $a->score }
437 @{ $matrix->get_all_Entries } )
440 # we only do this for perfect matches, i.e. transcript score == 1
441 last
if ( $entry->score < 1 );
444 $self->cache->get_by_key(
'transcripts_by_id',
'source',
445 $entry->source )->translation;
447 $self->cache->get_by_key(
'transcripts_by_id',
'target',
448 $entry->target )->translation;
450 # no penalty if both transcripts have no translation
451 next
if ( !$source_tl and !$target_tl );
455 or ( $source_tl->seq ne $target_tl->seq ) )
457 # PENALTY: The transcript stable ID is now on a transcript with a
458 # different translation.
459 $matrix->set_score( $entry->source(), $entry->target(),
460 0.9*$entry->score() );
464 } ## end
foreach my $entry ( sort { ...})
466 $self->logger->debug(
467 "Non-perfect translations on perfect transcripts: $i\n",
469 } ## end sub different_translation_rescore
472 sub non_mapped_gene_rescore {
475 my $gene_mappings = shift;
479 and $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
482 'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
486 unless ( $gene_mappings
487 and $gene_mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList') )
489 throw(
'Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
492 # create of lookup hash of mapped source genes to target genes
494 map { $_->source => $_->target }
495 @{ $gene_mappings->get_all_Entries };
499 foreach my $entry ( @{ $matrix->get_all_Entries } ) {
502 $self->cache->get_by_key(
'genes_by_transcript_id',
'source',
505 $self->cache->get_by_key(
'genes_by_transcript_id',
'target',
508 my $mapped_target = $gene_lookup{ $source_gene->id };
510 if ( !$mapped_target or ( $mapped_target != $target_gene->id ) ) {
511 # PENALTY: The transcript stable ID has been mapped to an
513 $matrix->set_score( $entry->source(), $entry->target(),
514 0.9*$entry->score() );
519 $self->logger->debug(
"Scored transcripts in non-mapped genes: $i\n",
521 } ## end sub non_mapped_gene_rescore
524 sub get_biotype_group {
525 my ($self, $biotype) = @_;
526 my $dba = $self->cache->get_DBAdaptor(
'target');
527 my $biotype_adaptor = $dba->get_BiotypeAdaptor();
528 my $biotype_object = $biotype_adaptor->fetch_by_name_object_type($biotype,
'transcript');
529 return $biotype_object->biotype_group();