3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
46 package Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder;
50 no warnings
'uninitialized';
60 sub score_transcripts {
62 my $exon_matrix = shift;
64 unless ($exon_matrix and
65 $exon_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
66 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
69 $self->logger->info(
"-- Scoring transcripts...\n\n", 0,
'stamped');
71 # build scores based on exon scores
72 my $matrix = $self->scores_from_exon_scores($exon_matrix);
75 if ($self->logger->loglevel eq
'debug') {
76 $matrix->
log(
'transcript', $self->conf->param(
'basedir'));
79 # log stats of combined matrix
80 my $fmt =
"%-40s%10.0f\n";
82 $self->logger->info(
"Scoring matrix:\n");
84 $self->logger->info(sprintf($fmt,
"Total source transcripts:",
85 $self->cache->get_count_by_name(
'transcripts_by_id',
'source')), 1);
87 $self->logger->info(sprintf($fmt,
"Total target transcripts:",
88 $self->cache->get_count_by_name(
'transcripts_by_id',
'target')), 1);
90 $self->log_matrix_stats($matrix);
92 $self->logger->info(
"\nDone with transcript scoring.\n\n");
98 sub scores_from_exon_scores {
100 my $exon_matrix = shift;
102 unless ($exon_matrix and
103 $exon_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
104 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
107 my $dump_path = path_append($self->conf->param(
'basedir'),
'matrix');
110 -DUMP_PATH => $dump_path,
111 -CACHE_FILE =>
'transcript_matrix.ser',
116 if (-s $transcript_cache) {
119 $self->logger->info(
"Reading transcript scoring matrix from file...\n", 0,
'stamped');
120 $self->logger->debug(
"Cache file $transcript_cache.\n", 1);
121 $matrix->read_from_file;
122 $self->logger->info(
"Done.\n\n", 0,
'stamped');
126 # build scoring matrix
127 $self->logger->info(
"No transcript scoring matrix found. Will build new one.\n");
129 $self->logger->info(
"Transcript scoring...\n", 0,
'stamped');
130 $matrix = $self->build_scores($matrix, $exon_matrix);
131 $self->logger->info(
"Done.\n\n", 0,
'stamped');
133 # write scoring matrix to file
134 $matrix->write_to_file;
145 my $exon_matrix = shift;
148 $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
149 throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
152 unless ($exon_matrix and
153 $exon_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
154 throw(
'Need a exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
157 # first find out which source and target transcripts have scoring exons and
158 # build a "flag" matrix for these transcripts (all scores are 1)
159 $self->flag_matrix_from_exon_scores($matrix, $exon_matrix);
161 # now calculate the actual scores for the transcripts in the flag matrix
163 $self->score_matrix_from_flag_matrix($matrix, $exon_matrix);
165 return $final_matrix;
169 sub flag_matrix_from_exon_scores {
172 my $exon_matrix = shift;
174 # initialise progress logger
176 my $num_transcripts =
177 scalar(keys %{ $self->cache->get_by_name(
'transcripts_by_id',
'source') });
178 my $progress_id = $self->logger->init_progress($num_transcripts, 100);
180 $self->logger->info(
"Creating flag matrix...\n", 1);
182 # loop over source transcripts
183 foreach my $source_transcript (values %{ $self->cache->get_by_name(
'transcripts_by_id',
'source') }) {
186 $self->logger->log_progress($progress_id, ++$i, 1);
188 # get all exons for the source transcript
189 foreach my $source_exon (@{ $source_transcript->get_all_Exons }) {
191 # get target exons for this source exon from scoring matrix
192 foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) {
194 # get target transcripts that contain this exon
195 foreach my $target_transcript (@{ $self->cache->get_by_key(
'transcripts_by_exon_id',
'target', $target_exon_id) }) {
197 # add scoring flag for these two transcripts
198 $matrix->add_score($source_transcript->id, $target_transcript->id, 1);
205 $self->logger->info(
"\n");
211 sub score_matrix_from_flag_matrix {
213 my $flag_matrix = shift;
214 my $exon_matrix = shift;
216 unless ($flag_matrix and
217 $flag_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
218 throw(
'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
221 unless ($exon_matrix and
222 $exon_matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
223 throw(
'Need an exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
226 my $transcript_score_threshold =
227 $self->conf->param(
'transcript_score_threshold') || 0;
229 # create a new scoring matrix which will replace the flag matrix
231 -DUMP_PATH => $flag_matrix->dump_path,
232 -CACHE_FILE => $flag_matrix->cache_file_name,
235 # initialise progress logger
237 my $num_transcripts =
238 scalar(keys %{ $self->cache->get_by_name(
'transcripts_by_id',
'source') });
239 my $progress_id = $self->logger->init_progress($num_transcripts, 100);
241 $self->logger->info(
"Creating score matrix from flag matrix...\n", 1);
244 my $fmt_d1 =
"%-14s%-15s%-14s%-14s%-14s\n";
247 # loop over source transcripts
248 foreach my $source_transcript (values %{ $self->cache->get_by_name(
'transcripts_by_id',
'source') }) {
251 $self->logger->log_progress($progress_id, ++$i, 1);
253 # We are only interested in scoring with exons that are in the target
254 # transcript. The ScoredMappingMatrix may contain scores for exons that
255 # aren't in this transcript so create a hash of the target transcript's
257 my %source_exons =
map { $_->id => 1 }
258 @{ $source_transcript->get_all_Exons };
260 my $source_transcript_length = $source_transcript->length;
262 # get all corresponding target transcripts from the flag matrix
263 foreach my $target_transcript_id (@{ $flag_matrix->get_targets_for_source($source_transcript->id) }) {
265 my $target_transcript = $self->cache->get_by_key(
'transcripts_by_id',
'target', $target_transcript_id);
267 my $source_transcript_score = 0;
268 my $target_transcript_score = 0;
269 my $target_transcript_length = $target_transcript->length;
271 my %target_exons =
map { $_->id => 1 }
272 @{ $target_transcript->get_all_Exons };
274 # now loop over source exons and find the highest scoring target exon
275 # belonging to the target transcript
277 foreach my $source_exon (@{ $source_transcript->get_all_Exons }) {
279 my $max_source_score = -1;
281 foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) {
283 next unless ($target_exons{$target_exon_id});
285 my $score = $exon_matrix->get_score($source_exon->id,
287 $max_source_score = $score
if ($score > $max_source_score);
290 if ($max_source_score > 0) {
291 $source_transcript_score += ($max_source_score * $source_exon->length);
296 # now do the same for target exons
298 foreach my $target_exon (@{ $target_transcript->get_all_Exons }) {
300 my $max_target_score = -1;
302 foreach my $source_exon_id (@{ $exon_matrix->get_sources_for_target($target_exon->id) }) {
304 next unless ($source_exons{$source_exon_id});
306 my $score = $exon_matrix->get_score(
307 $source_exon_id, $target_exon->id);
308 $max_target_score = $score
if ($score > $max_target_score);
311 if ($max_target_score > 0) {
312 $target_transcript_score += ($max_target_score * $target_exon->length);
318 # calculate transcript score and add to scoring matrix
320 if (($source_transcript_length + $target_transcript_length) > 0) {
323 if (($source_transcript_score > $source_transcript_length) or
324 ($target_transcript_score > $target_transcript_length)) {
326 $self->logger->warning(
"Score > length for source ($source_transcript_score <> $source_transcript_length) or target ($target_transcript_score <> $target_transcript_length).\n", 1);
330 my $source_transcript_biotype_group = $self->get_biotype_group($source_transcript->biotype());
331 my $target_transcript_biotype_group = $self->get_biotype_group($target_transcript->biotype());
334 $self->logger->info($source_transcript->id.
":".$target_transcript->id.
335 " source score: $source_transcript_score".
336 " source length: $source_transcript_length".
337 " source biotype:" . $source_transcript->biotype() .
338 " source group: $source_transcript_biotype_group".
339 " target score: $target_transcript_score".
340 " target biotype:" . $target_transcript->biotype() .
341 " target group: $target_transcript_biotype_group".
342 " target length: $target_transcript_length\n");
344 my $transcript_score =
345 ($source_transcript_score + $target_transcript_score) /
346 ($source_transcript_length + $target_transcript_length);
348 ## Add penalty if biotypes are different
349 if ($source_transcript->biotype() ne $target_transcript->biotype()) {
350 $transcript_score = $transcript_score * 0.9;
353 ## Add penalty if biotype groups are different
354 if ($source_transcript_biotype_group ne $target_transcript_biotype_group) {
355 $transcript_score = $transcript_score * 0.8;
358 # everything is fine, add score to matrix
359 if ($transcript_score > $transcript_score_threshold) {
360 $matrix->add_score($source_transcript->id, $target_transcript->id,
368 $self->logger->warning(
"Combined length of source (".$source_transcript->id.
") and target (".$target_transcript->id.
") transcript is zero!\n", 1);
375 $self->logger->info(
"\n");
383 # penalise scores between genes with different biotypes.
384 # entries are modified in place
386 sub biotype_transcript_rescore {
387 my ( $self, $matrix ) = @_;
389 if ( defined($matrix) &&
390 !$matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
392 throw(
'Exprected Bio::EnsEMBL::IdMapping::ScoredMappingMatrix');
397 foreach my $entry ( @{ $matrix->get_all_Entries } ) {
399 my $source_transcript =
400 $self->cache->get_by_key(
'transcripts_by_id',
'source',
403 my $target_transcript =
404 $self->cache->get_by_key(
'transcripts_by_id',
'target',
407 if ($source_transcript->biotype() ne $target_transcript->biotype() )
409 # PENALTY: Lower the score for mappings to transcripts of
411 $matrix->set_score( $entry->source(), $entry->target(),
412 0.9*$entry->score() );
417 $self->logger->debug(
"Scored transcripts with biotype mismatch: $i\n",
419 } ## end sub biotype_transcript_rescore
422 sub different_translation_rescore {
427 and $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
429 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
434 foreach my $entry ( sort { $b->score <=> $a->score }
435 @{ $matrix->get_all_Entries } )
438 # we only do this for perfect matches, i.e. transcript score == 1
439 last
if ( $entry->score < 1 );
442 $self->cache->get_by_key(
'transcripts_by_id',
'source',
443 $entry->source )->translation;
445 $self->cache->get_by_key(
'transcripts_by_id',
'target',
446 $entry->target )->translation;
448 # no penalty if both transcripts have no translation
449 next
if ( !$source_tl and !$target_tl );
453 or ( $source_tl->seq ne $target_tl->seq ) )
455 # PENALTY: The transcript stable ID is now on a transcript with a
456 # different translation.
457 $matrix->set_score( $entry->source(), $entry->target(),
458 0.9*$entry->score() );
462 } ## end
foreach my $entry ( sort { ...})
464 $self->logger->debug(
465 "Non-perfect translations on perfect transcripts: $i\n",
467 } ## end sub different_translation_rescore
470 sub non_mapped_gene_rescore {
473 my $gene_mappings = shift;
477 and $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
480 'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
484 unless ( $gene_mappings
485 and $gene_mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList') )
487 throw(
'Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
490 # create of lookup hash of mapped source genes to target genes
492 map { $_->source => $_->target }
493 @{ $gene_mappings->get_all_Entries };
497 foreach my $entry ( @{ $matrix->get_all_Entries } ) {
500 $self->cache->get_by_key(
'genes_by_transcript_id',
'source',
503 $self->cache->get_by_key(
'genes_by_transcript_id',
'target',
506 my $mapped_target = $gene_lookup{ $source_gene->id };
508 if ( !$mapped_target or ( $mapped_target != $target_gene->id ) ) {
509 # PENALTY: The transcript stable ID has been mapped to an
511 $matrix->set_score( $entry->source(), $entry->target(),
512 0.9*$entry->score() );
517 $self->logger->debug(
"Scored transcripts in non-mapped genes: $i\n",
519 } ## end sub non_mapped_gene_rescore
522 sub get_biotype_group {
523 my ($self, $biotype) = @_;
524 my $dba = $self->cache->get_DBAdaptor(
'target');
525 my $biotype_adaptor = $dba->get_BiotypeAdaptor();
526 my $biotype_object = $biotype_adaptor->fetch_by_name_object_type($biotype,
'transcript');
527 return $biotype_object->biotype_group();