ensembl-hive  2.6
TranscriptScoreBuilder.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2024] EMBL-European Bioinformatics Institute
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 
34 =head1 SYNOPSIS
35 
36 
37 =head1 DESCRIPTION
38 
39 Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from
40 Java application.
41 
42 =head1 METHODS
43 
44 =cut
45 
46 package Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder;
47 
48 use strict;
49 use warnings;
50 no warnings 'uninitialized';
51 
54 
55 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
56 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
58 
59 
60 sub score_transcripts {
61  my $self = shift;
62  my $exon_matrix = shift;
63 
64  unless ($exon_matrix and
65  $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
66  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
67  }
68 
69  $self->logger->info("-- Scoring transcripts...\n\n", 0, 'stamped');
70 
71  # build scores based on exon scores
72  my $matrix = $self->scores_from_exon_scores($exon_matrix);
73 
74  # debug logging
75  if ($self->logger->loglevel eq 'debug') {
76  $matrix->log('transcript', $self->conf->param('basedir'));
77  }
78 
79  # log stats of combined matrix
80  my $fmt = "%-40s%10.0f\n";
81 
82  $self->logger->info("Scoring matrix:\n");
83 
84  $self->logger->info(sprintf($fmt, "Total source transcripts:",
85  $self->cache->get_count_by_name('transcripts_by_id', 'source')), 1);
86 
87  $self->logger->info(sprintf($fmt, "Total target transcripts:",
88  $self->cache->get_count_by_name('transcripts_by_id', 'target')), 1);
89 
90  $self->log_matrix_stats($matrix);
91 
92  $self->logger->info("\nDone with transcript scoring.\n\n");
93 
94  return $matrix;
95 }
96 
97 
98 sub scores_from_exon_scores {
99  my $self = shift;
100  my $exon_matrix = shift;
101 
102  unless ($exon_matrix and
103  $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
104  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
105  }
106 
107  my $dump_path = path_append($self->conf->param('basedir'), 'matrix');
108 
110  -DUMP_PATH => $dump_path,
111  -CACHE_FILE => 'transcript_matrix.ser',
112  );
113 
114  my $transcript_cache = $matrix->cache_file;
115 
116  if (-s $transcript_cache) {
117 
118  # read from file
119  $self->logger->info("Reading transcript scoring matrix from file...\n", 0, 'stamped');
120  $self->logger->debug("Cache file $transcript_cache.\n", 1);
121  $matrix->read_from_file;
122  $self->logger->info("Done.\n\n", 0, 'stamped');
123 
124  } else {
125 
126  # build scoring matrix
127  $self->logger->info("No transcript scoring matrix found. Will build new one.\n");
128 
129  $self->logger->info("Transcript scoring...\n", 0, 'stamped');
130  $matrix = $self->build_scores($matrix, $exon_matrix);
131  $self->logger->info("Done.\n\n", 0, 'stamped');
132 
133  # write scoring matrix to file
134  $matrix->write_to_file;
135 
136  }
137 
138  return $matrix;
139 }
140 
141 
142 sub build_scores {
143  my $self = shift;
144  my $matrix = shift;
145  my $exon_matrix = shift;
146 
147  unless ($matrix and
148  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
149  throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
150  }
151 
152  unless ($exon_matrix and
153  $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
154  throw('Need a exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
155  }
156 
157  # first find out which source and target transcripts have scoring exons and
158  # build a "flag" matrix for these transcripts (all scores are 1)
159  $self->flag_matrix_from_exon_scores($matrix, $exon_matrix);
160 
161  # now calculate the actual scores for the transcripts in the flag matrix
162  my $final_matrix =
163  $self->score_matrix_from_flag_matrix($matrix, $exon_matrix);
164 
165  return $final_matrix;
166 }
167 
168 
169 sub flag_matrix_from_exon_scores {
170  my $self = shift;
171  my $matrix = shift;
172  my $exon_matrix = shift;
173 
174  # initialise progress logger
175  my $i;
176  my $num_transcripts =
177  scalar(keys %{ $self->cache->get_by_name('transcripts_by_id', 'source') });
178  my $progress_id = $self->logger->init_progress($num_transcripts, 100);
179 
180  $self->logger->info("Creating flag matrix...\n", 1);
181 
182  # loop over source transcripts
183  foreach my $source_transcript (values %{ $self->cache->get_by_name('transcripts_by_id', 'source') }) {
184 
185  # log progress
186  $self->logger->log_progress($progress_id, ++$i, 1);
187 
188  # get all exons for the source transcript
189  foreach my $source_exon (@{ $source_transcript->get_all_Exons }) {
190 
191  # get target exons for this source exon from scoring matrix
192  foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) {
193 
194  # get target transcripts that contain this exon
195  foreach my $target_transcript (@{ $self->cache->get_by_key('transcripts_by_exon_id', 'target', $target_exon_id) }) {
196 
197  # add scoring flag for these two transcripts
198  $matrix->add_score($source_transcript->id, $target_transcript->id, 1);
199 
200  }
201  }
202  }
203  }
204 
205  $self->logger->info("\n");
206 
207  return $matrix;
208 }
209 
210 
211 sub score_matrix_from_flag_matrix {
212  my $self = shift;
213  my $flag_matrix = shift;
214  my $exon_matrix = shift;
215 
216  unless ($flag_matrix and
217  $flag_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
218  throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
219  }
220 
221  unless ($exon_matrix and
222  $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
223  throw('Need an exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
224  }
225 
226  my $transcript_score_threshold =
227  $self->conf->param('transcript_score_threshold') || 0;
228 
229  # create a new scoring matrix which will replace the flag matrix
231  -DUMP_PATH => $flag_matrix->dump_path,
232  -CACHE_FILE => $flag_matrix->cache_file_name,
233  );
234 
235  # initialise progress logger
236  my $i;
237  my $num_transcripts =
238  scalar(keys %{ $self->cache->get_by_name('transcripts_by_id', 'source') });
239  my $progress_id = $self->logger->init_progress($num_transcripts, 100);
240 
241  $self->logger->info("Creating score matrix from flag matrix...\n", 1);
242 
243  # debug
244  my $fmt_d1 = "%-14s%-15s%-14s%-14s%-14s\n";
245  my $fmt_d2 = "%.6f";
246 
247  # loop over source transcripts
248  foreach my $source_transcript (values %{ $self->cache->get_by_name('transcripts_by_id', 'source') }) {
249 
250  # log progress
251  $self->logger->log_progress($progress_id, ++$i, 1);
252 
253  # We are only interested in scoring with exons that are in the target
254  # transcript. The ScoredMappingMatrix may contain scores for exons that
255  # aren't in this transcript so create a hash of the target transcript's
256  # exons
257  my %source_exons = map { $_->id => 1 }
258  @{ $source_transcript->get_all_Exons };
259 
260  my $source_transcript_length = $source_transcript->length;
261 
262  # get all corresponding target transcripts from the flag matrix
263  foreach my $target_transcript_id (@{ $flag_matrix->get_targets_for_source($source_transcript->id) }) {
264 
265  my $target_transcript = $self->cache->get_by_key('transcripts_by_id', 'target', $target_transcript_id);
266 
267  my $source_transcript_score = 0;
268  my $target_transcript_score = 0;
269  my $target_transcript_length = $target_transcript->length;
270 
271  my %target_exons = map { $_->id => 1 }
272  @{ $target_transcript->get_all_Exons };
273 
274  # now loop over source exons and find the highest scoring target exon
275  # belonging to the target transcript
276 
277  foreach my $source_exon (@{ $source_transcript->get_all_Exons }) {
278 
279  my $max_source_score = -1;
280 
281  foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) {
282 
283  next unless ($target_exons{$target_exon_id});
284 
285  my $score = $exon_matrix->get_score($source_exon->id,
286  $target_exon_id);
287  $max_source_score = $score if ($score > $max_source_score);
288  }
289 
290  if ($max_source_score > 0) {
291  $source_transcript_score += ($max_source_score * $source_exon->length);
292 
293  }
294  }
295 
296  # now do the same for target exons
297 
298  foreach my $target_exon (@{ $target_transcript->get_all_Exons }) {
299 
300  my $max_target_score = -1;
301 
302  foreach my $source_exon_id (@{ $exon_matrix->get_sources_for_target($target_exon->id) }) {
303 
304  next unless ($source_exons{$source_exon_id});
305 
306  my $score = $exon_matrix->get_score(
307  $source_exon_id, $target_exon->id);
308  $max_target_score = $score if ($score > $max_target_score);
309  }
310 
311  if ($max_target_score > 0) {
312  $target_transcript_score += ($max_target_score * $target_exon->length);
313 
314  }
315  }
316 
317  #
318  # calculate transcript score and add to scoring matrix
319  #
320  if (($source_transcript_length + $target_transcript_length) > 0) {
321 
322  # sanity check
323  if (($source_transcript_score > $source_transcript_length) or
324  ($target_transcript_score > $target_transcript_length)) {
325 
326  $self->logger->warning("Score > length for source ($source_transcript_score <> $source_transcript_length) or target ($target_transcript_score <> $target_transcript_length).\n", 1);
327 
328  } else {
329 
330  my $source_transcript_biotype_group = $self->get_biotype_group($source_transcript->biotype());
331  my $target_transcript_biotype_group = $self->get_biotype_group($target_transcript->biotype());
332 
333 =cut
334  # debug
335  $self->logger->info($source_transcript->id.":".$target_transcript->id.
336  " source score: $source_transcript_score".
337  " source length: $source_transcript_length".
338  " source biotype:" . $source_transcript->biotype() .
339  " source group: $source_transcript_biotype_group".
340  " target score: $target_transcript_score".
341  " target biotype:" . $target_transcript->biotype() .
342  " target group: $target_transcript_biotype_group".
343  " target length: $target_transcript_length\n");
344 =cut
345 
346  my $transcript_score =
347  ($source_transcript_score + $target_transcript_score) /
348  ($source_transcript_length + $target_transcript_length);
349 
350 ## Add penalty if biotypes are different
351  if ($source_transcript->biotype() ne $target_transcript->biotype()) {
352  $transcript_score = $transcript_score * 0.9;
353  }
354 
355 ## Add penalty if biotype groups are different
356  if ($source_transcript_biotype_group ne $target_transcript_biotype_group) {
357  $transcript_score = $transcript_score * 0.8;
358  }
359 
360  # everything is fine, add score to matrix
361  if ($transcript_score > $transcript_score_threshold) {
362  $matrix->add_score($source_transcript->id, $target_transcript->id,
363  $transcript_score);
364  }
365 
366  }
367 
368  } else {
369 
370  $self->logger->warning("Combined length of source (".$source_transcript->id.") and target (".$target_transcript->id.") transcript is zero!\n", 1);
371 
372  }
373 
374  }
375  }
376 
377  $self->logger->info("\n");
378 
379  return $matrix;
380 
381 }
382 
383 
384 #
385 # penalise scores between genes with different biotypes.
386 # entries are modified in place
387 #
388 sub biotype_transcript_rescore {
389  my ( $self, $matrix ) = @_;
390 
391  if ( defined($matrix) &&
392  !$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
393  {
394  throw('Exprected Bio::EnsEMBL::IdMapping::ScoredMappingMatrix');
395  }
396 
397  my $i = 0;
398 
399  foreach my $entry ( @{ $matrix->get_all_Entries } ) {
400 
401  my $source_transcript =
402  $self->cache->get_by_key( 'transcripts_by_id', 'source',
403  $entry->source() );
404 
405  my $target_transcript =
406  $self->cache->get_by_key( 'transcripts_by_id', 'target',
407  $entry->target() );
408 
409  if ($source_transcript->biotype() ne $target_transcript->biotype() )
410  {
411  # PENALTY: Lower the score for mappings to transcripts of
412  # different biotype.
413  $matrix->set_score( $entry->source(), $entry->target(),
414  0.9*$entry->score() );
415  $i++;
416  }
417  }
418 
419  $self->logger->debug("Scored transcripts with biotype mismatch: $i\n",
420  1 );
421 } ## end sub biotype_transcript_rescore
422 
423 
424 sub different_translation_rescore {
425  my $self = shift;
426  my $matrix = shift;
427 
428  unless ($matrix
429  and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
430  {
431  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
432  }
433 
434  my $i = 0;
435 
436  foreach my $entry ( sort { $b->score <=> $a->score }
437  @{ $matrix->get_all_Entries } )
438  {
439 
440  # we only do this for perfect matches, i.e. transcript score == 1
441  last if ( $entry->score < 1 );
442 
443  my $source_tl =
444  $self->cache->get_by_key( 'transcripts_by_id', 'source',
445  $entry->source )->translation;
446  my $target_tl =
447  $self->cache->get_by_key( 'transcripts_by_id', 'target',
448  $entry->target )->translation;
449 
450  # no penalty if both transcripts have no translation
451  next if ( !$source_tl and !$target_tl );
452 
453  if ( !$source_tl
454  or !$target_tl
455  or ( $source_tl->seq ne $target_tl->seq ) )
456  {
457  # PENALTY: The transcript stable ID is now on a transcript with a
458  # different translation.
459  $matrix->set_score( $entry->source(), $entry->target(),
460  0.9*$entry->score() );
461  $i++;
462  }
463 
464  } ## end foreach my $entry ( sort { ...})
465 
466  $self->logger->debug(
467  "Non-perfect translations on perfect transcripts: $i\n",
468  1 );
469 } ## end sub different_translation_rescore
470 
471 
472 sub non_mapped_gene_rescore {
473  my $self = shift;
474  my $matrix = shift;
475  my $gene_mappings = shift;
476 
477  # argument checks
478  unless ($matrix
479  and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
480  {
481  throw(
482  'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
483  );
484  }
485 
486  unless ( $gene_mappings
487  and $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList') )
488  {
489  throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
490  }
491 
492  # create of lookup hash of mapped source genes to target genes
493  my %gene_lookup =
494  map { $_->source => $_->target }
495  @{ $gene_mappings->get_all_Entries };
496 
497  my $i = 0;
498 
499  foreach my $entry ( @{ $matrix->get_all_Entries } ) {
500 
501  my $source_gene =
502  $self->cache->get_by_key( 'genes_by_transcript_id', 'source',
503  $entry->source );
504  my $target_gene =
505  $self->cache->get_by_key( 'genes_by_transcript_id', 'target',
506  $entry->target );
507 
508  my $mapped_target = $gene_lookup{ $source_gene->id };
509 
510  if ( !$mapped_target or ( $mapped_target != $target_gene->id ) ) {
511  # PENALTY: The transcript stable ID has been mapped to an
512  # un-mapped gene.
513  $matrix->set_score( $entry->source(), $entry->target(),
514  0.9*$entry->score() );
515  $i++;
516  }
517  }
518 
519  $self->logger->debug( "Scored transcripts in non-mapped genes: $i\n",
520  1 );
521 } ## end sub non_mapped_gene_rescore
522 
523 
524 sub get_biotype_group {
525  my ($self, $biotype) = @_;
526  my $dba = $self->cache->get_DBAdaptor('target');
527  my $biotype_adaptor = $dba->get_BiotypeAdaptor();
528  my $biotype_object = $biotype_adaptor->fetch_by_name_object_type($biotype, 'transcript');
529  return $biotype_object->biotype_group();
530 
531 }
532 
533 
534 1;
535 
map
public map()
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
Definition: ScoredMappingMatrix.pm:44
Bio::EnsEMBL::IdMapping::ScoreBuilder
Definition: ScoreBuilder.pm:22
Bio::EnsEMBL::IdMapping::Serialisable::cache_file
public String cache_file()
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::new
public Bio::EnsEMBL::IdMapping::ScoredMappingMatrix new()
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::log
public void log()
build_scores
public build_scores()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::IdMapping::ExonScoreBuilder
Definition: ExonScoreBuilder.pm:19