ensembl-hive  2.8.1
TranscriptScoreBuilder.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 
34 =head1 SYNOPSIS
35 
36 
37 =head1 DESCRIPTION
38 
39 Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from
40 Java application.
41 
42 =head1 METHODS
43 
44 =cut
45 
46 package Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder;
47 
48 use strict;
49 use warnings;
50 no warnings 'uninitialized';
51 
54 
55 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
56 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
58 
59 
60 sub score_transcripts {
61  my $self = shift;
62  my $exon_matrix = shift;
63 
64  unless ($exon_matrix and
65  $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
66  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
67  }
68 
69  $self->logger->info("-- Scoring transcripts...\n\n", 0, 'stamped');
70 
71  # build scores based on exon scores
72  my $matrix = $self->scores_from_exon_scores($exon_matrix);
73 
74  # debug logging
75  if ($self->logger->loglevel eq 'debug') {
76  $matrix->log('transcript', $self->conf->param('basedir'));
77  }
78 
79  # log stats of combined matrix
80  my $fmt = "%-40s%10.0f\n";
81 
82  $self->logger->info("Scoring matrix:\n");
83 
84  $self->logger->info(sprintf($fmt, "Total source transcripts:",
85  $self->cache->get_count_by_name('transcripts_by_id', 'source')), 1);
86 
87  $self->logger->info(sprintf($fmt, "Total target transcripts:",
88  $self->cache->get_count_by_name('transcripts_by_id', 'target')), 1);
89 
90  $self->log_matrix_stats($matrix);
91 
92  $self->logger->info("\nDone with transcript scoring.\n\n");
93 
94  return $matrix;
95 }
96 
97 
98 sub scores_from_exon_scores {
99  my $self = shift;
100  my $exon_matrix = shift;
101 
102  unless ($exon_matrix and
103  $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
104  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
105  }
106 
107  my $dump_path = path_append($self->conf->param('basedir'), 'matrix');
108 
110  -DUMP_PATH => $dump_path,
111  -CACHE_FILE => 'transcript_matrix.ser',
112  );
113 
114  my $transcript_cache = $matrix->cache_file;
115 
116  if (-s $transcript_cache) {
117 
118  # read from file
119  $self->logger->info("Reading transcript scoring matrix from file...\n", 0, 'stamped');
120  $self->logger->debug("Cache file $transcript_cache.\n", 1);
121  $matrix->read_from_file;
122  $self->logger->info("Done.\n\n", 0, 'stamped');
123 
124  } else {
125 
126  # build scoring matrix
127  $self->logger->info("No transcript scoring matrix found. Will build new one.\n");
128 
129  $self->logger->info("Transcript scoring...\n", 0, 'stamped');
130  $matrix = $self->build_scores($matrix, $exon_matrix);
131  $self->logger->info("Done.\n\n", 0, 'stamped');
132 
133  # write scoring matrix to file
134  $matrix->write_to_file;
135 
136  }
137 
138  return $matrix;
139 }
140 
141 
142 sub build_scores {
143  my $self = shift;
144  my $matrix = shift;
145  my $exon_matrix = shift;
146 
147  unless ($matrix and
148  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
149  throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
150  }
151 
152  unless ($exon_matrix and
153  $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
154  throw('Need a exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
155  }
156 
157  # first find out which source and target transcripts have scoring exons and
158  # build a "flag" matrix for these transcripts (all scores are 1)
159  $self->flag_matrix_from_exon_scores($matrix, $exon_matrix);
160 
161  # now calculate the actual scores for the transcripts in the flag matrix
162  my $final_matrix =
163  $self->score_matrix_from_flag_matrix($matrix, $exon_matrix);
164 
165  return $final_matrix;
166 }
167 
168 
169 sub flag_matrix_from_exon_scores {
170  my $self = shift;
171  my $matrix = shift;
172  my $exon_matrix = shift;
173 
174  # initialise progress logger
175  my $i;
176  my $num_transcripts =
177  scalar(keys %{ $self->cache->get_by_name('transcripts_by_id', 'source') });
178  my $progress_id = $self->logger->init_progress($num_transcripts, 100);
179 
180  $self->logger->info("Creating flag matrix...\n", 1);
181 
182  # loop over source transcripts
183  foreach my $source_transcript (values %{ $self->cache->get_by_name('transcripts_by_id', 'source') }) {
184 
185  # log progress
186  $self->logger->log_progress($progress_id, ++$i, 1);
187 
188  # get all exons for the source transcript
189  foreach my $source_exon (@{ $source_transcript->get_all_Exons }) {
190 
191  # get target exons for this source exon from scoring matrix
192  foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) {
193 
194  # get target transcripts that contain this exon
195  foreach my $target_transcript (@{ $self->cache->get_by_key('transcripts_by_exon_id', 'target', $target_exon_id) }) {
196 
197  # add scoring flag for these two transcripts
198  $matrix->add_score($source_transcript->id, $target_transcript->id, 1);
199 
200  }
201  }
202  }
203  }
204 
205  $self->logger->info("\n");
206 
207  return $matrix;
208 }
209 
210 
211 sub score_matrix_from_flag_matrix {
212  my $self = shift;
213  my $flag_matrix = shift;
214  my $exon_matrix = shift;
215 
216  unless ($flag_matrix and
217  $flag_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
218  throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
219  }
220 
221  unless ($exon_matrix and
222  $exon_matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
223  throw('Need an exon Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
224  }
225 
226  my $transcript_score_threshold =
227  $self->conf->param('transcript_score_threshold') || 0;
228 
229  # create a new scoring matrix which will replace the flag matrix
231  -DUMP_PATH => $flag_matrix->dump_path,
232  -CACHE_FILE => $flag_matrix->cache_file_name,
233  );
234 
235  # initialise progress logger
236  my $i;
237  my $num_transcripts =
238  scalar(keys %{ $self->cache->get_by_name('transcripts_by_id', 'source') });
239  my $progress_id = $self->logger->init_progress($num_transcripts, 100);
240 
241  $self->logger->info("Creating score matrix from flag matrix...\n", 1);
242 
243  # debug
244  my $fmt_d1 = "%-14s%-15s%-14s%-14s%-14s\n";
245  my $fmt_d2 = "%.6f";
246 
247  # loop over source transcripts
248  foreach my $source_transcript (values %{ $self->cache->get_by_name('transcripts_by_id', 'source') }) {
249 
250  # log progress
251  $self->logger->log_progress($progress_id, ++$i, 1);
252 
253  # We are only interested in scoring with exons that are in the target
254  # transcript. The ScoredMappingMatrix may contain scores for exons that
255  # aren't in this transcript so create a hash of the target transcript's
256  # exons
257  my %source_exons = map { $_->id => 1 }
258  @{ $source_transcript->get_all_Exons };
259 
260  my $source_transcript_length = $source_transcript->length;
261 
262  # get all corresponding target transcripts from the flag matrix
263  foreach my $target_transcript_id (@{ $flag_matrix->get_targets_for_source($source_transcript->id) }) {
264 
265  my $target_transcript = $self->cache->get_by_key('transcripts_by_id', 'target', $target_transcript_id);
266 
267  my $source_transcript_score = 0;
268  my $target_transcript_score = 0;
269  my $target_transcript_length = $target_transcript->length;
270 
271  my %target_exons = map { $_->id => 1 }
272  @{ $target_transcript->get_all_Exons };
273 
274  # now loop over source exons and find the highest scoring target exon
275  # belonging to the target transcript
276 
277  foreach my $source_exon (@{ $source_transcript->get_all_Exons }) {
278 
279  my $max_source_score = -1;
280 
281  foreach my $target_exon_id (@{ $exon_matrix->get_targets_for_source($source_exon->id) }) {
282 
283  next unless ($target_exons{$target_exon_id});
284 
285  my $score = $exon_matrix->get_score($source_exon->id,
286  $target_exon_id);
287  $max_source_score = $score if ($score > $max_source_score);
288  }
289 
290  if ($max_source_score > 0) {
291  $source_transcript_score += ($max_source_score * $source_exon->length);
292 
293  }
294  }
295 
296  # now do the same for target exons
297 
298  foreach my $target_exon (@{ $target_transcript->get_all_Exons }) {
299 
300  my $max_target_score = -1;
301 
302  foreach my $source_exon_id (@{ $exon_matrix->get_sources_for_target($target_exon->id) }) {
303 
304  next unless ($source_exons{$source_exon_id});
305 
306  my $score = $exon_matrix->get_score(
307  $source_exon_id, $target_exon->id);
308  $max_target_score = $score if ($score > $max_target_score);
309  }
310 
311  if ($max_target_score > 0) {
312  $target_transcript_score += ($max_target_score * $target_exon->length);
313 
314  }
315  }
316 
317  #
318  # calculate transcript score and add to scoring matrix
319  #
320  if (($source_transcript_length + $target_transcript_length) > 0) {
321 
322  # sanity check
323  if (($source_transcript_score > $source_transcript_length) or
324  ($target_transcript_score > $target_transcript_length)) {
325 
326  $self->logger->warning("Score > length for source ($source_transcript_score <> $source_transcript_length) or target ($target_transcript_score <> $target_transcript_length).\n", 1);
327 
328  } else {
329 
330  my $source_transcript_biotype_group = $self->get_biotype_group($source_transcript->biotype());
331  my $target_transcript_biotype_group = $self->get_biotype_group($target_transcript->biotype());
332 
333  # debug
334  $self->logger->info($source_transcript->id.":".$target_transcript->id.
335  " source score: $source_transcript_score".
336  " source length: $source_transcript_length".
337  " source biotype:" . $source_transcript->biotype() .
338  " source group: $source_transcript_biotype_group".
339  " target score: $target_transcript_score".
340  " target biotype:" . $target_transcript->biotype() .
341  " target group: $target_transcript_biotype_group".
342  " target length: $target_transcript_length\n");
343 
344  my $transcript_score =
345  ($source_transcript_score + $target_transcript_score) /
346  ($source_transcript_length + $target_transcript_length);
347 
348 ## Add penalty if biotypes are different
349  if ($source_transcript->biotype() ne $target_transcript->biotype()) {
350  $transcript_score = $transcript_score * 0.9;
351  }
352 
353 ## Add penalty if biotype groups are different
354  if ($source_transcript_biotype_group ne $target_transcript_biotype_group) {
355  $transcript_score = $transcript_score * 0.8;
356  }
357 
358  # everything is fine, add score to matrix
359  if ($transcript_score > $transcript_score_threshold) {
360  $matrix->add_score($source_transcript->id, $target_transcript->id,
361  $transcript_score);
362  }
363 
364  }
365 
366  } else {
367 
368  $self->logger->warning("Combined length of source (".$source_transcript->id.") and target (".$target_transcript->id.") transcript is zero!\n", 1);
369 
370  }
371 
372  }
373  }
374 
375  $self->logger->info("\n");
376 
377  return $matrix;
378 
379 }
380 
381 
382 #
383 # penalise scores between genes with different biotypes.
384 # entries are modified in place
385 #
386 sub biotype_transcript_rescore {
387  my ( $self, $matrix ) = @_;
388 
389  if ( defined($matrix) &&
390  !$matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
391  {
392  throw('Exprected Bio::EnsEMBL::IdMapping::ScoredMappingMatrix');
393  }
394 
395  my $i = 0;
396 
397  foreach my $entry ( @{ $matrix->get_all_Entries } ) {
398 
399  my $source_transcript =
400  $self->cache->get_by_key( 'transcripts_by_id', 'source',
401  $entry->source() );
402 
403  my $target_transcript =
404  $self->cache->get_by_key( 'transcripts_by_id', 'target',
405  $entry->target() );
406 
407  if ($source_transcript->biotype() ne $target_transcript->biotype() )
408  {
409  # PENALTY: Lower the score for mappings to transcripts of
410  # different biotype.
411  $matrix->set_score( $entry->source(), $entry->target(),
412  0.9*$entry->score() );
413  $i++;
414  }
415  }
416 
417  $self->logger->debug("Scored transcripts with biotype mismatch: $i\n",
418  1 );
419 } ## end sub biotype_transcript_rescore
420 
421 
422 sub different_translation_rescore {
423  my $self = shift;
424  my $matrix = shift;
425 
426  unless ($matrix
427  and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
428  {
429  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
430  }
431 
432  my $i = 0;
433 
434  foreach my $entry ( sort { $b->score <=> $a->score }
435  @{ $matrix->get_all_Entries } )
436  {
437 
438  # we only do this for perfect matches, i.e. transcript score == 1
439  last if ( $entry->score < 1 );
440 
441  my $source_tl =
442  $self->cache->get_by_key( 'transcripts_by_id', 'source',
443  $entry->source )->translation;
444  my $target_tl =
445  $self->cache->get_by_key( 'transcripts_by_id', 'target',
446  $entry->target )->translation;
447 
448  # no penalty if both transcripts have no translation
449  next if ( !$source_tl and !$target_tl );
450 
451  if ( !$source_tl
452  or !$target_tl
453  or ( $source_tl->seq ne $target_tl->seq ) )
454  {
455  # PENALTY: The transcript stable ID is now on a transcript with a
456  # different translation.
457  $matrix->set_score( $entry->source(), $entry->target(),
458  0.9*$entry->score() );
459  $i++;
460  }
461 
462  } ## end foreach my $entry ( sort { ...})
463 
464  $self->logger->debug(
465  "Non-perfect translations on perfect transcripts: $i\n",
466  1 );
467 } ## end sub different_translation_rescore
468 
469 
470 sub non_mapped_gene_rescore {
471  my $self = shift;
472  my $matrix = shift;
473  my $gene_mappings = shift;
474 
475  # argument checks
476  unless ($matrix
477  and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
478  {
479  throw(
480  'Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.'
481  );
482  }
483 
484  unless ( $gene_mappings
485  and $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList') )
486  {
487  throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
488  }
489 
490  # create of lookup hash of mapped source genes to target genes
491  my %gene_lookup =
492  map { $_->source => $_->target }
493  @{ $gene_mappings->get_all_Entries };
494 
495  my $i = 0;
496 
497  foreach my $entry ( @{ $matrix->get_all_Entries } ) {
498 
499  my $source_gene =
500  $self->cache->get_by_key( 'genes_by_transcript_id', 'source',
501  $entry->source );
502  my $target_gene =
503  $self->cache->get_by_key( 'genes_by_transcript_id', 'target',
504  $entry->target );
505 
506  my $mapped_target = $gene_lookup{ $source_gene->id };
507 
508  if ( !$mapped_target or ( $mapped_target != $target_gene->id ) ) {
509  # PENALTY: The transcript stable ID has been mapped to an
510  # un-mapped gene.
511  $matrix->set_score( $entry->source(), $entry->target(),
512  0.9*$entry->score() );
513  $i++;
514  }
515  }
516 
517  $self->logger->debug( "Scored transcripts in non-mapped genes: $i\n",
518  1 );
519 } ## end sub non_mapped_gene_rescore
520 
521 
522 sub get_biotype_group {
523  my ($self, $biotype) = @_;
524  my $dba = $self->cache->get_DBAdaptor('target');
525  my $biotype_adaptor = $dba->get_BiotypeAdaptor();
526  my $biotype_object = $biotype_adaptor->fetch_by_name_object_type($biotype, 'transcript');
527  return $biotype_object->biotype_group();
528 
529 }
530 
531 
532 1;
533 
map
public map()
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
Definition: ScoredMappingMatrix.pm:44
Bio::EnsEMBL::IdMapping::ScoreBuilder
Definition: ScoreBuilder.pm:22
Bio::EnsEMBL::IdMapping::Serialisable::cache_file
public String cache_file()
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::new
public Bio::EnsEMBL::IdMapping::ScoredMappingMatrix new()
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::log
public void log()
build_scores
public build_scores()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::IdMapping::ExonScoreBuilder
Definition: ExonScoreBuilder.pm:19