ensembl-hive  2.6
ExonScoreBuilder.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2024] EMBL-European Bioinformatics Institute
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 =head1 SYNOPSIS
34 
35 =head1 DESCRIPTION
36 
37 Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from
38 Java application.
39 
40 =head1 METHODS
41 
42 =cut
43 
44 package Bio::EnsEMBL::IdMapping::ExonScoreBuilder;
45 
46 use strict;
47 use warnings;
48 no warnings 'uninitialized';
49 
52 
53 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
54 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
55 use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes path_append);
57 
58 
59 #
60 # exon scoring is done in two steps:
61 # 1. map exons by overlap (if a common coord_system exists)
62 # 2. map remaining and poorly scoring exons using exonerate
63 #
64 sub score_exons {
65  my $self = shift;
66 
67  $self->logger->info( "-- Scoring exons...\n\n", 0, 'stamped' );
68 
69  # score using overlaps, then exonerate
70  my $matrix = $self->overlap_score;
71 
72  my $exonerate_matrix = $self->exonerate_score($matrix);
73 
74  # log stats before matrix merging
75  $self->logger->info("\nOverlap scoring matrix:\n");
76  $self->log_matrix_stats($matrix);
77  $self->logger->info("\nExonerate scoring matrix:\n");
78  $self->log_matrix_stats($exonerate_matrix);
79 
80  # merge matrices
81  $self->logger->info( "\nMerging scoring matrices...\n", 0,
82  'stamped' );
83  $matrix->merge($exonerate_matrix);
84 
85  $self->logger->info( "Done.\n\n", 0, 'stamped' );
86 
87  # debug logging
88  if ( $self->logger->loglevel eq 'debug' ) {
89  $matrix->log( 'exon', $self->conf->param('basedir') );
90  }
91 
92  # log stats of combined matrix
93  $self->logger->info("Combined scoring matrix:\n");
94  $self->log_matrix_stats($matrix);
95 
96  $self->logger->info( "\nDone with exon scoring.\n\n", 0, 'stamped' );
97 
98  return $matrix;
99 } ## end sub score_exons
100 
101 
102 #
103 # direct mapping by overlap (if common coord_system exists)
104 #
105 sub overlap_score {
106  my $self = shift;
107 
108  my $dump_path =
109  path_append( $self->conf->param('basedir'), 'matrix' );
110 
111  my $matrix =
113  -DUMP_PATH => $dump_path,
114  -CACHE_FILE => 'exon_overlap_matrix.ser',
115  );
116 
117  my $overlap_cache = $matrix->cache_file;
118 
119  if ( -s $overlap_cache ) {
120 
121  # read from file
122  $self->logger->info(
123  "Reading exon overlap scoring matrix from file...\n",
124  0, 'stamped' );
125  $self->logger->debug( "Cache file $overlap_cache.\n", 1 );
126  $matrix->read_from_file;
127  $self->logger->info( "Done.\n", 0, 'stamped' );
128 
129  }
130  else {
131 
132  # build scoring matrix
133  $self->logger->info(
134  "No exon overlap scoring matrix found. Will build new one.\n");
135 
136  if ( $self->cache->highest_common_cs ) {
137  $self->logger->info( "Overlap scoring...\n", 0, 'stamped' );
138  $matrix = $self->build_overlap_scores($matrix);
139  $self->logger->info( "Done.\n", 0, 'stamped' );
140  }
141 
142  # write scoring matrix to file
143  $matrix->write_to_file;
144 
145  }
146 
147  return $matrix;
148 } ## end sub overlap_score
149 
150 
151 #
152 # map the remaining exons using exonerate
153 #
154 sub exonerate_score {
155  my $self = shift;
156  my $matrix = shift;
157 
158  unless ( $matrix and
159  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
160  {
161  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
162  }
163 
164  my $dump_path =
165  path_append( $self->conf->param('basedir'), 'matrix' );
166 
167  my $exonerate_matrix =
169  -DUMP_PATH => $dump_path,
170  -CACHE_FILE => 'exon_exonerate_matrix.ser',
171  );
172 
173  my $exonerate_cache = $exonerate_matrix->cache_file;
174 
175  if ( -s $exonerate_cache ) {
176 
177  # read from file
178  $self->logger->info( "Reading exonerate matrix from file...\n",
179  0, 'stamped' );
180  $self->logger->debug( "Cache file $exonerate_cache.\n", 1 );
181  $exonerate_matrix->read_from_file;
182  $self->logger->info( "Done.\n", 0, 'stamped' );
183 
184  }
185  else {
186 
187  # build scoring matrix
188  $self->logger->info(
189  "No exonerate matrix found. Will build new one.\n");
190 
191  # dump exons to fasta files
192  my $dump_count = $self->dump_filtered_exons($matrix);
193 
194  if ($dump_count) {
195  # run exonerate
196  $self->run_exonerate;
197 
198  # parse results
199  $self->parse_exonerate_results($exonerate_matrix);
200 
201  }
202  else {
203 
204  $self->logger->info( "No source and/or target exons dumped, " .
205  "so don't need to run exonerate.\n" );
206 
207  }
208 
209  # write scoring matrix to file
210  $exonerate_matrix->write_to_file;
211 
212  } ## end else [ if ( -s $exonerate_cache)]
213 
214  return $exonerate_matrix;
215 } ## end sub exonerate_score
216 
217 #
218 # Algorithm:
219 # Get a lists of exon containers for source and target. Walk along both
220 # lists, set a flag when you first encounter an exon (i.e. it starts).
221 # Record all alternative exons until you encounter the exon again (exon
222 # ends), then score against all alternative exons you've recorded.
223 #
224 sub build_overlap_scores {
225  my ( $self, $matrix ) = @_;
226 
227  unless ($matrix
228  and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
229  {
230  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
231  }
232 
233  # get sorted list of exon containers
234  $self->logger->info( "Reading sorted exons from cache...\n",
235  1, 'stamped' );
236 
237  my @source_exons =
238  $self->sort_exons( [
239  values %{ $self->cache->get_by_name( 'exons_by_id', 'source' ) }
240  ] );
241 
242  my @target_exons =
243  $self->sort_exons( [
244  values %{ $self->cache->get_by_name( 'exons_by_id', 'target' ) }
245  ] );
246 
247  $self->logger->info( "Done.\n", 1, 'stamped' );
248 
249  # get first source and target exon container
250  my $source_ec = shift(@source_exons);
251  my $target_ec = shift(@target_exons);
252 
253  my %source_overlap = ();
254  my %target_overlap = ();
255 
256  $self->logger->info( "Scoring...\n", 1, 'stamped' );
257 
258  while ( defined($source_ec) || defined($target_ec) ) {
259  my $add_source = 0;
260  my $add_target = 0;
261 
262  # compare exon containers
263  if ( defined($source_ec) && defined($target_ec) ) {
264  my $cmp =
265  $self->compare_exon_containers( $source_ec, $target_ec );
266  if ( $cmp <= 0 ) { $add_source = 1 }
267  if ( $cmp >= 0 ) { $add_target = 1 }
268  }
269  elsif ( defined($source_ec) ) {
270  $add_source = 1;
271  }
272  elsif ( defined($target_ec) ) {
273  $add_target = 1;
274  }
275  else {
276  die('The world is a strange place');
277  }
278 
279  if ($add_source) {
280  if ( exists( $source_overlap{ $source_ec->[0] } ) ) {
281  # remove exon from list of overlapping source exons to score
282  # target against
283  delete $source_overlap{ $source_ec->[0] };
284  }
285  else {
286  # add exon to list of overlapping source exons to score target
287  # against
288  $source_overlap{ $source_ec->[0] } = $source_ec->[0];
289 
290  # score source exon against all target exons in current overlap
291  # list
292  foreach my $target_exon ( values %target_overlap ) {
293  if ( defined( $matrix->get_score(
294  $source_ec->[0]->id, $target_exon->id
295  ) ) )
296  {
297  next;
298  }
299 
300  $self->calc_overlap_score( $source_ec->[0], $target_exon,
301  $matrix );
302  }
303  }
304 
305  # get next source exon container
306  $source_ec = shift(@source_exons);
307  } ## end if ($add_source)
308 
309  if ($add_target) {
310  if ( exists( $target_overlap{ $target_ec->[0] } ) ) {
311  # remove exon from list of overlapping target exons to score
312  # source against
313  delete $target_overlap{ $target_ec->[0] };
314  }
315  else {
316  # add exon to list of overlapping target exons to score source
317  # against
318  $target_overlap{ $target_ec->[0] } = $target_ec->[0];
319 
320  # score target exon against all source exons in current overlap
321  # list
322  foreach my $source_exon ( values %source_overlap ) {
323  if ( defined( $matrix->get_score(
324  $source_exon->id, $target_ec->[0]->id
325  ) ) )
326  {
327  next;
328  }
329 
330  $self->calc_overlap_score( $source_exon, $target_ec->[0],
331  $matrix );
332  }
333  }
334 
335  # get next target exon container
336  $target_ec = shift(@target_exons);
337  } ## end if ($add_target)
338  } ## end while ( defined($source_ec...))
339 
340  $self->logger->info( "Done.\n", 1, 'stamped' );
341 
342  return $matrix;
343 } ## end sub build_overlap_scores
344 
345 
346 #
347 # Return a list of exon containers, sorted by seq_region_name, then location
348 # (where location is either start-1 or end, so each exon is in the list twice).
349 # An exon container is a listrefs of a TinyExon object and its location. This
350 # implements the ExonSortContainer in the java application.
351 #
352 sub sort_exons {
353  my $self = shift;
354  my $exons = shift;
355 
356  ## no critic
357  return sort {
358  ( $a->[0]->common_sr_name cmp $b->[0]->common_sr_name ) ||
359  ( $a->[1] <=> $b->[1] )
360  } ( map { [ $_, $_->common_start - 1 ] } @$exons ),
361  ( map { [ $_, $_->common_end ] } @$exons );
362 }
363 
364 sub compare_exon_containers {
365  my $self = shift;
366  my $e1 = shift;
367  my $e2 = shift;
368 
369  return ( ( $e1->[0]->common_sr_name cmp $e2->[0]->common_sr_name ) ||
370  ( $e1->[1] <=> $e2->[1] ) );
371 }
372 
373 #
374 # Calculates overlap score between two exons. Its done by dividing
375 # overlap region by exons sizes. 1.0 is full overlap on both exons.
376 # Score of at least 0.5 are added to the exon scoring matrix.
377 #
378 sub calc_overlap_score {
379  my $self = shift;
380  my $source_exon = shift;
381  my $target_exon = shift;
382  my $matrix = shift;
383 
384  my ( $start, $end );
385 
386  # don't score if exons on different strand
387  return unless ( $source_exon->strand == $target_exon->strand );
388 
389  # determine overlap start
390  if ( $source_exon->start > $target_exon->start ) {
391  $start = $source_exon->start;
392  }
393  else {
394  $start = $target_exon->start;
395  }
396 
397  # determine overlap end
398  if ( $source_exon->end < $target_exon->end ) {
399  $end = $source_exon->end;
400  }
401  else {
402  $end = $target_exon->end;
403  }
404 
405  #
406  # Calculate score, which is defined as average overlap / exon length
407  # ratio.
408  #
409 
410  my $overlap = $end - $start + 1;
411  my $source_length = $source_exon->end - $source_exon->start + 1;
412  my $target_length = $target_exon->end - $target_exon->start + 1;
413 
414  my $score = ( $overlap/$source_length + $overlap/$target_length )/2;
415 
416  # The following penalty was removed because it meant that an exon
417  # whose sequence and position had not changed would become a
418  # candidate for similarity mapping when its phase changed. This
419  # caused lots of criss-crossing stable ID history entries between
420  # genes/transcripts/translations in some gene families.
421  #
422  ## PENALTY:
423  ## penalise by 10% if phase if different
424  if ( $source_exon->phase != $target_exon->phase ) {
425  $score *= 0.9;
426  }
427 
428  # add score to scoring matrix if it's at least 0.5
429  if ( $score >= 0.5 ) {
430  $matrix->add_score( $source_exon->id, $target_exon->id, $score );
431  }
432 
433 } ## end sub calc_overlap_score
434 
435 
436 sub run_exonerate {
437  my $self = shift;
438 
439  my $source_file = $self->exon_fasta_file('source');
440  my $target_file = $self->exon_fasta_file('target');
441  my $source_size = -s $source_file;
442  my $target_size = -s $target_file;
443 
444  # check if fasta files exist and are not empty
445  unless ($source_size and $target_size) {
446  throw("Can't find exon fasta files.");
447  }
448 
449  # create an empty lsf log directory
450  my $logpath = path_append($self->logger->logpath, 'exonerate');
451  system("rm -rf $logpath") == 0 or
452  $self->logger->error("Unable to delete lsf log dir $logpath: $!\n");
453  system("mkdir -p $logpath") == 0 or
454  $self->logger->error("Can't create lsf log dir $logpath: $!\n");
455 
456  # delete exonerate output from previous runs
457  my $dump_path = $self->cache->dump_path;
458 
459  opendir(my $dumpdir, $dump_path) or
460  $self->logger->error("Can't open $dump_path for reading: $!");
461 
462  while (defined(my $file = readdir($dumpdir))) {
463  next unless /exonerate_map\.\d+/;
464 
465  unlink("$dump_path/$file") or
466  $self->logger->error("Can't delete $dump_path/$file: $!");
467  }
468 
469  closedir($dumpdir);
470 
471  # determine number of jobs to split task into
472  my $bytes_per_job = $self->conf->param('exonerate_bytes_per_job')
473  || 250000;
474  my $num_jobs = $self->conf->param('exonerate_jobs');
475  $num_jobs ||= int( $source_size/$bytes_per_job + 1 );
476 
477  my $percent =
478  int( ( $self->conf->param('exonerate_threshold') || 0.5 )*100 );
479  my $lsf_name = 'idmapping_exonerate_' . time;
480  my $exonerate_path = $self->conf->param('exonerate_path');
481  my $exonerate_extra_params =
482  $self->conf->param('exonerate_extra_params');
483 
484  #
485  # run exonerate jobs using lsf
486  #
487  my $exonerate_job =
488  qq{$exonerate_path } .
489  qq{--query $source_file --target $target_file } .
490  q{--querychunkid $LSB_JOBINDEX } .
491  qq{--querychunktotal $num_jobs } .
492  q{--model ungapped -M 1000 -D 100 } .
493  q{--showalignment FALSE --subopt no } . qq{--percent $percent } .
494  $self->conf->param('exonerate_extra_params') . " " .
495  q{--ryo 'myinfo: %qi %ti %et %ql %tl\n' } .
496  qq{| grep '^myinfo:' > $dump_path/exonerate_map.\$LSB_JOBINDEX} .
497  "\n";
498 
499  $self->logger->info("Submitting $num_jobs exonerate jobs to lsf.\n");
500  $self->logger->debug("$exonerate_job\n\n");
501 
502  my $bsub_cmd = sprintf(
503  "|bsub -J '%s[1-%d]%%%d' -o %s/exonerate.%%I.out %s",
504  $lsf_name,
505  $num_jobs,
506  $self->conf()->param('exonerate_concurrent_jobs') || 200,
507  $logpath,
508  $self->conf()->param('lsf_opt_exonerate') );
509 
510  local *BSUB;
511  open( BSUB, $bsub_cmd ) ## no critic
512  or $self->logger->error("Could not open open pipe to bsub: $!\n");
513 
514  print BSUB $exonerate_job;
515  $self->logger->error("Error submitting exonerate jobs: $!\n")
516  unless ($? == 0);
517  close BSUB;
518 
519  # submit dependent job to monitor finishing of exonerate jobs
520  $self->logger->info("Waiting for exonerate jobs to finish...\n", 0, 'stamped');
521 
522  my $dependent_job =
523  qq{bsub -K -w "ended($lsf_name)" -q production } .
524  qq{-M 1000 -R 'select[mem>1000]' -R 'rusage[mem=1000]' } .
525  qq{-o $logpath/exonerate_depend.out /bin/true};
526 
527  system($dependent_job) == 0 or
528  $self->logger->error("Error submitting dependent job: $!\n");
529 
530  $self->logger->info("All exonerate jobs finished.\n", 0, 'stamped');
531 
532  #
533  # check results
534  #
535  my @missing;
536  my @error;
537 
538  for (my $i = 1; $i <= $num_jobs; $i++) {
539 
540  # check that output file exists
541  my $outfile = "$dump_path/exonerate_map.$i";
542  push @missing, $outfile unless (-f "$outfile");
543 
544  # check no errors occurred
545  my $errfile = "$logpath/exonerate.$i.err";
546  push @error, $errfile if (-s "$errfile");
547  }
548 
549  if (@missing) {
550  $self->logger->info("Couldn't find all exonerate output files. These are missing:\n");
551  foreach (@missing) {
552  $self->logger->info("$_\n", 1);
553  }
554 
555  exit(1);
556  }
557 
558  if (@error) {
559  $self->logger->info("One or more exonerate jobs failed. Check these error files:\n");
560  foreach (@error) {
561  $self->logger->info("$_\n", 1);
562  }
563 
564  exit(1);
565  }
566 
567 }
568 
569 
570 sub exon_fasta_file {
571  my $self = shift;
572  my $type = shift;
573 
574  throw("You must provide a type.") unless $type;
575 
576  return $self->cache->dump_path."/$type.exons.fasta";
577 }
578 
579 
580 sub dump_filtered_exons {
581  my $self = shift;
582  my $matrix = shift;
583 
584  unless ($matrix and
585  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
586  throw('You must provide a ScoredMappingMatrix.');
587  }
588 
589  # write exons to fasta files
590  my $source_count = $self->write_filtered_exons('source', $matrix);
591  my $target_count = $self->write_filtered_exons('target', $matrix);
592 
593  # return true if both source and target exons were written; otherwise we
594  # don't need to run exonerate
595  return (($source_count > 0) and ($target_count > 0));
596 }
597 
598 
599 sub write_filtered_exons {
600  my $self = shift;
601  my $type = shift;
602  my $matrix = shift;
603 
604  throw("You must provide a type.") unless $type;
605  unless ($matrix and
606  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
607  throw('You must provide a ScoredMappingMatrix.');
608  }
609 
610  $self->logger->info("\nDumping $type exons to fasta file...\n", 0, 'stamped');
611 
612  # don't dump exons shorter than this
613  my $min_exon_length = $self->conf->param('min_exon_length') || 15;
614 
615  # counters
616  my $total_exons = 0;
617  my $dumped_exons = 0;
618 
619  # filehandle for fasta files
620  my $fh;
621  my $file = $self->exon_fasta_file($type);
622  open($fh, '>', $file) or throw("Unable to open $file for writing: $!");
623 
624  # loop over exons, dump sequence to fasta file if longer than threshold and
625  # score < 1
626  EXON:
627  foreach my $eid (sort { $b <=> $a }
628  keys %{ $self->cache->get_by_name('exons_by_id', $type) }) {
629 
630  my $exon = $self->cache->get_by_key('exons_by_id', $type, $eid);
631 
632  $total_exons++;
633 
634  # skip if exon shorter than threshold
635  next EXON if ($exon->length < $min_exon_length);
636 
637  # skip if overlap score with any other exon is 1
638  if ( $type eq 'source' ) {
639  foreach my $target ( @{ $matrix->get_targets_for_source($eid) } )
640  {
641  if ( $matrix->get_score( $eid, $target ) > 0.9999 ) {
642  next EXON;
643  }
644  }
645  } else {
646  foreach my $source ( @{ $matrix->get_sources_for_target($eid) } )
647  {
648  if ( $matrix->get_score( $source, $eid ) > 0.9999 ) {
649  next EXON;
650  }
651  }
652  }
653 
654  # write exon to fasta file
655  print $fh '>', $eid, "\n", $exon->seq, "\n";
656 
657  $dumped_exons++;
658 
659  }
660 
661  close($fh);
662 
663  # log
664  my $fmt = "%-30s%10s\n";
665  my $size = -s $file;
666  $self->logger->info(sprintf($fmt, 'Total exons:', $total_exons), 1);
667  $self->logger->info(sprintf($fmt, 'Dumped exons:', $dumped_exons), 1);
668  $self->logger->info(sprintf($fmt, 'Dump file size:', parse_bytes($size)), 1);
669  $self->logger->info("Done.\n\n", 0, 'stamped');
670 
671  return $dumped_exons;
672 }
673 
674 sub parse_exonerate_results {
675  my ( $self, $exonerate_matrix ) = @_;
676 
677  unless ( defined($exonerate_matrix)
678  &&
679  $exonerate_matrix->isa(
680  'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')
681  )
682  {
683  throw('You must provide a ScoredMappingMatrix.');
684  }
685 
686  $self->logger->info( "Parsing exonerate results...\n", 0, 'stamped' );
687 
688  # loop over all result files
689  my $dump_path = $self->cache->dump_path;
690  my $num_files = 0;
691  my $num_lines = 0;
692 
693  opendir( my $dumpdir, $dump_path ) or
694  $self->logger->error("Can't open $dump_path for reading: $!");
695 
696  my $penalised = 0;
697  my $killed = 0;
698 
699  while ( defined( my $file = readdir($dumpdir) ) ) {
700  unless ( $file =~ /exonerate_map\.\d+/ ) { next }
701 
702  $num_files++;
703 
704  open( my $fh, '<', "$dump_path/$file" );
705 
706  my $threshold = $self->conf->param('exonerate_threshold') || 0.5;
707 
708  while (<$fh>) {
709  $num_lines++;
710  chomp;
711 
712  # line format:
713  # myinfo: source_id target_id match_length source_length target_length
714  my ( undef, $source_id, $target_id, $match_length, $source_length,
715  $target_length )
716  = split;
717 
718  my $score = 0;
719 
720  if ( $source_length == 0 or $target_length == 0 ) {
721  $self->logger->warning(
722  "Alignment length is 0 for $source_id or $target_id.\n");
723  }
724  else {
725  $score = 2*$match_length/( $source_length + $target_length );
726 
727  }
728 
729  if ( $score > $threshold ) {
730  my $source_sr =
731  $self->cache()
732  ->get_by_key( 'exons_by_id', 'source', $source_id )
733  ->seq_region_name();
734  my $target_sr =
735  $self->cache()
736  ->get_by_key( 'exons_by_id', 'target', $target_id )
737  ->seq_region_name();
738 
739  if ( $source_sr ne $target_sr ) {
740  # PENALTY: The target and source are not on the same
741  # seq_region.
742  $score *= 0.70;
743 
744  # With a penalty of 0.7, exonerate scores need to be above
745  # approximately 0.714 to make the 0.5 threshold.
746 
747  ++$penalised;
748  }
749 
750  if ( $score > $threshold ) {
751  $exonerate_matrix->add_score( $source_id, $target_id,
752  $score );
753  }
754  else {
755  ++$killed;
756  }
757  } ## end if ( $score > $threshold)
758 
759  } ## end while (<$fh>)
760 
761  close($fh);
762  } ## end while ( defined( my $file...))
763 
764  closedir($dumpdir);
765 
766  $self->logger->info(
767  "Done parsing $num_lines lines from $num_files result files.\n",
768  0, 'stamped' );
769  $self->logger->info( "Penalised $penalised exon alignments " .
770  "for not being on the same seq_region " .
771  "($killed killed).\n",
772  0,
773  'stamped' );
774 
775  return $exonerate_matrix;
776 } ## end sub parse_exonerate_results
777 
778 
779 sub non_mapped_transcript_rescore {
780  my $self = shift;
781  my $matrix = shift;
782  my $transcript_mappings = shift;
783 
784  # argument checks
785  unless ($matrix
786  and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
787  {
788  throw(
789  'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.');
790  }
791 
792  unless ( $transcript_mappings
793  and
794  $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList') )
795  {
796  throw(
797  'Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
798  }
799 
800  # create a lookup hash of mapped source transcripts to target
801  # transcripts
802  my %transcript_lookup =
803  map { $_->source => $_->target }
804  @{ $transcript_mappings->get_all_Entries };
805 
806  my $i = 0;
807 
808  foreach my $entry ( @{ $matrix->get_all_Entries } ) {
809 
810  my @source_transcripts = @{
811  $self->cache->get_by_key( 'transcripts_by_exon_id', 'source',
812  $entry->source ) };
813  my @target_transcripts = @{
814  $self->cache->get_by_key( 'transcripts_by_exon_id', 'target',
815  $entry->target ) };
816 
817  my $found_mapped = 0;
818 
819  TR:
820  foreach my $source_tr (@source_transcripts) {
821  foreach my $target_tr (@target_transcripts) {
822  my $mapped_target = $transcript_lookup{ $source_tr->id };
823 
824  if ( $mapped_target && $mapped_target == $target_tr->id ) {
825  $found_mapped = 1;
826  last TR;
827  }
828  }
829  }
830 
831  unless ($found_mapped) {
832  # PENALTY: The exon appears to belong to a transcript that has not
833  # been mapped.
834  $matrix->set_score( $entry->source(), $entry->target(),
835  0.9*$entry->score() );
836  $i++;
837  }
838  } ## end foreach my $entry ( @{ $matrix...})
839 
840  $self->logger->debug( "Scored exons in non-mapped transcripts: $i\n",
841  1 );
842 } ## end sub non_mapped_transcript_rescore
843 
844 
845 
846 1;
map
public map()
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
Definition: ScoredMappingMatrix.pm:44
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::merge
public Int merge()
Bio::EnsEMBL::IdMapping::ScoreBuilder
Definition: ScoreBuilder.pm:22
Bio::EnsEMBL::IdMapping::Serialisable::cache_file
public String cache_file()
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::new
public Bio::EnsEMBL::IdMapping::ScoredMappingMatrix new()
Bio::EnsEMBL::Utils::Argument
Definition: Argument.pm:34
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::IdMapping::ExonScoreBuilder
Definition: ExonScoreBuilder.pm:19