3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
4 Copyright [2016-2024] EMBL-European Bioinformatics Institute
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
44 package Bio::EnsEMBL::IdMapping::ExonScoreBuilder;
48 no warnings
'uninitialized';
60 # exon scoring is done in two steps:
61 # 1. map exons by overlap (if a common coord_system exists)
62 # 2. map remaining and poorly scoring exons using exonerate
67 $self->logger->info(
"-- Scoring exons...\n\n", 0,
'stamped' );
69 # score using overlaps, then exonerate
70 my $matrix = $self->overlap_score;
72 my $exonerate_matrix = $self->exonerate_score($matrix);
74 # log stats before matrix merging
75 $self->logger->info(
"\nOverlap scoring matrix:\n");
76 $self->log_matrix_stats($matrix);
77 $self->logger->info(
"\nExonerate scoring matrix:\n");
78 $self->log_matrix_stats($exonerate_matrix);
81 $self->logger->info(
"\nMerging scoring matrices...\n", 0,
83 $matrix->
merge($exonerate_matrix);
85 $self->logger->info(
"Done.\n\n", 0,
'stamped' );
88 if ( $self->logger->loglevel eq
'debug' ) {
89 $matrix->log(
'exon', $self->conf->param(
'basedir') );
92 # log stats of combined matrix
93 $self->logger->info(
"Combined scoring matrix:\n");
94 $self->log_matrix_stats($matrix);
96 $self->logger->info(
"\nDone with exon scoring.\n\n", 0,
'stamped' );
99 } ## end sub score_exons
103 # direct mapping by overlap (if common coord_system exists)
109 path_append( $self->conf->param(
'basedir'),
'matrix' );
113 -DUMP_PATH => $dump_path,
114 -CACHE_FILE =>
'exon_overlap_matrix.ser',
119 if ( -s $overlap_cache ) {
123 "Reading exon overlap scoring matrix from file...\n",
125 $self->logger->debug(
"Cache file $overlap_cache.\n", 1 );
126 $matrix->read_from_file;
127 $self->logger->info(
"Done.\n", 0,
'stamped' );
132 # build scoring matrix
134 "No exon overlap scoring matrix found. Will build new one.\n");
136 if ( $self->cache->highest_common_cs ) {
137 $self->logger->info(
"Overlap scoring...\n", 0,
'stamped' );
138 $matrix = $self->build_overlap_scores($matrix);
139 $self->logger->info(
"Done.\n", 0,
'stamped' );
142 # write scoring matrix to file
143 $matrix->write_to_file;
148 } ## end sub overlap_score
152 # map the remaining exons using exonerate
154 sub exonerate_score {
159 $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
161 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
165 path_append( $self->conf->param(
'basedir'),
'matrix' );
167 my $exonerate_matrix =
169 -DUMP_PATH => $dump_path,
170 -CACHE_FILE =>
'exon_exonerate_matrix.ser',
173 my $exonerate_cache = $exonerate_matrix->
cache_file;
175 if ( -s $exonerate_cache ) {
178 $self->logger->info(
"Reading exonerate matrix from file...\n",
180 $self->logger->debug(
"Cache file $exonerate_cache.\n", 1 );
181 $exonerate_matrix->read_from_file;
182 $self->logger->info(
"Done.\n", 0,
'stamped' );
187 # build scoring matrix
189 "No exonerate matrix found. Will build new one.\n");
191 # dump exons to fasta files
192 my $dump_count = $self->dump_filtered_exons($matrix);
196 $self->run_exonerate;
199 $self->parse_exonerate_results($exonerate_matrix);
204 $self->logger->info(
"No source and/or target exons dumped, " .
205 "so don't need to run exonerate.\n" );
209 # write scoring matrix to file
210 $exonerate_matrix->write_to_file;
212 } ## end
else [
if ( -s $exonerate_cache)]
214 return $exonerate_matrix;
215 } ## end sub exonerate_score
219 # Get a lists of exon containers for source and target. Walk along both
220 # lists, set a flag when you first encounter an exon (i.e. it starts).
221 # Record all alternative exons until you encounter the exon again (exon
222 # ends), then score against all alternative exons you've recorded.
224 sub build_overlap_scores {
225 my ( $self, $matrix ) = @_;
228 and $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
230 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
233 # get sorted list of exon containers
234 $self->logger->info(
"Reading sorted exons from cache...\n",
239 values %{ $self->cache->get_by_name(
'exons_by_id',
'source' ) }
244 values %{ $self->cache->get_by_name(
'exons_by_id',
'target' ) }
247 $self->logger->info(
"Done.\n", 1,
'stamped' );
249 # get first source and target exon container
250 my $source_ec = shift(@source_exons);
251 my $target_ec = shift(@target_exons);
253 my %source_overlap = ();
254 my %target_overlap = ();
256 $self->logger->info(
"Scoring...\n", 1,
'stamped' );
258 while ( defined($source_ec) || defined($target_ec) ) {
262 # compare exon containers
263 if ( defined($source_ec) && defined($target_ec) ) {
265 $self->compare_exon_containers( $source_ec, $target_ec );
266 if ( $cmp <= 0 ) { $add_source = 1 }
267 if ( $cmp >= 0 ) { $add_target = 1 }
269 elsif ( defined($source_ec) ) {
272 elsif ( defined($target_ec) ) {
276 die(
'The world is a strange place');
280 if ( exists( $source_overlap{ $source_ec->[0] } ) ) {
281 # remove exon from list of overlapping source exons to score
283 delete $source_overlap{ $source_ec->[0] };
286 # add exon to list of overlapping source exons to score target
288 $source_overlap{ $source_ec->[0] } = $source_ec->[0];
290 # score source exon against all target exons in current overlap
292 foreach my $target_exon ( values %target_overlap ) {
293 if ( defined( $matrix->get_score(
294 $source_ec->[0]->id, $target_exon->id
300 $self->calc_overlap_score( $source_ec->[0], $target_exon,
305 # get next source exon container
306 $source_ec = shift(@source_exons);
307 } ## end
if ($add_source)
310 if ( exists( $target_overlap{ $target_ec->[0] } ) ) {
311 # remove exon from list of overlapping target exons to score
313 delete $target_overlap{ $target_ec->[0] };
316 # add exon to list of overlapping target exons to score source
318 $target_overlap{ $target_ec->[0] } = $target_ec->[0];
320 # score target exon against all source exons in current overlap
322 foreach my $source_exon ( values %source_overlap ) {
323 if ( defined( $matrix->get_score(
324 $source_exon->id, $target_ec->[0]->id
330 $self->calc_overlap_score( $source_exon, $target_ec->[0],
335 # get next target exon container
336 $target_ec = shift(@target_exons);
337 } ## end
if ($add_target)
338 } ## end
while ( defined($source_ec...))
340 $self->logger->info(
"Done.\n", 1,
'stamped' );
343 } ## end sub build_overlap_scores
347 # Return a list of exon containers, sorted by seq_region_name, then location
348 # (where location is either start-1 or end, so each exon is in the list twice).
349 # An exon container is a listrefs of a TinyExon object and its location. This
350 # implements the ExonSortContainer in the java application.
358 ( $a->[0]->common_sr_name cmp $b->[0]->common_sr_name ) ||
359 ( $a->[1] <=> $b->[1] )
360 } (
map { [ $_, $_->common_start - 1 ] } @$exons ),
361 (
map { [ $_, $_->common_end ] } @$exons );
364 sub compare_exon_containers {
369 return ( ( $e1->[0]->common_sr_name cmp $e2->[0]->common_sr_name ) ||
370 ( $e1->[1] <=> $e2->[1] ) );
374 # Calculates overlap score between two exons. Its done by dividing
375 # overlap region by exons sizes. 1.0 is full overlap on both exons.
376 # Score of at least 0.5 are added to the exon scoring matrix.
378 sub calc_overlap_score {
380 my $source_exon = shift;
381 my $target_exon = shift;
386 # don't score if exons on different strand
387 return unless ( $source_exon->strand == $target_exon->strand );
389 # determine overlap start
390 if ( $source_exon->start > $target_exon->start ) {
391 $start = $source_exon->start;
394 $start = $target_exon->start;
397 # determine overlap end
398 if ( $source_exon->end < $target_exon->end ) {
399 $end = $source_exon->end;
402 $end = $target_exon->end;
406 # Calculate score, which is defined as average overlap / exon length
410 my $overlap = $end - $start + 1;
411 my $source_length = $source_exon->end - $source_exon->start + 1;
412 my $target_length = $target_exon->end - $target_exon->start + 1;
414 my $score = ( $overlap/$source_length + $overlap/$target_length )/2;
416 # The following penalty was removed because it meant that an exon
417 # whose sequence and position had not changed would become a
418 # candidate for similarity mapping when its phase changed. This
419 # caused lots of criss-crossing stable ID history entries between
420 # genes/transcripts/translations in some gene families.
423 ## penalise by 10% if phase if different
424 if ( $source_exon->phase != $target_exon->phase ) {
428 # add score to scoring matrix if it's at least 0.5
429 if ( $score >= 0.5 ) {
430 $matrix->add_score( $source_exon->id, $target_exon->id, $score );
433 } ## end sub calc_overlap_score
439 my $source_file = $self->exon_fasta_file(
'source');
440 my $target_file = $self->exon_fasta_file(
'target');
441 my $source_size = -s $source_file;
442 my $target_size = -s $target_file;
444 # check if fasta files exist and are not empty
445 unless ($source_size and $target_size) {
446 throw(
"Can't find exon fasta files.");
449 # create an empty lsf log directory
450 my $logpath = path_append($self->logger->logpath,
'exonerate');
451 system(
"rm -rf $logpath") == 0 or
452 $self->logger->error(
"Unable to delete lsf log dir $logpath: $!\n");
453 system(
"mkdir -p $logpath") == 0 or
454 $self->logger->error(
"Can't create lsf log dir $logpath: $!\n");
456 # delete exonerate output from previous runs
457 my $dump_path = $self->cache->dump_path;
459 opendir(my $dumpdir, $dump_path) or
460 $self->logger->error(
"Can't open $dump_path for reading: $!");
462 while (defined(my $file = readdir($dumpdir))) {
463 next unless /exonerate_map\.\d+/;
465 unlink(
"$dump_path/$file") or
466 $self->logger->error(
"Can't delete $dump_path/$file: $!");
471 # determine number of jobs to split task into
472 my $bytes_per_job = $self->conf->param(
'exonerate_bytes_per_job')
474 my $num_jobs = $self->conf->param(
'exonerate_jobs');
475 $num_jobs ||= int( $source_size/$bytes_per_job + 1 );
478 int( ( $self->conf->param(
'exonerate_threshold') || 0.5 )*100 );
479 my $lsf_name =
'idmapping_exonerate_' . time;
480 my $exonerate_path = $self->conf->param(
'exonerate_path');
481 my $exonerate_extra_params =
482 $self->conf->param(
'exonerate_extra_params');
485 # run exonerate jobs using lsf
488 qq{$exonerate_path } .
489 qq{--query $source_file --target $target_file } .
490 q{--querychunkid $LSB_JOBINDEX } .
491 qq{--querychunktotal $num_jobs } .
492 q{--model ungapped -M 1000 -D 100 } .
493 q{--showalignment FALSE --subopt no } . qq{--percent $percent } .
494 $self->conf->param(
'exonerate_extra_params') .
" " .
495 q{--ryo
'myinfo: %qi %ti %et %ql %tl\n' } .
496 qq{| grep
'^myinfo:' > $dump_path/exonerate_map.\$LSB_JOBINDEX} .
499 $self->logger->info(
"Submitting $num_jobs exonerate jobs to lsf.\n");
500 $self->logger->debug(
"$exonerate_job\n\n");
502 my $bsub_cmd = sprintf(
503 "|bsub -J '%s[1-%d]%%%d' -o %s/exonerate.%%I.out %s",
506 $self->conf()->param(
'exonerate_concurrent_jobs') || 200,
508 $self->conf()->param(
'lsf_opt_exonerate') );
511 open( BSUB, $bsub_cmd )
## no critic
512 or $self->logger->error(
"Could not open open pipe to bsub: $!\n");
514 print BSUB $exonerate_job;
515 $self->logger->error(
"Error submitting exonerate jobs: $!\n")
519 # submit dependent job to monitor finishing of exonerate jobs
520 $self->logger->info(
"Waiting for exonerate jobs to finish...\n", 0,
'stamped');
523 qq{bsub -K -w
"ended($lsf_name)" -q production } .
524 qq{-M 1000 -R
'select[mem>1000]' -R
'rusage[mem=1000]' } .
525 qq{-o $logpath/exonerate_depend.out /bin/
true};
527 system($dependent_job) == 0 or
528 $self->logger->error(
"Error submitting dependent job: $!\n");
530 $self->logger->info(
"All exonerate jobs finished.\n", 0,
'stamped');
538 for (my $i = 1; $i <= $num_jobs; $i++) {
540 # check that output file exists
541 my $outfile =
"$dump_path/exonerate_map.$i";
542 push @missing, $outfile unless (-f
"$outfile");
544 # check no errors occurred
545 my $errfile =
"$logpath/exonerate.$i.err";
546 push @error, $errfile
if (-s
"$errfile");
550 $self->logger->info(
"Couldn't find all exonerate output files. These are missing:\n");
552 $self->logger->info(
"$_\n", 1);
559 $self->logger->info(
"One or more exonerate jobs failed. Check these error files:\n");
561 $self->logger->info(
"$_\n", 1);
570 sub exon_fasta_file {
574 throw(
"You must provide a type.") unless $type;
576 return $self->cache->dump_path.
"/$type.exons.fasta";
580 sub dump_filtered_exons {
585 $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
586 throw(
'You must provide a ScoredMappingMatrix.');
589 # write exons to fasta files
590 my $source_count = $self->write_filtered_exons(
'source', $matrix);
591 my $target_count = $self->write_filtered_exons(
'target', $matrix);
593 # return true if both source and target exons were written; otherwise we
594 # don't need to run exonerate
595 return (($source_count > 0) and ($target_count > 0));
599 sub write_filtered_exons {
604 throw(
"You must provide a type.") unless $type;
606 $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
607 throw(
'You must provide a ScoredMappingMatrix.');
610 $self->logger->info(
"\nDumping $type exons to fasta file...\n", 0,
'stamped');
612 # don't dump exons shorter than this
613 my $min_exon_length = $self->conf->param(
'min_exon_length') || 15;
617 my $dumped_exons = 0;
619 # filehandle for fasta files
621 my $file = $self->exon_fasta_file($type);
622 open($fh,
'>', $file) or
throw(
"Unable to open $file for writing: $!");
624 # loop over exons, dump sequence to fasta file if longer than threshold and
627 foreach my $eid (sort { $b <=> $a }
628 keys %{ $self->cache->get_by_name(
'exons_by_id', $type) }) {
630 my $exon = $self->cache->get_by_key(
'exons_by_id', $type, $eid);
634 # skip if exon shorter than threshold
635 next EXON
if ($exon->length < $min_exon_length);
637 # skip if overlap score with any other exon is 1
638 if ( $type eq
'source' ) {
639 foreach my $target ( @{ $matrix->get_targets_for_source($eid) } )
641 if ( $matrix->get_score( $eid, $target ) > 0.9999 ) {
646 foreach my $source ( @{ $matrix->get_sources_for_target($eid) } )
648 if ( $matrix->get_score( $source, $eid ) > 0.9999 ) {
654 # write exon to fasta file
655 print $fh
'>', $eid,
"\n", $exon->seq,
"\n";
664 my $fmt =
"%-30s%10s\n";
666 $self->logger->info(sprintf($fmt,
'Total exons:', $total_exons), 1);
667 $self->logger->info(sprintf($fmt,
'Dumped exons:', $dumped_exons), 1);
668 $self->logger->info(sprintf($fmt,
'Dump file size:', parse_bytes($size)), 1);
669 $self->logger->info(
"Done.\n\n", 0,
'stamped');
671 return $dumped_exons;
674 sub parse_exonerate_results {
675 my ( $self, $exonerate_matrix ) = @_;
677 unless ( defined($exonerate_matrix)
679 $exonerate_matrix->isa(
680 'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')
683 throw(
'You must provide a ScoredMappingMatrix.');
686 $self->logger->info(
"Parsing exonerate results...\n", 0,
'stamped' );
688 # loop over all result files
689 my $dump_path = $self->cache->dump_path;
693 opendir( my $dumpdir, $dump_path ) or
694 $self->logger->error(
"Can't open $dump_path for reading: $!");
699 while ( defined( my $file = readdir($dumpdir) ) ) {
700 unless ( $file =~ /exonerate_map\.\d+/ ) { next }
704 open( my $fh,
'<',
"$dump_path/$file" );
706 my $threshold = $self->conf->param(
'exonerate_threshold') || 0.5;
713 # myinfo: source_id target_id match_length source_length target_length
714 my ( undef, $source_id, $target_id, $match_length, $source_length,
720 if ( $source_length == 0 or $target_length == 0 ) {
721 $self->logger->warning(
722 "Alignment length is 0 for $source_id or $target_id.\n");
725 $score = 2*$match_length/( $source_length + $target_length );
729 if ( $score > $threshold ) {
732 ->get_by_key(
'exons_by_id',
'source', $source_id )
736 ->get_by_key(
'exons_by_id',
'target', $target_id )
739 if ( $source_sr ne $target_sr ) {
740 # PENALTY: The target and source are not on the same
744 # With a penalty of 0.7, exonerate scores need to be above
745 # approximately 0.714 to make the 0.5 threshold.
750 if ( $score > $threshold ) {
751 $exonerate_matrix->add_score( $source_id, $target_id,
757 } ## end
if ( $score > $threshold)
759 } ## end
while (<$fh>)
762 } ## end
while ( defined( my $file...))
767 "Done parsing $num_lines lines from $num_files result files.\n",
769 $self->logger->info(
"Penalised $penalised exon alignments " .
770 "for not being on the same seq_region " .
771 "($killed killed).\n",
775 return $exonerate_matrix;
776 } ## end sub parse_exonerate_results
779 sub non_mapped_transcript_rescore {
782 my $transcript_mappings = shift;
786 and $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
789 'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.');
792 unless ( $transcript_mappings
794 $transcript_mappings->isa(
'Bio::EnsEMBL::IdMapping::MappingList') )
797 'Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
800 # create a lookup hash of mapped source transcripts to target
802 my %transcript_lookup =
803 map { $_->source => $_->target }
804 @{ $transcript_mappings->get_all_Entries };
808 foreach my $entry ( @{ $matrix->get_all_Entries } ) {
810 my @source_transcripts = @{
811 $self->cache->get_by_key(
'transcripts_by_exon_id',
'source',
813 my @target_transcripts = @{
814 $self->cache->get_by_key(
'transcripts_by_exon_id',
'target',
817 my $found_mapped = 0;
820 foreach my $source_tr (@source_transcripts) {
821 foreach my $target_tr (@target_transcripts) {
822 my $mapped_target = $transcript_lookup{ $source_tr->id };
824 if ( $mapped_target && $mapped_target == $target_tr->id ) {
831 unless ($found_mapped) {
832 # PENALTY: The exon appears to belong to a transcript that has not
834 $matrix->set_score( $entry->source(), $entry->target(),
835 0.9*$entry->score() );
838 } ## end
foreach my $entry ( @{ $matrix...})
840 $self->logger->debug(
"Scored exons in non-mapped transcripts: $i\n",
842 } ## end sub non_mapped_transcript_rescore