3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
34 regions across the genome
38 # build the SyntenyFramework from unambiguous gene mappings
40 -DUMP_PATH => $dump_path,
41 -CACHE_FILE =>
'synteny_framework.ser',
42 -LOGGER => $self->logger,
44 -CACHE => $self->cache,
48 # use it to rescore the genes
49 $gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
54 locations very analoguous to the information in the assembly table (the
55 locations dont have to be the same length though). They are built from
56 genes that
map uniquely between source and target.
59 pairs to determine whether they are similar. This process is slow (it
60 involves testing all gene pairs against all SyntenyRegions),
this module
61 therefor has built-in support to
run the process in parallel via LSF.
69 get_all_SyntenyRegions
70 rescore_gene_matrix_lsf
78 package Bio::EnsEMBL::IdMapping::SyntenyFramework;
82 no warnings
'uninitialized';
102 Arg [DUMP_PATH] : String - path
for object serialisation
103 Arg [CACHE_FILE] : String - filename of serialised
object
105 -DUMP_PATH => $dump_path,
106 -CACHE_FILE =>
'synteny_framework.ser',
107 -LOGGER => $self->logger,
108 -CONF => $self->conf,
109 -CACHE => $self->cache,
111 Description : Constructor.
113 Exceptions : thrown on wrong or missing arguments
122 my $class = ref($caller) || $caller;
123 my $self = $class->SUPER::new(@_);
125 my ($logger, $conf, $cache) = rearrange([
'LOGGER',
'CONF',
'CACHE'], @_);
127 unless ($logger and ref($logger) and
128 $logger->isa(
'Bio::EnsEMBL::Utils::Logger')) {
129 throw(
"You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
132 unless ($conf and ref($conf) and
133 $conf->isa(
'Bio::EnsEMBL::Utils::ConfParser')) {
134 throw(
"You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
137 unless ($cache and ref($cache) and
138 $cache->isa(
'Bio::EnsEMBL::IdMapping::Cache')) {
139 throw(
"You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object.");
143 $self->logger($logger);
145 $self->cache($cache);
146 $self->{
'cache'} = [];
158 SyntenyRegions are allowed to overlap. At most two overlapping
159 SyntenyRegions are merged (otherwise we
'd get too large
160 SyntenyRegions with little information content).
162 Exceptions : thrown on wrong or missing argument
163 Caller : InternalIdMapper plugins
171 my $mappings = shift;
173 unless ($mappings and
178 # create a synteny region for each mapping
179 my @synteny_regions = ();
181 foreach my $entry (@{ $mappings->get_all_Entries }) {
183 my $source_gene = $self->cache->get_by_key('genes_by_id
', 'source
',
185 my $target_gene = $self->cache->get_by_key('genes_by_id
', 'target
',
188 my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([
191 $source_gene->strand,
192 $source_gene->seq_region_name,
195 $target_gene->strand,
196 $target_gene->seq_region_name,
200 push @synteny_regions, $sr;
203 unless (@synteny_regions) {
204 $self->logger->warning("No synteny regions could be identified.\n");
208 # sort synteny regions
209 #my @sorted = sort _by_overlap @synteny_regions;
210 my @sorted = reverse sort {
211 $a->source_seq_region_name cmp $b->source_seq_region_name ||
212 $a->source_start <=> $b->source_start ||
213 $a->source_end <=> $b->source_end } @synteny_regions;
215 $self->logger->info("SyntenyRegions before merging: ".scalar(@sorted)."\n");
217 # now create merged regions from overlapping syntenies, but only merge a
218 # maximum of 2 regions (otherwise you end up with large synteny blocks which
219 # won't contain much information in
this context)
221 my $last_sr = shift(@sorted);
223 while (my $sr = shift(@sorted)) {
224 #$self->logger->debug("this ".$sr->to_string."\n");
226 my $merged_sr = $last_sr->merge($sr);
229 unless ($last_merged) {
230 $self->add_SyntenyRegion($last_sr->stretch(2));
231 #$self->logger->debug("nnn ".$last_sr->to_string."\n");
235 $self->add_SyntenyRegion($merged_sr->stretch(2));
236 #$self->logger->debug("mmm ".$merged_sr->to_string."\n");
243 # deal with last synteny region in @sorted
244 unless ($last_merged) {
245 $self->add_SyntenyRegion($last_sr->stretch(2));
249 #foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
250 # $self->logger->debug("SRs ".$sr->to_string."\n");
253 $self->logger->info(
"SyntenyRegions after merging: ".scalar(@{ $self->get_all_SyntenyRegions }).
"\n");
259 # sort SyntenyRegions by overlap
262 # first sort by seq_region
263 my $retval = ($b->source_seq_region_name cmp $a->source_seq_region_name);
264 return $retval
if ($retval);
266 # then sort by overlap:
267 # return -1 if $a is downstream, 1 if it's upstream, 0 if they overlap
268 if ($a->source_end < $b->source_start) {
return 1; }
269 if ($a->source_start < $b->source_end) {
return -1; }
274 =head2 add_SyntenyRegion
276 Arg[1] : Bio::EnsEMBL::IdMaping::SyntenyRegion - SyntenyRegion to add
277 Example : $synteny_framework->add_SyntenyRegion($synteny_region);
278 Description : Adds a SyntenyRegion to the framework. For speed reasons (and
279 since
this is an
internal method), no argument check is done.
288 sub add_SyntenyRegion {
289 push @{ $_[0]->{
'cache'} }, $_[1];
293 =head2 get_all_SyntenyRegions
295 Example :
foreach my $sr (@{ $sf->get_all_SyntenyRegions }) {
296 # do something with the SyntenyRegion
298 Description : Get a list of all SyntenyRegions in the framework.
307 sub get_all_SyntenyRegions {
308 return $_[0]->{
'cache'};
312 =head2 rescore_gene_matrix_lsf
314 Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
316 Example : my $new_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
317 Description : This method runs rescore_gene_matrix() (via the
318 synteny_resocre.pl script) in parallel with lsf, then combines
319 the results to
return a single rescored scoring matrix.
320 Parallelisation is done by chunking the scoring matrix into
321 several pieces (determined by the --synteny_rescore_jobs
322 configuration option).
324 Exceptions : thrown on wrong or missing argument
325 thrown on filesystem I/O error
326 thrown on failure of one or mor lsf jobs
327 Caller : InternalIdMapper plugins
333 sub rescore_gene_matrix_lsf {
338 $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
339 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
342 # serialise SyntenyFramework to disk
343 $self->logger->debug(
"Serialising SyntenyFramework...\n", 0,
'stamped');
345 $self->logger->debug(
"Done.\n", 0,
'stamped');
347 # split the ScoredMappingMatrix into chunks and write to disk
348 my $matrix_size = $matrix->size;
349 $self->logger->debug(
"Scores before rescoring: $matrix_size.\n");
351 my $num_jobs = $self->conf->param(
'synteny_rescore_jobs') || 20;
354 my $dump_path = path_append($self->conf->param(
'basedir'),
355 'matrix/synteny_rescore');
357 $self->logger->debug(
"Creating sub-matrices...\n", 0,
'stamped');
358 foreach my $i (1..$num_jobs) {
359 my $start = (int($matrix_size/($num_jobs-1)) * ($i - 1)) + 1;
360 my $end = int($matrix_size/($num_jobs-1)) * $i;
361 $self->logger->debug(
"$start-$end\n", 1);
362 my $sub_matrix = $matrix->sub_matrix($start, $end);
364 $sub_matrix->cache_file_name(
"gene_matrix_synteny$i.ser");
365 $sub_matrix->dump_path($dump_path);
367 $sub_matrix->write_to_file;
369 $self->logger->debug(
"Done.\n", 0,
'stamped');
371 # create an empty lsf log directory
372 my $logpath = path_append($self->logger->logpath,
'synteny_rescore');
373 system(
"rm -rf $logpath") == 0 or
374 $self->logger->error(
"Unable to delete lsf log dir $logpath: $!\n");
375 system(
"mkdir -p $logpath") == 0 or
376 $self->logger->error(
"Can't create lsf log dir $logpath: $!\n");
379 my $lsf_name =
'idmapping_synteny_rescore_'.time;
381 my $options = $self->conf->create_commandline_options(
383 logautobase =>
"synteny_rescore",
389 my $cmd = qq{$Bin/synteny_rescore.pl $options --index \$LSB_JOBINDEX};
392 sprintf(
"|bsub -J '%s[1-%d]' "
393 .
"-o %s/synteny_rescore.%%I.out "
394 .
"-e %s/synteny_rescore.%%I.err %s",
395 $lsf_name, $num_jobs, $logpath, $logpath,
396 $self->conf()->param(
'lsf_opt_synteny_rescore') );
399 $self->logger->info(
"Submitting $num_jobs jobs to lsf.\n");
400 $self->logger->debug(
"$cmd\n\n");
403 open( BSUB, $bsub_cmd )
## no critic
404 or $self->logger->error(
"Could not open open pipe to bsub: $!\n");
407 $self->logger->error(
"Error submitting synteny rescoring jobs: $!\n")
411 # submit dependent job to monitor finishing of jobs
412 $self->logger->info(
"Waiting for jobs to finish...\n", 0,
'stamped');
415 qq{bsub -K -w
"ended($lsf_name)" -q production } .
416 qq{-M 1000 -R
'select[mem>1000]' -R
'rusage[mem=1000]' } .
417 qq{-o $logpath/synteny_rescore_depend.out /bin/
true};
419 system($dependent_job) == 0 or
420 $self->logger->error(
"Error submitting dependent job: $!\n");
422 $self->logger->info(
"All jobs finished.\n", 0,
'stamped');
424 # check for lsf errors
427 foreach my $i (1..$num_jobs) {
428 $err++ unless (-e
"$logpath/synteny_rescore.$i.success");
432 $self->logger->error(
"At least one of your jobs failed.\nPlease check the logfiles at $logpath for errors.\n");
435 # merge and return matrix
436 $self->logger->debug(
"Merging rescored matrices...\n");
439 foreach my $i (1..$num_jobs) {
440 # read partial matrix created by lsf job from file
442 -DUMP_PATH => $dump_path,
443 -CACHE_FILE =>
"gene_matrix_synteny$i.ser",
447 # merge with main matrix
448 $matrix->merge($sub_matrix);
451 $self->logger->debug(
"Done.\n");
452 $self->logger->debug(
"Scores after rescoring: ".$matrix->size.
".\n");
460 =head2 rescore_gene_matrix
462 Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
464 Example : my $new_scores = $sf->rescore_gene_matrix($gene_scores);
465 Description : Rescores a gene matrix. Retains 70% of old score and builds
466 other 30% from the synteny match.
468 Exceptions : thrown on wrong or missing argument
469 Caller : InternalIdMapper plugins
475 sub rescore_gene_matrix {
480 $matrix->isa(
'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
481 throw(
'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
484 my $retain_factor = 0.7;
486 foreach my $entry (@{ $matrix->get_all_Entries }) {
487 my $source_gene = $self->cache->get_by_key(
'genes_by_id',
'source',
490 my $target_gene = $self->cache->get_by_key(
'genes_by_id',
'target',
493 my $highest_score = 0;
495 foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
496 my $score = $sr->score_location_relationship($source_gene, $target_gene);
497 $highest_score = $score
if ($score > $highest_score);
500 #$self->logger->debug("highscore ".$entry->to_string." ".
501 # sprintf("%.6f\n", $highest_score));
503 $matrix->
set_score($entry->source, $entry->target,
504 ($entry->score * 0.7 + $highest_score * 0.3));
514 Example : $object->logger->
info(
"Starting ID mapping.\n");
515 Description : Getter/setter
for logger
object
526 $self->{
'_logger'} = shift
if (@_);
527 return $self->{
'_logger'};
535 Example : my $basedir = $object->conf->
param(
'basedir');
536 Description : Getter/setter
for configuration
object
547 $self->{
'_conf'} = shift
if (@_);
548 return $self->{
'_conf'};
556 Description : Getter/setter
for cache
object
567 $self->{
'_cache'} = shift
if (@_);
568 return $self->{
'_cache'};