ensembl-hive  2.7.0
SyntenyFramework.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::IdMapping::SyntenyFramework - framework representing syntenic
34 regions across the genome
35 
36 =head1 SYNOPSIS
37 
38  # build the SyntenyFramework from unambiguous gene mappings
40  -DUMP_PATH => $dump_path,
41  -CACHE_FILE => 'synteny_framework.ser',
42  -LOGGER => $self->logger,
43  -CONF => $self->conf,
44  -CACHE => $self->cache,
45  );
46  $sf->build_synteny($gene_mappings);
47 
48  # use it to rescore the genes
49  $gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
50 
51 =head1 DESCRIPTION
52 
53 The SyntenyFramework is a set of SyntenyRegions. These are pairs of
54 locations very analoguous to the information in the assembly table (the
55 locations dont have to be the same length though). They are built from
56 genes that map uniquely between source and target.
57 
58 Once built, the SyntenyFramework is used to score source and target gene
59 pairs to determine whether they are similar. This process is slow (it
60 involves testing all gene pairs against all SyntenyRegions), this module
61 therefor has built-in support to run the process in parallel via LSF.
62 
63 =head1 METHODS
64 
65  new
66  build_synteny
67  _by_overlap
68  add_SyntenyRegion
69  get_all_SyntenyRegions
70  rescore_gene_matrix_lsf
71  rescore_gene_matrix
72  logger
73  conf
74  cache
75 
76 =cut
77 
78 package Bio::EnsEMBL::IdMapping::SyntenyFramework;
79 
80 use strict;
81 use warnings;
82 no warnings 'uninitialized';
83 
86 
87 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
88 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
89 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
92 
93 use FindBin qw($Bin);
94 FindBin->again;
95 
96 
97 =head2 new
98 
99  Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
100  Arg [CONF] : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
101  Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object
102  Arg [DUMP_PATH] : String - path for object serialisation
103  Arg [CACHE_FILE] : String - filename of serialised object
105  -DUMP_PATH => $dump_path,
106  -CACHE_FILE => 'synteny_framework.ser',
107  -LOGGER => $self->logger,
108  -CONF => $self->conf,
109  -CACHE => $self->cache,
110  );
111  Description : Constructor.
113  Exceptions : thrown on wrong or missing arguments
114  Caller : InternalIdMapper plugins
115  Status : At Risk
116  : under development
117 
118 =cut
119 
120 sub new {
121  my $caller = shift;
122  my $class = ref($caller) || $caller;
123  my $self = $class->SUPER::new(@_);
124 
125  my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_);
126 
127  unless ($logger and ref($logger) and
128  $logger->isa('Bio::EnsEMBL::Utils::Logger')) {
129  throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
130  }
131 
132  unless ($conf and ref($conf) and
133  $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
134  throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
135  }
136 
137  unless ($cache and ref($cache) and
138  $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) {
139  throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object.");
140  }
141 
142  # initialise
143  $self->logger($logger);
144  $self->conf($conf);
145  $self->cache($cache);
146  $self->{'cache'} = [];
147 
148  return $self;
149 }
150 
151 
152 =head2 build_synteny
153 
154  Arg[1] : Bio::EnsEMBL::IdMapping::MappingList $mappings - gene mappings
155  to build the SyntenyFramework from
156  Example : $synteny_framework->build_synteny($gene_mappings);
157  Description : Builds the SyntenyFramework from unambiguous gene mappings.
158  SyntenyRegions are allowed to overlap. At most two overlapping
159  SyntenyRegions are merged (otherwise we'd get too large
160  SyntenyRegions with little information content).
161  Return type : none
162  Exceptions : thrown on wrong or missing argument
163  Caller : InternalIdMapper plugins
164  Status : At Risk
165  : under development
166 
167 =cut
168 
169 sub build_synteny {
170  my $self = shift;
171  my $mappings = shift;
172 
173  unless ($mappings and
174  $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
175  throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
176  }
177 
178  # create a synteny region for each mapping
179  my @synteny_regions = ();
180 
181  foreach my $entry (@{ $mappings->get_all_Entries }) {
182 
183  my $source_gene = $self->cache->get_by_key('genes_by_id', 'source',
184  $entry->source);
185  my $target_gene = $self->cache->get_by_key('genes_by_id', 'target',
186  $entry->target);
187 
188  my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([
189  $source_gene->start,
190  $source_gene->end,
191  $source_gene->strand,
192  $source_gene->seq_region_name,
193  $target_gene->start,
194  $target_gene->end,
195  $target_gene->strand,
196  $target_gene->seq_region_name,
197  $entry->score,
198  ]);
199 
200  push @synteny_regions, $sr;
201  }
202 
203  unless (@synteny_regions) {
204  $self->logger->warning("No synteny regions could be identified.\n");
205  return;
206  }
207 
208  # sort synteny regions
209  #my @sorted = sort _by_overlap @synteny_regions;
210  my @sorted = reverse sort {
211  $a->source_seq_region_name cmp $b->source_seq_region_name ||
212  $a->source_start <=> $b->source_start ||
213  $a->source_end <=> $b->source_end } @synteny_regions;
214 
215  $self->logger->info("SyntenyRegions before merging: ".scalar(@sorted)."\n");
216 
217  # now create merged regions from overlapping syntenies, but only merge a
218  # maximum of 2 regions (otherwise you end up with large synteny blocks which
219  # won't contain much information in this context)
220  my $last_merged = 0;
221  my $last_sr = shift(@sorted);
222 
223  while (my $sr = shift(@sorted)) {
224  #$self->logger->debug("this ".$sr->to_string."\n");
225 
226  my $merged_sr = $last_sr->merge($sr);
227 
228  if (! $merged_sr) {
229  unless ($last_merged) {
230  $self->add_SyntenyRegion($last_sr->stretch(2));
231  #$self->logger->debug("nnn ".$last_sr->to_string."\n");
232  }
233  $last_merged = 0;
234  } else {
235  $self->add_SyntenyRegion($merged_sr->stretch(2));
236  #$self->logger->debug("mmm ".$merged_sr->to_string."\n");
237  $last_merged = 1;
238  }
239 
240  $last_sr = $sr;
241  }
242 
243  # deal with last synteny region in @sorted
244  unless ($last_merged) {
245  $self->add_SyntenyRegion($last_sr->stretch(2));
246  $last_merged = 0;
247  }
248 
249  #foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
250  # $self->logger->debug("SRs ".$sr->to_string."\n");
251  #}
252 
253  $self->logger->info("SyntenyRegions after merging: ".scalar(@{ $self->get_all_SyntenyRegions })."\n");
254 
255 }
256 
257 
258 #
259 # sort SyntenyRegions by overlap
260 #
261 sub _by_overlap {
262  # first sort by seq_region
263  my $retval = ($b->source_seq_region_name cmp $a->source_seq_region_name);
264  return $retval if ($retval);
265 
266  # then sort by overlap:
267  # return -1 if $a is downstream, 1 if it's upstream, 0 if they overlap
268  if ($a->source_end < $b->source_start) { return 1; }
269  if ($a->source_start < $b->source_end) { return -1; }
270  return 0;
271 }
272 
273 
274 =head2 add_SyntenyRegion
275 
276  Arg[1] : Bio::EnsEMBL::IdMaping::SyntenyRegion - SyntenyRegion to add
277  Example : $synteny_framework->add_SyntenyRegion($synteny_region);
278  Description : Adds a SyntenyRegion to the framework. For speed reasons (and
279  since this is an internal method), no argument check is done.
280  Return type : none
281  Exceptions : none
282  Caller : internal
283  Status : At Risk
284  : under development
285 
286 =cut
287 
288 sub add_SyntenyRegion {
289  push @{ $_[0]->{'cache'} }, $_[1];
290 }
291 
292 
293 =head2 get_all_SyntenyRegions
294 
295  Example : foreach my $sr (@{ $sf->get_all_SyntenyRegions }) {
296  # do something with the SyntenyRegion
297  }
298  Description : Get a list of all SyntenyRegions in the framework.
299  Return type : Arrayref of Bio::EnsEMBL::IdMapping::SyntenyRegion
300  Exceptions : none
301  Caller : general
302  Status : At Risk
303  : under development
304 
305 =cut
306 
307 sub get_all_SyntenyRegions {
308  return $_[0]->{'cache'};
309 }
310 
311 
312 =head2 rescore_gene_matrix_lsf
313 
314  Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
315  scores to rescore
316  Example : my $new_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
317  Description : This method runs rescore_gene_matrix() (via the
318  synteny_resocre.pl script) in parallel with lsf, then combines
319  the results to return a single rescored scoring matrix.
320  Parallelisation is done by chunking the scoring matrix into
321  several pieces (determined by the --synteny_rescore_jobs
322  configuration option).
324  Exceptions : thrown on wrong or missing argument
325  thrown on filesystem I/O error
326  thrown on failure of one or mor lsf jobs
327  Caller : InternalIdMapper plugins
328  Status : At Risk
329  : under development
330 
331 =cut
332 
333 sub rescore_gene_matrix_lsf {
334  my $self = shift;
335  my $matrix = shift;
336 
337  unless ($matrix and
338  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
339  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
340  }
341 
342  # serialise SyntenyFramework to disk
343  $self->logger->debug("Serialising SyntenyFramework...\n", 0, 'stamped');
344  $self->write_to_file;
345  $self->logger->debug("Done.\n", 0, 'stamped');
346 
347  # split the ScoredMappingMatrix into chunks and write to disk
348  my $matrix_size = $matrix->size;
349  $self->logger->debug("Scores before rescoring: $matrix_size.\n");
350 
351  my $num_jobs = $self->conf->param('synteny_rescore_jobs') || 20;
352  $num_jobs++;
353 
354  my $dump_path = path_append($self->conf->param('basedir'),
355  'matrix/synteny_rescore');
356 
357  $self->logger->debug("Creating sub-matrices...\n", 0, 'stamped');
358  foreach my $i (1..$num_jobs) {
359  my $start = (int($matrix_size/($num_jobs-1)) * ($i - 1)) + 1;
360  my $end = int($matrix_size/($num_jobs-1)) * $i;
361  $self->logger->debug("$start-$end\n", 1);
362  my $sub_matrix = $matrix->sub_matrix($start, $end);
363 
364  $sub_matrix->cache_file_name("gene_matrix_synteny$i.ser");
365  $sub_matrix->dump_path($dump_path);
366 
367  $sub_matrix->write_to_file;
368  }
369  $self->logger->debug("Done.\n", 0, 'stamped');
370 
371  # create an empty lsf log directory
372  my $logpath = path_append($self->logger->logpath, 'synteny_rescore');
373  system("rm -rf $logpath") == 0 or
374  $self->logger->error("Unable to delete lsf log dir $logpath: $!\n");
375  system("mkdir -p $logpath") == 0 or
376  $self->logger->error("Can't create lsf log dir $logpath: $!\n");
377 
378  # build lsf command
379  my $lsf_name = 'idmapping_synteny_rescore_'.time;
380 
381  my $options = $self->conf->create_commandline_options(
382  logauto => 1,
383  logautobase => "synteny_rescore",
384  logpath => $logpath,
385  interactive => 0,
386  is_component => 1,
387  );
388 
389  my $cmd = qq{$Bin/synteny_rescore.pl $options --index \$LSB_JOBINDEX};
390 
391  my $bsub_cmd =
392  sprintf( "|bsub -J '%s[1-%d]' "
393  . "-o %s/synteny_rescore.%%I.out "
394  . "-e %s/synteny_rescore.%%I.err %s",
395  $lsf_name, $num_jobs, $logpath, $logpath,
396  $self->conf()->param('lsf_opt_synteny_rescore') );
397 
398  # run lsf job array
399  $self->logger->info("Submitting $num_jobs jobs to lsf.\n");
400  $self->logger->debug("$cmd\n\n");
401 
402  local *BSUB;
403  open( BSUB, $bsub_cmd ) ## no critic
404  or $self->logger->error("Could not open open pipe to bsub: $!\n");
405 
406  print BSUB $cmd;
407  $self->logger->error("Error submitting synteny rescoring jobs: $!\n")
408  unless ($? == 0);
409  close BSUB;
410 
411  # submit dependent job to monitor finishing of jobs
412  $self->logger->info("Waiting for jobs to finish...\n", 0, 'stamped');
413 
414  my $dependent_job =
415  qq{bsub -K -w "ended($lsf_name)" -q production } .
416  qq{-M 1000 -R 'select[mem>1000]' -R 'rusage[mem=1000]' } .
417  qq{-o $logpath/synteny_rescore_depend.out /bin/true};
418 
419  system($dependent_job) == 0 or
420  $self->logger->error("Error submitting dependent job: $!\n");
421 
422  $self->logger->info("All jobs finished.\n", 0, 'stamped');
423 
424  # check for lsf errors
425  sleep(5);
426  my $err;
427  foreach my $i (1..$num_jobs) {
428  $err++ unless (-e "$logpath/synteny_rescore.$i.success");
429  }
430 
431  if ($err) {
432  $self->logger->error("At least one of your jobs failed.\nPlease check the logfiles at $logpath for errors.\n");
433  }
434 
435  # merge and return matrix
436  $self->logger->debug("Merging rescored matrices...\n");
437  $matrix->flush;
438 
439  foreach my $i (1..$num_jobs) {
440  # read partial matrix created by lsf job from file
442  -DUMP_PATH => $dump_path,
443  -CACHE_FILE => "gene_matrix_synteny$i.ser",
444  );
445  $sub_matrix->read_from_file;
446 
447  # merge with main matrix
448  $matrix->merge($sub_matrix);
449  }
450 
451  $self->logger->debug("Done.\n");
452  $self->logger->debug("Scores after rescoring: ".$matrix->size.".\n");
453 
454  return $matrix;
455 }
456 
457 
458 #
459 #
460 =head2 rescore_gene_matrix
461 
462  Arg[1] : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
463  scores to rescore
464  Example : my $new_scores = $sf->rescore_gene_matrix($gene_scores);
465  Description : Rescores a gene matrix. Retains 70% of old score and builds
466  other 30% from the synteny match.
468  Exceptions : thrown on wrong or missing argument
469  Caller : InternalIdMapper plugins
470  Status : At Risk
471  : under development
472 
473 =cut
474 
475 sub rescore_gene_matrix {
476  my $self = shift;
477  my $matrix = shift;
478 
479  unless ($matrix and
480  $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
481  throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
482  }
483 
484  my $retain_factor = 0.7;
485 
486  foreach my $entry (@{ $matrix->get_all_Entries }) {
487  my $source_gene = $self->cache->get_by_key('genes_by_id', 'source',
488  $entry->source);
489 
490  my $target_gene = $self->cache->get_by_key('genes_by_id', 'target',
491  $entry->target);
492 
493  my $highest_score = 0;
494 
495  foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
496  my $score = $sr->score_location_relationship($source_gene, $target_gene);
497  $highest_score = $score if ($score > $highest_score);
498  }
499 
500  #$self->logger->debug("highscore ".$entry->to_string." ".
501  # sprintf("%.6f\n", $highest_score));
502 
503  $matrix->set_score($entry->source, $entry->target,
504  ($entry->score * 0.7 + $highest_score * 0.3));
505  }
506 
507  return $matrix;
508 }
509 
510 
511 =head2 logger
512 
513  Arg[1] : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set
514  Example : $object->logger->info("Starting ID mapping.\n");
515  Description : Getter/setter for logger object
516  Return type : Bio::EnsEMBL::Utils::Logger
517  Exceptions : none
518  Caller : constructor
519  Status : At Risk
520  : under development
521 
522 =cut
523 
524 sub logger {
525  my $self = shift;
526  $self->{'_logger'} = shift if (@_);
527  return $self->{'_logger'};
528 }
529 
530 
531 =head2 conf
532 
533  Arg[1] : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration
534  to set
535  Example : my $basedir = $object->conf->param('basedir');
536  Description : Getter/setter for configuration object
537  Return type : Bio::EnsEMBL::Utils::ConfParser
538  Exceptions : none
539  Caller : constructor
540  Status : At Risk
541  : under development
542 
543 =cut
544 
545 sub conf {
546  my $self = shift;
547  $self->{'_conf'} = shift if (@_);
548  return $self->{'_conf'};
549 }
550 
551 
552 =head2 cache
553 
554  Arg[1] : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set
555  Example : $object->cache->read_from_file('source');
556  Description : Getter/setter for cache object
557  Return type : Bio::EnsEMBL::IdMapping::Cache
558  Exceptions : none
559  Caller : constructor
560  Status : At Risk
561  : under development
562 
563 =cut
564 
565 sub cache {
566  my $self = shift;
567  $self->{'_cache'} = shift if (@_);
568  return $self->{'_cache'};
569 }
570 
571 
572 1;
573 
map
public map()
Bio::EnsEMBL::Utils::ScriptUtils
Definition: ScriptUtils.pm:11
Bio::EnsEMBL::IdMapping::Cache::read_from_file
public read_from_file()
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
Definition: ScoredMappingMatrix.pm:44
Bio::EnsEMBL::Utils::ConfParser::param
public Scalar param()
Bio::EnsEMBL::IdMapping::SyntenyFramework
Definition: SyntenyFramework.pm:41
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::set_score
public Float set_score()
Bio::EnsEMBL::Utils::ConfParser
Definition: ConfParser.pm:41
Bio::EnsEMBL::IdMapping::ScoredMappingMatrix::new
public Bio::EnsEMBL::IdMapping::ScoredMappingMatrix new()
Bio::EnsEMBL::Utils::Logger
Definition: Logger.pm:36
Bio::EnsEMBL::IdMapping::SyntenyFramework::build_synteny
public void build_synteny()
Bio::EnsEMBL::Utils::Logger::info
public info()
Bio::EnsEMBL::IdMapping::SyntenyFramework::new
public Bio::EnsEMBL::IdMapping::SyntenyFramework new()
Bio::EnsEMBL::IdMapping::Serialisable::read_from_file
public Bio::EnsEMBL::IdMapping::Serialisable read_from_file()
Bio::EnsEMBL::IdMapping::Serialisable
Definition: Serialisable.pm:45
Bio::EnsEMBL::IdMapping::Serialisable::write_to_file
public String write_to_file()
Bio::EnsEMBL::IdMapping::Cache
Definition: Cache.pm:18
run
public run()
Bio::EnsEMBL::IdMapping::MappingList
Definition: MappingList.pm:38
Bio::EnsEMBL::IdMapping::SyntenyRegion
Definition: SyntenyRegion.pm:33
Bio::EnsEMBL::IdMapping::InternalIdMapper
Definition: InternalIdMapper.pm:18
Bio::EnsEMBL::Utils::Argument
Definition: Argument.pm:34
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68