ensembl-hive  2.8.1
SyntenyRegion.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
33 Bio::EnsEMBL::IdMapping::SyntenyRegion - object representing syntenic regions
34 
35 =head1 SYNOPSIS
36 
37  # create a new SyntenyRegion from a source and a target gene
39  $source_gene->start, $source_gene->end,
40  $source_gene->strand, $source_gene->seq_region_name,
41  $target_gene->start, $target_gene->end,
42  $target_gene->strand, $target_gene->seq_region_name,
43  $entry->score,
44  ] );
45 
46  # merge with another SyntenyRegion
47  my $merged_sr = $sr->merge($sr1);
48 
49  # score a gene pair against this SyntenyRegion
50  my $score =
51  $sr->score_location_relationship( $source_gene1, $target_gene1 );
52 
53 =head1 DESCRIPTION
54 
55 This object represents a synteny between a source and a target location.
56 SyntenyRegions are built from mapped genes, and the their score is
57 defined as the score of the gene mapping. For merged SyntenyRegions,
58 scores are combined.
59 
60 =head1 METHODS
61 
62  new_fast
63  source_start
64  source_end
65  source_strand
66  source_seq_region_name
67  target_start
68  target_end
69  target_strand
70  target_seq_region_name
71  score
72  merge
73  stretch
74  score_location_relationship
75  to_string
76 
77 =cut
78 
79 package Bio::EnsEMBL::IdMapping::SyntenyRegion;
80 
81 use strict;
82 use warnings;
83 no warnings 'uninitialized';
84 
85 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
86 
87 
88 =head2 new_fast
89 
90  Arg[1] : Arrayref $array_ref - the arrayref to bless into the
91  SyntenyRegion object
93  ]);
94  Description : Constructor. On instantiation, source and target regions are
95  reverse complemented so that source is always on forward strand.
96  Return type : a Bio::EnsEMBL::IdMapping::SyntenyRegion object
97  Exceptions : none
99  Status : At Risk
100  : under development
101 
102 =cut
103 
104 sub new_fast {
105  my $class = shift;
106  my $array_ref = shift;
107 
108  # reverse complement source and target so that source is always on forward
109  # strand; this will make merging and other comparison operations easier
110  # at later stages
111  if ($array_ref->[2] == -1) {
112  $array_ref->[2] = 1;
113  $array_ref->[6] = -1 * $array_ref->[6];
114  }
115 
116  return bless $array_ref, $class;
117 }
118 
119 
120 =head2 source_start
121 
122  Arg[1] : (optional) Int - source location start coordinate
123  Description : Getter/setter for source location start coordinate.
124  Return type : Int
125  Exceptions : none
127  Status : At Risk
128  : under development
129 
130 =cut
131 
132 sub source_start {
133  my $self = shift;
134  $self->[0] = shift if (@_);
135  return $self->[0];
136 }
137 
138 
139 =head2 source_end
140 
141  Arg[1] : (optional) Int - source location end coordinate
142  Description : Getter/setter for source location end coordinate.
143  Return type : Int
144  Exceptions : none
146  Status : At Risk
147  : under development
148 
149 =cut
150 
151 sub source_end {
152  my $self = shift;
153  $self->[1] = shift if (@_);
154  return $self->[1];
155 }
156 
157 
158 =head2 source_strand
159 
160  Arg[1] : (optional) Int - source location strand
161  Description : Getter/setter for source location strand.
162  Return type : Int
163  Exceptions : none
165  Status : At Risk
166  : under development
167 
168 =cut
169 
170 sub source_strand {
171  my $self = shift;
172  $self->[2] = shift if (@_);
173  return $self->[2];
174 }
175 
176 
177 =head2 source_seq_region_name
178 
179  Arg[1] : (optional) String - source location seq_region name
180  Description : Getter/setter for source location seq_region name.
181  Return type : String
182  Exceptions : none
184  Status : At Risk
185  : under development
186 
187 =cut
188 
189 sub source_seq_region_name {
190  my $self = shift;
191  $self->[3] = shift if (@_);
192  return $self->[3];
193 }
194 
195 
196 =head2 target_start
197 
198  Arg[1] : (optional) Int - target location start coordinate
199  Description : Getter/setter for target location start coordinate.
200  Return type : Int
201  Exceptions : none
203  Status : At Risk
204  : under development
205 
206 =cut
207 
208 sub target_start {
209  my $self = shift;
210  $self->[4] = shift if (@_);
211  return $self->[4];
212 }
213 
214 
215 =head2 target_end
216 
217  Arg[1] : (optional) Int - target location end coordinate
218  Description : Getter/setter for target location end coordinate.
219  Return type : Int
220  Exceptions : none
222  Status : At Risk
223  : under development
224 
225 =cut
226 
227 sub target_end {
228  my $self = shift;
229  $self->[5] = shift if (@_);
230  return $self->[5];
231 }
232 
233 
234 =head2 target_strand
235 
236  Arg[1] : (optional) Int - target location strand
237  Description : Getter/setter for target location strand.
238  Return type : Int
239  Exceptions : none
241  Status : At Risk
242  : under development
243 
244 =cut
245 
246 sub target_strand {
247  my $self = shift;
248  $self->[6] = shift if (@_);
249  return $self->[6];
250 }
251 
252 
253 =head2 target_seq_region_name
254 
255  Arg[1] : (optional) String - target location seq_region name
256  Description : Getter/setter for target location seq_region name.
257  Return type : String
258  Exceptions : none
260  Status : At Risk
261  : under development
262 
263 =cut
264 
265 sub target_seq_region_name {
266  my $self = shift;
267  $self->[7] = shift if (@_);
268  return $self->[7];
269 }
270 
271 
272 =head2 score
273 
274  Arg[1] : (optional) Float - score
275  Description : Getter/setter for the score between source and target location.
276  Return type : Int
277  Exceptions : none
279  Status : At Risk
280  : under development
281 
282 =cut
283 
284 sub score {
285  my $self = shift;
286  $self->[8] = shift if (@_);
287  return $self->[8];
288 }
289 
290 
291 =head2 merge
292 
293  Arg[1] : Bio::EnsEMBL::IdMapping::SyntenyRegion $sr - another
294  SyntenyRegion
295  Example : $merged_sr = $sr->merge($other_sr);
296  Description : Merges two overlapping SyntenyRegions if they meet certain
297  criteria (see documentation in the code for details). Score is
298  calculated as a combined distance score. If the two
299  SyntenyRegions aren't mergeable, this method returns undef.
300  Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion or undef
301  Exceptions : warns on bad scores
302  Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
303  Status : At Risk
304  : under development
305 
306 =cut
307 
308 sub merge {
309  my ($self, $sr) = @_;
310 
311  # must be on same seq_region
312  if ($self->source_seq_region_name ne $sr->source_seq_region_name or
313  $self->target_seq_region_name ne $sr->target_seq_region_name) {
314  return 0;
315  }
316 
317  # target must be on same strand
318  return 0 unless ($self->target_strand == $sr->target_strand);
319 
320  # find the distance of source and target pair and compare
321  my $source_dist = $sr->source_start - $self->source_start;
322  my $target_dist;
323  if ($self->target_strand == 1) {
324  $target_dist = $sr->target_start - $self->target_start;
325  } else {
326  $target_dist = $self->target_end - $sr->target_end;
327  }
328 
329  # prevent division by zero error
330  if ($source_dist == 0 or $target_dist == 0) {
331  warn("WARNING: source_dist ($source_dist) and/or target_dist ($target_dist) is zero.\n");
332  return 0;
333  }
334 
335  # calculate a distance score
336  my $dist = $source_dist - $target_dist;
337  $dist = -$dist if ($dist < 0);
338  my $d1 = $dist/$source_dist;
339  $d1 = -$d1 if ($d1 < 0);
340  my $d2 = $dist/$target_dist;
341  $d2 = -$d2 if ($d2 < 0);
342  my $dist_score = 1 - $d1 - $d2;
343 
344  # distance score must be more than 50%
345  return 0 if ($dist_score < 0.5);
346 
347  my $new_score = $dist_score * ($sr->score + $self->score)/2;
348 
349  if ($new_score > 1) {
350  warn("WARNING: Bad merge score: $new_score\n");
351  }
352 
353  # extend SyntenyRegion to cover both sources and targets, set merged score
354  # and return
355  if ($sr->source_start < $self->source_start) {
356  $self->source_start($sr->source_start);
357  }
358  if ($sr->source_end > $self->source_end) {
359  $self->source_end($sr->source_end);
360  }
361 
362  if ($sr->target_start < $self->target_start) {
363  $self->target_start($sr->target_start);
364  }
365  if ($sr->target_end > $self->target_end) {
366  $self->target_end($sr->target_end);
367  }
368 
369  $self->score($new_score);
370 
371  return $self;
372 }
373 
374 
375 =head2 stretch
376 
377  Arg[1] : Float $factor - stretching factor
378  Example : $stretched_sr = $sr->stretch(2);
379  Description : Extends this SyntenyRegion to span a $factor * $score more area.
380  Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion
381  Exceptions : none
382  Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
383  Status : At Risk
384  : under development
385 
386 =cut
387 
388 sub stretch {
389  my ($self, $factor) = @_;
390 
391  my $source_adjust = int(($self->source_end - $self->source_start + 1) *
392  $factor * $self->score);
393  $self->source_start($self->source_start - $source_adjust);
394  $self->source_end($self->source_end + $source_adjust);
395  #warn sprintf(" sss %d %d %d\n", $source_adjust, $self->source_start,
396  # $self->source_end);
397 
398  my $target_adjust = int(($self->target_end - $self->target_start + 1) *
399  $factor * $self->score);
400  $self->target_start($self->target_start - $target_adjust);
401  $self->target_end($self->target_end + $target_adjust);
402 
403  return $self;
404 }
405 
406 
407 =head2 score_location_relationship
408 
409  Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $source_gene - source gene
410  Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $target_gene - target gene
411  Example : my $score = $sr->score_location_relationship($source_gene,
412  $target_gene);
413  Description : This function calculates how well the given source location
414  interpolates on given target location inside this SyntenyRegion.
415 
416  Scoring is done the following way: Source and target location
417  are normalized with respect to this Regions source and target.
418  Source range will then be somewhere close to 0.0-1.0 and target
419  range anything around that.
420 
421  The extend of the covered area between source and target range
422  is a measurement of how well they agree (smaller extend is
423  better). The extend (actually 2*extend) is reduced by the size
424  of the regions. This will result in 0.0 if they overlap
425  perfectly and bigger values if they dont.
426 
427  This is substracted from 1.0 to give the score. The score is
428  likely to be below zero, but is cut off at 0.0f.
429 
430  Finally, the score is multiplied with the score of the synteny
431  itself.
432  Return type : Float
433  Exceptions : warns if score out of range
434  Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
435  Status : At Risk
436  : under development
437 
438 =cut
439 
440 
441 
442 sub score_location_relationship {
443  my ($self, $source_gene, $target_gene) = @_;
444 
445  # must be on same seq_region
446  if (($self->source_seq_region_name ne $source_gene->seq_region_name) or
447  ($self->target_seq_region_name ne $target_gene->seq_region_name)) {
448  return 0;
449  }
450 
451  # strand relationship must be the same (use logical XOR to find out)
452  if (($self->source_strand == $source_gene->strand) xor
453  ($self->target_strand == $target_gene->strand)) {
454  return 0;
455  }
456 
457  # normalise source location
458  my $source_rel_start = ($source_gene->start - $self->source_start) /
459  ($self->source_end - $self->source_start + 1);
460 
461  my $source_rel_end = ($source_gene->end - $self->source_start + 1) /
462  ($self->source_end - $self->source_start + 1);
463 
464  #warn " aaa ".$self->to_string."\n";
465  #warn sprintf(" bbb %.6f %.6f\n", $source_rel_start, $source_rel_end);
466 
467  # cut off if the source location is completely outside
468  return 0 if ($source_rel_start > 1.1 or $source_rel_end < -0.1);
469 
470  # normalise target location
471  my ($target_rel_start, $target_rel_end);
472  my $t_length = $self->target_end - $self->target_start + 1;
473 
474  if ($self->target_strand == 1) {
475 
476  $target_rel_start = ($target_gene->start - $self->target_start) / $t_length;
477 
478  $target_rel_end = ($target_gene->end - $self->target_start + 1) / $t_length;
479 
480  } else {
481  $target_rel_start = ($self->target_end - $target_gene->end) / $t_length;
482  $target_rel_end = ($self->target_end - $target_gene->start + 1) / $t_length;
483  }
484 
485  my $added_range = (($target_rel_end > $source_rel_end) ? $target_rel_end :
486  $source_rel_end) -
487  (($target_rel_start < $source_rel_start) ? $target_rel_start :
488  $source_rel_start);
489 
490  my $score = $self->score * (1 - (2 * $added_range - $target_rel_end -
491  $source_rel_end + $target_rel_start + $source_rel_start));
492 
493  #warn " ccc ".sprintf("%.6f:%.6f:%.6f:%.6f:%.6f\n", $added_range,
494  # $source_rel_start, $source_rel_end, $target_rel_start, $target_rel_end);
495 
496  $score = 0 if ($score < 0);
497 
498  # sanity check
499  if ($score > 1) {
500  warn "Out of range score ($score) for ".$source_gene->id.":".
501  $target_gene->id."\n";
502  }
503 
504  return $score;
505 }
506 
507 
508 =head2 to_string
509 
510  Example : print LOG $sr->to_string, "\n";
511  Description : Returns a string representation of the SyntenyRegion object.
512  Useful for debugging and logging.
513  Return type : String
514  Exceptions : none
515  Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework
516  Status : At Risk
517  : under development
518 
519 =cut
520 
521 sub to_string {
522  my $self = shift;
523  return sprintf("%s:%s-%s:%s %s:%s-%s:%s %.6f",
524  $self->source_seq_region_name,
525  $self->source_start,
526  $self->source_end,
527  $self->source_strand,
528  $self->target_seq_region_name,
529  $self->target_start,
530  $self->target_end,
531  $self->target_strand,
532  $self->score
533  );
534 }
535 
536 
537 1;
538 
Bio::EnsEMBL::IdMapping::SyntenyFramework
Definition: SyntenyFramework.pm:41
Bio::EnsEMBL::IdMapping::SyntenyRegion::new_fast
public A new_fast()
Bio::EnsEMBL::IdMapping::SyntenyRegion
Definition: SyntenyRegion.pm:33
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68
Bio::EnsEMBL::IdMapping::SyntenyRegion::merge
public Bio::EnsEMBL::IdMapping::SyntenyRegion merge()