ensembl-hive  2.8.1
AssemblyProjector.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
34 utility class to post-process projections from one assembly to another
35 
36 =head1 SYNOPSIS
37 
38  # connect to an old database
39  my $dba_old = new Bio::EnsEMBL::DBSQL::DBAdaptor(
40  -host => 'ensembldb.ensembl.org',
41  -port => 3306,
42  -user => ensro,
43  -dbname => 'mus_musculus_core_46_36g',
44  -group => 'core_old',
45  );
46 
47  # connect to the new database containing the mapping between old and
48  # new assembly
49  my $dba_new = new Bio::EnsEMBL::DBSQL::DBAdaptor(
50  -host => 'ensembldb.ensembl.org',
51  -port => 3306,
52  -user => ensro,
53  -dbname => 'mus_musculus_core_47_37',
54  -group => 'core_new',
55  );
56 
57  my $assembly_projector = Bio::EnsEMBL::Utils::AssemblyProjector->new(
58  -OLD_ASSEMBLY => 'NCBIM36',
59  -NEW_ASSEMBLY => 'NCBIM37',
60  -ADAPTOR => $dba_new,
61  -EXTERNAL_SOURCE => 1,
62  -MERGE_FRAGMENTS => 1,
63  -CHECK_LENGTH => 0,
64  );
65 
66  # fetch a slice on the old assembly
67  my $slice_adaptor = $dba_old->get_SliceAdaptor;
68  my $slice =
69  $slice_adaptor->fetch_by_region( 'chromosome', 1, undef, undef,
70  undef, 'NCBIM36' );
71 
72  my $new_slice = $assembly_projector->old_to_new($slice);
73 
74  print $new_slice->name, " (", $assembly_projector->last_status, ")\n";
75 
76 =head1 DESCRIPTION
77 
78 This class implements some utility functions for converting coordinates
79 between assemblies. A mapping between the two assemblies has to present
80 the database for this to work, see the 'Related Modules' section below
81 on how to generate the mapping.
82 
83 In addition to the "raw" projecting of features and slices, the methods
84 in this module also apply some sensible rules to the results of the
85 projection (like discarding unwanted results or merging fragmented
86 projections). These are the rules (depending on configuration):
87 
88 Discard the projected feature/slice if:
89 
90  1. it doesn't project at all (no segments returned)
91  2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more
92  than one segment)
93  3. [if CHECK_LENGTH is set] the projection doesn't have the same
94  length as the original feature/slice
95  4. all segments are on same chromosome and strand
96 
97 If a projection fails any of these rules, undef is returned instead of
98 a projected feature/slice. You can use the last_status() method to find
99 out about the results of the rules tests.
100 
101 Also note that when projecting features, only a shallow projection is
102 performed, i.e. other features attached to your features (e.g. the
103 transcripts of a gene) are not projected automatically, so it will be
104 the responsability of the user code project all levels of features
105 involved.
106 
107 =head1 METHODS
108 
109  new
110  project
111  old_to_new
112  new_to_old
113  adaptor
114  external_source
115  old_assembly
116  new_assembly
117  merge_fragments
118  check_length
119 
120 =head1 RELATED MODULES
121 
122 The process of creating a whole genome alignment between two assemblies
123 (which is the basis for the use of the methods in this class) is done by
124 a series of scripts. Please see
125 
126  ensembl/misc-scripts/assembly/README
127 
128 for a high-level description of this process, and POD in the individual
129 scripts for the details.
130 
131 =cut
132 
134 
135 use strict;
136 use warnings;
137 no warnings qw(uninitialized);
138 
139 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
140 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
142 use Scalar::Util qw(weaken);
143 
144 =head2 new
145 
146  Arg [ADAPTOR] : Bio::EnsEMBL::DBSQL::DBAdaptor $adaptor - a db adaptor
147  for a database containing the assembly mapping
148  Arg [EXTERNAL_SOURCE] : (optional) Boolean $external_source - indicates if
149  source is from a different database
150  Arg [OLD_ASSEMBLY] : name of the old assembly
151  Arg [OLD_ASSEMBLY] : name of the new assembly
152  Arg [OBJECT_TYPE] : (optional) object type ('slice' or 'feature')
153  Arg [MERGE_FRAGMENTS] : (optional) Boolean - determines if segments are merged
154  to return a single object spanning all segments
155  (default: true)
156  Arg [CHECK_LENGTH] : (optional) Boolean - determines if projected objects
157  have to have same length as original (default: false)
158  Example : my $ap = Bio::EnsEMBL::Utils::AssemblyProjector->new(
159  -DBADAPTOR => $dba,
160  -OLD_ASSEMBLY => NCBIM36,
161  -NEW_ASSEMBLY => NCBIM37,
162  );
163  Description : Constructor.
164  Return type : a Bio::EnsEMBL::Utils::AssemblyProjector object
165  Exceptions : thrown on missing arguments
166  thrown on invalid OBJECT_TYPE
167  Caller : general
168  Status : At Risk
169  : under development
170 
171 =cut
172 
173 sub new {
174  my $caller = shift;
175  my $class = ref($caller) || $caller;
176 
177  my ($adaptor, $external_source, $old_assembly, $new_assembly,
178  $merge_fragments, $check_length) = rearrange([qw(ADAPTOR EXTERNAL_SOURCE
179  OLD_ASSEMBLY NEW_ASSEMBLY MERGE_FRAGMENTS CHECK_LENGTH)], @_);
180 
181  unless ($adaptor and ref($adaptor) and
182  $adaptor->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) {
183  throw("You must provide a DBAdaptor to a database containing the assembly mapping.");
184  }
185 
186  unless ($old_assembly and $new_assembly) {
187  throw("You must provide an old and new assembly name.");
188  }
189 
190  my $self = {};
191  bless ($self, $class);
192 
193  # initialise
194  $self->adaptor($adaptor);
195  $self->{'old_assembly'} = $old_assembly;
196  $self->{'new_assembly'} = $new_assembly;
197 
198  # by default, merge fragments
199  $self->{'merge_fragments'} = $merge_fragments || 1;
200 
201  # by default, do not check length
202  $self->{'check_length'} = $check_length || 0;
203 
204  # by default, features and slices are expected in same database as the
205  # assembly mapping
206  $self->{'external_source'} = $external_source || 0;
207 
208  return $self;
209 }
210 
211 
212 =head2 project
213 
214  Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
215  the object to project
216  Arg[2] : String $to_assembly - assembly to project to
217  Example : my $new_slice = $assembly_projector->project($old_slice,
218  'NCBIM37');
219  Description : Projects a Slice or Feature to the specified assembly.
220 
221  Several tests are performed on the result to discard unwanted
222  results. All projection segments have to be on the same
223  seq_region and strand. If -MERGE_FRAGMENTS is set, gaps will be
224  bridged by creating a single object from first_segment_start to
225  last_segment_end. If -CHECK_LENGTH is set, the projected object
226  will have to have the same length as the original. You can use
227  the last_status() method to find out what the result of some of
228  these rule tests were. Please see the comments in the code for
229  more details about these rules.
230 
231  The return value of this method will always be a single object,
232  or undef if the projection fails any of the rules.
233 
234  Note that when projecting features, only a "shallow" projection
235  is performed, i.e. attached features aren't projected
236  automatically! (e.g. if you project a gene, its transcripts will
237  have to be projected manually before storing the new gene)
238  Return type : same a Arg 1, or undef if projection fails any of the rules
239  Exceptions : thrown on invalid arguments
240  Caller : general, $self->old_to_new, $self->new_to_old
241  Status : At Risk
242  : under development
243 
244 =cut
245 
246 sub project {
247  my ($self, $object, $to_assembly) = @_;
248 
249  throw("Need an assembly version to project to.") unless ($to_assembly);
250  throw("Need an object to project.") unless ($object and ref($object));
251 
252  my ($slice, $object_type);
253 
254  if ($object->isa('Bio::EnsEMBL::Feature')) {
255  $object_type = 'feature';
256  } elsif ($object->isa('Bio::EnsEMBL::Slice')) {
257  $object_type = 'slice';
258  } else {
259  throw("Need a Feature or Slice to project.");
260  }
261 
262  # if the feature or slice is sourced from another db, we have to "transfer"
263  # it to the db that contains the assembly mapping. the transfer is very
264  # shallow but that should do for our purposes
265  if ($self->external_source) {
266  my $slice_adaptor = $self->adaptor->get_SliceAdaptor;
267 
268  if ($object_type eq 'feature') {
269 
270  # createa a new slice from the target db
271  my $f_slice = $object->slice;
272  my $target_slice = $slice_adaptor->fetch_by_name($f_slice->name);
273 
274  # now change the feature so that it appears it's from the target db
275  $object->slice($target_slice);
276 
277  } else {
278 
279  # createa a new slice from the target db
280  $object = $slice_adaptor->fetch_by_name($object->name);
281 
282  }
283  }
284 
285  if ($object_type eq 'feature') {
286  $slice = $object->feature_Slice;
287  } else {
288  $slice = $object;
289  }
290 
291  # warn if trying to project to assembly version the object already is on
292  if ($slice->coord_system->version eq $to_assembly) {
293  warning("Assembly version to project to ($to_assembly) is the same as your object's assembly (".$slice->coord_system->version.").");
294  }
295 
296  # now project the slice
297  my $cs_name = $slice->coord_system_name;
298  my @segments = @{ $slice->project($cs_name, $to_assembly) };
299 
300  # we need to reverse the projection segment list if the orignial
301  if ($slice->strand == -1) {
302  @segments = reverse(@segments);
303  }
304 
305  # apply rules to projection results
306  #
307  # discard the projected feature/slice if
308  # 1. it doesn't project at all (no segments returned)
309  # 2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more
310  # than one segment)
311  # 3. [if CHECK_LENGTH is set] the projection doesn't have the same length
312  # as the original feature/slice
313  # 4. all segments are on same chromosome and strand
314 
315  # keep track of the status of applied rules
316  my @status = ();
317 
318  # test (1)
319  return undef unless (@segments);
320  #warn "DEBUG: passed test 1\n";
321 
322  # test (2)
323  return undef if (!($self->merge_fragments) and scalar(@segments) > 1);
324  push @status, 'fragmented' if (scalar(@segments) > 1);
325  #warn "DEBUG: passed test 2\n";
326 
327  # test (3)
328  my $first_slice = $segments[0]->to_Slice;
329  my $last_slice = $segments[-1]->to_Slice;
330  my $length_mismatch = (($last_slice->end - $first_slice->start + 1) !=
331  $object->length);
332  return undef if ($self->check_length and $length_mismatch);
333  push @status, 'length_mismatch' if ($length_mismatch);
334  #warn "DEBUG: passed test 3\n";
335 
336  # test (4)
337  my %sr_names = ();
338  my %strands = ();
339  foreach my $seg (@segments) {
340  my $sl = $seg->to_Slice;
341  $sr_names{$sl->seq_region_name}++;
342  $strands{$sl->strand}++;
343  }
344  return undef if (scalar(keys %sr_names) > 1 or scalar(keys %strands) > 1);
345  #warn "DEBUG: passed test 4\n";
346 
347  # remember rule status
348  $self->last_status(join('|', @status));
349 
350  # everything looks fine, so adjust the coords of your feature/slice
351  my $new_slice = $first_slice;
352  $new_slice->{'end'} = $last_slice->end;
353 
354  if ($object_type eq 'slice') {
355  return $new_slice;
356  } else {
357 
358  $object->start($new_slice->start);
359  $object->end($new_slice->end);
360  $object->strand($new_slice->strand);
361  $object->slice($new_slice->seq_region_Slice);
362 
363  # undef dbID and adaptor so you can store the feature in the target db
364  $object->dbID(undef);
365  $object->adaptor(undef);
366 
367  return $object;
368  }
369 
370 }
371 
372 
373 =head2 old_to_new
374 
375  Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
376  the object to project
377  Example : my $new_slice = $assembly_projector->old_to_new($old_slice);
378  Description : Projects a Slice or Feature from old to new assembly.
379  This method is just a convenience wrapper for $self->project.
380  Return type : same a Arg 1, or undef
381  Exceptions : none
382  Caller : general
383  Status : At Risk
384  : under development
385 
386 =cut
387 
388 sub old_to_new {
389  my ($self, $object) = @_;
390  return $self->project($object, $self->new_assembly);
391 }
392 
393 
394 =head2 new_to_old
395 
396  Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
397  the object to project
398  Example : my $old_slice = $assembly_projector->new_to_old($new_slice, 1);
399  Description : Projects a Slice or Feature from new to old assembly.
400  This method is just a convenience wrapper for $self->project.
401  Return type : same a Arg 1, or undef
402  Exceptions : none
403  Caller : general
404  Status : At Risk
405  : under development
406 
407 =cut
408 
409 sub new_to_old {
410  my ($self, $object) = @_;
411  return $self->project($object, $self->old_assembly);
412 }
413 
414 
415 #
416 # accessors
417 #
418 sub adaptor {
419  my $self = shift;
420  weaken($self->{'adaptor'} = shift) if (@_);
421  return $self->{'adaptor'};
422 }
423 
424 
425 sub external_source {
426  my $self = shift;
427  $self->{'external_source'} = shift if (@_);
428  return $self->{'external_source'};
429 }
430 
431 
432 sub old_assembly {
433  my $self = shift;
434  $self->{'old_assembly'} = shift if (@_);
435  return $self->{'old_assembly'};
436 }
437 
438 
439 sub new_assembly {
440  my $self = shift;
441  $self->{'new_assembly'} = shift if (@_);
442  return $self->{'new_assembly'};
443 }
444 
445 
446 sub merge_fragments {
447  my $self = shift;
448  $self->{'merge_fragments'} = shift if (@_);
449  return $self->{'merge_fragments'};
450 }
451 
452 
453 sub check_length {
454  my $self = shift;
455  $self->{'check_length'} = shift if (@_);
456  return $self->{'check_length'};
457 }
458 
459 
460 sub last_status {
461  my $self = shift;
462  $self->{'last_status'} = shift if (@_);
463  return $self->{'last_status'};
464 }
465 
466 
467 1;
468 
469 
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
Bio::EnsEMBL::Utils::AssemblyProjector
Definition: AssemblyProjector.pm:82
Bio::EnsEMBL::Feature
Definition: Feature.pm:47
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::Utils::AssemblyProjector::new
public A new()
about
public about()
Bio::EnsEMBL::Feature::project
public Listref project()
Bio::EnsEMBL::Utils::Argument
Definition: Argument.pm:34
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68