ensembl-hive  2.7.0
SeqDumper.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 =head1 NAME
32 
34 
35 =head1 SYNOPSIS
36 
37  $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new();
38 
39  # don't dump snps or repeats
40  $seq_dumper->disable_feature_type('repeat');
41  $seq_dumper->disable_feature_type('variation');
42 
43  # dump EMBL format to STDOUT
44  $seq_dumper->dump( $slice, 'EMBL' );
45 
46  # dump GENBANK format to a file
47  $seq_dumper->dump( $slice, 'GENBANK', 'out.genbank' );
48 
49  # dump FASTA format to a file
50  $seq_dumper->dump( $slice, 'FASTA', 'out.fasta' );
51 
52 =head1 DESCRIPTION
53 
54 A relatively simple and lite-weight flat file dumper for Ensembl slices.
55 The memory efficiency could be improved and this is currently not very
56 good for dumping very large sequences such as whole chromosomes.
57 
58 =head1 METHODS
59 
60 =cut
61 
62 
63 package Bio::EnsEMBL::Utils::SeqDumper;
64 
65 use strict;
66 
67 use IO::File;
68 use Fcntl qw( SEEK_SET SEEK_END );
69 use vars qw(@ISA);
70 
71 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
72 
73 #keys must be uppercase
74 my $DUMP_HANDLERS =
75  { 'FASTA' => \&dump_fasta,
76  'EMBL' => \&dump_embl,
77  'GENBANK' => \&dump_genbank };
78 
79 my @COMMENTS =
80  ('This sequence was annotated by ###SOURCE###. Please visit ' .
81  'the Ensembl or EnsemblGenomes web site, http://www.ensembl.org/ or http://www.ensemblgenomes.org/ for more information.',
82 
83  'All feature locations are relative to the first (5\') base ' .
84  'of the sequence in this file. The sequence presented is '.
85  'always the forward strand of the assembly. Features ' .
86  'that lie outside of the sequence contained in this file ' .
87  'have clonal location coordinates in the format: ' .
88  '<clone accession>.<version>:<start>..<end>',
89 
90  'The /gene indicates a unique id for a gene, /note="transcript_id=..."' .
91  ' a unique id for a transcript, /protein_id a unique id for a peptide ' .
92  'and note="exon_id=..." a unique id for an exon. These ids are ' .
93  'maintained wherever possible between versions.',
94 
95  'All the exons and transcripts in Ensembl are confirmed by ' .
96  'similarity to either protein or cDNA sequences.');
97 
98 =head2 new
99 
100  Arg [1] : none
101  Example : $seq_dumper = Bio::EnsEMBL::Utils::SeqDumper->new;
102  Description: Creates a new SeqDumper
103  Returntype : Bio::EnsEMBL::Utils::SeqDumper
104  Exceptions : none
105  Caller : general
106 
107 =cut
108 
109 sub new {
110  my ($caller, $slice, $params) = @_;
111 
112  my $class = ref($caller) || $caller;
113 
114  my $feature_types = {'gene' => 1,
115  'genscan' => 1,
116  'repeat' => 1,
117  'similarity' => 1,
118  'variation' => 1,
119  'contig' => 1,
120  'marker' => 1,
121  'estgene' => 0,
122  'vegagene' => 0};
123 
124  my $self = bless {'feature_types' => $feature_types}, $class;
125 
126  foreach my $p (sort keys %{$params || {}}) {
127  $self->{$p} = $params->{$p};
128  }
129 
130  # default 60kb buffer
131  exists $self->{'chunk_factor'} and defined $self->{'chunk_factor'}
132  or $self->{'chunk_factor'} = 1000;
133 
134  return $self;
135 }
136 
137 
138 
139 =head2 enable_feature_type
140 
141  Arg [1] : string $type
142  Example : $seq_dumper->enable_feature_type('similarity');
143  Description: Enables the dumping of a specific type of feature
144  Returntype : none
145  Exceptions : warn if invalid feature type is passed,
146  thrown if no feature type is passed
147  Caller : general
148 
149 =cut
150 
151 sub enable_feature_type {
152  my ($self, $type) = @_;
153 
154  $type || throw("type arg is required");
155 
156  if(exists($self->{'feature_types'}->{$type})) {
157  $self->{'feature_types'}->{$type} = 1;
158  } else {
159  warning("unknown feature type '$type'\n" .
160  "valid types are: " . join(',', keys %{$self->{'feature_types'}}));
161  }
162 }
163 
164 
165 
166 =head2 attach_database
167 
168  Arg [1] : string name
170  Example : $seq_dumper->attach_database('estgene', $estgene_db);
171  Description: Attaches a database to the seqdumper that can be used to
172  dump data which is external to the ensembl core database.
173  Currently this is necessary to dump est genes and vega genes
174  Returntype : none
175  Exceptions : thrown if incorrect argument is supplied
176  Caller : general
177 
178 =cut
179 
180 sub attach_database {
181  my ($self, $name, $db) = @_;
182 
183  $name || throw("name arg is required");
184  unless($db && ref($db) && $db->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) {
185  throw("db arg must be a Bio::EnsEMBL::DBSQL::DBAdaptor not a [$db]");
186  }
187 
188  $self->{'attached_dbs'}->{$name} = $db;
189 }
190 
191 
192 
193 =head2 get_database
194 
195  Arg [1] : string $name
196  Example : $db = $seq_dumper->get_database('vega');
197  Description: Retrieves a database that has been attached to the
198  seqdumper via the attach database call.
199  Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
200  Exceptions : thrown if incorrect argument is supplied
201  Caller : dump_feature_table
202 
203 =cut
204 
205 sub get_database {
206  my ($self, $name) = @_;
207 
208  $name || throw("name arg is required");
209 
210  return $self->{'attached_dbs'}->{$name};
211 }
212 
213 
214 
215 =head2 remove_database
216 
217  Arg [1] : string $name
218  Example : $db = $seq_dumper->remove_database('estgene');
219  Description: Removes a database that has been attached to the seqdumper
220  via the attach database call. The database that is removed
221  is returned (or undef if it did not exist).
222  Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor
223  Exceptions : thrown if incorrect argument is supplied
224  Caller : general
225 
226 =cut
227 
228 sub remove_database {
229  my ($self, $name) = @_;
230 
231  $name || throw("name arg is required");
232 
233  if(exists $self->{'attached_dbs'}->{$name}) {
234  return delete $self->{'attached_dbs'}->{$name};
235  }
236 
237  return undef;
238 }
239 
240 
241 =head2 disable_feature_type
242 
243  Arg [1] : string $type
244  Example : $seq_dumper->disable_feature_type('genes');
245  Description: Disables the dumping of a specific type of feature
246  Returntype : none
247  Exceptions : warn if an invalid feature type is passed,
248  thrown if no feature type is passed
249  Caller : general
250 
251 =cut
252 
253 sub disable_feature_type {
254  my ($self, $type) = @_;
255 
256  $type || throw("type arg is required");
257 
258  if(exists($self->{'feature_types'}->{$type})) {
259  $self->{'feature_types'}->{$type} = 0;
260  } else {
261  warning("unknown feature type '$type'\n" .
262  "valid types are: " . join(',', keys %{$self->{'feature_types'}}));
263  }
264 }
265 
266 
267 
268 =head2 is_enabled
269 
270  Arg [1] : string $type
271  Example : do_something() if($seq_dumper->is_enabled('gene'));
272  Description: checks if a specific feature type is enabled
273  Returntype : none
274  Exceptions : warning if invalid type is passed,
275  thrown if no type is passed
276  Caller : general
277 
278 =cut
279 
280 sub is_enabled {
281  my ($self, $type) = @_;
282 
283  $type || throw("type arg is required");
284 
285  if(exists($self->{'feature_types'}->{$type})) {
286  return $self->{'feature_types'}->{$type};
287  } else {
288  warning("unknown feature type '$type'\n" .
289  "valid types are: " . join(',', keys %{$self->{'feature_types'}}));
290  }
291 }
292 
293 
294 =head2 dump
295 
296  Arg [1] : Bio::EnsEMBL::Slice slice
297  The slice to dump
298  Arg [2] : string $format
299  The name of the format to dump
300  Arg [3] : (optional) $outfile
301  The name of the file to dump to. If no file is specified STDOUT
302  is used
303  Arg [4] : (optional) $seq
304  Sequence to dump
305  Arg [4] : (optional) $no_append
306  Default action is to open the file in append mode. This will
307  turn that mode off
308  Example : $seq_dumper->dump($slice, 'EMBL');
309  Description: Dumps a region of a genome specified by the slice argument into
310  an outfile of the format $format
311  Returntype : none
312  Exceptions : thrown if slice or format args are not supplied
313  Caller : general
314 
315 =cut
316 
317 
318 sub dump {
319  my ($self, $slice, $format, $outfile, $seq, $no_append) = @_;
320 
321  $format || throw("format arg is required");
322  $slice || throw("slice arg is required");
323 
324  my $dump_handler = $DUMP_HANDLERS->{uc($format)};
325 
326  unless($dump_handler) {
327  throw("No dump handler is defined for format $format\n");
328  }
329 
330 
331  my $FH = IO::File->new;;
332  if($outfile) {
333  my $mode = ($no_append) ? '>' : '>>';
334  $FH->open("${mode}${outfile}") or throw("Could not open file $outfile: $!");
335  } else {
336  $FH = \*STDOUT;
337  #mod_perl did not like the following
338  #$FH->fdopen(fileno(STDOUT), "w")
339  # or throw("Could not open currently selected output filehandle " .
340  # "for writing");
341  }
342 
343  &$dump_handler($self, $slice, $FH, $seq);
344 
345  $FH->close if ($outfile); #close if we were writing to a file
346 }
347 
348 =head2 dump_embl
349 
350  Arg [1] : Bio::EnsEMBL::Slice
351  Arg [2] : IO::File $FH
352  Arg [3] : optional sequence string
353  Example : $seq_dumper->dump_embl($slice, $FH);
354  Description: Dumps an EMBL flat file to an open file handle
355  Returntype : none
356  Exceptions : if calls to get/set the filehandle position fail
357  Caller : dump
358 
359 =cut
360 
361 sub dump_embl {
362  my $self = shift;
363  my $slice = shift;
364  my $FH = shift;
365 
366  my $len = $slice->length;
367 
368  my $version;
369  my $acc;
370 
371  my $cs = $slice->coord_system();
372  my $name_str = $cs->name() . ' ' . $slice->seq_region_name();
373  $name_str .= ' ' . $cs->version if($cs->version);
374 
375  my $start = $slice->start;
376  my $end = $slice->end;
377 
378  #determine if this slice is the entire seq region
379  #if it is then we just use the name as the id
380  my $slice_adaptor = $slice->adaptor();
381  my $full_slice =
382  $slice->adaptor->fetch_by_region($cs->name,
383  $slice->seq_region_name,
384  undef,undef,undef,
385  $cs->version);
386 
387  my $entry_name = $slice->seq_region_name();
388 
389  if($full_slice->name eq $slice->name) {
390  $name_str .= ' full sequence';
391  $acc = $slice->seq_region_name();
392  my @acc_ver = split(/\./, $acc);
393  if(@acc_ver == 2) {
394  $acc = $acc_ver[0];
395  $version = $acc_ver[0] . '.'. $acc_ver[1];
396  } elsif(@acc_ver == 1 && $cs->version()) {
397  $version = $acc . '.'. $cs->version();
398  } else {
399  $version = $acc;
400  }
401  } else {
402  $name_str .= ' partial sequence';
403  $acc = $slice->name();
404  $version = $acc;
405  }
406 
407  $acc = $slice->name();
408 
409  #line breaks are allowed near the end of the line on ' ', "\t", "\n", ','
410  $: = (" \t\n-,");
411 
412  #############
413  # dump header
414  #############
415 
416  # move at the end of file in case the file
417  # is open more than once (i.e. human chromosome Y,
418  # two chromosome slices
419  #
420  # WARNING
421  #
422  # When the file is open not for the first time,
423  # it must be in read/write mode
424  #
425  seek($FH, 0, SEEK_END);
426 
427  my $EMBL_HEADER =
428 '@< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
429 ';
430 
431  #ID and moltype
432  # HTG = High Throughput Genome division, probably most suitable
433  # and it would be hard to come up with another appropriate division
434  # that worked for all organisms (e.g. plants are in PLN but human is
435  # in HUM).
436  my $VALUE = "$entry_name standard; DNA; HTG; $len BP.";
437  $self->write($FH, $EMBL_HEADER, 'ID', $VALUE);
438  $self->print( $FH, "XX\n" );
439 
440  #Accession
441  $self->write($FH, $EMBL_HEADER, 'AC', $acc);
442  $self->print( $FH, "XX\n" );
443 
444  #Version
445  $self->write($FH, $EMBL_HEADER, 'SV', $version);
446  $self->print( $FH, "XX\n" );
447 
448  #Date
449  $self->write($FH, $EMBL_HEADER, 'DT', $self->_date_string);
450  $self->print( $FH, "XX\n" );
451 
452  my $meta_container = $slice->adaptor()->db()->get_MetaContainer();
453 
454  #Description
455  $self->write($FH, $EMBL_HEADER, 'DE', $meta_container->get_scientific_name() .
456  " $name_str $start..$end annotated by Ensembl");
457  $self->print( $FH, "XX\n" );
458 
459  #key words
460  $self->write($FH, $EMBL_HEADER, 'KW', '.');
461  $self->print( $FH, "XX\n" );
462 
463  #Species
464  my $species_name = $meta_container->get_scientific_name();
465  if(my $cn = $meta_container->get_common_name()) {
466  $species_name .= " ($cn)";
467  }
468 
469  $self->write($FH, $EMBL_HEADER, 'OS', $species_name);
470 
471  #Classification
472  my @cls = @{$meta_container->get_classification()};
473  $self->write($FH, $EMBL_HEADER, 'OC', join('; ', reverse(@cls)) . '.');
474  $self->print( $FH, "XX\n" );
475 
476  #References (we are not dumping refereneces)
477  #Database References (we are not dumping these)
478 
479  #comments
480  my $annotation_source = $self->annotation_source($meta_container);
481  foreach my $comment (@COMMENTS) {
482  $comment =~ s/\#\#\#SOURCE\#\#\#/$annotation_source/;
483  $self->write($FH, $EMBL_HEADER, 'CC', $comment);
484  $self->print( $FH, "XX\n" );
485  }
486 
487  ####################
488  # DUMP FEATURE TABLE
489  ####################
490  $self->print( $FH, "FH Key Location/Qualifiers\n" );
491 
492  my $FEATURE_TABLE =
493 'FT ^<<<<<<<<<<<<<<<^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
494 ';
495  $self->_dump_feature_table($slice, $FH, $FEATURE_TABLE);
496 
497  #write an XX after the feature tables
498  $self->print( $FH, "XX\n" );
499 
500  ###################
501  # DUMP SEQUENCE
502  ###################
503 
504  # record position before writing sequence header, so that
505  # after printing the sequence and having the base counts
506  # we can seek to this position and write the proper sequence
507  # header
508  my $sq_offset = tell($FH);
509  $sq_offset == -1 and throw "Unable to get offset for output fh";
510 
511  # print a sequence header template, to be replaced with a real
512  # one containing the base counts
513  $self->print($FH, "SQ Sequence ########## BP; ########## A; ########## C; ########## G; ########## T; ########## other;\n");
514 
515  # dump the sequence and get the base counts
516  my $acgt = $self->write_embl_seq($slice, $FH);
517 
518  # print the end of EMBL entry
519  $self->print( $FH, "//\n" );
520  my $end_of_entry_offset = tell($FH);
521  $end_of_entry_offset == -1 and throw "Unable to get offset for output fh";
522 
523  # seek backwards to the position of the sequence header and
524  # write it with the actual base counts
525  seek($FH, $sq_offset, SEEK_SET)
526  or throw "Cannot seek backward to sequence header position";
527  $self->print($FH, sprintf "SQ Sequence %10d BP; %10d A; %10d C; %10d G; %10d T; %10d other;",
528  $acgt->{tot}, $acgt->{a}, $acgt->{c}, $acgt->{g}, $acgt->{t}, $acgt->{n});
529 
530  # move forward to end of file to dump the next slice
531  seek($FH, $end_of_entry_offset, SEEK_SET)
532  or throw "Cannot seek forward to end of entry";
533 
534  # Set formatting back to normal
535  $: = " \n-";
536 }
537 
538 =head2 dump_genbank
539 
540  Arg [1] : Bio::EnsEMBL::Slice
541  Arg [2] : IO::File $FH
542  Example : $seq_dumper->dump_genbank($slice, $FH);
543  Description: Dumps a GENBANK flat file to an open file handle
544  Returntype : none
545  Exceptions : none
546  Caller : dump
547 
548 =cut
549 
550 sub dump_genbank {
551  my ($self, $slice, $FH) = @_;
552 
553  #line breaks are allowed near the end of the line on ' ', "\t", "\n", ','
554  $: = " \t\n-,";
555 
556  my $GENBANK_HEADER =
557 '^<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
558 ';
559 
560  my $GENBANK_SUBHEADER =
561 ' ^<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
562 ';
563 
564  my $GENBANK_FT =
565 ' ^<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
566 ';
567 
568  my $version;
569  my $acc;
570 
571  my $cs = $slice->coord_system();
572 
573  my $name_str = $cs->name() . ' ' . $slice->seq_region_name();
574 
575  $name_str .= ' ' . $cs->version if($cs->version);
576 
577  #determine if this slice is the entire seq region
578  #if it is then we just use the name as the id
579  my $slice_adaptor = $slice->adaptor();
580  my $full_slice =
581  $slice->adaptor->fetch_by_region($cs->name,
582  $slice->seq_region_name,
583  undef,undef,undef,
584  $cs->version);
585 
586 
587  my $entry_name = $slice->seq_region_name();
588 
589  if($full_slice->name eq $slice->name) {
590  $name_str .= ' full sequence';
591  $acc = $slice->seq_region_name();
592  my @acc_ver = split(/\./, $acc);
593  if(@acc_ver == 2) {
594  $acc = $acc_ver[0];
595  $version = $acc_ver[0] . $acc_ver[1];
596  } elsif(@acc_ver == 1 && $cs->version()) {
597  $version = $acc . $cs->version();
598  } else {
599  $version = $acc;
600  }
601  } else {
602  $name_str .= ' partial sequence';
603  $acc = $slice->name();
604  $version = $acc;
605  }
606 
607  $acc = $slice->name(); # to keep format consistent for all
608 
609  my $length = $slice->length;
610  my $start = $slice->start();
611  my $end = $slice->end();
612 
613  my $date = $self->_date_string;
614 
615  my $meta_container = $slice->adaptor()->db()->get_MetaContainer();
616 
617  # move at the end of file in case the file
618  # is open more than once (i.e. human chromosome Y,
619  # two chromosome slices
620  #
621  # WARNING
622  #
623  # When the file is open not for the first time,
624  # it must be in read/write mode
625  #
626  seek($FH, 0, SEEK_END);
627 
628  #LOCUS
629  my $tag = 'LOCUS';
630  my $value = "$entry_name $length bp DNA HTG $date";
631  $self->write($FH, $GENBANK_HEADER, $tag, $value);
632 
633  #DEFINITION
634  $tag = "DEFINITION";
635  $value = $meta_container->get_scientific_name() .
636  " $name_str $start..$end reannotated via EnsEMBL";
637  $self->write($FH, $GENBANK_HEADER, $tag, $value);
638 
639  #ACCESSION
640  $self->write($FH, $GENBANK_HEADER, 'ACCESSION', $acc);
641 
642  #VERSION
643  $self->write($FH, $GENBANK_HEADER, 'VERSION', $version);
644 
645  # KEYWORDS
646  $self->write($FH, $GENBANK_HEADER, 'KEYWORDS', '.');
647 
648  # SOURCE
649  my $common_name = $meta_container->get_common_name();
650  $common_name = $meta_container->get_scientific_name() unless $common_name;
651  $self->write($FH, $GENBANK_HEADER, 'SOURCE', $common_name);
652 
653  #organism
654  my @cls = @{$meta_container->get_classification()};
655  shift @cls;
656  $self->write($FH, $GENBANK_SUBHEADER, 'ORGANISM', $meta_container->get_scientific_name());
657  $self->write($FH, $GENBANK_SUBHEADER, '', join('; ', reverse @cls) . ".");
658 
659  #refereneces
660 
661  #comments
662  my $annotation_source = $self->annotation_source($meta_container);
663  foreach my $comment (@COMMENTS) {
664  $comment =~ s/\#\#\#SOURCE\#\#\#/$annotation_source/;
665  $self->write($FH, $GENBANK_HEADER, 'COMMENT', $comment);
666  }
667 
668  ####################
669  # DUMP FEATURE TABLE
670  ####################
671  $self->print( $FH, "FEATURES Location/Qualifiers\n" );
672  $self->_dump_feature_table($slice, $FH, $GENBANK_FT);
673 
674  ####################
675  # DUMP SEQUENCE
676  ####################
677 
678  # record position before writing sequence header, so that
679  # after printing the sequence and having the base counts
680  # we can seek to this position and write the proper sequence
681  # header
682  my $sq_offset = tell($FH);
683  $sq_offset == -1 and throw "Unable to get offset for output fh";
684 
685  # print a sequence header template, to be replaced with a real
686  # one containing the base counts
687  $self->print($FH, "BASE COUNT ########## a ########## c ########## g ########## t ########## n\nORIGIN\n");
688 
689  # dump the sequence and get the base counts
690  my $acgt = $self->write_genbank_seq($slice, $FH);
691 
692  # print the end of genbank entry
693  $self->print( $FH, "//\n" );
694  my $end_of_entry_offset = tell($FH);
695  $end_of_entry_offset == -1 and throw "Unable to get offset for output fh";
696 
697  # seek backwards to the position of the sequence header and
698  # write it with the actual base counts
699  seek($FH, $sq_offset, SEEK_SET)
700  or throw "Cannot seek backward to sequence header position";
701  $self->print($FH, sprintf "BASE COUNT %10d a %10d c %10d g %10d t %10d n",
702  $acgt->{a}, $acgt->{c}, $acgt->{g}, $acgt->{t}, $acgt->{n});
703 
704  seek($FH, $end_of_entry_offset, SEEK_SET)
705  or throw "Cannot seek forward to end of entry";
706 
707  # Set formatting back to normal
708  $: = " \n-";
709 }
710 
711 
712 
713 =head2 _dump_feature_table
714 
715  Arg [1] : Bio::EnsEMBL::Slice slice
716  Example : none
717  Description: Helper method used to dump feature tables used in EMBL, FASTA,
718  GENBANK. Assumes formating of file handle has been setup
719  already to use $FEAT and $VALUE values.
720  Returntype : none
721  Exceptions : none
722  Caller : internal
723 
724 =cut
725 
726 sub _dump_feature_table {
727  my $self = shift;
728  my $slice = shift;
729  my $FH = shift;
730  my $FORMAT = shift;
731 
732  #use only the core database to dump features (except for bloody snps)
733  my $lite = $slice->adaptor->db->remove_db_adaptor('lite');
734 
735  my $meta = $slice->adaptor->db->get_MetaContainer;
736 
737  #lump file handle and format string together for simpler method calls
738  my @ff = ($FH, $FORMAT);
739  my $value;
740 
741  #source
742  my $classification = join(', ', @{$meta->get_classification()});
743  $self->write(@ff,'source', "1.." . $slice->length());
744  $self->write(@ff,'' , '/organism="'.$meta->get_scientific_name(). '"');
745  $self->write(@ff,'' , '/db_xref="taxon:'.$meta->get_taxonomy_id().'"');
746 
747  #
748  # Transcripts & Genes
749  #
750  my @gene_slices;
751  if($self->is_enabled('gene')) {
752  push @gene_slices, $slice;
753  }
754 
755  # Retrieve slices of other database where we need to pull genes from
756 
757  my $gene_dbs = {'vegagene' => 'vega',
758  'estgene' => 'estgene'};
759 
760  foreach my $gene_type (keys %$gene_dbs) {
761  if($self->is_enabled($gene_type)) {
762  my $db = $self->get_database($gene_dbs->{$gene_type});
763  if($db) {
764  my $sa = $db->get_SliceAdaptor();
765  push @gene_slices, $sa->fetch_by_name($slice->name());
766  } else {
767  warning("A [". $gene_dbs->{$gene_type} ."] database must be " .
768  "attached to this SeqDumper\n(via a call to " .
769  "attach_database) to retrieve genes of type [$gene_type]");
770  }
771  }
772  }
773 
774  foreach my $gene_slice (@gene_slices) {
775  my @genes = @{$gene_slice->get_all_Genes(undef,undef, 1)};
776  while(my $gene = shift @genes) {
777  $value = $self->features2location( [$gene] );
778  $self->write( @ff, 'gene', $value );
779  $self->write( @ff, "", '/gene='.$gene->stable_id_version() );
780 
781 
782  if(defined($gene->display_xref)){
783  $self->write( @ff, "",'/locus_tag="'.$gene->display_xref->display_id.'"');
784  }
785  my $desc = $gene->description;
786  if(defined($desc) and $desc ne ""){
787  $desc =~ s/\"//;
788  $self->write( @ff, "", '/note="'.$gene->description.'"');
789  }
790 
791 
792 
793  foreach my $transcript (@{$gene->get_all_Transcripts}) {
794  my $translation = $transcript->translation;
795 
796  # normal transcripts get dumped differently than pseudogenes
797  if($translation) {
798  #normal transcript
799  $value = $self->features2location($transcript->get_all_Exons);
800  $self->write(@ff, 'mRNA', $value);
801  $self->write(@ff,'' , '/gene="'.$gene->stable_id_version().'"');
802  $self->write(@ff,''
803  ,'/standard_name="'.$transcript->stable_id_version().'"');
804 
805  # ...and a CDS section
806  $value =
807  $self->features2location($transcript->get_all_translateable_Exons);
808  $self->write(@ff,'CDS', $value);
809  my $codon_start = $self->transcript_to_codon_start($transcript);
810  $self->write(@ff,'' , qq{/codon_start="${codon_start}"}) if $codon_start > 1;
811 
812  my $codon_table_id = $self->_get_codon_table_id($slice);
813  if($codon_table_id > 1){
814  $self->write(@ff,'' , '/transl_table='.$codon_table_id);
815  }
816  $self->write(@ff,'' , '/gene="'.$gene->stable_id_version().'"');
817  $self->write(@ff,'' , '/protein_id="'.$translation->stable_id_version().'"');
818  $self->write(@ff,'' ,'/note="transcript_id='.$transcript->stable_id_version().'"');
819 
820  foreach my $dbl (@{$transcript->get_all_DBLinks}) {
821  my $db_xref = '/db_xref="'.$dbl->dbname().':'.$dbl->primary_id().'"';
822  my $go_db_xref = '/db_xref="'.$dbl->primary_id().'"';
823  $value = ($dbl->dbname()=~/GO/) ? $go_db_xref : $db_xref;
824  $self->write(@ff, '', $value);
825  }
826 
827  my $pep = $transcript->translate();
828  if ($pep) {
829  $value = '/translation="'.$pep->seq().'"';
830  $self->write(@ff, '', $value);
831  }
832  } else {
833  #pseudogene
834  $value = $self->features2location($transcript->get_all_Exons);
835  $self->write(@ff, 'misc_RNA', $value);
836  $self->write(@ff,'' , '/gene="'.$gene->stable_id_version().'"');
837  foreach my $dbl (@{$transcript->get_all_DBLinks}) {
838  $value = '/db_xref="'.$dbl->dbname().':'.$dbl->primary_id().'"';
839  $self->write(@ff, '', $value);
840  }
841  $self->write(@ff,'' , '/note="'.$transcript->biotype().'"');
842  $self->write(@ff,''
843  ,'/standard_name="'.$transcript->stable_id_version().'"');
844  }
845  }
846  }
847 
848  # exons
849  foreach my $gene (@{$gene_slice->get_all_Genes(undef,undef,1)}) {
850  foreach my $exon (@{$gene->get_all_Exons}) {
851  $self->write(@ff,'exon', $self->features2location([$exon]));
852  $self->write(@ff,'' , '/note="exon_id='.$exon->stable_id_version().'"');
853  }
854  }
855  }
856 
857  #
858  # genscans
859  #
860  if($self->is_enabled('genscan')) {
861  my @genscan_exons;
862  my @transcripts = @{$slice->get_all_PredictionTranscripts(undef,1)};
863  while(my $transcript = shift @transcripts) {
864  my $exons = $transcript->get_all_Exons();
865  push @genscan_exons, @$exons;
866  $self->write(@ff, 'mRNA', $self->features2location($exons));
867  $self->write(@ff, '', '/product="'.$transcript->translate()->seq().'"');
868  $self->write(@ff, '', '/note="identifier='.$transcript->stable_id_version.'"');
869  $self->write(@ff, '', '/note="Derived by automated computational' .
870  ' analysis using gene prediction method:' .
871  $transcript->analysis->logic_name . '"');
872  }
873  }
874 
875  #
876  # snps
877  #
878  if($self->is_enabled('variation') && $slice->can('get_all_VariationFeatures')) {
879 # $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite;
880 
881  my @variations = @{$slice->get_all_VariationFeatures()};
882  while(my $snp = shift @variations) {
883  my $ss = $snp->start;
884  my $se = $snp->end;
885  #skip snps that hang off edge of slice
886  next if($ss < 1 || $se > $slice->length);
887 
888  $self->write(@ff, 'variation', "$ss..$se");
889  $self->write(@ff, '' , '/replace="'.$snp->allele_string.'"');
890  #$self->write(@ff, '' , '/evidence="'.$snp->status.'"');
891  my $rs_id = $snp->variation_name();
892  my $db = $snp->source_name();
893 # foreach my $link ($snp->each_DBLink) {
894 # my $id = $link->primary_id;
895 # my $db = $link->database;
896  $self->write(@ff, '', "/db_xref=\"$db:$rs_id\"");
897 # }
898  }
899 
900 # $slice->adaptor->db->remove_db_adaptor('lite') if $lite;
901  }
902 
903  #
904  # similarity features
905  #
906  if($self->is_enabled('similarity')) {
907  foreach my $sim (@{$slice->get_all_SimilarityFeatures}) {
908  $self->write(@ff, 'misc_feature', $self->features2location([$sim]));
909  $self->write(@ff, '' , '/note="match: '.$sim->hseqname.
910  ' : '.$sim->hstart.'..'.$sim->hend.'('.$sim->hstrand.')"');
911  }
912  }
913 
914  #
915  # repeats
916  #
917  if($self->is_enabled('repeat')) {
918  my @rfs = @{$slice->get_all_RepeatFeatures()};
919 
920  while(my $repeat = shift @rfs) {
921  $self->write(@ff, 'repeat_region', $self->features2location([$repeat]));
922  $self->write(@ff, '' , '/note="' . $repeat->repeat_consensus->name.
923  ' repeat: matches ' . $repeat->hstart.'..'.$repeat->hend .
924  '('.$repeat->hstrand.') of consensus"');
925  }
926 
927  }
928 
929  #
930  # markers
931  #
932  if($self->is_enabled('marker') && $slice->can('get_all_MarkerFeatures')) {
933  my @markers = @{$slice->get_all_MarkerFeatures()};
934  while(my $mf = shift @markers) {
935  $self->write(@ff, 'STS', $self->features2location([$mf]));
936  if($mf->marker->display_MarkerSynonym) {
937  $self->write(@ff, '' , '/standard_name="' .
938  $mf->marker->display_MarkerSynonym->name . '"');
939  }
940 
941 
942  #grep out synonyms without a source
943  my @synonyms = @{$mf->marker->get_all_MarkerSynonyms};
944  @synonyms = grep {$_->source } @synonyms;
945  foreach my $synonym (@synonyms) {
946  $self->write(@ff, '', '/db_xref="'.$synonym->source.
947  ':'.$synonym->name.'"');
948  }
949  $self->write(@ff, '', '/note="map_weight='.$mf->map_weight.'"');
950  }
951  }
952 
953  #
954  # contigs
955  #
956  if($self->is_enabled('contig')) {
957  foreach my $segment (@{$slice->project('seqlevel')}) {
958  my ($start, $end, $slice) = @$segment;
959  $self->write(@ff, 'misc_feature',
960  $start .'..'. $end);
961  $self->write(@ff, '', '/note="contig '.$slice->seq_region_name .
962  ' ' . $slice->start . '..' . $slice->end .
963  '(' . $slice->strand . ')"');
964  }
965  }
966 
967  $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite;
968 
969 }
970 
971 # /codon_start= is the first base to start translating from. This maps
972 # Ensembl start exon phase to this concept. Here we present the usage
973 # of phase in this concept where each row shows the sequence the
974 # spliced_seq() method will return
975 
976 # 123456789
977 # ATTATGACG
978 # Phase == 0 ...+++### codon_start=0 // start from 1st A
979 # Phase == 1 ..+++### codon_start=3 // start from 2nd A (base 3 in the given spliced sequence)
980 # Phase == 2 .+++### codon_start=2 // start from 2nd A (base 2 in the spliced sequence)
981 #
982 # In the case of the final 2 phases we will generate a X codon
983 #
984 
985 sub transcript_to_codon_start {
986  my ($self, $transcript) = @_;
987  my $start_phase = $transcript->start_Exon()->phase();
988  return ( $start_phase == 1 ) ? 3 :
989  ( $start_phase == 2 ) ? 2 :
990  ( $start_phase == 0 ) ? 1 :
991  -1;
992 }
993 
994 
995 =head2 _get_codon_table_id
996 
997  Arg [1] : Bio::EnsEMBL::Slice slice
998  Example : none
999  Description: Helper method to get codon_table seq region attribute
1000  codon_table defines the genetic code table used if other than the universal genetic code table (1)
1001  By default it is 1 and is not shown in flat files.
1002  If it is not equal to 1, then it is shown as a transl_table qualifier on the CDS feature.
1003  Returntype : int
1004  Caller : internal
1005 
1006 =cut
1007 
1008 sub _get_codon_table_id{
1009  my ($self, $slice) = @_;
1010  my $codon_table_id = 1;
1011  my $codon_table_attributes = $slice->get_all_Attributes("codon_table");
1012  if (@{$codon_table_attributes}) {
1013  $codon_table_id = $codon_table_attributes->[0]->value;
1014  }
1015  return $codon_table_id;
1016 }
1017 
1018 =head2 dump_fasta
1019 
1020  Arg [1] : Bio::EnsEMBL::Slice
1021  Arg [2] : IO::File $FH
1022  Example : $seq_dumper->dump_fasta($slice, $FH);
1023  Description: Dumps an FASTA flat file to an open file handle
1024  Returntype : none
1025  Exceptions : none
1026  Caller : dump
1027 
1028 =cut
1029 
1030 sub dump_fasta {
1031  my $self = shift;
1032  my $slice = shift;
1033  my $FH = shift;
1034 
1035  my $id = $slice->seq_region_name;
1036  my $seqtype = 'dna';
1037  my $idtype = $slice->coord_system->name;
1038  my $location = $slice->name;
1039  my $start = 1;
1040  my $end = $slice->length();
1041 
1042  my $header = ">$id $seqtype:$idtype $location\n";
1043  $self->print( $FH, $header );
1044 
1045  #set the formatting to FASTA
1046  my $FORMAT = '^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1047 ';
1048 
1049  #chunk the sequence in 60kb chunks to use less memory
1050  my $cur = $start;
1051  while($cur <= $end) {
1052  my $to = $cur + 59_999;
1053  $to = $end if($to > $end);
1054  my $seq = $slice->subseq($cur, $to);
1055  $cur = $to + 1;
1056  $self->write($FH, $FORMAT, $seq);
1057  }
1058 }
1059 
1060 
1061 
1062 =head2 features2location
1063 
1064  Arg [1] : listref of Bio::EnsEMBL::SeqFeatures
1065  Example : $location = $self->features2location(\@features);
1066  Description: Constructs an EMBL location string from a list of features
1067  Returntype : string
1068  Exceptions : none
1069  Caller : internal
1070 
1071 =cut
1072 
1073 sub features2location {
1074  my $self = shift;
1075  my $features = shift;
1076 
1077  my @join = ();
1078 
1079  foreach my $f (@$features) {
1080  my $slice = $f->slice;
1081  my $start = $f->start();
1082  my $end = $f->end();
1083  my $strand = $f->strand();
1084 
1085  if($start >= 1 && $end <= $slice->length) {
1086  #this feature in on a slice and doesn't lie outside the boundary
1087 
1088  if($strand == 1) {
1089  push @join, "$start..$end";
1090  } else {
1091  push @join, "complement($start..$end)";
1092  }
1093  } else {
1094  my @fs = ();
1095  #this feature is outside the boundary of the dump,
1096  # yet implemented and 'seqlevel' is guaranteed to be 1step
1097  my $projection = $f->project('seqlevel');
1098  foreach my $segment (@$projection) {
1099  my $slice = $segment->[2];
1100  my $slc_start = $slice->start();
1101  my $slc_end = $slice->end();
1102  my $seq_reg = $slice->seq_region_name();
1103  if($slice->strand == 1) {
1104  push @join, "$seq_reg:$slc_start..$slc_end";
1105  } else {
1106  push @join, "complement($seq_reg:$slc_start..$slc_end)";
1107  }
1108  }
1109  }
1110  }
1111 
1112  my $out = join ',', @join;
1113 
1114  if(scalar @join > 1) {
1115  $out = "join($out)";
1116  }
1117 
1118  return $out;
1119 }
1120 
1121 =head2 annotation_source
1122 
1124  Example : $seq_dumper->annotation_source($meta_container);
1125  Description: Constructs a string with gene annotation sources
1126  Returntype : string
1127  Exceptions : none
1128  Caller : dump_embl, dump_genbank
1129 
1130 =cut
1131 
1132 sub annotation_source {
1133  my ($self, $meta_container) = @_;
1134 
1135  my @providers = ();
1136  my @provider_urls = ();
1137 
1138  foreach ( @{$meta_container->list_value_by_key('annotation.provider_name')} ) {
1139  push @providers, $_ if $_ ne '';
1140  }
1141  foreach ( @{$meta_container->list_value_by_key('annotation.provider_url')} ) {
1142  push @provider_urls, $_ if $_ ne '';
1143  }
1144  if ( ! scalar(@providers) ) {
1145  foreach ( @{$meta_container->list_value_by_key('assembly.provider_name')} ) {
1146  push @providers, $_ if $_ ne '';
1147  }
1148  foreach ( @{$meta_container->list_value_by_key('assembly.provider_url')} ) {
1149  push @provider_urls, $_ if $_ ne '';
1150  }
1151  }
1152  if ( ! scalar(@providers) ) {
1153  push @providers, 'Ensembl';
1154  }
1155 
1156  my $annotation_source = join(' and ', @providers);
1157  if (@provider_urls) {
1158  $annotation_source .= ' ' . sprintf(q{(%s)}, join(', ', @provider_urls));
1159  }
1160 
1161  return $annotation_source;
1162 }
1163 
1164 sub _date_string {
1165  my $self = shift;
1166 
1167  my ($sec, $min, $hour, $mday,$mon, $year) = localtime(time());
1168 
1169  my $month = ('JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
1170  'AUG', 'SEP', 'OCT', 'NOV', 'DEC')[$mon];
1171  $year += 1900;
1172 
1173  return "$mday-$month-$year";
1174 }
1175 
1176 sub write {
1177  my ($self, $FH, $FORMAT, @values) = @_;
1178 
1179  #while the last value still contains something
1180  while(defined($values[-1]) and $values[-1] ne '') {
1181  formline($FORMAT, @values);
1182  $self->print( $FH, $^A );
1183  $^A = '';
1184  }
1185 }
1186 
1187 sub write_genbank_seq {
1188  my $self = shift;
1189  my $FH = shift;
1190  my $seq = shift;
1191  my $base_total = shift;
1192 
1193  $base_total ||= 0;
1194 
1195  my $GENBANK_SEQ =
1196 '@>>>>>>>> ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<~
1197 ';
1198 
1199  my $total = -59 + $base_total;
1200  #keep track of total and print lines of 60 bases with spaces every 10bp
1201  while($$seq) {
1202  $total += 60;
1203  formline($GENBANK_SEQ,$total, $$seq, $$seq, $$seq, $$seq, $$seq, $$seq);
1204  $self->print( $FH, $^A );
1205  $^A = '';
1206  }
1207 }
1208 
1209 sub write_genbank_seq {
1210  my $self = shift;
1211  my $slice = shift;
1212  my $FH = shift;
1213 
1214  my $width = 60;
1215 
1216  # set buffer size
1217  my $chunk_size = $self->{'chunk_factor'} * $width;
1218  $chunk_size > 0 or throw "Invalid chunk size: check chunk_factor parameter";
1219 
1220  my $start = 1;
1221  my $end = $slice->length;
1222  my $total = -59;
1223 
1224  # chunk the sequence to conserve memory, and print
1225  my $here = $start;
1226  my $GENBANK_SEQ =
1227 '@>>>>>>>> ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<~
1228 ';
1229  my $acgt;
1230 
1231  while($here <= $end) {
1232  my $there = $here + $chunk_size - 1;
1233  $there = $end if($there > $end);
1234  my $sseq = $slice->subseq($here, $there);
1235 
1236  $acgt->{a} += $sseq =~ tr/Aa//;
1237  $acgt->{c} += $sseq =~ tr/Cc//;
1238  $acgt->{g} += $sseq =~ tr/Gg//;
1239  $acgt->{t} += $sseq =~ tr/Tt//;
1240 
1241  while($sseq) {
1242  $total += 60;
1243  formline($GENBANK_SEQ, $total, $sseq, $sseq, $sseq, $sseq, $sseq, $sseq);
1244  $self->print($FH, $^A);
1245  $^A = '';
1246  }
1247 
1248  $here = $there + 1;
1249  }
1250 
1251  $acgt->{n} = $end - ($acgt->{a} + $acgt->{c} + $acgt->{g} + $acgt->{t});
1252  $acgt->{tot} = $end;
1253 
1254  return $acgt;
1255 }
1256 
1257 
1258 sub write_embl_seq {
1259  my $self = shift;
1260  my $slice = shift;
1261  my $FH = shift;
1262 
1263  my $width = 60;
1264 
1265  # set buffer size
1266  my $chunk_size = $self->{'chunk_factor'} * $width;
1267  $chunk_size > 0 or throw "Invalid chunk size: check chunk_factor parameter";
1268 
1269  my $start = 1;
1270  my $end = $slice->length;
1271  my $total = $end;
1272 
1273  # chunk the sequence to conserve memory, and print
1274  my $here = $start;
1275  my $EMBL_SEQ =
1276  ' ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< @>>>>>>>>>~
1277 ';
1278  my $acgt;
1279 
1280  while($here <= $end) {
1281  my $there = $here + $chunk_size - 1;
1282  $there = $end if($there > $end);
1283  my $sseq = $slice->subseq($here, $there);
1284 
1285  $acgt->{a} += $sseq =~ tr/Aa//;
1286  $acgt->{c} += $sseq =~ tr/Cc//;
1287  $acgt->{g} += $sseq =~ tr/Gg//;
1288  $acgt->{t} += $sseq =~ tr/Tt//;
1289 
1290  while($sseq) {
1291  $total -= 60;
1292  $total = 0 if $total < 0;
1293  formline($EMBL_SEQ,
1294  $sseq, $sseq, $sseq, $sseq, $sseq, $sseq,
1295  $end - $total);
1296  $self->print($FH, $^A);
1297  $^A = '';
1298  }
1299 
1300  $here = $there + 1;
1301  }
1302 
1303  $acgt->{n} = $end - ($acgt->{a} + $acgt->{c} + $acgt->{g} + $acgt->{t});
1304  $acgt->{tot} = $end;
1305 
1306  return $acgt;
1307 }
1308 
1309 sub print {
1310  my( $self, $FH, $string ) = @_;
1311  if(!print $FH $string){
1312  print STDERR "Problem writing to disk\n";
1313  print STDERR "the string is $string\n";
1314  die "Could not write to file handle";
1315  }
1316 }
1317 
1318 
1319 1;
Bio::EnsEMBL::Utils::SeqDumper::disable_feature_type
public void disable_feature_type()
Bio::EnsEMBL::Slice::get_all_Attributes
public Listref get_all_Attributes()
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
Bio::EnsEMBL::CoordSystem::name
public String name()
Bio::EnsEMBL::Utils::SeqDumper
Definition: SeqDumper.pm:33
Bio::EnsEMBL::Slice::adaptor
public Bio::EnsEMBL::DBSQL::SliceAdaptor adaptor()
Bio::EnsEMBL::Slice
Definition: Slice.pm:50
Bio::EnsEMBL::Slice::new
public Bio::EnsEMBL::Slice new()
Bio::EnsEMBL::Slice::seq_region_name
public String seq_region_name()
Bio::EnsEMBL::DBSQL::BaseAdaptor::db
public Bio::EnsEMBL::DBSQL::DBAdaptor db()
Bio::EnsEMBL::Utils::SeqDumper::enable_feature_type
public void enable_feature_type()
Bio::EnsEMBL::DBSQL::MetaContainer
Definition: MetaContainer.pm:22
Bio::EnsEMBL::Slice::coord_system
public Bio::EnsEMBL::CoordSystem coord_system()
Bio::EnsEMBL::Utils::SeqDumper::new
public Bio::EnsEMBL::Utils::SeqDumper new()
Bio::EnsEMBL::DBSQL::DBAdaptor::remove_db_adaptor
public void remove_db_adaptor()
Bio::EnsEMBL::Slice::length
public Int length()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68