3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
23 Please email comments or questions to the
public Ensembl
24 developers list at <http:
26 Questions may also be sent to the Ensembl help desk at
39 # don't dump snps or repeats
41 $seq_dumper->disable_feature_type(
'variation');
43 # dump EMBL format to STDOUT
44 $seq_dumper->dump( $slice,
'EMBL' );
46 # dump GENBANK format to a file
47 $seq_dumper->dump( $slice,
'GENBANK',
'out.genbank' );
49 # dump FASTA format to a file
50 $seq_dumper->dump( $slice,
'FASTA',
'out.fasta' );
54 A relatively simple and lite-weight flat file dumper
for Ensembl slices.
55 The memory efficiency could be improved and
this is currently not very
56 good
for dumping very large sequences such as whole chromosomes.
63 package Bio::EnsEMBL::Utils::SeqDumper;
68 use Fcntl qw( SEEK_SET SEEK_END );
73 #keys must be uppercase
75 {
'FASTA' => \&dump_fasta,
76 'EMBL' => \&dump_embl,
77 'GENBANK' => \&dump_genbank };
80 (
'This sequence was annotated by ###SOURCE###. Please visit ' .
81 'the Ensembl or EnsemblGenomes web site, http://www.ensembl.org/ or http://www.ensemblgenomes.org/ for more information.',
83 'All feature locations are relative to the first (5\') base ' .
84 'of the sequence in this file. The sequence presented is '.
85 'always the forward strand of the assembly. Features ' .
86 'that lie outside of the sequence contained in this file ' .
87 'have clonal location coordinates in the format: ' .
88 '<clone accession>.<version>:<start>..<end>',
90 'The /gene indicates a unique id for a gene, /note="transcript_id=..."' .
91 ' a unique id for a transcript, /protein_id a unique id for a peptide ' .
92 'and note="exon_id=..." a unique id for an exon. These ids are ' .
93 'maintained wherever possible between versions.',
95 'All the exons and transcripts in Ensembl are confirmed by ' .
96 'similarity to either protein or cDNA sequences.');
110 my ($caller, $slice, $params) = @_;
112 my $class = ref($caller) || $caller;
114 my $feature_types = {
'gene' => 1,
124 my $self = bless {
'feature_types' => $feature_types}, $class;
126 foreach my $p (sort keys %{$params || {}}) {
127 $self->{$p} = $params->{$p};
130 # default 60kb buffer
131 exists $self->{
'chunk_factor'} and defined $self->{
'chunk_factor'}
132 or $self->{
'chunk_factor'} = 1000;
139 =head2 enable_feature_type
141 Arg [1] :
string $type
143 Description: Enables the dumping of a specific type of feature
145 Exceptions : warn
if invalid feature type is passed,
146 thrown
if no feature type is passed
151 sub enable_feature_type {
152 my ($self, $type) = @_;
154 $type ||
throw(
"type arg is required");
156 if(exists($self->{
'feature_types'}->{$type})) {
157 $self->{
'feature_types'}->{$type} = 1;
159 warning(
"unknown feature type '$type'\n" .
160 "valid types are: " . join(
',', keys %{$self->{
'feature_types'}}));
166 =head2 attach_database
168 Arg [1] :
string name
170 Example : $seq_dumper->attach_database(
'estgene', $estgene_db);
171 Description: Attaches a database to the seqdumper that can be used to
172 dump data which is external to the ensembl core database.
173 Currently
this is necessary to dump est genes and vega genes
175 Exceptions : thrown
if incorrect argument is supplied
180 sub attach_database {
181 my ($self, $name, $db) = @_;
183 $name ||
throw(
"name arg is required");
184 unless($db && ref($db) && $db->isa(
'Bio::EnsEMBL::DBSQL::DBAdaptor')) {
185 throw(
"db arg must be a Bio::EnsEMBL::DBSQL::DBAdaptor not a [$db]");
188 $self->{
'attached_dbs'}->{$name} = $db;
195 Arg [1] :
string $name
196 Example : $db = $seq_dumper->get_database(
'vega');
197 Description: Retrieves a database that has been attached to the
198 seqdumper via the attach database call.
200 Exceptions : thrown
if incorrect argument is supplied
201 Caller : dump_feature_table
206 my ($self, $name) = @_;
208 $name ||
throw(
"name arg is required");
210 return $self->{
'attached_dbs'}->{$name};
215 =head2 remove_database
217 Arg [1] :
string $name
218 Example : $db = $seq_dumper->remove_database(
'estgene');
219 Description: Removes a database that has been attached to the seqdumper
220 via the attach database call. The database that is removed
221 is returned (or undef
if it did not exist).
223 Exceptions : thrown
if incorrect argument is supplied
228 sub remove_database {
229 my ($self, $name) = @_;
231 $name ||
throw(
"name arg is required");
233 if(exists $self->{
'attached_dbs'}->{$name}) {
234 return delete $self->{
'attached_dbs'}->{$name};
241 =head2 disable_feature_type
243 Arg [1] :
string $type
244 Example : $seq_dumper->disable_feature_type(
'genes');
245 Description: Disables the dumping of a specific type of feature
247 Exceptions : warn
if an invalid feature type is passed,
248 thrown
if no feature type is passed
253 sub disable_feature_type {
254 my ($self, $type) = @_;
256 $type ||
throw(
"type arg is required");
258 if(exists($self->{
'feature_types'}->{$type})) {
259 $self->{
'feature_types'}->{$type} = 0;
261 warning(
"unknown feature type '$type'\n" .
262 "valid types are: " . join(
',', keys %{$self->{
'feature_types'}}));
270 Arg [1] :
string $type
271 Example : do_something() if($seq_dumper->is_enabled('gene'));
272 Description: checks if a specific feature type is enabled
274 Exceptions : warning if invalid type is passed,
275 thrown if no type is passed
281 my ($self, $type) = @_;
283 $type ||
throw(
"type arg is required");
285 if(exists($self->{
'feature_types'}->{$type})) {
286 return $self->{
'feature_types'}->{$type};
288 warning(
"unknown feature type '$type'\n" .
289 "valid types are: " . join(
',', keys %{$self->{
'feature_types'}}));
298 Arg [2] :
string $format
299 The name of the format to dump
300 Arg [3] : (optional) $outfile
301 The name of the file to dump to. If no file is specified STDOUT
303 Arg [4] : (optional) $seq
305 Arg [4] : (optional) $no_append
306 Default action is to open the file in append mode. This will
308 Example : $seq_dumper->dump($slice,
'EMBL');
309 Description: Dumps a region of a genome specified by the slice argument into
310 an outfile of the format $format
312 Exceptions : thrown
if slice or format args are not supplied
319 my ($self, $slice, $format, $outfile, $seq, $no_append) = @_;
321 $format ||
throw(
"format arg is required");
322 $slice ||
throw(
"slice arg is required");
324 my $dump_handler = $DUMP_HANDLERS->{uc($format)};
326 unless($dump_handler) {
327 throw(
"No dump handler is defined for format $format\n");
331 my $FH = IO::File->
new;;
333 my $mode = ($no_append) ?
'>' :
'>>';
334 $FH->open(
"${mode}${outfile}") or
throw(
"Could not open file $outfile: $!");
337 #mod_perl did not like the following
338 #$FH->fdopen(fileno(STDOUT), "w")
339 # or throw("Could not open currently selected output filehandle " .
343 &$dump_handler($self, $slice, $FH, $seq);
345 $FH->close
if ($outfile); #close
if we were writing to a file
351 Arg [2] : IO::File $FH
352 Arg [3] : optional sequence
string
353 Example : $seq_dumper->dump_embl($slice, $FH);
354 Description: Dumps an EMBL flat file to an open file handle
356 Exceptions :
if calls to get/set the filehandle position fail
371 my $cs = $slice->coord_system();
372 my $name_str = $cs->name() .
' ' . $slice->seq_region_name();
373 $name_str .=
' ' . $cs->version
if($cs->version);
375 my $start = $slice->start;
376 my $end = $slice->end;
378 #determine if this slice is the entire seq region
379 #if it is then we just use the name as the id
380 my $slice_adaptor = $slice->adaptor();
382 $slice->adaptor->fetch_by_region($cs->name,
383 $slice->seq_region_name,
387 my $entry_name = $slice->seq_region_name();
389 if($full_slice->name eq $slice->name) {
390 $name_str .=
' full sequence';
391 $acc = $slice->seq_region_name();
392 my @acc_ver = split(/\./, $acc);
395 $version = $acc_ver[0] .
'.'. $acc_ver[1];
396 } elsif(@acc_ver == 1 && $cs->version()) {
397 $version = $acc .
'.'. $cs->version();
402 $name_str .=
' partial sequence';
403 $acc = $slice->name();
407 $acc = $slice->name();
409 #line breaks are allowed near the end of the line on ' ', "\t", "\n", ','
416 # move at the end of file in case the file
417 # is open more than once (i.e. human chromosome Y,
418 # two chromosome slices
422 # When the file is open not for the first time,
423 # it must be in read/write mode
425 seek($FH, 0, SEEK_END);
428 '@< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
432 # HTG = High Throughput Genome division, probably most suitable
433 # and it would be hard to come up with another appropriate division
434 # that worked for all organisms (e.g. plants are in PLN but human is
436 my $VALUE =
"$entry_name standard; DNA; HTG; $len BP.";
437 $self->write($FH, $EMBL_HEADER,
'ID', $VALUE);
438 $self->print( $FH,
"XX\n" );
441 $self->write($FH, $EMBL_HEADER,
'AC', $acc);
442 $self->print( $FH,
"XX\n" );
445 $self->write($FH, $EMBL_HEADER,
'SV', $version);
446 $self->print( $FH,
"XX\n" );
449 $self->write($FH, $EMBL_HEADER,
'DT', $self->_date_string);
450 $self->print( $FH,
"XX\n" );
452 my $meta_container = $slice->adaptor()->db()->get_MetaContainer();
455 $self->write($FH, $EMBL_HEADER,
'DE', $meta_container->get_scientific_name() .
456 " $name_str $start..$end annotated by Ensembl");
457 $self->print( $FH,
"XX\n" );
460 $self->write($FH, $EMBL_HEADER,
'KW',
'.');
461 $self->print( $FH,
"XX\n" );
464 my $species_name = $meta_container->get_scientific_name();
465 if(my $cn = $meta_container->get_common_name()) {
466 $species_name .=
" ($cn)";
469 $self->write($FH, $EMBL_HEADER,
'OS', $species_name);
472 my @cls = @{$meta_container->get_classification()};
473 $self->write($FH, $EMBL_HEADER,
'OC', join(
'; ', reverse(@cls)) .
'.');
474 $self->print( $FH,
"XX\n" );
476 #References (we are not dumping refereneces)
477 #Database References (we are not dumping these)
480 my $annotation_source = $self->annotation_source($meta_container);
481 foreach my $comment (@COMMENTS) {
482 $comment =~ s/\#\#\#SOURCE\#\#\#/$annotation_source/;
483 $self->write($FH, $EMBL_HEADER,
'CC', $comment);
484 $self->print( $FH,
"XX\n" );
490 $self->print( $FH,
"FH Key Location/Qualifiers\n" );
493 'FT ^<<<<<<<<<<<<<<<^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~
495 $self->_dump_feature_table($slice, $FH, $FEATURE_TABLE);
497 #write an XX after the feature tables
498 $self->print( $FH,
"XX\n" );
504 # record position before writing sequence header, so that
505 # after printing the sequence and having the base counts
506 # we can seek to this position and write the proper sequence
508 my $sq_offset = tell($FH);
509 $sq_offset == -1 and
throw "Unable to get offset for output fh";
511 # print a sequence header template, to be replaced with a real
512 # one containing the base counts
513 $self->print($FH,
"SQ Sequence ########## BP; ########## A; ########## C; ########## G; ########## T; ########## other;\n");
515 # dump the sequence and get the base counts
516 my $acgt = $self->write_embl_seq($slice, $FH);
518 # print the end of EMBL entry
519 $self->print( $FH,
"//\n" );
520 my $end_of_entry_offset = tell($FH);
521 $end_of_entry_offset == -1 and
throw "Unable to get offset for output fh";
523 # seek backwards to the position of the sequence header and
524 # write it with the actual base counts
525 seek($FH, $sq_offset, SEEK_SET)
526 or
throw "Cannot seek backward to sequence header position";
527 $self->print($FH, sprintf
"SQ Sequence %10d BP; %10d A; %10d C; %10d G; %10d T; %10d other;",
528 $acgt->{tot}, $acgt->{a}, $acgt->{c}, $acgt->{g}, $acgt->{t}, $acgt->{n});
530 # move forward to end of file to dump the next slice
531 seek($FH, $end_of_entry_offset, SEEK_SET)
532 or
throw "Cannot seek forward to end of entry";
534 # Set formatting back to normal
541 Arg [2] : IO::File $FH
542 Example : $seq_dumper->dump_genbank($slice, $FH);
543 Description: Dumps a GENBANK flat file to an open file handle
551 my ($self, $slice, $FH) = @_;
553 #line breaks are allowed near the end of the line on ' ', "\t", "\n", ','
557 '^<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
560 my $GENBANK_SUBHEADER =
561 ' ^<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
565 ' ^<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<~~
573 my $name_str = $cs->
name() .
' ' . $slice->seq_region_name();
575 $name_str .=
' ' . $cs->version
if($cs->version);
577 #determine if this slice is the entire seq region
578 #if it is then we just use the name as the id
579 my $slice_adaptor = $slice->adaptor();
581 $slice->adaptor->fetch_by_region($cs->name,
582 $slice->seq_region_name,
587 my $entry_name = $slice->seq_region_name();
589 if($full_slice->name eq $slice->name) {
590 $name_str .=
' full sequence';
591 $acc = $slice->seq_region_name();
592 my @acc_ver = split(/\./, $acc);
595 $version = $acc_ver[0] . $acc_ver[1];
596 } elsif(@acc_ver == 1 && $cs->version()) {
597 $version = $acc . $cs->version();
602 $name_str .=
' partial sequence';
603 $acc = $slice->name();
607 $acc = $slice->name(); # to keep format consistent
for all
609 my $length = $slice->length;
610 my $start = $slice->start();
611 my $end = $slice->end();
613 my $date = $self->_date_string;
615 my $meta_container = $slice->adaptor()->db()->get_MetaContainer();
617 # move at the end of file in case the file
618 # is open more than once (i.e. human chromosome Y,
619 # two chromosome slices
623 # When the file is open not for the first time,
624 # it must be in read/write mode
626 seek($FH, 0, SEEK_END);
630 my $value =
"$entry_name $length bp DNA HTG $date";
631 $self->write($FH, $GENBANK_HEADER, $tag, $value);
635 $value = $meta_container->get_scientific_name() .
636 " $name_str $start..$end reannotated via EnsEMBL";
637 $self->write($FH, $GENBANK_HEADER, $tag, $value);
640 $self->write($FH, $GENBANK_HEADER,
'ACCESSION', $acc);
643 $self->write($FH, $GENBANK_HEADER,
'VERSION', $version);
646 $self->write($FH, $GENBANK_HEADER,
'KEYWORDS',
'.');
649 my $common_name = $meta_container->get_common_name();
650 $common_name = $meta_container->get_scientific_name() unless $common_name;
651 $self->write($FH, $GENBANK_HEADER,
'SOURCE', $common_name);
654 my @cls = @{$meta_container->get_classification()};
656 $self->write($FH, $GENBANK_SUBHEADER,
'ORGANISM', $meta_container->get_scientific_name());
657 $self->write($FH, $GENBANK_SUBHEADER,
'', join(
'; ', reverse @cls) .
".");
662 my $annotation_source = $self->annotation_source($meta_container);
663 foreach my $comment (@COMMENTS) {
664 $comment =~ s/\#\#\#SOURCE\#\#\#/$annotation_source/;
665 $self->write($FH, $GENBANK_HEADER,
'COMMENT', $comment);
671 $self->print( $FH,
"FEATURES Location/Qualifiers\n" );
672 $self->_dump_feature_table($slice, $FH, $GENBANK_FT);
678 # record position before writing sequence header, so that
679 # after printing the sequence and having the base counts
680 # we can seek to this position and write the proper sequence
682 my $sq_offset = tell($FH);
683 $sq_offset == -1 and
throw "Unable to get offset for output fh";
685 # print a sequence header template, to be replaced with a real
686 # one containing the base counts
687 $self->print($FH,
"BASE COUNT ########## a ########## c ########## g ########## t ########## n\nORIGIN\n");
689 # dump the sequence and get the base counts
690 my $acgt = $self->write_genbank_seq($slice, $FH);
692 # print the end of genbank entry
693 $self->print( $FH,
"//\n" );
694 my $end_of_entry_offset = tell($FH);
695 $end_of_entry_offset == -1 and
throw "Unable to get offset for output fh";
697 # seek backwards to the position of the sequence header and
698 # write it with the actual base counts
699 seek($FH, $sq_offset, SEEK_SET)
700 or
throw "Cannot seek backward to sequence header position";
701 $self->print($FH, sprintf
"BASE COUNT %10d a %10d c %10d g %10d t %10d n",
702 $acgt->{a}, $acgt->{c}, $acgt->{g}, $acgt->{t}, $acgt->{n});
704 seek($FH, $end_of_entry_offset, SEEK_SET)
705 or
throw "Cannot seek forward to end of entry";
707 # Set formatting back to normal
713 =head2 _dump_feature_table
717 Description: Helper method used to dump feature tables used in EMBL, FASTA,
718 GENBANK. Assumes formating of file handle has been setup
719 already to use $FEAT and $VALUE values.
726 sub _dump_feature_table {
732 #use only the core database to dump features (except for bloody snps)
735 my $meta = $slice->adaptor->db->get_MetaContainer;
737 #lump file handle and format string together for simpler method calls
738 my @ff = ($FH, $FORMAT);
742 my $classification = join(
', ', @{$meta->get_classification()});
743 $self->write(@ff,
'source',
"1.." . $slice->length());
744 $self->write(@ff,
'' ,
'/organism="'.$meta->get_scientific_name().
'"');
745 $self->write(@ff,
'' ,
'/db_xref="taxon:'.$meta->get_taxonomy_id().
'"');
748 # Transcripts & Genes
751 if($self->is_enabled(
'gene')) {
752 push @gene_slices, $slice;
755 # Retrieve slices of other database where we need to pull genes from
757 my $gene_dbs = {
'vegagene' =>
'vega',
758 'estgene' =>
'estgene'};
760 foreach my $gene_type (keys %$gene_dbs) {
761 if($self->is_enabled($gene_type)) {
762 my $db = $self->get_database($gene_dbs->{$gene_type});
764 my $sa = $db->get_SliceAdaptor();
765 push @gene_slices, $sa->fetch_by_name($slice->name());
767 warning(
"A [". $gene_dbs->{$gene_type} .
"] database must be " .
768 "attached to this SeqDumper\n(via a call to " .
769 "attach_database) to retrieve genes of type [$gene_type]");
774 foreach my $gene_slice (@gene_slices) {
775 my @genes = @{$gene_slice->get_all_Genes(undef,undef, 1)};
776 while(my $gene = shift @genes) {
777 $value = $self->features2location( [$gene] );
778 $self->write( @ff,
'gene', $value );
779 $self->write( @ff,
"",
'/gene='.$gene->stable_id_version() );
782 if(defined($gene->display_xref)){
783 $self->write( @ff,
"",
'/locus_tag="'.$gene->display_xref->display_id.
'"');
785 my $desc = $gene->description;
786 if(defined($desc) and $desc ne
""){
788 $self->write( @ff, "", '/note="'.$gene->description.'"');
793 foreach my $transcript (@{$gene->get_all_Transcripts}) {
794 my $translation = $transcript->translation;
796 # normal transcripts get dumped differently than pseudogenes
799 $value = $self->features2location($transcript->get_all_Exons);
800 $self->write(@ff, 'mRNA', $value);
801 $self->write(@ff,'' , '/gene="'.$gene->stable_id_version().'"');
803 ,'/standard_name="'.$transcript->stable_id_version().'"');
805 # ...and a CDS section
807 $self->features2location($transcript->get_all_translateable_Exons);
808 $self->write(@ff,'CDS', $value);
809 my $codon_start = $self->transcript_to_codon_start($transcript);
810 $self->write(@ff,'' , qq{/codon_start="${codon_start}
"}) if $codon_start > 1;
812 my $codon_table_id = $self->_get_codon_table_id($slice);
813 if($codon_table_id > 1){
814 $self->write(@ff,'' , '/transl_table='.$codon_table_id);
816 $self->write(@ff,'' , '/gene="'.$gene->stable_id_version().'"');
817 $self->write(@ff,'' , '/protein_id="'.$translation->stable_id_version().'"');
818 $self->write(@ff,'' ,'/note="transcript_id=
'.$transcript->stable_id_version().'"');
820 foreach my $dbl (@{$transcript->get_all_DBLinks}) {
821 my $db_xref = '/db_xref="'.$dbl->dbname().':
'.$dbl->primary_id().'"';
822 my $go_db_xref = '/db_xref="'.$dbl->primary_id().'"';
823 $value = ($dbl->dbname()=~/GO/) ? $go_db_xref : $db_xref;
824 $self->write(@ff, '', $value);
827 my $pep = $transcript->translate();
829 $value = '/translation="'.$pep->seq().'"';
830 $self->write(@ff, '', $value);
834 $value = $self->features2location($transcript->get_all_Exons);
835 $self->write(@ff, 'misc_RNA', $value);
836 $self->write(@ff,'' , '/gene="'.$gene->stable_id_version().'"');
837 foreach my $dbl (@{$transcript->get_all_DBLinks}) {
838 $value = '/db_xref="'.$dbl->dbname().':
'.$dbl->primary_id().'"';
839 $self->write(@ff, '', $value);
841 $self->write(@ff,'' , '/note="'.$transcript->biotype().'"');
843 ,'/standard_name="'.$transcript->stable_id_version().'"');
849 foreach my $gene (@{$gene_slice->get_all_Genes(undef,undef,1)}) {
850 foreach my $exon (@{$gene->get_all_Exons}) {
851 $self->write(@ff,'exon', $self->features2location([$exon]));
852 $self->write(@ff,'' , '/note="exon_id=
'.$exon->stable_id_version().'"');
860 if($self->is_enabled('genscan')) {
862 my @transcripts = @{$slice->get_all_PredictionTranscripts(undef,1)};
863 while(my $transcript = shift @transcripts) {
864 my $exons = $transcript->get_all_Exons();
865 push @genscan_exons, @$exons;
866 $self->write(@ff, 'mRNA', $self->features2location($exons));
867 $self->write(@ff, '', '/product="'.$transcript->translate()->seq().'"');
868 $self->write(@ff, '', '/note="identifier=
'.$transcript->stable_id_version.'"');
869 $self->write(@ff, '', '/note="Derived by automated computational
' .
870 ' analysis
using gene prediction method:
' .
871 $transcript->analysis->logic_name . '"');
878 if($self->is_enabled('variation') && $slice->can('get_all_VariationFeatures')) {
879 # $slice->adaptor->db->add_db_adaptor('lite', $lite) if $lite;
881 my @variations = @{$slice->get_all_VariationFeatures()};
882 while(my $snp = shift @variations) {
883 my $ss = $snp->start;
885 #skip snps that hang off edge of slice
886 next if($ss < 1 || $se > $slice->length);
888 $self->write(@ff, 'variation', "$ss..$se
");
889 $self->write(@ff, '' , '/replace="'.$snp->allele_string.'"');
890 #$self->write(@ff, '' , '/evidence="'.$snp->status.'"');
891 my $rs_id = $snp->variation_name();
892 my $db = $snp->source_name();
893 # foreach my $link ($snp->each_DBLink) {
894 # my $id = $link->primary_id;
895 # my $db = $link->database;
896 $self->write(@ff, '', "/db_xref=\
"$db:$rs_id\"");
900 # $slice->adaptor->db->remove_db_adaptor('lite') if $lite;
904 # similarity features
906 if($self->is_enabled(
'similarity')) {
907 foreach my $sim (@{$slice->get_all_SimilarityFeatures}) {
908 $self->write(@ff,
'misc_feature', $self->features2location([$sim]));
909 $self->write(@ff,
'' ,
'/note="match: '.$sim->hseqname.
910 ' : '.$sim->hstart.
'..'.$sim->hend.
'('.$sim->hstrand.
')"');
917 if($self->is_enabled(
'repeat')) {
918 my @rfs = @{$slice->get_all_RepeatFeatures()};
920 while(my $repeat = shift @rfs) {
921 $self->write(@ff,
'repeat_region', $self->features2location([$repeat]));
922 $self->write(@ff,
'' ,
'/note="' . $repeat->repeat_consensus->name.
923 ' repeat: matches ' . $repeat->hstart.
'..'.$repeat->hend .
924 '('.$repeat->hstrand.
') of consensus"');
932 if($self->is_enabled(
'marker') && $slice->can(
'get_all_MarkerFeatures')) {
933 my @markers = @{$slice->get_all_MarkerFeatures()};
934 while(my $mf = shift @markers) {
935 $self->write(@ff,
'STS', $self->features2location([$mf]));
936 if($mf->marker->display_MarkerSynonym) {
937 $self->write(@ff,
'' ,
'/standard_name="' .
938 $mf->marker->display_MarkerSynonym->name .
'"');
942 #grep out synonyms without a source
943 my @synonyms = @{$mf->marker->get_all_MarkerSynonyms};
944 @synonyms = grep {$_->source } @synonyms;
945 foreach my $synonym (@synonyms) {
946 $self->write(@ff,
'',
'/db_xref="'.$synonym->source.
947 ':'.$synonym->name.
'"');
949 $self->write(@ff,
'',
'/note="map_weight='.$mf->map_weight.
'"');
956 if($self->is_enabled(
'contig')) {
957 foreach my $segment (@{$slice->project(
'seqlevel')}) {
958 my ($start, $end, $slice) = @$segment;
959 $self->write(@ff,
'misc_feature',
961 $self->write(@ff,
'',
'/note="contig '.$slice->seq_region_name .
962 ' ' . $slice->start .
'..' . $slice->end .
963 '(' . $slice->strand .
')"');
967 $slice->adaptor->db->add_db_adaptor(
'lite', $lite)
if $lite;
971 # /codon_start= is the first base to start translating from. This maps
972 # Ensembl start exon phase to this concept. Here we present the usage
973 # of phase in this concept where each row shows the sequence the
974 # spliced_seq() method will return
978 # Phase == 0 ...+++### codon_start=0 // start from 1st A
979 # Phase == 1 ..+++### codon_start=3 // start from 2nd A (base 3 in the given spliced sequence)
980 # Phase == 2 .+++### codon_start=2 // start from 2nd A (base 2 in the spliced sequence)
982 # In the case of the final 2 phases we will generate a X codon
985 sub transcript_to_codon_start {
986 my ($self, $transcript) = @_;
987 my $start_phase = $transcript->start_Exon()->phase();
988 return ( $start_phase == 1 ) ? 3 :
989 ( $start_phase == 2 ) ? 2 :
990 ( $start_phase == 0 ) ? 1 :
995 =head2 _get_codon_table_id
999 Description: Helper method to get codon_table seq region attribute
1000 codon_table defines the genetic code table used
if other than the universal genetic code table (1)
1001 By default it is 1 and is not shown in flat files.
1002 If it is not equal to 1, then it is shown as a transl_table qualifier on the CDS feature.
1008 sub _get_codon_table_id{
1009 my ($self, $slice) = @_;
1010 my $codon_table_id = 1;
1012 if (@{$codon_table_attributes}) {
1013 $codon_table_id = $codon_table_attributes->[0]->value;
1015 return $codon_table_id;
1021 Arg [2] : IO::File $FH
1022 Example : $seq_dumper->dump_fasta($slice, $FH);
1023 Description: Dumps an FASTA flat file to an open file handle
1036 my $seqtype =
'dna';
1037 my $idtype = $slice->coord_system->name;
1038 my $location = $slice->name;
1040 my $end = $slice->length();
1042 my $header =
">$id $seqtype:$idtype $location\n";
1043 $self->print( $FH, $header );
1045 #set the formatting to FASTA
1046 my $FORMAT =
'^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
1049 #chunk the sequence in 60kb chunks to use less memory
1051 while($cur <= $end) {
1052 my $to = $cur + 59_999;
1053 $to = $end
if($to > $end);
1054 my $seq = $slice->subseq($cur, $to);
1056 $self->write($FH, $FORMAT, $seq);
1062 =head2 features2location
1064 Arg [1] : listref of Bio::EnsEMBL::SeqFeatures
1065 Example : $location = $self->features2location(\@features);
1066 Description: Constructs an EMBL location
string from a list of features
1073 sub features2location {
1075 my $features = shift;
1079 foreach my $f (@$features) {
1080 my $slice = $f->slice;
1081 my $start = $f->start();
1082 my $end = $f->end();
1083 my $strand = $f->strand();
1085 if($start >= 1 && $end <= $slice->length) {
1086 #this feature in on a slice and doesn't lie outside the boundary
1089 push @join,
"$start..$end";
1091 push @join,
"complement($start..$end)";
1095 #this feature is outside the boundary of the dump,
1096 # yet implemented and 'seqlevel' is guaranteed to be 1step
1097 my $projection = $f->project(
'seqlevel');
1098 foreach my $segment (@$projection) {
1099 my $slice = $segment->[2];
1100 my $slc_start = $slice->start();
1101 my $slc_end = $slice->end();
1102 my $seq_reg = $slice->seq_region_name();
1103 if($slice->strand == 1) {
1104 push @join,
"$seq_reg:$slc_start..$slc_end";
1106 push @join,
"complement($seq_reg:$slc_start..$slc_end)";
1112 my $out = join
',', @join;
1114 if(scalar @join > 1) {
1115 $out =
"join($out)";
1121 =head2 annotation_source
1124 Example : $seq_dumper->annotation_source($meta_container);
1125 Description: Constructs a
string with gene annotation sources
1128 Caller : dump_embl, dump_genbank
1132 sub annotation_source {
1133 my ($self, $meta_container) = @_;
1136 my @provider_urls = ();
1138 foreach ( @{$meta_container->list_value_by_key(
'annotation.provider_name')} ) {
1139 push @providers, $_
if $_ ne
'';
1141 foreach ( @{$meta_container->list_value_by_key(
'annotation.provider_url')} ) {
1142 push @provider_urls, $_
if $_ ne
'';
1144 if ( ! scalar(@providers) ) {
1145 foreach ( @{$meta_container->list_value_by_key(
'assembly.provider_name')} ) {
1146 push @providers, $_
if $_ ne
'';
1148 foreach ( @{$meta_container->list_value_by_key(
'assembly.provider_url')} ) {
1149 push @provider_urls, $_
if $_ ne
'';
1152 if ( ! scalar(@providers) ) {
1153 push @providers,
'Ensembl';
1156 my $annotation_source = join(
' and ', @providers);
1157 if (@provider_urls) {
1158 $annotation_source .=
' ' . sprintf(q{(%s)}, join(
', ', @provider_urls));
1161 return $annotation_source;
1167 my ($sec, $min, $hour, $mday,$mon, $year) = localtime(time());
1169 my $month = (
'JAN',
'FEB',
'MAR',
'APR',
'MAY',
'JUN',
'JUL',
1170 'AUG',
'SEP',
'OCT',
'NOV',
'DEC')[$mon];
1173 return "$mday-$month-$year";
1177 my ($self, $FH, $FORMAT, @values) = @_;
1179 #while the last value still contains something
1180 while(defined($values[-1]) and $values[-1] ne
'') {
1181 formline($FORMAT, @values);
1182 $self->print( $FH, $^A );
1187 sub write_genbank_seq {
1191 my $base_total = shift;
1196 '@>>>>>>>> ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<~
1199 my $total = -59 + $base_total;
1200 #keep track of total and print lines of 60 bases with spaces every 10bp
1203 formline($GENBANK_SEQ,$total, $$seq, $$seq, $$seq, $$seq, $$seq, $$seq);
1204 $self->print( $FH, $^A );
1209 sub write_genbank_seq {
1217 my $chunk_size = $self->{
'chunk_factor'} * $width;
1218 $chunk_size > 0 or
throw "Invalid chunk size: check chunk_factor parameter";
1221 my $end = $slice->length;
1224 # chunk the sequence to conserve memory, and print
1227 '@>>>>>>>> ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<~
1231 while($here <= $end) {
1232 my $there = $here + $chunk_size - 1;
1233 $there = $end
if($there > $end);
1234 my $sseq = $slice->subseq($here, $there);
1236 $acgt->{a} += $sseq =~ tr/Aa
1237 $acgt->{c} += $sseq =~ tr/Cc
1238 $acgt->{g} += $sseq =~ tr/Gg
1239 $acgt->{t} += $sseq =~ tr/Tt
1243 formline($GENBANK_SEQ, $total, $sseq, $sseq, $sseq, $sseq, $sseq, $sseq);
1244 $self->print($FH, $^A);
1251 $acgt->{n} = $end - ($acgt->{a} + $acgt->{c} + $acgt->{g} + $acgt->{t});
1252 $acgt->{tot} = $end;
1258 sub write_embl_seq {
1266 my $chunk_size = $self->{
'chunk_factor'} * $width;
1267 $chunk_size > 0 or
throw "Invalid chunk size: check chunk_factor parameter";
1270 my $end = $slice->length;
1273 # chunk the sequence to conserve memory, and print
1276 ' ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< @>>>>>>>>>~
1280 while($here <= $end) {
1281 my $there = $here + $chunk_size - 1;
1282 $there = $end
if($there > $end);
1283 my $sseq = $slice->subseq($here, $there);
1285 $acgt->{a} += $sseq =~ tr/Aa
1286 $acgt->{c} += $sseq =~ tr/Cc
1287 $acgt->{g} += $sseq =~ tr/Gg
1288 $acgt->{t} += $sseq =~ tr/Tt
1292 $total = 0
if $total < 0;
1294 $sseq, $sseq, $sseq, $sseq, $sseq, $sseq,
1296 $self->print($FH, $^A);
1303 $acgt->{n} = $end - ($acgt->{a} + $acgt->{c} + $acgt->{g} + $acgt->{t});
1304 $acgt->{tot} = $end;
1310 my( $self, $FH, $string ) = @_;
1311 if(!print $FH $string){
1312 print STDERR
"Problem writing to disk\n";
1313 print STDERR
"the string is $string\n";
1314 die
"Could not write to file handle";