3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
24 Parser
for JGI-1.0 protein files with gene description, FASTA format.
26 WARNING:
this is an extremely simplistic implementation of a FASTA
27 parser,
for instance it does not treat strings beginning with ; as
28 comments. As of September 2019 it (still) works
for JGI data, though.
36 files => [
"ciona.prot.fasta.gz" ],
41 package XrefParser::JGI_ProteinParser;
43 # For non-destructive substitutions in regexps (/r flag)
56 Arg [] : HashRef standard list of arguments from ParseSource
57 Example : $jgi_parser->run({ ... });
58 Description: Parse FASTA input file containing JGI-1.0 protein data,
59 extract seq xrefs and add them to the xref DB
60 Return type: Int; 0 upon success
61 Exceptions :
throws on all processing errors
62 Caller : ParseSource in the xref pipeline
67 my ( $self, $ref_arg ) = @_;
69 my $source_id = $ref_arg->{source_id};
70 my $species_id = $ref_arg->{species_id};
71 my $files = $ref_arg->{files};
72 my $verbose = $ref_arg->{verbose}
73 my $dbi = $ref_arg->{dbi}
75 if ( ( !defined $source_id ) or
76 ( !defined $species_id ) or
79 confess
'Need to pass source_id, species_id and files as pairs';
82 my $file = @{$files}[0];
84 my $file_io = $self->get_filehandle($file);
85 if ( !defined $file_io ) {
86 confess
"Could not open $file\n";
88 IO::Handle->input_record_separator(
"\n>");
93 while ( my $input_data = $file_io->getline() ) {
95 my ( $accession, $sequence )
97 # Header line. The first record will
98 # have a > but since we use "\n>" as
99 # record separator, further ones will not
101 \A >? \s* ci0100 ( \w+? ) \n
103 # Sequence data. Can span multiple
104 # lines. Err on the side of caution and
105 # assume there CAN be records with no
106 # sequence data at all (hence the *), such
107 # records would be useless for xref
108 # generation but at least they shoudn't
109 # trigger parsing errors. By specifying
110 # "not >" as our character class we avoid
111 # having to chomp the input record.
115 if ( !defined $accession ) {
116 # Is it the file header? If so, just skip it
117 if ( $input_data =~ m{ \A File: }msx ) {
120 # Otherwise, alert the user of parsing problems
122 confess
"Can't parse FASTA entry: $input_data";
126 # Build an xref object (getting rid of whitespace from the
127 # sequence in the process) and store it
129 { ACCESSION => $accession,
130 SEQUENCE => ( $sequence =~ s{ \s }{}grmsx ),
131 SOURCE_ID => $source_id,
132 SPECIES_ID => $species_id,
133 SEQUENCE_TYPE =>
'peptide',
136 } ## end
while ( my $input_data = $file_io...)
140 $self->upload_xref_object_graphs( \@xrefs, $dbi );
143 print scalar(@xrefs) .
" JGI_ xrefs succesfully parsed\n";