my ( $self, $ref_arg ) = @_;
my $source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $files = $ref_arg->{files};
my $verbose = $ref_arg->{verbose}
my $dbi = $ref_arg->{dbi}
if ( ( !defined $source_id ) or
( !defined $species_id ) or
( !defined $files ) )
{
confess 'Need to pass source_id, species_id and files as pairs';
}
my $file = @{$files}[0];
my $file_io = $self->get_filehandle($file);
if ( !defined $file_io ) {
confess "Could not open $file\n";
}
IO::Handle->input_record_separator("\n>");
my @xrefs;
RECORD:
while ( my $input_data = $file_io->getline() ) {
my ( $accession, $sequence )
= ( $input_data =~ m{
# Header line. The first record will
# have a > but since we use "\n>" as
# record separator, further ones will not
# contain it.
\A >? \s* ci0100 ( \w+? ) \n
# Sequence data. Can span multiple
# lines. Err on the side of caution and
# assume there CAN be records with no
# sequence data at all (hence the *), such
# records would be useless for xref
# generation but at least they shoudn't
# trigger parsing errors. By specifying
# "not >" as our character class we avoid
# having to chomp the input record.
( [^>]* )
}msx );
if ( !defined $accession ) {
# Is it the file header? If so, just skip it
if ( $input_data =~ m{ \A File: }msx ) {
next RECORD;
}
# Otherwise, alert the user of parsing problems
else {
confess "Can't parse FASTA entry: $input_data";
}
}
# Build an xref object (getting rid of whitespace from the
# sequence in the process) and store it
push @xrefs,
{ ACCESSION => $accession,
SEQUENCE => ( $sequence =~ s{ \s }{}grmsx ),
SOURCE_ID => $source_id,
SPECIES_ID => $species_id,
SEQUENCE_TYPE => 'peptide',
};
} ## end while ( my $input_data = $file_io...)
$file_io->close();
$self->upload_xref_object_graphs( \@xrefs, $dbi );
if ( $verbose ) {
print scalar(@xrefs) . " JGI_ xrefs succesfully parsed\n";
}
return 0;