ensembl-hive  2.8.1
JGI_ProteinParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =head1 NAME
19 
21 
22 =head1 DESCRIPTION
23 
24 Parser for JGI-1.0 protein files with gene description, FASTA format.
25 
26 WARNING: this is an extremely simplistic implementation of a FASTA
27 parser, for instance it does not treat strings beginning with ; as
28 comments. As of September 2019 it (still) works for JGI data, though.
29 
30 =head1 SYNOPSIS
31 
32  my $parser = XrefParser::JGI_ProteinParser->new($db->dbh);
33  $parser->run({
34  source_id => 70,
35  species_id => 7719,
36  files => [ "ciona.prot.fasta.gz" ],
37  });
38 
39 =cut
40 
41 package XrefParser::JGI_ProteinParser;
42 
43 # For non-destructive substitutions in regexps (/r flag)
44 require 5.014_000;
45 
46 use strict;
47 use warnings;
48 
49 use Carp;
50 
51 use parent qw( XrefParser::BaseParser );
52 
53 
54 =head2 run
55 
56  Arg [] : HashRef standard list of arguments from ParseSource
57  Example : $jgi_parser->run({ ... });
58  Description: Parse FASTA input file containing JGI-1.0 protein data,
59  extract seq xrefs and add them to the xref DB
60  Return type: Int; 0 upon success
61  Exceptions : throws on all processing errors
62  Caller : ParseSource in the xref pipeline
63 
64 =cut
65 
66 sub run {
67  my ( $self, $ref_arg ) = @_;
68 
69  my $source_id = $ref_arg->{source_id};
70  my $species_id = $ref_arg->{species_id};
71  my $files = $ref_arg->{files};
72  my $verbose = $ref_arg->{verbose} // 0;
73  my $dbi = $ref_arg->{dbi} // $self->dbi;
74 
75  if ( ( !defined $source_id ) or
76  ( !defined $species_id ) or
77  ( !defined $files ) )
78  {
79  confess 'Need to pass source_id, species_id and files as pairs';
80  }
81 
82  my $file = @{$files}[0];
83 
84  my $file_io = $self->get_filehandle($file);
85  if ( !defined $file_io ) {
86  confess "Could not open $file\n";
87  }
88  IO::Handle->input_record_separator("\n>");
89 
90  my @xrefs;
91 
92  RECORD:
93  while ( my $input_data = $file_io->getline() ) {
94 
95  my ( $accession, $sequence )
96  = ( $input_data =~ m{
97  # Header line. The first record will
98  # have a > but since we use "\n>" as
99  # record separator, further ones will not
100  # contain it.
101  \A >? \s* ci0100 ( \w+? ) \n
102 
103  # Sequence data. Can span multiple
104  # lines. Err on the side of caution and
105  # assume there CAN be records with no
106  # sequence data at all (hence the *), such
107  # records would be useless for xref
108  # generation but at least they shoudn't
109  # trigger parsing errors. By specifying
110  # "not >" as our character class we avoid
111  # having to chomp the input record.
112  ( [^>]* )
113  }msx );
114 
115  if ( !defined $accession ) {
116  # Is it the file header? If so, just skip it
117  if ( $input_data =~ m{ \A File: }msx ) {
118  next RECORD;
119  }
120  # Otherwise, alert the user of parsing problems
121  else {
122  confess "Can't parse FASTA entry: $input_data";
123  }
124  }
125 
126  # Build an xref object (getting rid of whitespace from the
127  # sequence in the process) and store it
128  push @xrefs,
129  { ACCESSION => $accession,
130  SEQUENCE => ( $sequence =~ s{ \s }{}grmsx ),
131  SOURCE_ID => $source_id,
132  SPECIES_ID => $species_id,
133  SEQUENCE_TYPE => 'peptide',
134  };
135 
136  } ## end while ( my $input_data = $file_io...)
137 
138  $file_io->close();
139 
140  $self->upload_xref_object_graphs( \@xrefs, $dbi );
141 
142  if ( $verbose ) {
143  print scalar(@xrefs) . " JGI_ xrefs succesfully parsed\n";
144  }
145 
146  return 0;
147 } ## end sub run
148 
149 
150 1;
XrefParser::JGI_ProteinParser::run
public Int run()
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()
XrefParser::JGI_ProteinParser
Definition: JGI_ProteinParser.pm:23