3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
26 This parser will read and create dependent xrefs from a simple
27 comma-delimited file downloaded from the EntrezGene database.
35 files => [
"gene_info.gz" ],
40 package XrefParser::EntrezGeneParser;
51 my $EXPECTED_NUMBER_OF_COLUMNS = 16;
57 Arg [1] : HashRef standard list of arguments from ParseSource
58 Description: Add dependent xrefs from EntrezGene to the xref database
59 Return type: Int; 0 upon success
60 Exceptions :
throws on all processing errors
61 Caller : ParseSource in the xref pipeline
67 my ( $self, $ref_arg ) = @_;
68 my $source_id = $ref_arg->{source_id};
69 my $species_id = $ref_arg->{species_id};
70 my $species_name = $ref_arg->{species};
71 my $files = $ref_arg->{files};
72 my $verbose = $ref_arg->{verbose}
73 my $dbi = $ref_arg->{dbi}
75 if ( ( !defined $source_id ) or
76 ( !defined $species_id ) or
79 confess
'Need to pass source_id, species_id and files';
82 my $file = @{$files}[0];
85 $self->get_source_id_for_source_name(
'WikiGene', undef, $dbi );
87 my $eg_io = $self->get_filehandle($file);
88 if ( !defined $eg_io ) {
89 confess
"Could not open $file";
92 my $input_file = Text::CSV->new({
95 allow_loose_quotes => 1
97 || confess
"Cannot use file $file: " . Text::CSV->error_diag();
100 if ( ! is_file_header_valid( $input_file->header( $eg_io ) ) ) {
101 confess
"Malformed or unexpected header in EntrezGene file '${file}'";
106 my %seen; # record already processed xrefs
108 # read data and load xrefs
110 while ( my $data = $input_file->getline($eg_io) ) {
111 my ( $tax_id, $acc, $symbol, undef, $synonyms, undef, undef, undef, $desc ) = @{ $data };
113 # species_id corresponds to the species taxonomy id, see:
114 # https://github.com/Ensembl/ensembl-xref/pull/31#issuecomment-445838474
115 if ( $tax_id ne $species_id ) {
119 if ( exists $seen{$acc} ) {
127 source_id => $source_id,
128 species_id => $species_id,
130 info_type =>
'DEPENDENT'
136 source_id => $wiki_source_id,
137 species_id => $species_id,
139 info_type =>
'DEPENDENT'
143 my @syn = split qr{ \| }msx, $synonyms;
144 foreach my $synonym ( @syn ) {
145 if ( $synonym ne q{-} ) {
146 $self->add_to_syn( $acc, $source_id, $synonym, $species_id, $dbi );
152 } ## end
while ( my $data = $input_file...)
155 confess
"Error parsing file $file, should be EOF: " . $input_file->error_diag();
159 print $xref_count .
" EntrezGene Xrefs added with $syn_count synonyms\n";
166 =head2 is_file_header_valid
168 Arg [1..N] : list of column names provided by Text::CSV::getline()
169 Example : if ( ! is_file_header_valid( $csv->getline( $fh ) ) {
170 confess
'Bad header';
172 Description: Verifies
if the header of a EntrezGene file follows expected
181 sub is_file_header_valid {
184 # Don't bother with parsing column names if their number does not
185 # match to begin with
186 if ( scalar @header != $EXPECTED_NUMBER_OF_COLUMNS ) {
192 qr{ \A [#]? \s* tax_id }msx,
199 qr{ map_location }msx,
200 qr{ description }msx,
201 qr{ type_of_gene }msx,
202 qr{ symbol_from_nomenclature_authority }msx,
203 qr{ full_name_from_nomenclature_authority }msx,
204 qr{ nomenclature_status }msx,
205 qr{ other_designations }msx,
206 qr{ modification_date }msx,
207 qr{ feature_type }msx,
211 foreach my $pattern (@field_patterns) {
212 $header_field = shift @header;
213 # Make sure we run the regex match in scalar context
214 return 0 unless scalar ( $header_field =~ m{ $pattern }msx );
217 # If we have made it this far, all should be in order