ensembl-hive  2.7.0
EntrezGeneParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 =head1 NAME
21 
23 
24 =head1 DESCRIPTION
25 
26 This parser will read and create dependent xrefs from a simple
27 comma-delimited file downloaded from the EntrezGene database.
28 
29 =head1 SYNOPSIS
30 
31  my $parser = XrefParser::EntrezGeneParser->new($db->dbh);
32  $parser->run({
33  source_id => 11,
34  species_id => 9606,
35  files => [ "gene_info.gz" ],
36  });
37 
38 =cut
39 
40 package XrefParser::EntrezGeneParser;
41 
42 use strict;
43 use warnings;
44 
45 use Carp;
46 use Text::CSV;
47 
48 use parent qw( XrefParser::BaseParser );
49 
50 
51 my $EXPECTED_NUMBER_OF_COLUMNS = 16;
52 
53 
54 
55 =head2 run
56 
57  Arg [1] : HashRef standard list of arguments from ParseSource
58  Description: Add dependent xrefs from EntrezGene to the xref database
59  Return type: Int; 0 upon success
60  Exceptions : throws on all processing errors
61  Caller : ParseSource in the xref pipeline
62 
63 =cut
64 
65 sub run {
66 
67  my ( $self, $ref_arg ) = @_;
68  my $source_id = $ref_arg->{source_id};
69  my $species_id = $ref_arg->{species_id};
70  my $species_name = $ref_arg->{species};
71  my $files = $ref_arg->{files};
72  my $verbose = $ref_arg->{verbose} // 0;
73  my $dbi = $ref_arg->{dbi} // $self->dbi;
74 
75  if ( ( !defined $source_id ) or
76  ( !defined $species_id ) or
77  ( !defined $files ) )
78  {
79  confess 'Need to pass source_id, species_id and files';
80  }
81 
82  my $file = @{$files}[0];
83 
84  my $wiki_source_id =
85  $self->get_source_id_for_source_name( 'WikiGene', undef, $dbi );
86 
87  my $eg_io = $self->get_filehandle($file);
88  if ( !defined $eg_io ) {
89  confess "Could not open $file";
90  }
91 
92  my $input_file = Text::CSV->new({
93  sep_char => "\t",
94  empty_is_undef => 1,
95  allow_loose_quotes => 1
96  })
97  || confess "Cannot use file $file: " . Text::CSV->error_diag();
98 
99  # process header
100  if ( ! is_file_header_valid( $input_file->header( $eg_io ) ) ) {
101  confess "Malformed or unexpected header in EntrezGene file '${file}'";
102  }
103 
104  my $xref_count = 0;
105  my $syn_count = 0;
106  my %seen; # record already processed xrefs
107 
108  # read data and load xrefs
109  RECORD:
110  while ( my $data = $input_file->getline($eg_io) ) {
111  my ( $tax_id, $acc, $symbol, undef, $synonyms, undef, undef, undef, $desc ) = @{ $data };
112 
113  # species_id corresponds to the species taxonomy id, see:
114  # https://github.com/Ensembl/ensembl-xref/pull/31#issuecomment-445838474
115  if ( $tax_id ne $species_id ) {
116  next RECORD;
117  }
118 
119  if ( exists $seen{$acc} ) {
120  next RECORD;
121  }
122 
123  $self->add_xref({
124  acc => $acc,
125  label => $symbol,
126  desc => $desc,
127  source_id => $source_id,
128  species_id => $species_id,
129  dbi => $dbi,
130  info_type => 'DEPENDENT'
131  });
132  $self->add_xref({
133  acc => $acc,
134  label => $symbol,
135  desc => $desc,
136  source_id => $wiki_source_id,
137  species_id => $species_id,
138  dbi => $dbi,
139  info_type => 'DEPENDENT'
140  });
141  $xref_count += 1;
142 
143  my @syn = split qr{ \| }msx, $synonyms;
144  foreach my $synonym ( @syn ) {
145  if ( $synonym ne q{-} ) {
146  $self->add_to_syn( $acc, $source_id, $synonym, $species_id, $dbi );
147  $syn_count += 1;
148  }
149  }
150 
151  $seen{$acc} = 1;
152  } ## end while ( my $data = $input_file...)
153 
154  $input_file->eof ||
155  confess "Error parsing file $file, should be EOF: " . $input_file->error_diag();
156  $eg_io->close();
157 
158  if ( $verbose ) {
159  print $xref_count . " EntrezGene Xrefs added with $syn_count synonyms\n";
160  }
161 
162  return 0;
163 } ## end sub run
164 
165 
166 =head2 is_file_header_valid
167 
168  Arg [1..N] : list of column names provided by Text::CSV::getline()
169  Example : if ( ! is_file_header_valid( $csv->getline( $fh ) ) {
170  confess 'Bad header';
171  }
172  Description: Verifies if the header of a EntrezGene file follows expected
173  syntax.
174  Return type: boolean
175  Exceptions : none
176  Caller : internal
177  Status : Stable
178 
179 =cut
180 
181 sub is_file_header_valid {
182  my ( @header ) = @_;
183 
184  # Don't bother with parsing column names if their number does not
185  # match to begin with
186  if ( scalar @header != $EXPECTED_NUMBER_OF_COLUMNS ) {
187  return 0;
188  }
189 
190  my @field_patterns
191  = (
192  qr{ \A [#]? \s* tax_id }msx,
193  qr{ geneid }msx,
194  qr{ symbol }msx,
195  qr{ locustag }msx,
196  qr{ synonyms }msx,
197  qr{ dbxrefs }msx,
198  qr{ chromosome }msx,
199  qr{ map_location }msx,
200  qr{ description }msx,
201  qr{ type_of_gene }msx,
202  qr{ symbol_from_nomenclature_authority }msx,
203  qr{ full_name_from_nomenclature_authority }msx,
204  qr{ nomenclature_status }msx,
205  qr{ other_designations }msx,
206  qr{ modification_date }msx,
207  qr{ feature_type }msx,
208  );
209 
210  my $header_field;
211  foreach my $pattern (@field_patterns) {
212  $header_field = shift @header;
213  # Make sure we run the regex match in scalar context
214  return 0 unless scalar ( $header_field =~ m{ $pattern }msx );
215  }
216 
217  # If we have made it this far, all should be in order
218  return 1;
219 }
220 
221 
222 1;
XrefParser::EntrezGeneParser::run
public Int run()
XrefParser::BaseParser
Definition: BaseParser.pm:8
XrefParser::EntrezGeneParser
Definition: EntrezGeneParser.pm:21
run
public run()