ensembl-hive  2.7.0
VGNCParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =head1 CONTACT
19 
20  Please email comments or questions to the public Ensembl
21  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
22 
23  Questions may also be sent to the Ensembl help desk at
24  <http://www.ensembl.org/Help/Contact>.
25 
26 =head1 NAME
27 
29 
30 =head1 DESCRIPTION
31 
32 A parser class to parse the VGNC source.
33 VGNC is the official naming source for some vertebrates species
34 
35 -data_uri = https://ftp.ebi.ac.uk/pub/databases/genenames/vgnc/tsv/vgnc_gene_set_All.txt.gz
36 -file_format = TSV
37 -columns = [
38  taxon_id
39  vgnc_id
40  symbol
41  name
42  locus_group
43  locus_type
44  status
45  location
46  location_sortable:
47  alias_symbol
48  alias_name
49  prev_symbol
50  prev_name
51  gene_family
52  gene_family_id
53  date_approved_reserved
54  date_symbol_changed
55  date_name_changed
56  date_modified
57  entrez_id
58  ensembl_gene_id
59  uniprot_ids
60  ]
61 
62 Only columns listed in @required_columns are mandatory.
63 
64 =head1 SYNOPSIS
65 
66  my $parser = XrefParser::VGNCParser->new($db->dbh);
67 
68  my $parser->run( {
69  source_id => 144,
70  species_id => 9598,
71  files => ['VGNC/vgnc_gene_set_All.txt.gz'],
72  } );
73 
74 =cut
75 
76 package XrefParser::VGNCParser;
77 
78 use strict;
79 use warnings;
80 use Carp;
81 use Text::CSV;
82 
83 use parent qw( XrefParser::HGNCParser );
84 
85 
86 =head2 run
87  Description: Runs the VGNCParser
88  Return type: none
89  Exceptions : throws on all processing errors
90  Caller : ParseSource in the xref pipeline
91 =cut
92 
93 sub run {
94  my ($self, $ref_arg) = @_;
95 
96  my $source_id = $ref_arg->{source_id};
97  my $species_id = $ref_arg->{species_id};
98  my $files = $ref_arg->{files};
99  my $verbose = $ref_arg->{verbose} // 0;
100  my $dbi = $ref_arg->{dbi} // $self->dbi;
101 
102 
103  if ( (!defined $source_id) || (!defined $species_id) || (!defined $files) ) {
104  confess "Need to pass source_id, species_id and files as pairs";
105  }
106 
107  my $file = @{$files}[0];
108 
109  my $count = 0;
110 
111  my $file_io = $self->get_filehandle($file);
112 
113  if ( !defined $file_io ) {
114  confess "Can't open VGNC file '$file'\n";
115  }
116 
117  my $source_name = $self->get_source_name_for_source_id($source_id, $dbi);
118 
119  # Create a hash of all valid taxon_ids for this species
120  my %species2tax = $self->species_id2taxonomy($dbi);
121  push @{$species2tax{$species_id}}, $species_id;
122  my @tax_ids = @{$species2tax{$species_id}};
123  my %taxonomy2species_id = map{ $_=>$species_id } @tax_ids;
124 
125  my $input_file = Text::CSV->new({
126  sep_char => "\t",
127  empty_is_undef => 1,
128  binary => 1
129  }) or confess "Cannot use file '$file': ".Text::CSV->error_diag();
130 
131  # header must contain these columns
132  my @required_columns = qw(
133  taxon_id
134  ensembl_gene_id
135  vgnc_id
136  symbol
137  name
138  alias_symbol
139  prev_symbol
140  );
141 
142  # get header columns
143  my @columns = @{ $input_file->getline( $file_io ) };
144 
145  # die if some required_column is not in columns
146  foreach my $colname (@required_columns) {
147  if ( !grep { /$colname/xms } @columns ) {
148  confess "Can't find required column '$colname' in VGNC file '$file'\n";
149  }
150  }
151 
152  $input_file->column_names( @columns );
153 
154  while ( my $data = $input_file->getline_hr( $file_io ) ) {
155 
156  # skip data for other species
157  next if ( !exists $taxonomy2species_id{$data->{'taxon_id'}} );
158 
159  if ( $data->{'ensembl_gene_id'} ) { # Ensembl direct xref
160  $self->add_to_direct_xrefs({
161  stable_id => $data->{'ensembl_gene_id'},
162  type => 'gene',
163  acc => $data->{'vgnc_id'},
164  label => $data->{'symbol'},
165  desc => $data->{'name'},
166  dbi => $dbi,
167  source_id => $source_id,
168  species_id => $species_id
169  });
170 
171  $self->add_synonyms_for_hgnc({
172  source_id => $source_id,
173  name => $data->{'vgnc_id'},
174  species_id => $species_id,
175  dbi => $dbi,
176  dead => $data->{'alias_symbol'},
177  alias => $data->{'prev_symbol'}
178  });
179 
180  $count++;
181  }
182 
183  }
184 
185  $input_file->eof or confess "Error parsing file '$file': " . $input_file->error_diag();
186  $file_io->close();
187 
188  if($verbose){
189  print "Loaded a total of $count VGNC xrefs\n";
190  }
191 
192  return 0; # successful
193 }
194 
195 1;
map
public map()
XrefParser::VGNCParser
Definition: VGNCParser.pm:51
XrefParser::VGNCParser::run
public void run()
run
public run()
XrefParser::HGNCParser
Definition: HGNCParser.pm:42