ensembl-hive  2.8.1
HGNCParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =head1 CONTACT
19 
20  Please email comments or questions to the public Ensembl
21  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
22 
23  Questions may also be sent to the Ensembl help desk at
24  <http://www.ensembl.org/Help/Contact>.
25 
26 =head1 NAME
27 
29 
30 =head1 DESCRIPTION
31 
32 A parser class to parse the HGNC source.
33 HGNC is the official naming source for Human.
34 
35 -data_uri = https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_prev_sym&col=gd_aliases&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pub_refseq_ids&col=gd_ccds_ids&col=gd_lsdb_links&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit
36 -file_format = TSV
37 -columns = [
38  HGNC ID
39  Approved symbol
40  Approved name
41  Previous symbols
42  Alias symbols
43  NCBI Gene ID
44  Ensembl gene ID
45  RefSeq IDs
46  CCDS IDs
47  Locus specific databases
48  ]
49 
50 A core database adaptor is required.
51 
52 =head1 SYNOPSIS
53 
54  my $parser = XrefParser::HGNCParser->new($db->dbh);
55 
56  $parser->run_script( {
57  source_id => 46,
58  species_id => 9606,
59  file => 'hgnc_data.tsv',
60  dba => $core_dba,
61  } );
62 
63 =cut
64 
65 
66 
67 package XrefParser::HGNCParser;
68 
69 use strict;
70 use warnings;
71 use Carp;
72 use Text::CSV;
73 use Encode;
74 use Text::Unidecode;
75 
76 use parent qw( XrefParser::BaseParser );
77 
78 
79 # HGNC sources to be processed
80 my @SOURCES = (
81  'ccds',
82  'entrezgene_manual',
83  'refseq_manual',
84  'ensembl_manual',
85  'desc_only'
86 );
87 
88 
89 =head2 run_script
90  Description: Runs the HGNCParser
91  Return type: none
92  Exceptions : throws on all processing errors
93  Caller : ParseSource in the xref pipeline
94 =cut
95 
96 sub run_script {
97  my ($self, $ref_arg) = @_;
98 
99  my $source_id = $ref_arg->{source_id};
100  my $species_id = $ref_arg->{species_id};
101  my $file = $ref_arg->{file};
102  my $db = $ref_arg->{dba};
103  my $verbose = $ref_arg->{verbose} // 0;
104  my $dbi = $ref_arg->{dbi} // $self->dbi;
105 
106  if ((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
107  confess "Need to pass source_id, species_id, file as pairs";
108  }
109 
110  # parse the file string and set default user
111  my $file_params = $self->parse_file_string($file);
112  $file_params->{user} //= 'ensro';
113 
114  # Prepare lookup lists
115  my (%swissprot) = %{$self->get_valid_codes('Uniprot/SWISSPROT',$species_id, $dbi)};
116  my (%refseq) = %{$self->get_valid_codes('refseq',$species_id, $dbi)};
117  my @list = ('refseq_peptide', 'refseq_mRNA');
118  my (%entrezgene) = %{$self->get_valid_xrefs_for_dependencies('EntrezGene', $dbi, @list)};
119 
120  # Prepare sources
121  my $self_source_name = $self->get_source_name_for_source_id($source_id, $dbi);
122 
123  # get RefSeq source ids
124  foreach my $source_name (@SOURCES) {
125  $self->{source_ids}->{$source_name} = $self->get_source_id_for_source_name( $self_source_name, $source_name , $dbi );
126  }
127  $self->{source_ids}->{'lrg'} = $self->get_source_id_for_source_name( 'LRG_HGNC_notransfer', undef, $dbi );
128  $self->{source_ids}->{'genecards'} = $self->get_source_id_for_source_name('GeneCards', undef, $dbi);
129 
130  # statistics counts
131  my %name_count;
132  my $mismatch = 0;
133 
134  # Get CCDS data from core db
135  my $core_db;
136  if (defined $db) {
137  $core_db = $db->dbc();
138  } elsif (defined $file_params->{host}) {
139  $core_db = XrefParser::Database->new({
140  host => $file_params->{host},
141  port => $file_params->{port},
142  user => $file_params->{user},
143  dbname => $file_params->{dbname},
144  pass => $file_params->{pass}
145  })->dbi;
146  } else {
147  confess "No ensembl core database provided\n";
148  }
149 
150  if (!defined $core_db) {
151  confess "No ensembl core database!\n";
152  }
153 
154  my $sql =(<<'CCDS');
155  SELECT ta.value, t.stable_id
156  FROM transcript t
157  INNER JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id
158  INNER JOIN attrib_type a ON ta.attrib_type_id = a.attrib_type_id
159  WHERE a.code = 'ccds_transcript';
160 CCDS
161 
162  my %ccds_to_ens;
163  my $sth = $core_db->prepare($sql);
164  $sth->execute() or croak( $core_db->errstr() );
165  while ( my ($ccds_id, $ens_id) = $sth->fetchrow_array() ) {
166  # Remove version
167  $ccds_id =~ s/\.\d+//x;
168  $ccds_to_ens{$ccds_id} = $ens_id;
169  }
170  $sth->finish;
171 
172  # in memory HGNC file
173  my $mem_file;
174 
175  # use wget link to get file
176  if (defined $file_params->{wget}) {
177  my $ua = LWP::UserAgent->new();
178  $ua->timeout(10);
179  $ua->env_proxy();
180  my $request = HTTP::Request->new(
181  GET => $file_params->{wget}
182  );
183  my $response = $ua->request($request);
184 
185  if ( !$response->is_success() ) {
186  confess $response->status_line;
187  }
188 
189  $mem_file = $response->decoded_content;
190 
191  # else get file from disk
192  } else {
193  my $disk_fh = $self->get_filehandle($file);
194  if ( !defined $disk_fh ) {
195  confess "Can't open HGNC file '$file'\n";
196  }
197  $mem_file = do { local $/; <$disk_fh> };
198  }
199 
200  my $input_file = Text::CSV->new({
201  sep_char => "\t",
202  empty_is_undef => 1,
203  binary => 1,
204  auto_diag => 1
205  }) or croak "Cannot use file $file: ".Text::CSV->error_diag ();
206 
207  # make sure it's utf8
208  $mem_file = Encode::encode("UTF-8", $mem_file);
209  # get rid of non-conventional " used in the Locus specific databases field
210  $mem_file =~ s/"//xg;
211 
212  open my $fh, '<', \$mem_file or confess "Can't open HGNC in-memory file: $!\n";
213 
214  $input_file->column_names( @{ $input_file->getline( $fh ) } );
215 
216 
217  # loop through each row
218  while ( my $data = $input_file->getline_hr( $fh ) ) {
219 
220  my $acc = $data->{'HGNC ID'};
221  my $symbol = $data->{'Approved symbol'};
222  my $name = $data->{'Approved name'};
223  my $previous_symbols = $data->{'Previous symbols'};
224  my $synonyms = $data->{'Alias symbols'};
225 
226  my $seen = 0;
227 
228  # Direct CCDS to ENST mappings
229  my $ccds = $data->{'CCDS IDs'};
230  my @ccds_list;
231 
232  if ( defined $ccds ) {
233  @ccds_list = split( /,\s/x, $ccds );
234  }
235 
236  CCDS:
237  foreach my $ccds (@ccds_list) {
238  my $enst_id = $ccds_to_ens{$ccds};
239 
240  if (!defined $enst_id) {
241  next CCDS;
242  }
243 
244  $self->add_to_direct_xrefs({
245  stable_id => $enst_id,
246  type => 'gene',
247  acc => $acc,
248  label => $symbol,
249  desc => $name,
250  source_id => $self->{source_ids}->{'ccds'},
251  dbi => $dbi,
252  species_id => $species_id
253  });
254 
255  $self->add_synonyms_for_hgnc({
256  source_id => $self->{source_ids}->{'ccds'},
257  name => $acc,
258  species_id => $species_id,
259  dbi => $dbi,
260  dead => $previous_symbols,
261  alias => $synonyms
262  });
263  $name_count{'ccds'}++;
264  }
265 
266  # Direct LRG to ENST mappings
267  my $lrg_id = $data->{'Locus specific databases'};
268 
269  if ( defined $lrg_id && $lrg_id =~ m/(LRG_\d+)\|/x){
270  $lrg_id = $1;
271  $self->add_to_direct_xrefs({
272  stable_id => $lrg_id,
273  type => 'gene',
274  acc => $acc,
275  label => $symbol,
276  desc => $name,
277  source_id => $self->{source_ids}->{'lrg'},
278  dbi => $dbi,
279  species_id => $species_id
280  });
281 
282  $self->add_synonyms_for_hgnc({
283  source_id => $self->{source_ids}->{'lrg'},
284  name => $acc,
285  species_id => $species_id,
286  dbi => $dbi,
287  dead => $previous_symbols,
288  alias => $synonyms
289  });
290  $name_count{'lrg'}++;
291  }
292 
293  # Direct Ensembl mappings
294  my $ensg_id = $data->{'Ensembl gene ID'};
295  if ( defined $ensg_id ){
296  $seen = 1;
297 
298  $self->add_to_direct_xrefs({
299  stable_id => $ensg_id,
300  type => 'gene',
301  acc => $acc,
302  label => $symbol,
303  desc => $name,
304  dbi => $dbi,
305  source_id => $self->{source_ids}->{'ensembl_manual'},
306  species_id => $species_id
307  });
308 
309  $self->add_synonyms_for_hgnc({
310  source_id => $self->{source_ids}->{'ensembl_manual'},
311  name => $acc,
312  species_id => $species_id,
313  dead => $previous_symbols,
314  dbi => $dbi,
315  alias => $synonyms
316  });
317  $name_count{'ensembl_manual'}++;
318 
319  # GeneCards
320  my $direct_id = $self->get_xref($acc, $self->{source_ids}->{'ensembl_manual'}, $species_id, $dbi);
321  my ($hgnc_id) = $acc =~ /HGNC:(\d+)/;
322  $self->add_dependent_xref({
323  master_xref_id => $direct_id,
324  acc => $hgnc_id,
325  label => $symbol,
326  desc => $name,
327  source_id => $self->{source_ids}->{'genecards'},
328  dbi => $dbi,
329  species_id => $species_id
330  });
331 
332  $self->add_synonyms_for_hgnc({
333  source_id => $self->{source_ids}->{'genecards'},
334  name => $hgnc_id,
335  species_id => $species_id,
336  dbi => $dbi,
337  dead => $previous_symbols,
338  alias => $synonyms
339  });
340  $name_count{'genecards'}++;
341  }
342 
343  # RefSeq
344  my $refseq_id = $data->{'RefSeq IDs'};
345  if ($refseq_id) {
346  if ( defined $refseq{$refseq_id} ){
347  $seen = 1;
348  foreach my $xref_id ( @{$refseq{$refseq_id}} ){
349  $self->add_dependent_xref({
350  master_xref_id => $xref_id,
351  acc => $acc,
352  label => $symbol,
353  desc => $name,
354  source_id => $self->{source_ids}->{'refseq_manual'},
355  dbi => $dbi,
356  species_id => $species_id
357  });
358  $name_count{'refseq_manual'}++;
359  }
360 
361  $self->add_synonyms_for_hgnc({
362  source_id => $self->{source_ids}->{'refseq_manual'},
363  name => $acc,
364  species_id => $species_id,
365  dbi => $dbi,
366  dead => $previous_symbols,
367  alias => $synonyms
368  });
369  }
370  }
371 
372  # EntrezGene
373  my $entrez_id = $data->{'NCBI Gene ID'};
374  if ( defined $entrez_id ){
375  if ( defined $entrezgene{$entrez_id} ){
376  $seen = 1;
377  $self->add_dependent_xref({
378  master_xref_id => $entrezgene{$entrez_id},
379  acc => $acc,
380  label => $symbol,
381  desc => $name,
382  source_id => $self->{source_ids}->{'entrezgene_manual'},
383  dbi => $dbi,
384  species_id => $species_id
385  });
386 
387  $self->add_synonyms_for_hgnc({
388  source_id => $self->{source_ids}->{'entrezgene_manual'},
389  name => $acc,
390  species_id => $species_id,
391  dead => $previous_symbols,
392  dbi => $dbi,
393  alias => $synonyms
394  });
395  $name_count{'entrezgene_manual'}++;
396  }
397  }
398 
399  # Store to keep descriptions if stored yet
400  if ( !$seen ){
401  $self->add_xref({
402  acc => $acc,
403  label => $symbol,
404  desc => $name,
405  source_id => $self->{source_ids}->{'desc_only'},
406  species_id => $species_id,
407  dbi => $dbi,
408  info_type => "MISC"
409  });
410 
411  $self->add_synonyms_for_hgnc({
412  source_id => $self->{source_ids}->{'desc_only'},
413  name => $acc,
414  species_id => $species_id,
415  dbi => $dbi,
416  dead => $previous_symbols,
417  alias => $synonyms
418  });
419  $mismatch++;
420  }
421 
422  }
423 
424  close $fh;
425 
426  if ( $verbose ){
427  print "HGNC xrefs loaded:\n";
428  foreach my $type (sort keys %name_count){
429  print "\t$type\t$name_count{$type}\n";
430  }
431  print "$mismatch HGNC ids could not be associated in xrefs\n";
432  }
433  return 0; # successful
434 }
435 
436 
437 
438 =head2 add_synonyms_for_hgnc
439  Arg [1] : hashref : source_id, name, species_id, dead, alias
440  Description: Specialized class to add synonyms from HGNC and VGNC data
441  Return type: N/A
442  Caller : internal
443 =cut
444 
445 sub add_synonyms_for_hgnc {
446  my ($self, $ref_arg) = @_;
447 
448  my $source_id = $ref_arg->{source_id};
449  my $name = $ref_arg->{name};
450  my $species_id = $ref_arg->{species_id};
451  my $dead_string = $ref_arg->{dead};
452  my $alias_string = $ref_arg->{alias};
453  my $dbi = $ref_arg->{dbi};
454 
455  # dead name, add to synonym
456  if (defined $dead_string) {
457  $dead_string =~ s/"//xg;
458  my @dead_array = split( ',\s', $dead_string );
459  foreach my $dead (@dead_array){
460  $dead = Encode::decode("UTF-8", $dead);
461  $dead = unidecode(uc($dead));
462  $self->add_to_syn($name, $source_id, $dead, $species_id, $dbi);
463  }
464  }
465 
466  # alias name, add to synonym
467  if (defined $alias_string) {
468  $alias_string =~ s/"//xg;
469  my @alias_array = split( ',\s', $alias_string );
470  foreach my $alias (@alias_array){
471  $alias = Encode::decode("UTF-8", $alias);
472  $alias = unidecode(uc($alias));
473  $self->add_to_syn($name, $source_id, $alias, $species_id, $dbi);
474  }
475  }
476 
477  return;
478 }
479 
480 
481 
482 =head2 parse_file_string
483  Arg [1] : string : input file string
484  Description: parses the input string $file into an hash
485  string $file is in the format as the example:
486  script:project=>ensembl,host=>ens-staging1,dbname=>homo_sapiens_core_70_37,ofhost=>ens-staging1,...
487  string until : is ignored, hash is built with keys=>values provided
488  Return type: params hashref
489  Caller : internal
490 =cut
491 
492 sub parse_file_string {
493  my ($self, $file_string) = @_;
494 
495  $file_string =~ s/\A\w+://x;
496 
497  my @param_pairs = split( /,/x, $file_string );
498 
499  my $params;
500 
501  # set provided values
502  foreach my $pair ( @param_pairs ) {
503  my ($key, $value) = split( /=>/x, $pair );
504  $params->{$key} = $value;
505  }
506 
507  return $params;
508 }
509 
510 
511 
512 1;
transcript
public transcript()
XrefParser::BaseParser
Definition: BaseParser.pm:8
XrefParser::HGNCParser::run_script
public void run_script()
XrefParser::Database::new
public new()
XrefParser::HGNCParser
Definition: HGNCParser.pm:42
XrefParser::HGNCParser::parse_file_string
public Params parse_file_string()
XrefParser::Database
Definition: Database.pm:8