ensembl-hive  2.7.0
UniProtParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 # Parse UniProt (SwissProt & SPTrEMBL) files to create xrefs.
21 #
22 # Files actually contain both types of xref, distinguished by ID line;
23 #
24 # ID CYC_PIG Reviewed; 104 AA. Swissprot
25 # ID Q3ASY8_CHLCH Unreviewed; 36805 AA. SPTrEMBL
26 
27 
28 
29 package XrefParser::UniProtParser;
30 
31 use strict;
32 use warnings;
33 use Carp;
34 use POSIX qw(strftime);
35 use File::Basename;
36 use Text::CSV;
37 
38 use base qw( XrefParser::BaseParser );
39 
40 my $hgnc_file;
41 
42 sub run {
43 
44  my ($self, $ref_arg) = @_;
45  my $source_id = $ref_arg->{source_id};
46  my $species_id = $ref_arg->{species_id};
47  my $species_name = $ref_arg->{species};
48  my $files = $ref_arg->{files};
49  my $release_file = $ref_arg->{rel_file};
50  my $verbose = $ref_arg->{verbose};
51  my $dbi = $ref_arg->{dbi};
52  $dbi = $self->dbi unless defined $dbi;
53 
54  $hgnc_file = $ref_arg->{hgnc_file} || undef;
55 
56  if((!defined $source_id) or (!defined $species_id) or (!defined $files)){
57  croak "Need to pass source_id, species_id, files and rel_file as pairs";
58  }
59  $verbose |=0;
60 
61  my $file = @{$files}[0];
62 
63  my ( $sp_source_id, $sptr_source_id, $sp_release, $sptr_release, $sptr_non_display_source_id, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id );
64 
65  $sp_source_id =
66  $self->get_source_id_for_source_name('Uniprot/SWISSPROT','sequence_mapped', $dbi);
67  $sptr_source_id =
68  $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'sequence_mapped', $dbi);
69 
70  $sptr_non_display_source_id =
71  $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'protein_evidence_gt_2', $dbi);
72 
73  $sp_direct_source_id = $self->get_source_id_for_source_name('Uniprot/SWISSPROT', 'direct', $dbi);
74  $sptr_direct_source_id = $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'direct', $dbi);
75 
76  $isoform_source_id = $self->get_source_id_for_source_name('Uniprot_isoform');
77 
78  print "SwissProt source id for $file: $sp_source_id\n" if ($verbose);
79  print "SpTREMBL source id for $file: $sptr_source_id\n" if ($verbose);
80  print "SpTREMBL protein_evidence > 2 source id for $file: $sptr_non_display_source_id\n" if ($verbose);
81  print "SwissProt direct source id for $file: $sp_direct_source_id\n" if ($verbose);
82  print "SpTREMBL direct source id for $file: $sptr_direct_source_id\n" if ($verbose);
83 
84  $self->create_xrefs( $sp_source_id, $sptr_source_id, $sptr_non_display_source_id, $species_id,
85  $file, $verbose, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id, $dbi );
86 
87  if ( defined $release_file ) {
88  # Parse Swiss-Prot and SpTrEMBL release info from
89  # $release_file.
90  my $release_io = $self->get_filehandle($release_file);
91  while ( defined( my $line = $release_io->getline() ) ) {
92  if ( $line =~ m#(UniProtKB/Swiss-Prot Release .*)# ) {
93  $sp_release = $1;
94  print "Swiss-Prot release is '$sp_release'\n" if($verbose);
95  } elsif ( $line =~ m#(UniProtKB/TrEMBL Release .*)# ) {
96  $sptr_release = $1;
97  print "SpTrEMBL release is '$sptr_release'\n" if($verbose);
98  }
99  }
100  $release_io->close();
101 
102  # Set releases
103  $self->set_release( $sp_source_id, $sp_release, $dbi );
104  $self->set_release( $sptr_source_id, $sptr_release, $dbi );
105  $self->set_release( $sptr_non_display_source_id, $sptr_release, $dbi );
106  $self->set_release( $sp_direct_source_id, $sp_release, $dbi );
107  $self->set_release( $sptr_direct_source_id,$sptr_release, $dbi );
108  }
109 
110 
111  return 0; # successfull
112 }
113 
114 
115 # --------------------------------------------------------------------------------
116 # Parse file into array of xref objects
117 
118 sub create_xrefs {
119  my ($self, $sp_source_id, $sptr_source_id, $sptr_non_display_source_id, $species_id, $file, $verbose, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id, $dbi ) = @_;
120 
121  my $num_sp = 0;
122  my $num_sptr = 0;
123  my $num_sp_pred = 0;
124  my $num_sptr_pred = 0;
125  my $num_sptr_non_display = 0;
126  my $num_direct_sp = 0;
127  my $num_direct_sptr = 0;
128 
129  my %dependent_sources = $self->get_xref_sources($dbi);
130 
131  my (%genemap) =
132  %{ $self->get_valid_codes( "mim_gene", $species_id, $dbi ) };
133  my (%morbidmap) =
134  %{ $self->get_valid_codes( "mim_morbid", $species_id, $dbi ) };
135 
136  # Extract descriptions from hgnc
137  my %hgnc_descriptions;
138  if ($hgnc_file) {
139  %hgnc_descriptions = $self->get_hgnc_descriptions($hgnc_file);
140  }
141 
142  my $uniprot_io = $self->get_filehandle($file);
143  if ( !defined $uniprot_io ) { return }
144 
145  my @xrefs;
146 
147  local $/ = "//\n";
148 
149  # Create a hash of all valid taxon_ids for this species
150  my %species2tax = $self->species_id2taxonomy($dbi);
151  push @{$species2tax{$species_id}}, $species_id;
152  my @tax_ids = @{$species2tax{$species_id}};
153  my %taxonomy2species_id = map{ $_=>$species_id } @tax_ids;
154 
155  my %dependent_xrefs;
156  my $ensembl_derived_protein_count = 0;
157 
158  # Counter to process file in batches
159  my $count = 0;
160 
161  while ( $_ = $uniprot_io->getline() ) {
162 
163  # if an OX line exists, only store the xref if the taxonomy ID that the OX
164  # line refers to is in the species table
165  # due to some records having more than one tax_id, we need to check them
166  # all and only proceed if one of them matches.
167  #OX NCBI_TaxID=158878, 158879;
168  #OX NCBI_TaxID=103690;
169 
170  my ($ox) = $_ =~ /OX\s+[a-zA-Z_]+=([0-9 ,]+).*;/;
171  my @ox = ();
172  my $found = 0;
173 
174  if ( defined $ox ) {
175  @ox = split /\, /, $ox;
176 
177  # my %taxonomy2species_id = $self->taxonomy2species_id();
178 
179  foreach my $taxon_id_from_file (@ox) {
180  $taxon_id_from_file =~ s/\s//;
181  if ( exists $taxonomy2species_id{$taxon_id_from_file} ){
182  $found = 1;
183  $count++;
184  }
185  }
186  }
187 
188  next if (!$found); # no taxon_id's match, so skip to next record
189  my $xref;
190 
191  # set accession (and synonyms if more than one)
192  # AC line may have primary accession and possibly several ; separated synonyms
193  # May also be more than one AC line
194  my ($acc) = $_ =~ /(\nAC\s+.+)/s; # will match first AC line and everything else
195 
196  my @all_lines = split /\n/, $acc;
197 
198  # Check for CC (caution) lines containing certain text
199  # If sequence is from Ensembl, do not use
200  my $ensembl_derived_protein = 0;
201  if ($_ =~ /CAUTION: The sequence shown here is derived from an Ensembl/) {
202  $ensembl_derived_protein = 1;
203  $ensembl_derived_protein_count++;
204  }
205 
206  # extract ^AC lines only & build list of accessions
207  my @accessions;
208  foreach my $line (@all_lines) {
209  my ($accessions_only) = $line =~ /^AC\s+(.+)/;
210  push(@accessions, (split /;\s*/, $accessions_only)) if ($accessions_only);
211 
212  }
213 
214 
215  if(lc($accessions[0]) eq "unreviewed"){
216  print "WARNING: entries with accession of $acc not allowed will be skipped\n";
217  next;
218  }
219  $xref->{INFO_TYPE} = "SEQUENCE_MATCH";
220  $xref->{ACCESSION} = $accessions[0];
221  for (my $a=1; $a <= $#accessions; $a++) {
222  push(@{$xref->{"SYNONYMS"} }, $accessions[$a]);
223  }
224 
225  my ($label, $sp_type) = $_ =~ /ID\s+(\w+)\s+(\w+)/;
226  my ($protein_evidence_code) = $_ =~ /PE\s+(\d+)/;
227  # Capture line with entry version
228  # Example: DT 22-APR-2020, entry version 1.
229  my ($version) = $_ =~ /DT\s+\d+-\w+-\d+, entry version (\d+)/;
230 
231  # SwissProt/SPTrEMBL are differentiated by having STANDARD/PRELIMINARY here
232  if ($sp_type =~ /^Reviewed/i) {
233 
234  $xref->{SOURCE_ID} = $sp_source_id;
235  $num_sp++;
236  } elsif ($sp_type =~ /Unreviewed/i) {
237 
238  #Use normal source only if it is PE levels 1 & 2
239  if (defined($protein_evidence_code) && $protein_evidence_code < 3) {
240  $xref->{SOURCE_ID} = $sptr_source_id;
241  $num_sptr++;
242  } else {
243  $xref->{SOURCE_ID} = $sptr_non_display_source_id;
244  $num_sptr_non_display++;
245  }
246 
247  } else {
248 
249  next; # ignore if it's neither one nor t'other
250 
251  }
252 
253 
254 
255  # some straightforward fields
256  # the previous $label flag of type BRCA2_HUMAN is not used in Uniprot any more, use accession instead
257  $xref->{LABEL} = $accessions[0] ."." . $version;
258  $xref->{VERSION} = $version;
259  $xref->{SPECIES_ID} = $species_id;
260  $xref->{SEQUENCE_TYPE} = 'peptide';
261  $xref->{STATUS} = 'experimental';
262 
263  # May have multi-line descriptions
264  my ($description_and_rest) = $_ =~ /(DE\s+.*)/s;
265  @all_lines = split /\n/, $description_and_rest;
266 
267  # extract ^DE lines only & build cumulative description string
268  my $description = "";
269  my $name = "";
270  my $sub_description = "";
271 
272  foreach my $line (@all_lines) {
273 
274  next if(!($line =~ /^DE/));
275 
276  # get the data
277  if($line =~ /^DE RecName: Full=(.*);/){
278  $name .= '; ' if $name ne q{}; #separate multiple sub-names with a '; '
279  $name .= $1;
280  }
281  elsif($line =~ /RecName: Full=(.*);/){
282  $description .= ' ' if $description ne q{}; #separate the description bit with just a space
283  $description .= $1;
284  }
285  elsif($line =~ /SubName: Full=(.*);/){
286  $name .= '; ' if $name ne q{}; #separate multiple sub-names with a '; '
287  $name .= $1;
288  }
289 
290 
291  $description =~ s/^\s*//g;
292  $description =~ s/\s*$//g;
293 
294 
295  my $desc = $name.' '.$description;
296  if(!length($desc)){
297  $desc = $sub_description;
298  }
299 
300  $desc =~ s/\s*\{ECO:.*?\}//g;
301  $xref->{DESCRIPTION} = $desc;
302 
303  # Parse the EC_NUMBER line, only for S.cerevisiae for now
304 
305  if (($line =~ /EC=/) && ($species_id == 4932)) {
306 
307  #print STDERR "EC Number line: $line\n";
308 
309  $line =~ /^DE\s+EC=([^;]+);/;
310 
311  # Get the EC Number and make it an xref for S.cer if any
312 
313  my $EC = $1;
314 
315  #print STDERR "EC after processing: $EC\n";
316 
317  my %depe;
318  $depe{LABEL} = $EC;
319  $depe{ACCESSION} = $EC;
320 
321  $depe{SOURCE_NAME} = "EC_NUMBER";
322 
323  $depe{SOURCE_ID} = $dependent_sources{"EC_NUMBER"};
324  $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
325  push @{$xref->{DEPENDENT_XREFS}}, \%depe;
326  $dependent_xrefs{"EC_NUMBER"}++;
327  }
328 
329  }
330 
331  # extract sequence
332  my ($seq) = $_ =~ /SQ\s+(.+)/s; # /s allows . to match newline
333  my @seq_lines = split /\n/, $seq;
334  my $parsed_seq = "";
335  foreach my $x (@seq_lines) {
336  $parsed_seq .= $x;
337  }
338  $parsed_seq =~ s/\/\///g; # remove trailing end-of-record character
339  $parsed_seq =~ s/\s//g; # remove whitespace
340  $parsed_seq =~ s/^.*;//g; # remove everything before last ;
341 
342  $xref->{SEQUENCE} = $parsed_seq;
343  #print "Adding " . $xref->{ACCESSION} . " " . $xref->{LABEL} ."\n";
344 
345 
346  my ($gns) = $_ =~ /(GN\s+.+)/s;
347  my @gn_lines = ();
348  if (defined $gns) {
349  foreach my $gn_line (split("\n", $gns)) {
350  if ($gn_line !~ /^GN/) {last;}
351 
352  $gn_line =~ s/^GN\s+//g;
353  push(@gn_lines, $gn_line);
354  }
355 
356  $gns = join('', @gn_lines);
357  @gn_lines = split /;/, $gns;
358  }
359 
360  # Do not allow the addition of UniProt Gene Name dependent Xrefs
361  # if the protein was imported from Ensembl. Otherwise we will
362  # re-import previously set symbols
363  if(! $ensembl_derived_protein) {
364  my %depe;
365  foreach my $gn (@gn_lines){
366  my $gene_name = undef;
367 
368  if ($gn =~ /Name=([A-Za-z0-9_\-\.\s:]+)/s) { #/s for multi-line entries ; is the delimiter
369 # Example line
370 # GN Name=ctrc {ECO:0000313|Xenbase:XB-GENE-5790348};
371  my $name = $1;
372  $name =~ s/\s+$//g; # Remove white spaces that are left over at the end if there was an evidence code
373  $depe{LABEL} = $name; # leave name as is, upper/lower case is relevant in gene names
374  $depe{ACCESSION} = $self->get_name($xref->{ACCESSION},$depe{LABEL});
375  $gene_name = $depe{ACCESSION};
376 
377  $depe{SOURCE_NAME} = "Uniprot_gn";
378  $depe{SOURCE_ID} = $dependent_sources{"Uniprot_gn"};
379  $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
380  $depe{DESCRIPTION} = $hgnc_descriptions{$name} if ($hgnc_file && defined($hgnc_descriptions{$name}));
381  push @{$xref->{DEPENDENT_XREFS}}, \%depe;
382  $dependent_xrefs{"Uniprot_gn"}++;
383  }
384  my @syn;
385  if($gn =~ /Synonyms=(.*)/s){ # use of /s as synonyms can be across more than one line
386 # Example line
387 # GN Synonyms=cela2a {ECO:0000313|Ensembl:ENSXETP00000014934},
388 # GN MGC79767 {ECO:0000313|EMBL:AAH80976.1}
389  my $syn = $1;
390  $syn =~ s/{.*}//g; # Remove any potential evidence codes
391  $syn =~ s/\n//g; # Remove return carriages, as entry can span several lines
392  $syn =~ s/\s+$//g; # Remove white spaces that are left over at the end if there was an evidence code
393  #$syn =~ s/^\s+//g; # Remove white spaces that are left over at the beginning if there was an evidence code
394  $syn =~ s/\s+,/,/g; # Remove white spaces that are left over before the comma if there was an evidence code
395  @syn = split(/, /,$syn);
396  push (@{$depe{"SYNONYMS"}}, @syn);
397  }
398  }
399  }
400 
401  # dependent xrefs - only store those that are from sources listed in the source table
402  my ($deps) = $_ =~ /(DR\s+.+)/s; # /s allows . to match newline
403 
404  my @dep_lines = ();
405  if ( defined $deps ) { @dep_lines = split /\n/, $deps }
406 
407  my %seen=(); # per record basis
408 
409  foreach my $dep (@dep_lines) {
410  #both GO and UniGene have the own sources so ignore those in the uniprot files
411  #as the uniprot data should be older
412  if($dep =~ /GO/ || $dep =~ /UniGene/){
413  next;
414  }
415  if ($dep =~ /^DR\s+(.+)/) {
416  my ($source, $acc, @extra) = split /;\s*/, $1;
417  if($source =~ "RGD"){ #using RGD file now instead.
418  next;
419  }
420  if($source =~ "CCDS"){
421  next;
422  }
423  if($source =~ "IPI"){
424  next;
425  }
426  if($source =~ "UCSC"){
427  next;
428  }
429  if($source =~ "SGD"){
430  next;
431  }
432  if($source =~ "HGNC"){
433  next;
434  }
435  # We get the mappings directly from the source
436  if($source =~ "MGI"){
437  next;
438  }
439  # Nomenclature data is imported directly from the source
440  if($source =~ "VGNC"){
441  next;
442  }
443  if($source =~ "Orphanet"){
444  #we don't want to parse Orphanet xrefs via Uniprot, we get them from Orphanet with descriptions
445  next;
446  }
447  if($source =~ "ArrayExpress"){
448  next;
449  }
450  if($source =~ "GenomeRNAi" || $source =~ "EPD"){
451  next;
452  }
453  if($source =~ "Xenbase"){
454  next;
455  }
456 # Uniprot get Reactome links from Reactome, so we want to get the info from Reactome directly
457  if($source =~ "Reactome"){
458  next;
459  }
460 # MIM xrefs are already imported separately, ignore from Uniprot
461 # Also, Uniprot deals with proteins, not appropriate for gene level xrefs
462  if ($source =~ "MIM_GENE" || $source =~ "MIM_MORBID" || $source =~ "MIM") {
463  next;
464  }
465  # GeneCards xrefs are imported through the HGNC file
466  if ($source =~ "GeneCards") {
467  next;
468  }
469 # If mapped to Ensembl, add as direct xref
470  if ($source eq "Ensembl") {
471 # Example line:
472 # DR Ensembl; ENST00000380152; ENSP00000369497; ENSG00000139618.
473 # DR Ensembl; ENST00000372839; ENSP00000361930; ENSG00000166913. [P31946-1]
474 # $source is Ensembl, $acc is ENST00000380152 and @extra is the rest of the line
475 # If the UniProt accession is repeated here, it links to a specific isoform
476  my %direct;
477  my $isoform;
478 
479  my $stable_id = $extra[0];
480  $stable_id =~ s/\.[0-9]+//;
481  $direct{STABLE_ID} = $stable_id;
482  $direct{ENSEMBL_TYPE} = 'Translation';
483  $direct{LINKAGE_TYPE} = 'DIRECT';
484  if ($xref->{SOURCE_ID} == $sp_source_id) {
485  $direct{SOURCE_ID} = $sp_direct_source_id;
486  $num_direct_sp++;
487  } else {
488  $direct{SOURCE_ID} = $sptr_direct_source_id;
489  $num_direct_sptr++;
490  }
491  push @{$xref->{DIRECT_XREFS}}, \%direct;
492 
493  my $uniprot_acc = $accessions[0];
494  if ($extra[1] =~ /($accessions[0]-[0-9]+)/) {
495  $isoform = $1;
496  $self->add_to_direct_xrefs({
497  stable_id => $stable_id,
498  type => 'translation',
499  acc => $isoform,
500  label => $isoform,
501  dbi => $dbi,
502  source_id => $isoform_source_id,
503  linkage => 'DIRECT',
504  species_id => $species_id
505  });
506  }
507  }
508  if (exists $dependent_sources{$source} ) {
509  # create dependent xref structure & store it
510  my %dep;
511  $dep{SOURCE_NAME} = $source;
512  $dep{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
513  $dep{SOURCE_ID} = $dependent_sources{$source};
514 
515  if($source =~ /HGNC/){
516  $acc =~ s/HGNC://;
517  $extra[0] =~ s/[.]//;
518  $dep{LABEL} = $extra[0];
519  }
520  $dep{ACCESSION} = $acc;
521 
522 # $dep{ACCESSION} = $acc;
523  $dependent_xrefs{ $dep{SOURCE_NAME} }++; # get count of depenent xrefs.
524  if(!defined($seen{$dep{SOURCE_NAME}.":".$dep{ACCESSION}})){
525  push @{$xref->{DEPENDENT_XREFS}}, \%dep; # array of hashrefs
526  $seen{$dep{SOURCE_NAME}.":".$dep{ACCESSION}} =1;
527  }
528  if($dep =~ /EMBL/ && !($dep =~ /ChEMBL/)){
529  my ($protein_id) = $extra[0];
530  if(($protein_id ne "-") and (!defined($seen{$source.":".$protein_id}))){
531  my %dep2;
532  $dep2{SOURCE_NAME} = $source;
533  $dep2{SOURCE_ID} = $dependent_sources{"protein_id"};
534  $dep2{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
535  # store accession unversioned
536  $dep2{LABEL} = $protein_id;
537  my ($prot_acc, $prot_version) = $protein_id =~ /([^.]+)\.([^.]+)/;
538  $dep2{ACCESSION} = $prot_acc;
539  $dependent_xrefs{ $dep2{SOURCE_NAME} }++; # get count of dependent xrefs.
540  $seen{$source.":".$protein_id} = 1;
541  push @{$xref->{DEPENDENT_XREFS}}, \%dep2; # array of hashrefs
542  }
543  }
544  }
545  }
546  }
547 
548  push @xrefs, $xref;
549 
550  if ($count > 1000) {
551  $self->upload_xref_object_graphs(\@xrefs, $dbi);
552  $count = 0;
553  undef @xrefs;
554  }
555 
556  }
557 
558  $self->upload_xref_object_graphs(\@xrefs, $dbi) if scalar(@xrefs) > 0;
559 
560  $uniprot_io->close();
561 
562  print "Read $num_sp SwissProt xrefs, $num_sptr SPTrEMBL xrefs with protein evidence codes 1-2, and $num_sptr_non_display SPTrEMBL xrefs with protein evidence codes > 2 from $file\n" if($verbose);
563  print "Added $num_direct_sp direct SwissProt xrefs and $num_direct_sptr direct SPTrEMBL xrefs\n" if ($verbose);
564  print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if (($num_sp_pred > 0 || $num_sptr_pred > 0) and $verbose);
565  print "Skipped $ensembl_derived_protein_count ensembl annotations as Gene names\n";
566 
567 
568 # print "$kount gene anmes added\n";
569 
570  print "Added the following dependent xrefs:-\n" if($verbose);
571  foreach my $key (keys %dependent_xrefs){
572  print $key."\t".$dependent_xrefs{$key}."\n" if($verbose);
573  }
574  print "End.\n" if ($verbose);
575 
576  #TODO - currently include records from other species - filter on OX line??
577 }
578 
579 sub get_name {
580  my $self = shift;
581  my $acc = shift;
582  my $label = shift;
583 
584  return $acc;
585 }
586 
587 sub get_hgnc_descriptions {
588  my ($self, $hgnc_file) = @_;
589  my %descriptions;
590 
591  my $hgnc_fh = $self->get_filehandle($hgnc_file);
592  if ( !defined $hgnc_fh ) {confess "Can't open HGNC file '$hgnc_file'\n";}
593  $hgnc_file = do { local $/; <$hgnc_fh> };
594 
595  my $input_file = Text::CSV->new({
596  sep_char => "\t",
597  empty_is_undef => 1,
598  binary => 1,
599  auto_diag => 1
600  }) or croak "Cannot use file $hgnc_file: ".Text::CSV->error_diag ();
601 
602  $hgnc_file = Encode::encode("UTF-8", $hgnc_file);
603  $hgnc_file =~ s/"//xg;
604 
605  open my $hgnc_io, '<', \$hgnc_file or confess "Can't open HGNC file: $!\n";
606 
607  $input_file->column_names( @{ $input_file->getline( $hgnc_io ) } );
608 
609  while ( my $data = $input_file->getline_hr( $hgnc_io ) ) {
610  my $gene_name = $data->{'Approved symbol'};
611  my $description = $data->{'Approved name'};
612 
613  $descriptions{$gene_name} = $description;
614  }
615 
616  close $hgnc_io;
617 
618  return %descriptions;
619 }
620 
621 1;
map
public map()
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()