ensembl-hive  2.8.1
PHIbaseParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefParser::PHIbaseParser;
21 
22 use strict;
23 use warnings;
24 use Carp;
25 use POSIX qw(strftime);
26 use File::Basename;
27 
28 use XML::LibXML;
29 
30 use base qw( XrefParser::BaseParser );
31 
32 sub run {
33 
34 
35  my ($self, $ref_arg) = @_;
36  my $source_id = $ref_arg->{source_id};
37  my $species_id = $ref_arg->{species_id};
38  my $files = $ref_arg->{files};
39  my $verbose = $ref_arg->{verbose};
40 
41  if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
42  croak "Need to pass source_id, species_id and file as pairs";
43  }
44  $verbose |=0;
45 
46  my $phi_xml_file = @{$files}[0];
47 
48  print STDERR "PhiBase file to parse, $phi_xml_file\n" if($verbose);
49 
50  my %phi_mapping;
51  my %taxIds;
52 
53  my $term = undef;
54  my $desc = undef;
55 
56  my $phi_parser = XML::LibXML->new();
57  my $phi_doc = $phi_parser->parse_file($phi_xml_file);
58 
59  my $concepts_count = 0;
60 
61  foreach my $concept ($phi_doc->findnodes('ondex:ondexdata/ondexdataseq/concepts/concept')) {
62  my ($pid_node) = $concept->findnodes('./pid');
63  my $pid = $pid_node->to_literal;
64  my $concept_accessions_aref = $concept->findnodes('./coaccessions/concept_accession');
65  my $uniprot_acc = undef;
66 
67  foreach my $concept_accession (@$concept_accessions_aref) {
68 
69  # get the one associated with UniProt
70  my ($elementOf) = $concept_accession->findnodes('./elementOf');
71 
72  if ($elementOf->to_literal =~ /UPROT/) {
73  my ($accession) = $concept_accession->findnodes('./accession');
74  $uniprot_acc = $accession->to_literal;
75  }
76  }
77 
78  if (!defined $uniprot_acc) {
79  # print STDERR "phi id, $pid, no uniprot mapping found in xml file!\n";
80  }
81  else {
82  if (!defined $phi_mapping{$uniprot_acc}) {
83  $phi_mapping{$uniprot_acc} =
84  {
85  -phi_ids => [$pid],
86  -tax_id => undef,
87  };
88  }
89  else {
90  my $phi_href = $phi_mapping{$uniprot_acc};
91  my $aref = $phi_href->{-phi_ids};
92  push (@$aref, $pid);
93  }
94  }
95 
96  # Get the TaxId
97 
98  my $concept_gds_aref = $concept->findnodes('./cogds/concept_gds');
99  my $taxId = undef;
100 
101  foreach my $concept_gds (@$concept_gds_aref) {
102 
103  # get the one associated with Taxid
104  my ($attrname) = $concept_gds->findnodes('./attrname');
105  if ($attrname->to_literal =~ /TAXID/) {
106  my ($value) = $concept_gds->findnodes('./value');
107  $taxId = $value->to_literal;
108  $taxId =~ s/\D//g;
109 
110  if (! defined $taxIds{$taxId}) {
111  $taxIds{$taxId} = 1;
112  }
113 
114  if (defined $uniprot_acc) {
115  my $phi_href = $phi_mapping{$uniprot_acc};
116  $phi_href->{-tax_id} = $taxId;
117  }
118  }
119  }
120  $concepts_count++;
121  }
122 
123  my @phis = keys (%phi_mapping);
124 
125  print "Parsed $concepts_count concepts\n";
126  print "Found " . @phis . " with UniProt mapping!\n";
127 
128  print "Found " . keys (%taxIds) . " different taxIds\n";
129 
130  #get the "main" PHIbase source id.
131  $source_id = $self->get_source_id_for_source_name("PHIbase");
132 
133 
134  #get the mapping that are already there so that we don't get lots of duplicates.
135  # stored in the global hash xref_dependent_mapped.
136  $self->get_dependent_mappings($source_id);
137 
138  #if(!defined($species_id)){
139  # $species_id = $self->get_species_id_for_filename($phi_xml_file);
140  #}
141 
142  my $swiss_miss=0;
143  my (%swiss) = %{$self->get_valid_codes("uniprot/", $species_id)};
144 
145  print "got " . keys (%swiss) . " Uniprot entries\n";
146 
147  print "species_id, source_id: $species_id, $source_id\n";
148 
149  # Don't check only the species_id, but all taxIds specified in xref_config.ini
150 
151  my %species2tax = $self->species_id2taxonomy();
152  my @tax_ids = @{$species2tax{$species_id}};
153 
154  print "tax_ids from xref_config.ini file: " . join (', ', @tax_ids) . "\n";
155 
156  my $added = 0;
157 
158  foreach my $uniprot_acc (keys (%phi_mapping)) {
159  my $phis_href = $phi_mapping{$uniprot_acc};
160  my $taxId = $phis_href->{-tax_id};
161  if (grep {$_ eq $taxId} @tax_ids) {
162  # Get the master_xref_id
163  # and the linkage
164 
165  if(!defined($swiss{$uniprot_acc})){
166  print STDERR "failed to get the master_xref_if for UniProt, $uniprot_acc!\n";
167  # one reason it happens is that the UniProt identifier is attached to a different tax node in Phibase that it is in UniProt
168  }
169  else{
170  foreach my $master_xref_id (@{$swiss{$uniprot_acc}}){
171  # print STDERR "master_xref_id, $master_xref_id\n";
172  my $linkage = undef;
173  my $phis_aref = $phis_href->{-phi_ids};
174  foreach my $phibase_id (@$phis_aref) {
175  # print STDERR "Adding xrefs for phibase id, $phibase_id\n";
176  $self->add_dependent_xref({ master_xref_id => $master_xref_id,
177  acc => $phibase_id,
178  label => $phibase_id,
179  source_id => $source_id,
180  species_id => $species_id });
181  $added++;
182  }
183  }
184  }
185  }
186  }
187 
188  print "Added $added PHIbase xrefs\n";
189 
190  return 0;
191 
192 }
193 
194 1;
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()