ensembl-hive  2.7.0
UniProtDatabaseParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 # Parse UniProt (SwissProt & SPTrEMBL) files to create xrefs.
21 #
22 # Files actually contain both types of xref, distinguished by ID line;
23 #
24 # ID CYC_PIG Reviewed; 104 AA. Swissprot
25 # ID Q3ASY8_CHLCH Unreviewed; 36805 AA. SPTrEMBL
26 
27 
28 
29 package XrefParser::UniProtDatabaseParser;
30 
31 use strict;
32 use warnings;
33 use Carp;
34 use POSIX qw(strftime);
35 use File::Basename;
36 
37 use base qw( XrefParser::BaseParser );
38 
39 
40 
41 sub run {
42 
43  my ($self, $ref_arg) = @_;
44  my $source_id = $ref_arg->{source_id};
45  my $species_id = $ref_arg->{species_id};
46  my $dbi = $ref_arg->{dbi};
47  my $xref_source = $ref_arg->{xref_source};
48 
49  if((!defined $source_id) or (!defined $species_id) or (!defined $xref_source)){
50  croak "Need to pass source_id, species_id and xref_source";
51  }
52 
53  my @source_ids;
54  my $sp_source_id = $self->get_source_id_for_source_name('Uniprot/SWISSPROT','sequence_mapped', $dbi);
55  push @source_ids, $sp_source_id;
56  my $sptr_source_id = $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'sequence_mapped', $dbi);
57  push @source_ids, $sptr_source_id;
58  my $sptr_non_display_source_id = $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'protein_evidence_gt_2', $dbi);
59  push @source_ids, $sptr_non_display_source_id;
60  my $sp_direct_source_id = $self->get_source_id_for_source_name('Uniprot/SWISSPROT', 'direct', $dbi);
61  push @source_ids, $sp_direct_source_id;
62  my $sptr_direct_source_id = $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'direct', $dbi);
63  push @source_ids, $sptr_direct_source_id;
64  my $isoform_source_id = $self->get_source_id_for_source_name('Uniprot_isoform');
65  push @source_ids, $isoform_source_id;
66 
67  my $get_xref_sql = "SELECT xref_id, accession, version, label, description, info_type ".
68  "FROM xref WHERE species_id = ? AND source_id = ?";
69  my $get_xref_sth = $xref_source->prepare($get_xref_sql);
70  my $get_dependent_sql = "SELECT x.xref_id, x.accession, x.version, x.label, x.description, x.source_id, x.species_id, dx.linkage_source_id FROM xref x, dependent_xref dx ".
71  "WHERE dx.dependent_xref_id = x.xref_id and dx.master_xref_id = ?";
72  my $get_dependent_sth = $xref_source->prepare($get_dependent_sql);
73  my $get_sequence_sql = "SELECT sequence, sequence_type, status FROM primary_xref WHERE xref_id = ?";
74  my $get_sequence_sth = $xref_source->prepare($get_sequence_sql);
75  my $get_synonym_sql = "SELECT synonym FROM synonym WHERE xref_id = ?";
76  my $get_synonym_sth = $xref_source->prepare($get_synonym_sql);
77  my $get_direct_sql = "SELECT ensembl_stable_id, linkage_xref FROM translation_direct_xref WHERE general_xref_id = ?";
78  my $get_direct_sth = $xref_source->prepare($get_direct_sql);
79  my ($xref_id, $accession, $version, $label, $description, $info_type, $parsed_seq, $type, $status, $dep_xref_id, $dep_accession, $dep_version, $dep_label, $dep_description, $dep_source_id, $dep_species_id, $linkage_source_id, $synonym, $stable_id, $linkage_xref);
80 
81  my @xrefs;
82  my $count = 0;
83 
84  foreach my $xref_source_id (@source_ids) {
85  $get_xref_sth->execute($species_id, $xref_source_id);
86  $get_xref_sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$info_type);
87  while ($get_xref_sth->fetch) {
88  my $xref = {};
89  $count++;
90  $xref->{ACCESSION} = $accession;
91  $xref->{LABEL} = $label;
92  $xref->{VERSION} = $version;
93  $xref->{SPECIES_ID} = $species_id;
94  $xref->{INFO_TYPE} = $info_type;
95  $xref->{SOURCE_ID} = $xref_source_id;
96  $xref->{DESCRIPTION} = $description;
97 
98  # Add sequence if there is some
99  $get_sequence_sth->execute($xref_id);
100  $get_sequence_sth->bind_columns(\$parsed_seq, \$type, \$status);
101  while ($get_sequence_sth->fetch) {
102  $xref->{SEQUENCE_TYPE} = $type;
103  $xref->{STATUS} = $status;
104  $xref->{SEQUENCE} = $parsed_seq;
105  }
106 
107  # Look for synonyms
108  $get_synonym_sth->execute($xref_id);
109  $get_synonym_sth->bind_columns(\$synonym);
110  while ($get_synonym_sth->fetch) {
111  push (@{$xref->{SYNONYMS} }, $synonym);
112  }
113 
114  # Look for direct xref
115  $get_direct_sth->execute($xref_id);
116  $get_direct_sth->bind_columns(\$stable_id, \$linkage_xref);
117  while ($get_direct_sth->fetch) {
118  my %direct;
119  my $isoform;
120  $direct{STABLE_ID} = $stable_id;
121  $direct{ENSEMBL_TYPE} = 'Translation';
122  $direct{LINKAGE_TYPE} = $linkage_xref;
123  $direct{SOURCE_ID} = $xref_source_id;
124  push (@{$xref->{DIRECT_XREFS}}, \%direct);
125  }
126 
127  #Add any dependent xrefs
128  $get_dependent_sth->execute($xref_id);
129  $get_dependent_sth->bind_columns(\$dep_xref_id, \$dep_accession, \$dep_version, \$dep_label, \$dep_description, \$dep_source_id, \$dep_species_id, \$linkage_source_id);
130  while ($get_dependent_sth->fetch) {
131  if ($dep_species_id != $species_id) { next; }
132  my %dep;
133  $dep{ACCESSION} = $dep_accession;
134  $dep{LABEL} = $dep_label;
135  $dep{VERSION} = $dep_version;
136  $dep{DESCRIPTION} = $dep_description;
137  $dep{SOURCE_ID} = $dep_source_id;
138  $dep{LINKAGE_SOURCE_ID} = $linkage_source_id;
139  $get_synonym_sth->execute($dep_xref_id);
140  $get_synonym_sth->bind_columns(\$synonym);
141  while ($get_synonym_sth->fetch) {
142  push (@{$dep{SYNONYMS} }, $synonym);
143  }
144  push @{$xref->{DEPENDENT_XREFS}}, \%dep;
145  }
146 
147  push @xrefs, $xref;
148 
149  if ($count > 1000) {
150  $self->upload_xref_object_graphs( \@xrefs, $dbi );
151  $count = 0;
152  undef @xrefs;
153  }
154  }
155  }
156 
157  $self->upload_xref_object_graphs(\@xrefs, $dbi) if scalar(@xrefs) > 0;
158  return 0; # successfull
159 }
160 
161 1;
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()