ensembl-hive  2.7.0
RefSeqDatabaseParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 # Parse RefSeq data from central database to create species specific xrefs.
21 
22 package XrefParser::RefSeqDatabaseParser;
23 
24 use strict;
25 use warnings;
26 use Carp;
27 use File::Basename;
28 
29 use base qw( XrefParser::BaseParser );
30 
31 sub run {
32 
33  my ($self, $ref_arg) = @_;
34  my $source_id = $ref_arg->{source_id};
35  my $species_id = $ref_arg->{species_id};
36  my $dbi = $ref_arg->{dbi};
37  my $xref_source = $ref_arg->{xref_source};
38 
39  if((!defined $source_id) or (!defined $species_id) or (!defined $xref_source)){
40  croak "Need to pass source_id, species_id and xref_source";
41  }
42 
43  my $source_name = $self->get_source_name_for_source_id($source_id, $dbi);
44  my @source_ids;
45 
46  if ($source_name =~ /RefSeq_dna/) {
47  my $mrna_source_id = $self->get_source_id_for_source_name('RefSeq_mRNA','refseq', $dbi);
48  push @source_ids, $mrna_source_id;
49  my $pred_mrna_source_id = $self->get_source_id_for_source_name('RefSeq_mRNA_predicted','refseq', $dbi);
50  push @source_ids, $pred_mrna_source_id;
51  my $ncrna_source_id = $self->get_source_id_for_source_name('RefSeq_ncRNA', undef, $dbi);
52  push @source_ids, $ncrna_source_id;
53  my $pred_ncrna_source_id = $self->get_source_id_for_source_name('RefSeq_ncRNA_predicted', undef, $dbi);
54  push @source_ids, $pred_ncrna_source_id;
55  } elsif ($source_name =~ /RefSeq_peptide/) {
56  my $peptide_source_id = $self->get_source_id_for_source_name('RefSeq_peptide', undef, $dbi);
57  push @source_ids, $peptide_source_id;
58  my $pred_peptide_source_id = $self->get_source_id_for_source_name('RefSeq_peptide_predicted', undef, $dbi);
59  push @source_ids, $pred_peptide_source_id;
60  }
61 
62  my $entrez_source_id = $self->get_source_id_for_source_name('EntrezGene', undef, $dbi);
63  my $wiki_source_id = $self->get_source_id_for_source_name('WikiGene', undef, $dbi);
64 
65  # Retrieve existing NCBIGene xrefs
66  my (%entrez) = %{$self->get_acc_to_label("EntrezGene",$species_id, undef, $dbi)};
67 
68  # Get existing mrna, entrezgene and wikigene accession => xref_id
69  my (%refseq_ids, %entrez_ids, %wiki_ids, $add_dependent_xref_sth);
70  if ($source_name =~ /RefSeq_peptide/) {
71  (%refseq_ids) = %{ $self->get_valid_codes("RefSeq_mRNA", $species_id, $dbi) };
72  (%entrez_ids) = %{ $self->get_valid_codes("EntrezGene", $species_id, $dbi) };
73  (%wiki_ids) = %{ $self->get_valid_codes("WikiGene", $species_id, $dbi) };
74  $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref (master_xref_id, dependent_xref_id, linkage_source_id) VALUES (?,?, $entrez_source_id)");
75  }
76 
77  my $get_xref_sql = "SELECT xref_id, accession, version, label, description, info_type ".
78  "FROM xref WHERE species_id = ? AND source_id = ?";
79  my $get_xref_sth = $xref_source->prepare($get_xref_sql);
80  my $get_dependent_sql = "SELECT x.xref_id, x.accession, x.version, x.label, x.description, x.source_id, x.species_id, dx.linkage_source_id FROM xref x, dependent_xref dx ".
81  "WHERE dx.dependent_xref_id = x.xref_id and dx.master_xref_id = ?";
82  my $get_dependent_sth = $xref_source->prepare($get_dependent_sql);
83  my $get_sequence_sql = "SELECT sequence, sequence_type, status FROM primary_xref WHERE xref_id = ?";
84  my $get_sequence_sth = $xref_source->prepare($get_sequence_sql);
85  my $get_synonym_sql = "SELECT synonym FROM synonym WHERE xref_id = ?";
86  my $get_synonym_sth = $xref_source->prepare($get_synonym_sql);
87  my $get_pair_sql = "SELECT accession2 FROM pairs where accession1 = ?";
88  my $get_pair_sth = $xref_source->prepare($get_pair_sql);
89  my ($xref_id, $accession, $version, $label, $description, $info_type, $parsed_seq, $type, $status, $dep_xref_id, $dep_accession, $dep_version, $dep_label, $dep_description, $dep_source_id, $dep_species_id, $linkage_source_id, $synonym, $refseq_pair);
90 
91  my @xrefs;
92  my @dep_refseq_ids;
93  my @dep_entrez_ids;
94  my $count = 0;
95  foreach my $xref_source_id (@source_ids) {
96  $get_xref_sth->execute($species_id, $xref_source_id);
97  $get_xref_sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$info_type);
98  while ($get_xref_sth->fetch()) {
99  my $xref = {};
100  $count++;
101 
102  $xref->{ACCESSION} = $accession;
103  $xref->{LABEL} = $label;
104  $xref->{VERSION} = $version;
105  $xref->{SPECIES_ID} = $species_id;
106  $xref->{INFO_TYPE} = $info_type;
107  $xref->{SOURCE_ID} = $xref_source_id;
108  $xref->{DESCRIPTION} = $description;
109 
110  # Add sequence if there is some
111  $get_sequence_sth->execute($xref_id);
112  $get_sequence_sth->bind_columns(\$parsed_seq, \$type, \$status);
113  while ($get_sequence_sth->fetch) {
114  $xref->{SEQUENCE_TYPE} = $type;
115  $xref->{STATUS} = $status;
116  $xref->{SEQUENCE} = $parsed_seq;
117  }
118 
119  # Add pair information if there is some
120  $get_pair_sth->execute($accession);
121  $get_pair_sth->bind_columns(\$refseq_pair);
122  while ($get_pair_sth->fetch) {
123  $xref->{PAIR} = $refseq_pair;
124  }
125 
126  # Look for synonyms
127  $get_synonym_sth->execute($xref_id);
128  $get_synonym_sth->bind_columns(\$synonym);
129  while ($get_synonym_sth->fetch) {
130  push (@{$xref->{SYNONYMS} }, $synonym);
131  }
132 
133  # Add any dependent xrefs
134  $get_dependent_sth->execute($xref_id);
135  $get_dependent_sth->bind_columns(\$dep_xref_id, \$dep_accession, \$dep_version, \$dep_label, \$dep_description, \$dep_source_id, \$dep_species_id, \$linkage_source_id);
136  while ($get_dependent_sth->fetch) {
137  if ($dep_species_id != $species_id) { next; }
138  if (defined $entrez{$dep_accession}) {
139  push(@{$xref->{DEPENDENT_XREFS}}, {
140  ACCESSION => $dep_accession,
141  LABEL => $entrez{$dep_accession},
142  VERSION => $dep_version,
143  DESCRIPTION => $dep_description,
144  SOURCE_ID => $entrez_source_id,
145  LINKAGE_SOURCE_ID => $linkage_source_id
146  });
147 
148  push(@{$xref->{DEPENDENT_XREFS}}, {
149  ACCESSION => $dep_accession,
150  LABEL => $entrez{$dep_accession},
151  VERSION => $dep_version,
152  DESCRIPTION => $dep_description,
153  SOURCE_ID => $wiki_source_id,
154  LINKAGE_SOURCE_ID => $linkage_source_id
155  });
156 
157  # Add dependent xrefs for RefSeq mRNA as well where available
158  # only after they are added in priority 1
159  $refseq_pair =~ s/\.[0-9]*// if $refseq_pair;
160  if (defined $refseq_pair) {
161  if ($refseq_ids{$refseq_pair}) {
162  foreach my $refseq_id (@{ $refseq_ids{$refseq_pair} }) {
163  foreach my $entrez_id (@{ $entrez_ids{$dep_accession} }) {
164  push(@dep_refseq_ids, $refseq_id);
165  push(@dep_entrez_ids, $entrez_id);
166  }
167  foreach my $wiki_id (@{ $wiki_ids{$dep_accession} }) {
168  push(@dep_refseq_ids, $refseq_id);
169  push(@dep_entrez_ids, $wiki_id);
170  }
171  }
172  }
173  }
174  }
175  }
176 
177  push @xrefs, $xref;
178 
179  if (scalar(@dep_refseq_ids) > 0) {
180  my @tuple_status;
181  my $tuples = $add_dependent_xref_sth->execute_array(
182  { ArrayTupleStatus => \@tuple_status },
183  \@dep_refseq_ids,
184  \@dep_entrez_ids,
185  );
186  $dbi->commit;
187  undef @dep_refseq_ids;
188  undef @dep_entrez_ids;
189  }
190 
191  if ($count > 1000) {
192  $self->upload_xref_object_graphs( \@xrefs, $dbi );
193  $count = 0;
194  undef @xrefs;
195  }
196  }
197  }
198  $get_xref_sth->finish();
199 
200  $self->upload_xref_object_graphs(\@xrefs, $dbi) if scalar(@xrefs) > 0;
201 
202  return 0; # successful
203 
204 }
205 
206 1;
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()