3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 # Parse RefSeq data from central database to create species specific xrefs.
22 package XrefParser::RefSeqDatabaseParser;
33 my ($self, $ref_arg) = @_;
34 my $source_id = $ref_arg->{source_id};
35 my $species_id = $ref_arg->{species_id};
36 my $dbi = $ref_arg->{dbi};
37 my $xref_source = $ref_arg->{xref_source};
39 if((!defined $source_id) or (!defined $species_id) or (!defined $xref_source)){
40 croak
"Need to pass source_id, species_id and xref_source";
43 my $source_name = $self->get_source_name_for_source_id($source_id, $dbi);
46 if ($source_name =~ /RefSeq_dna/) {
47 my $mrna_source_id = $self->get_source_id_for_source_name(
'RefSeq_mRNA',
'refseq', $dbi);
48 push @source_ids, $mrna_source_id;
49 my $pred_mrna_source_id = $self->get_source_id_for_source_name(
'RefSeq_mRNA_predicted',
'refseq', $dbi);
50 push @source_ids, $pred_mrna_source_id;
51 my $ncrna_source_id = $self->get_source_id_for_source_name(
'RefSeq_ncRNA', undef, $dbi);
52 push @source_ids, $ncrna_source_id;
53 my $pred_ncrna_source_id = $self->get_source_id_for_source_name(
'RefSeq_ncRNA_predicted', undef, $dbi);
54 push @source_ids, $pred_ncrna_source_id;
55 } elsif ($source_name =~ /RefSeq_peptide/) {
56 my $peptide_source_id = $self->get_source_id_for_source_name(
'RefSeq_peptide', undef, $dbi);
57 push @source_ids, $peptide_source_id;
58 my $pred_peptide_source_id = $self->get_source_id_for_source_name(
'RefSeq_peptide_predicted', undef, $dbi);
59 push @source_ids, $pred_peptide_source_id;
62 my $entrez_source_id = $self->get_source_id_for_source_name(
'EntrezGene', undef, $dbi);
63 my $wiki_source_id = $self->get_source_id_for_source_name(
'WikiGene', undef, $dbi);
65 # Retrieve existing NCBIGene xrefs
66 my (%entrez) = %{$self->get_acc_to_label(
"EntrezGene",$species_id, undef, $dbi)};
68 # Get existing mrna, entrezgene and wikigene accession => xref_id
69 my (%refseq_ids, %entrez_ids, %wiki_ids, $add_dependent_xref_sth);
70 if ($source_name =~ /RefSeq_peptide/) {
71 (%refseq_ids) = %{ $self->get_valid_codes(
"RefSeq_mRNA", $species_id, $dbi) };
72 (%entrez_ids) = %{ $self->get_valid_codes(
"EntrezGene", $species_id, $dbi) };
73 (%wiki_ids) = %{ $self->get_valid_codes(
"WikiGene", $species_id, $dbi) };
74 $add_dependent_xref_sth = $dbi->prepare(
"INSERT INTO dependent_xref (master_xref_id, dependent_xref_id, linkage_source_id) VALUES (?,?, $entrez_source_id)");
77 my $get_xref_sql =
"SELECT xref_id, accession, version, label, description, info_type ".
78 "FROM xref WHERE species_id = ? AND source_id = ?";
79 my $get_xref_sth = $xref_source->prepare($get_xref_sql);
80 my $get_dependent_sql =
"SELECT x.xref_id, x.accession, x.version, x.label, x.description, x.source_id, x.species_id, dx.linkage_source_id FROM xref x, dependent_xref dx ".
81 "WHERE dx.dependent_xref_id = x.xref_id and dx.master_xref_id = ?";
82 my $get_dependent_sth = $xref_source->prepare($get_dependent_sql);
83 my $get_sequence_sql =
"SELECT sequence, sequence_type, status FROM primary_xref WHERE xref_id = ?";
84 my $get_sequence_sth = $xref_source->prepare($get_sequence_sql);
85 my $get_synonym_sql =
"SELECT synonym FROM synonym WHERE xref_id = ?";
86 my $get_synonym_sth = $xref_source->prepare($get_synonym_sql);
87 my $get_pair_sql =
"SELECT accession2 FROM pairs where accession1 = ?";
88 my $get_pair_sth = $xref_source->prepare($get_pair_sql);
89 my ($xref_id, $accession, $version, $label, $description, $info_type, $parsed_seq, $type, $status, $dep_xref_id, $dep_accession, $dep_version, $dep_label, $dep_description, $dep_source_id, $dep_species_id, $linkage_source_id, $synonym, $refseq_pair);
95 foreach my $xref_source_id (@source_ids) {
96 $get_xref_sth->execute($species_id, $xref_source_id);
97 $get_xref_sth->bind_columns(\$xref_id, \$accession, \$version, \$label, \$description, \$info_type);
98 while ($get_xref_sth->fetch()) {
102 $xref->{ACCESSION} = $accession;
103 $xref->{LABEL} = $label;
104 $xref->{VERSION} = $version;
105 $xref->{SPECIES_ID} = $species_id;
106 $xref->{INFO_TYPE} = $info_type;
107 $xref->{SOURCE_ID} = $xref_source_id;
108 $xref->{DESCRIPTION} = $description;
110 # Add sequence if there is some
111 $get_sequence_sth->execute($xref_id);
112 $get_sequence_sth->bind_columns(\$parsed_seq, \$type, \$status);
113 while ($get_sequence_sth->fetch) {
114 $xref->{SEQUENCE_TYPE} = $type;
115 $xref->{STATUS} = $status;
116 $xref->{SEQUENCE} = $parsed_seq;
119 # Add pair information if there is some
120 $get_pair_sth->execute($accession);
121 $get_pair_sth->bind_columns(\$refseq_pair);
122 while ($get_pair_sth->fetch) {
123 $xref->{PAIR} = $refseq_pair;
127 $get_synonym_sth->execute($xref_id);
128 $get_synonym_sth->bind_columns(\$synonym);
129 while ($get_synonym_sth->fetch) {
130 push (@{$xref->{SYNONYMS} }, $synonym);
133 # Add any dependent xrefs
134 $get_dependent_sth->execute($xref_id);
135 $get_dependent_sth->bind_columns(\$dep_xref_id, \$dep_accession, \$dep_version, \$dep_label, \$dep_description, \$dep_source_id, \$dep_species_id, \$linkage_source_id);
136 while ($get_dependent_sth->fetch) {
137 if ($dep_species_id != $species_id) { next; }
138 if (defined $entrez{$dep_accession}) {
139 push(@{$xref->{DEPENDENT_XREFS}}, {
140 ACCESSION => $dep_accession,
141 LABEL => $entrez{$dep_accession},
142 VERSION => $dep_version,
143 DESCRIPTION => $dep_description,
144 SOURCE_ID => $entrez_source_id,
145 LINKAGE_SOURCE_ID => $linkage_source_id
148 push(@{$xref->{DEPENDENT_XREFS}}, {
149 ACCESSION => $dep_accession,
150 LABEL => $entrez{$dep_accession},
151 VERSION => $dep_version,
152 DESCRIPTION => $dep_description,
153 SOURCE_ID => $wiki_source_id,
154 LINKAGE_SOURCE_ID => $linkage_source_id
157 # Add dependent xrefs for RefSeq mRNA as well where available
158 # only after they are added in priority 1
159 $refseq_pair =~ s/\.[0-9]*
160 if (defined $refseq_pair) {
161 if ($refseq_ids{$refseq_pair}) {
162 foreach my $refseq_id (@{ $refseq_ids{$refseq_pair} }) {
163 foreach my $entrez_id (@{ $entrez_ids{$dep_accession} }) {
164 push(@dep_refseq_ids, $refseq_id);
165 push(@dep_entrez_ids, $entrez_id);
167 foreach my $wiki_id (@{ $wiki_ids{$dep_accession} }) {
168 push(@dep_refseq_ids, $refseq_id);
169 push(@dep_entrez_ids, $wiki_id);
179 if (scalar(@dep_refseq_ids) > 0) {
181 my $tuples = $add_dependent_xref_sth->execute_array(
182 { ArrayTupleStatus => \@tuple_status },
187 undef @dep_refseq_ids;
188 undef @dep_entrez_ids;
192 $self->upload_xref_object_graphs( \@xrefs, $dbi );
198 $get_xref_sth->finish();
200 $self->upload_xref_object_graphs(\@xrefs, $dbi)
if scalar(@xrefs) > 0;
202 return 0; # successful