3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 package XrefParser::ZFINParser;
25 use File::Basename; # provides dirname
26 use File::Spec::Functions;
32 my ($self, $ref_arg) = @_;
33 my $source_id = $ref_arg->{source_id};
34 my $species_id = $ref_arg->{species_id};
35 my $files = $ref_arg->{files};
36 my $verbose = $ref_arg->{verbose};
37 my $dbi = $ref_arg->{dbi};
38 $dbi = $self->dbi unless defined $dbi;
40 if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
41 croak
"Need to pass source_id, species_id and files as pairs";
45 my $file = @{$files}[0];
46 my $dir = dirname($file);
48 # Get the ZFIN source ids
49 my $direct_src_id = $self->get_source_id_for_source_name(
'ZFIN_ID',
'direct', $dbi);
50 my $dependent_src_id = $self->get_source_id_for_source_name(
'ZFIN_ID',
'uniprot/refseq', $dbi);
51 my $description_src_id = $self->get_source_id_for_source_name(
'ZFIN_ID',
'description_only', $dbi);
53 # Get the ZFIN descriptions
56 my $sth = $dbi->prepare(
"select accession, description from xref where source_id=?");
57 $sth->execute($description_src_id);
59 my $zfin_loaded_count = 0;
60 $sth->bind_columns(\$acc, \$desc);
61 while (my @row = $sth->fetchrow_array()) {
62 $description{$acc} = $desc
if(defined($desc));
67 # Get the Uniprot and RefSeq accessions
68 my (%swiss) = %{$self->get_valid_codes(
"uniprot/swissprot",$species_id, $dbi)};
69 my (%refseq) = %{$self->get_valid_codes(
"refseq",$species_id, $dbi)};
71 # Process ZFIN to ensEMBL mappings
73 my $zfin_io = $self->get_filehandle(catfile($dir,
'ensembl_1_to_1.txt'));
74 if (!defined($zfin_io)) {
75 croak
"ERROR: Could not open " . catfile($dir,
'ensembl_1_to_1.txt') .
"\n";
78 my $zfin_csv = Text::CSV->new({
82 }) or croak
"Could not use zfin file: " . Text::CSV->error_diag();
84 $zfin_csv->column_names([
'zfin',
'so',
'label',
'ensembl_id']);
86 while (my $zfin_line = $zfin_csv->getline_hr($zfin_io)) {
87 my ($zfin_acc, $so, $label, $ensembl_id) = @{$zfin_line}{qw(zfin so label ensembl_id)};
89 $self->add_to_direct_xrefs({
90 stable_id => $ensembl_id,
94 desc => $description{$zfin_acc},
96 source_id => $direct_src_id,
97 species_id => $species_id
100 $zfin{$zfin_acc} = 1;
109 # Process ZFIN to Uniprot mappings
110 my $swissprot_io = $self->get_filehandle( catfile( $dir,
'uniprot.txt' ) );
111 if ( !defined $swissprot_io ) {
112 croak
"ERROR: Could not open " . catfile( $dir,
'uniprot.txt' ).
"\n" ;
115 my $swissprot_csv = Text::CSV->new({
119 }) or croak
"Could not use swissprot file $file: " . Text::CSV->error_diag();
121 $swissprot_csv->column_names([
'zfin',
'so',
'label',
'acc' ]);
123 #swissprot file format (in uniprot.txt)
124 #ZDB-GENE-000112-47 SO:0000704 ppardb Q90Z66
125 #ZDB-GENE-000125-12 SO:0000704 igfbp2a Q9PTH3
126 #ZDB-GENE-000125-4 SO:0000704 dlc B3DFM3
128 while ( my $swissprot_line = $swissprot_csv->getline_hr( $swissprot_io ) ) {
129 my ($zfin_acc, $so, $label, $acc) = @{$swissprot_line}{qw(zfin so label acc)};
131 if(defined($swiss{$acc}) && !defined($zfin{$zfin_acc})){
132 foreach my $xref_id (@{$swiss{$acc}}){
133 $self->add_dependent_xref({
134 master_xref_id => $xref_id,
137 desc => $description{$zfin_acc},
138 source_id => $dependent_src_id,
140 species_id => $species_id
149 $swissprot_io->close();
151 # Process ZFIN to RefSeq mappings
152 my $refseq_io = $self->get_filehandle( catfile( $dir,
'refseq.txt' ) );
153 if ( !defined $refseq_io ) {
154 croak
"ERROR: Could not open " . catfile( $dir,
'refseq.txt' ),
"\n" ;
157 my $refseq_csv = Text::CSV->new({
161 }) or croak
"could not use refseq file $file: " . Text::CSV->error_diag();
163 $refseq_csv->column_names([
'zfin',
'so',
'label',
'acc' ]);
165 #ZDB-GENE-000125-12 SO:0000704 igfbp2a NP_571533
166 #ZDB-GENE-000125-4 SO:0000704 dlc NM_130944
167 #ZDB-GENE-000125-4 SO:0000704 dlc NP_571019
168 #ZDB-GENE-000128-11 SO:0000704 dbx1b NM_131178
170 while ( my $refseq_line = $refseq_csv->getline_hr( $refseq_io ) ) {
171 my ($zfin_acc, $so, $label, $acc) = @{$refseq_line}{qw(zfin so label acc)};
172 # Ignore mappings to predicted RefSeq
173 if ($acc =~ /^XP_/ || $acc =~ /^XM_/ || $acc =~ /^XR_/) { next; }
175 if(defined($refseq{$acc}) && !defined($zfin{$zfin_acc})){
176 foreach my $xref_id (@{$refseq{$acc}}){
177 $self->add_dependent_xref({
178 master_xref_id => $xref_id,
181 desc => $description{$zfin_acc},
182 source_id => $dependent_src_id,
184 species_id => $species_id
195 # Get the added ZFINs again (with deps)
196 (%zfin) = %{$self->get_valid_codes(
"zfin", $species_id, $dbi)};
198 # Process the synonyms
199 my $aliases_io = $self->get_filehandle( catfile( $dir,
'aliases.txt' ) );
200 if ( !defined $aliases_io ) {
201 croak
"ERROR: Could not open " . catfile( $dir,
'aliases.txt' ),
"\n" ;
204 my $aliases_csv = Text::CSV->new({
208 }) or croak
"could not use zfin file $file: " . Text::CSV->error_diag();
210 $aliases_csv->column_names([
'acc',
'cur_name',
'cur_symbol',
'syn',
'so' ]);
212 #DB-ALT-000717-2 zc1Tg zc1Tg zc1 SO:0001218
213 #ZDB-ALT-000717-4 zc3Tg zc3Tg Tg(NBT:MAPT-GFP) SO:0001218
217 $sth = $dbi->prepare(
'SELECT source_id from source where name like "ZFIN_ID"');
220 $sth->bind_columns(\$s1);
222 while($sth->fetch()){
227 while ( my $aliases_line = $aliases_csv->getline_hr( $aliases_io ) ) {
228 my ($acc, $syn) = @{$aliases_line}{qw(acc syn)};
229 if(defined($zfin{$acc})){
230 $self->add_to_syn_for_mult_sources($acc, $sources, $syn, $species_id, $dbi);
235 $aliases_io->close();
238 print
"\t$spcount xrefs from UniProt and\n";
239 print
"\t$rscount xrefs from RefSeq succesfully loaded\n";
240 print
"\t$syncount synonyms loaded\n";
241 print
"\t$mismatch xrefs ignored\n";