ensembl-hive  2.8.1
ZFINParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefParser::ZFINParser;
21 
22 use strict;
23 use warnings;
24 use Carp;
25 use File::Basename; # provides dirname
26 use File::Spec::Functions;
27 use Text::CSV;
28 
29 use parent qw( XrefParser::BaseParser );
30 
31 sub run {
32  my ($self, $ref_arg) = @_;
33  my $source_id = $ref_arg->{source_id};
34  my $species_id = $ref_arg->{species_id};
35  my $files = $ref_arg->{files};
36  my $verbose = $ref_arg->{verbose};
37  my $dbi = $ref_arg->{dbi};
38  $dbi = $self->dbi unless defined $dbi;
39 
40  if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
41  croak "Need to pass source_id, species_id and files as pairs";
42  }
43  $verbose |=0;
44 
45  my $file = @{$files}[0];
46  my $dir = dirname($file);
47 
48  # Get the ZFIN source ids
49  my $direct_src_id = $self->get_source_id_for_source_name('ZFIN_ID', 'direct', $dbi);
50  my $dependent_src_id = $self->get_source_id_for_source_name('ZFIN_ID', 'uniprot/refseq', $dbi);
51  my $description_src_id = $self->get_source_id_for_source_name('ZFIN_ID', 'description_only', $dbi);
52 
53  # Get the ZFIN descriptions
54  my %description;
55 
56  my $sth = $dbi->prepare("select accession, description from xref where source_id=?");
57  $sth->execute($description_src_id);
58  my ($acc, $desc);
59  my $zfin_loaded_count = 0;
60  $sth->bind_columns(\$acc, \$desc);
61  while (my @row = $sth->fetchrow_array()) {
62  $description{$acc} = $desc if(defined($desc));
63  $zfin_loaded_count++;
64  }
65  $sth->finish;
66 
67  # Get the Uniprot and RefSeq accessions
68  my (%swiss) = %{$self->get_valid_codes("uniprot/swissprot",$species_id, $dbi)};
69  my (%refseq) = %{$self->get_valid_codes("refseq",$species_id, $dbi)};
70 
71  # Process ZFIN to ensEMBL mappings
72  my %zfin;
73  my $zfin_io = $self->get_filehandle(catfile($dir, 'ensembl_1_to_1.txt'));
74  if (!defined($zfin_io)) {
75  croak "ERROR: Could not open " . catfile($dir, 'ensembl_1_to_1.txt') . "\n";
76  }
77 
78  my $zfin_csv = Text::CSV->new({
79  sep_char => "\t",
80  empty_is_undef => 1,
81  strict => 1,
82  }) or croak "Could not use zfin file: " . Text::CSV->error_diag();
83 
84  $zfin_csv->column_names(['zfin', 'so', 'label', 'ensembl_id']);
85 
86  while (my $zfin_line = $zfin_csv->getline_hr($zfin_io)) {
87  my ($zfin_acc, $so, $label, $ensembl_id) = @{$zfin_line}{qw(zfin so label ensembl_id)};
88 
89  $self->add_to_direct_xrefs({
90  stable_id => $ensembl_id,
91  type => 'gene',
92  acc => $zfin_acc,
93  label => $label,
94  desc => $description{$zfin_acc},
95  dbi => $dbi,
96  source_id => $direct_src_id,
97  species_id => $species_id
98  });
99 
100  $zfin{$zfin_acc} = 1;
101  }
102 
103  $zfin_io->close();
104 
105  my $spcount =0;
106  my $rscount =0;
107  my $mismatch=0;
108 
109  # Process ZFIN to Uniprot mappings
110  my $swissprot_io = $self->get_filehandle( catfile( $dir, 'uniprot.txt' ) );
111  if ( !defined $swissprot_io ) {
112  croak "ERROR: Could not open " . catfile( $dir, 'uniprot.txt' ). "\n" ;
113  }
114 
115  my $swissprot_csv = Text::CSV->new({
116  sep_char => "\t",
117  empty_is_undef => 1,
118  strict => 1,
119  }) or croak "Could not use swissprot file $file: " . Text::CSV->error_diag();
120 
121  $swissprot_csv->column_names([ 'zfin', 'so', 'label', 'acc' ]);
122 
123  #swissprot file format (in uniprot.txt)
124  #ZDB-GENE-000112-47 SO:0000704 ppardb Q90Z66
125  #ZDB-GENE-000125-12 SO:0000704 igfbp2a Q9PTH3
126  #ZDB-GENE-000125-4 SO:0000704 dlc B3DFM3
127 
128  while ( my $swissprot_line = $swissprot_csv->getline_hr( $swissprot_io ) ) {
129  my ($zfin_acc, $so, $label, $acc) = @{$swissprot_line}{qw(zfin so label acc)};
130 
131  if(defined($swiss{$acc}) && !defined($zfin{$zfin_acc})){
132  foreach my $xref_id (@{$swiss{$acc}}){
133  $self->add_dependent_xref({
134  master_xref_id => $xref_id,
135  acc => $zfin_acc,
136  label => $label,
137  desc => $description{$zfin_acc},
138  source_id => $dependent_src_id,
139  dbi => $dbi,
140  species_id => $species_id
141  });
142  $spcount++;
143  }
144  } else {
145  $mismatch++;
146  }
147  }
148 
149  $swissprot_io->close();
150 
151  # Process ZFIN to RefSeq mappings
152  my $refseq_io = $self->get_filehandle( catfile( $dir, 'refseq.txt' ) );
153  if ( !defined $refseq_io ) {
154  croak "ERROR: Could not open " . catfile( $dir, 'refseq.txt' ),"\n" ;
155  }
156 
157  my $refseq_csv = Text::CSV->new({
158  sep_char => "\t",
159  empty_is_undef => 1,
160  strict => 1,
161  }) or croak "could not use refseq file $file: " . Text::CSV->error_diag();
162 
163  $refseq_csv->column_names([ 'zfin', 'so', 'label', 'acc' ]);
164 
165  #ZDB-GENE-000125-12 SO:0000704 igfbp2a NP_571533
166  #ZDB-GENE-000125-4 SO:0000704 dlc NM_130944
167  #ZDB-GENE-000125-4 SO:0000704 dlc NP_571019
168  #ZDB-GENE-000128-11 SO:0000704 dbx1b NM_131178
169 
170  while ( my $refseq_line = $refseq_csv->getline_hr( $refseq_io ) ) {
171  my ($zfin_acc, $so, $label, $acc) = @{$refseq_line}{qw(zfin so label acc)};
172  # Ignore mappings to predicted RefSeq
173  if ($acc =~ /^XP_/ || $acc =~ /^XM_/ || $acc =~ /^XR_/) { next; }
174 
175  if(defined($refseq{$acc}) && !defined($zfin{$zfin_acc})){
176  foreach my $xref_id (@{$refseq{$acc}}){
177  $self->add_dependent_xref({
178  master_xref_id => $xref_id,
179  acc => $zfin_acc,
180  label => $label,
181  desc => $description{$zfin_acc},
182  source_id => $dependent_src_id,
183  dbi => $dbi,
184  species_id => $species_id
185  });
186  $rscount++;
187  }
188  } else {
189  $mismatch++;
190  }
191  }
192 
193  $refseq_io->close();
194 
195  # Get the added ZFINs again (with deps)
196  (%zfin) = %{$self->get_valid_codes("zfin", $species_id, $dbi)};
197 
198  # Process the synonyms
199  my $aliases_io = $self->get_filehandle( catfile( $dir, 'aliases.txt' ) );
200  if ( !defined $aliases_io ) {
201  croak "ERROR: Could not open " . catfile( $dir, 'aliases.txt' ), "\n" ;
202  }
203 
204  my $aliases_csv = Text::CSV->new({
205  sep_char => '\t',
206  empty_is_undef => 1,
207  strict => 1,
208  }) or croak "could not use zfin file $file: " . Text::CSV->error_diag();
209 
210  $aliases_csv->column_names([ 'acc', 'cur_name', 'cur_symbol', 'syn', 'so' ]);
211 
212  #DB-ALT-000717-2 zc1Tg zc1Tg zc1 SO:0001218
213  #ZDB-ALT-000717-4 zc3Tg zc3Tg Tg(NBT:MAPT-GFP) SO:0001218
214 
215  my $syncount = 0;
216 
217  $sth = $dbi->prepare('SELECT source_id from source where name like "ZFIN_ID"');
218  $sth->execute;
219  my $s1;
220  $sth->bind_columns(\$s1);
221  my $sources;
222  while($sth->fetch()){
223  push @$sources, $s1;
224  }
225  $sth->finish;
226 
227  while ( my $aliases_line = $aliases_csv->getline_hr( $aliases_io ) ) {
228  my ($acc, $syn) = @{$aliases_line}{qw(acc syn)};
229  if(defined($zfin{$acc})){
230  $self->add_to_syn_for_mult_sources($acc, $sources, $syn, $species_id, $dbi);
231  $syncount++;
232  }
233  }
234 
235  $aliases_io->close();
236 
237  if($verbose){
238  print "\t$spcount xrefs from UniProt and\n";
239  print "\t$rscount xrefs from RefSeq succesfully loaded\n";
240  print "\t$syncount synonyms loaded\n";
241  print "\t$mismatch xrefs ignored\n";
242  }
243  return 0;
244 }
245 
246 1;
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()