ensembl-hive  2.8.1
RGDParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =head1 DESCRIPTION
19 
20 Designed to parse the Rat Genome Database download file, historically hosted at
21 ftp://ftp.rgd.mcw.edu/pub/data_release/GENES_RAT.txt . It comprises 40+ columns in a
22 tab-separated format
23 
24 It contains RGD IDs (which are numeric), and associates them either with Ensembl genes or
25 RefSeq records (mainly transcripts).
26 
27 =cut
28 
29 package XrefParser::RGDParser;
30 
31 use strict;
32 use warnings;
33 
34 use Carp;
35 use Text::CSV;
36 
37 use parent qw( XrefParser::BaseParser );
38 
39 
40 =head2 run
41 
42 Description: Triggers the parsing of the RGD file specified in files parameter
43  It uses Text::CSV to consume the source file.
44 
45 =cut
46 
47 sub run {
48 
49  my ( $self, $ref_arg ) = @_;
50  my $source_id = $ref_arg->{source_id};
51  my $species_id = $ref_arg->{species_id};
52  my $files = $ref_arg->{files};
53  my $verbose = $ref_arg->{verbose} // 0;
54  my $dbi = $ref_arg->{dbi} // $self->dbi;
55 
56  if ( ( !defined $source_id ) or
57  ( !defined $species_id ) or
58  ( !defined $files ) )
59  {
60  confess 'Need to pass source_id, species_id and files as pairs';
61  }
62 
63  my $source_sql = "select source_id from source where name = 'RGD' and priority_description = 'direct_xref'";
64  my $sth = $dbi->prepare($source_sql);
65  $sth->execute();
66  my ($direct_source_id);
67  $sth->bind_columns(\$direct_source_id);
68  $sth->fetch();
69  $sth->finish();
70 
71  my $file = @{$files}[0];
72 
73  # Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs
74  my (%preloaded_refseq) =
75  %{ $self->get_valid_codes( 'refseq', $species_id, $dbi ) };
76 
77  my $rgd_io = $self->get_filehandle($file);
78 
79  if ( !defined $rgd_io ) {
80  confess "Could not open $file when trying to parse RGD";
81  }
82  my $csv = Text::CSV->new({
83  sep => "\t",
84  blank_is_undef => 1,
85  auto_diag => 1,
86  binary => 1,
87  allow_loose_quotes => 1,
88  }) || confess 'Cannot use CSV: ' . Text::CSV->error_diag();
89 # WARNING - Text::CSV does not like the GENES-RAT.txt file. It is improperly formatted and contains a non-ASCII character
90 # Make sure binary is turned on or it silently fails and you get 1/3rd of the records.
91 # strict is turned off to prevent failure on a blank line at the end
92 
93  my $line = q{#};
94  while ( substr( $line, 0, 1 ) eq q{#} ) {
95  $line = $rgd_io->getline;
96  }
97  $csv->parse($line);
98  my @column_names = $csv->fields();
99  # Columns we want
100  # GENE_RGD_ID => 0,
101  # SYMBOL => 1,
102  # NAME => 2,
103  # GENBANK_NUCLEOTIDE => 23,
104  # OLD_SYMBOL => 29,
105  # ENSEMBL_ID => 37
106 
107  my $count = 0;
108  my $ensembl_count = 0;
109  my $mismatch = 0;
110  my $syn_count = 0;
111 
112  my $cols = {}; # Digested columns from CSV
113  $csv->bind_columns( \@{$cols}{@column_names} );
114 
115  while ( $csv->getline($rgd_io) ) {
116  next
117  if exists $cols->{GENE_RGD_ID} &&
118  ( $cols->{GENE_RGD_ID} eq q{} || !defined $cols->{GENE_RGD_ID} );
119 
120  # Some RGD annotation is directly copied from Ensembl
121  if ($cols->{SYMBOL} =~ /ENSRNO/) { next; }
122 
123  my @nucs;
124  if ( defined $cols->{GENBANK_NUCLEOTIDE} ) {
125  @nucs = split qr{ ; }msx, $cols->{GENBANK_NUCLEOTIDE};
126  }
127  my $done = 0;
128  # @nucs are sorted in the file in alphabetical order. Filter them down
129  # to a higher quality subset, then add dependent Xrefs where possible
130  foreach my $nuc ( $self->sort_refseq_accessions(@nucs) ) {
131 
132  if ( !$done && exists $preloaded_refseq{$nuc} ) {
133 
134  foreach my $xref ( @{ $preloaded_refseq{$nuc} } ) {
135  my $xref_id =
136  $self->add_dependent_xref({
137  master_xref_id => $xref,
138  acc => $cols->{GENE_RGD_ID},
139  label => $cols->{SYMBOL},
140  desc => $cols->{NAME},
141  source_id => $source_id,
142  dbi => $dbi,
143  species_id => $species_id,
144  });
145  $count++;
146  $syn_count +=
147  $self->process_synonyms( $xref_id, $cols->{OLD_SYMBOL},
148  $dbi );
149  $done = 1;
150  }
151  }
152  }
153 
154  if ( defined $cols->{ENSEMBL_ID} ) {
155  my @ensembl_ids = split qr{ ; }msx, $cols->{ENSEMBL_ID};
156 
157  foreach my $id (@ensembl_ids) {
158  $ensembl_count++;
159  $self->add_to_direct_xrefs({
160  stable_id => $id,
161  type => 'gene',
162  acc => $cols->{GENE_RGD_ID},
163  label => $cols->{SYMBOL},
164  desc => $cols->{NAME},
165  dbi => $dbi,
166  source_id => $direct_source_id,
167  species_id => $species_id,
168  });
169  my $xref_id =
170  $self->get_xref( $cols->{GENE_RGD_ID}, $direct_source_id,
171  $species_id, $dbi );
172  $syn_count +=
173  $self->process_synonyms( $xref_id, $cols->{OLD_SYMBOL},
174  $dbi );
175  $done = 1;
176  }
177  }
178  if ( !$done ) {
179  $self->add_xref({
180  acc => $cols->{GENE_RGD_ID},
181  label => $cols->{SYMBOL},
182  desc => $cols->{NAME},
183  source_id => $source_id,
184  species_id => $species_id,
185  dbi => $dbi,
186  info_type => 'MISC',
187  });
188  $mismatch++;
189  }
190 
191  } ## end while ( $cols = $csv->getline_hr...)
192  if ( !$csv->eof ) {
193  confess 'Failed to finish parsing RGD file: ' . $csv->error_diag();
194  }
195  $rgd_io->close();
196 
197  if ($verbose) {
198  print "$count xrefs succesfully loaded and dependent on refseq\n" .
199  "$mismatch xrefs added but with NO dependencies\n" .
200  "$ensembl_count direct xrefs successfully loaded\n" .
201  "Tried to add $syn_count synonyms, including duplicates\n";
202  }
203  return 0;
204 } ## end sub run
205 
206 # Predefined importance levels for the most valued RefSeq accession types
207 my %refseq_priorities =
208  ( NM => 1, NP => 1, NR => 1, XM => 2, XP => 2, XR => 2, );
209 
210 
211 =head2 sort_refseq_accessions
212 
213 Arg [1..n] : Original list of accessions
214 Description : Filter out any accessions which are not in the "normal" set of
215  genomic features. The column in question contains EMBL accessions
216  as well as other things, and we don't have the ability to make
217  Xrefs to all sources
218 Returntype : List of sorted and filtered accessions
219 
220 =cut
221 
222 sub sort_refseq_accessions {
223  my ( $self, @accessions ) = @_;
224  @accessions = sort {
225  $refseq_priorities{ substr $a, 0, 2 }
226  <=> $refseq_priorities{ substr $b, 0, 2 } ||
227  $a cmp $b
228  } grep { exists $refseq_priorities{ substr $_, 0, 2 } } @accessions;
229  return @accessions;
230 }
231 
232 
233 =head2 process_synonyms
234 Arg [1] : Xref dbID to attach synonyms to
235 Arg [2] : Synonym string as read from file
236 Description : Process the synonym column into potentially many items and add
237  them to the synonym table. Synonyms are ';' separated
238 Returntype : Int - the count of synonyms added
239 =cut
240 
241 sub process_synonyms {
242  my ( $self, $xref_id, $synonym_string, $dbi ) = @_;
243  my $syn_count = 0;
244  if ( ( !defined $synonym_string ) || ( !defined $xref_id ) ) {
245  return $syn_count;
246  }
247 
248  my @syns = split qr{ ; }msx, $synonym_string;
249  foreach my $syn (@syns) {
250  $self->add_synonym( $xref_id, $syn, $dbi );
251  $syn_count++;
252  }
253  return $syn_count;
254 }
255 
256 
257 1;
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()
XrefParser::Database
Definition: Database.pm:8