3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 Designed to parse the Rat Genome
Database download file, historically hosted at
24 It contains RGD IDs (which are numeric), and associates them either with Ensembl genes or
25 RefSeq records (mainly transcripts).
29 package XrefParser::RGDParser;
42 Description: Triggers the parsing of the RGD file specified in files parameter
43 It uses Text::CSV to consume the source file.
49 my ( $self, $ref_arg ) = @_;
50 my $source_id = $ref_arg->{source_id};
51 my $species_id = $ref_arg->{species_id};
52 my $files = $ref_arg->{files};
53 my $verbose = $ref_arg->{verbose}
54 my $dbi = $ref_arg->{dbi}
56 if ( ( !defined $source_id ) or
57 ( !defined $species_id ) or
60 confess
'Need to pass source_id, species_id and files as pairs';
63 my $source_sql =
"select source_id from source where name = 'RGD' and priority_description = 'direct_xref'";
64 my $sth = $dbi->prepare($source_sql);
66 my ($direct_source_id);
67 $sth->bind_columns(\$direct_source_id);
71 my $file = @{$files}[0];
73 # Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs
74 my (%preloaded_refseq) =
75 %{ $self->get_valid_codes(
'refseq', $species_id, $dbi ) };
77 my $rgd_io = $self->get_filehandle($file);
79 if ( !defined $rgd_io ) {
80 confess
"Could not open $file when trying to parse RGD";
82 my $csv = Text::CSV->new({
87 allow_loose_quotes => 1,
88 }) || confess
'Cannot use CSV: ' . Text::CSV->error_diag();
89 # WARNING - Text::CSV does not like the GENES-RAT.txt file. It is improperly formatted and contains a non-ASCII character
90 # Make sure binary is turned on or it silently fails and you get 1/3rd of the records.
91 # strict is turned off to prevent failure on a blank line at the end
94 while ( substr( $line, 0, 1 ) eq q{#} ) {
95 $line = $rgd_io->getline;
98 my @column_names = $csv->fields();
103 # GENBANK_NUCLEOTIDE => 23,
108 my $ensembl_count = 0;
112 my $cols = {}; # Digested columns from CSV
113 $csv->bind_columns( \@{$cols}{@column_names} );
115 while ( $csv->getline($rgd_io) ) {
117 if exists $cols->{GENE_RGD_ID} &&
118 ( $cols->{GENE_RGD_ID} eq q{} || !defined $cols->{GENE_RGD_ID} );
120 # Some RGD annotation is directly copied from Ensembl
121 if ($cols->{SYMBOL} =~ /ENSRNO/) { next; }
124 if ( defined $cols->{GENBANK_NUCLEOTIDE} ) {
125 @nucs = split qr{ ; }msx, $cols->{GENBANK_NUCLEOTIDE};
128 # @nucs are sorted in the file in alphabetical order. Filter them down
129 # to a higher quality subset, then add dependent Xrefs where possible
130 foreach my $nuc ( $self->sort_refseq_accessions(@nucs) ) {
132 if ( !$done && exists $preloaded_refseq{$nuc} ) {
134 foreach my $xref ( @{ $preloaded_refseq{$nuc} } ) {
136 $self->add_dependent_xref({
137 master_xref_id => $xref,
138 acc => $cols->{GENE_RGD_ID},
139 label => $cols->{SYMBOL},
140 desc => $cols->{NAME},
141 source_id => $source_id,
143 species_id => $species_id,
147 $self->process_synonyms( $xref_id, $cols->{OLD_SYMBOL},
154 if ( defined $cols->{ENSEMBL_ID} ) {
155 my @ensembl_ids = split qr{ ; }msx, $cols->{ENSEMBL_ID};
157 foreach my $id (@ensembl_ids) {
159 $self->add_to_direct_xrefs({
162 acc => $cols->{GENE_RGD_ID},
163 label => $cols->{SYMBOL},
164 desc => $cols->{NAME},
166 source_id => $direct_source_id,
167 species_id => $species_id,
170 $self->get_xref( $cols->{GENE_RGD_ID}, $direct_source_id,
173 $self->process_synonyms( $xref_id, $cols->{OLD_SYMBOL},
180 acc => $cols->{GENE_RGD_ID},
181 label => $cols->{SYMBOL},
182 desc => $cols->{NAME},
183 source_id => $source_id,
184 species_id => $species_id,
191 } ## end
while ( $cols = $csv->getline_hr...)
193 confess
'Failed to finish parsing RGD file: ' . $csv->error_diag();
198 print
"$count xrefs succesfully loaded and dependent on refseq\n" .
199 "$mismatch xrefs added but with NO dependencies\n" .
200 "$ensembl_count direct xrefs successfully loaded\n" .
201 "Tried to add $syn_count synonyms, including duplicates\n";
206 # Predefined importance levels for the most valued RefSeq accession types
207 my %refseq_priorities =
208 ( NM => 1, NP => 1, NR => 1, XM => 2, XP => 2, XR => 2, );
211 =head2 sort_refseq_accessions
213 Arg [1..n] : Original list of accessions
214 Description : Filter out any accessions which are not in the
"normal" set of
215 genomic features. The column in question contains EMBL accessions
216 as well as other things, and we don
't have the ability to make
218 Returntype : List of sorted and filtered accessions
222 sub sort_refseq_accessions {
223 my ( $self, @accessions ) = @_;
225 $refseq_priorities{ substr $a, 0, 2 }
226 <=> $refseq_priorities{ substr $b, 0, 2 } ||
228 } grep { exists $refseq_priorities{ substr $_, 0, 2 } } @accessions;
233 =head2 process_synonyms
234 Arg [1] : Xref dbID to attach synonyms to
235 Arg [2] : Synonym string as read from file
236 Description : Process the synonym column into potentially many items and add
237 them to the synonym table. Synonyms are ';
' separated
238 Returntype : Int - the count of synonyms added
241 sub process_synonyms {
242 my ( $self, $xref_id, $synonym_string, $dbi ) = @_;
244 if ( ( !defined $synonym_string ) || ( !defined $xref_id ) ) {
248 my @syns = split qr{ ; }msx, $synonym_string;
249 foreach my $syn (@syns) {
250 $self->add_synonym( $xref_id, $syn, $dbi );