my ( $self, $ref_arg ) = @_;
my $source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $files = $ref_arg->{files};
my $verbose = $ref_arg->{verbose}
my $dbi = $ref_arg->{dbi}
if ( ( !defined $source_id ) or
( !defined $species_id ) or
( !defined $files ) )
{
confess 'Need to pass source_id, species_id and files as pairs';
}
my $source_sql = "select source_id from source where name = 'RGD' and priority_description = 'direct_xref'";
my $sth = $dbi->prepare($source_sql);
$sth->execute();
my ($direct_source_id);
$sth->bind_columns(\$direct_source_id);
$sth->fetch();
$sth->finish();
my $file = @{$files}[0];
# Used to assign dbIDs for when RGD Xrefs are dependent on RefSeq xrefs
my (%preloaded_refseq) =
%{ $self->get_valid_codes( 'refseq', $species_id, $dbi ) };
my $rgd_io = $self->get_filehandle($file);
if ( !defined $rgd_io ) {
confess "Could not open $file when trying to parse RGD";
}
my $csv = Text::CSV->new({
sep => "\t",
blank_is_undef => 1,
auto_diag => 1,
binary => 1,
allow_loose_quotes => 1,
}) || confess 'Cannot use CSV: ' . Text::CSV->error_diag();
# WARNING - Text::CSV does not like the GENES-RAT.txt file. It is improperly formatted and contains a non-ASCII character
# Make sure binary is turned on or it silently fails and you get 1/3rd of the records.
# strict is turned off to prevent failure on a blank line at the end
my $line = q{#};
while ( substr( $line, 0, 1 ) eq q{#} ) {
$line = $rgd_io->getline;
}
$csv->parse($line);
my @column_names = $csv->fields();
# Columns we want
# GENE_RGD_ID => 0,
# SYMBOL => 1,
# NAME => 2,
# GENBANK_NUCLEOTIDE => 23,
# OLD_SYMBOL => 29,
# ENSEMBL_ID => 37
my $count = 0;
my $ensembl_count = 0;
my $mismatch = 0;
my $syn_count = 0;
my $cols = {}; # Digested columns from CSV
$csv->bind_columns( \@{$cols}{@column_names} );
while ( $csv->getline($rgd_io) ) {
next
if exists $cols->{GENE_RGD_ID} &&
( $cols->{GENE_RGD_ID} eq q{} || !defined $cols->{GENE_RGD_ID} );
# Some RGD annotation is directly copied from Ensembl
if ($cols->{SYMBOL} =~ /ENSRNO/) { next; }
my @nucs;
if ( defined $cols->{GENBANK_NUCLEOTIDE} ) {
@nucs = split qr{ ; }msx, $cols->{GENBANK_NUCLEOTIDE};
}
my $done = 0;
# @nucs are sorted in the file in alphabetical order. Filter them down
# to a higher quality subset, then add dependent Xrefs where possible
foreach my $nuc ( $self->sort_refseq_accessions(@nucs) ) {
if ( !$done && exists $preloaded_refseq{$nuc} ) {
foreach my $xref ( @{ $preloaded_refseq{$nuc} } ) {
my $xref_id =
$self->add_dependent_xref({
master_xref_id => $xref,
acc => $cols->{GENE_RGD_ID},
label => $cols->{SYMBOL},
desc => $cols->{NAME},
source_id => $source_id,
dbi => $dbi,
species_id => $species_id,
});
$count++;
$syn_count +=
$self->process_synonyms( $xref_id, $cols->{OLD_SYMBOL},
$dbi );
$done = 1;
}
}
}
if ( defined $cols->{ENSEMBL_ID} ) {
my @ensembl_ids = split qr{ ; }msx, $cols->{ENSEMBL_ID};
foreach my $id (@ensembl_ids) {
$ensembl_count++;
$self->add_to_direct_xrefs({
stable_id => $id,
type => 'gene',
acc => $cols->{GENE_RGD_ID},
label => $cols->{SYMBOL},
desc => $cols->{NAME},
dbi => $dbi,
source_id => $direct_source_id,
species_id => $species_id,
});
my $xref_id =
$self->get_xref( $cols->{GENE_RGD_ID}, $direct_source_id,
$species_id, $dbi );
$syn_count +=
$self->process_synonyms( $xref_id, $cols->{OLD_SYMBOL},
$dbi );
$done = 1;
}
}
if ( !$done ) {
$self->add_xref({
acc => $cols->{GENE_RGD_ID},
label => $cols->{SYMBOL},
desc => $cols->{NAME},
source_id => $source_id,
species_id => $species_id,
dbi => $dbi,
info_type => 'MISC',
});
$mismatch++;
}
} ## end while ( $cols = $csv->getline_hr...)
if ( !$csv->eof ) {
confess 'Failed to finish parsing RGD file: ' . $csv->error_diag();
}
$rgd_io->close();
if ($verbose) {
print "$count xrefs succesfully loaded and dependent on refseq\n" .
"$mismatch xrefs added but with NO dependencies\n" .
"$ensembl_count direct xrefs successfully loaded\n" .
"Tried to add $syn_count synonyms, including duplicates\n";
}
return 0;