my ( $self, $ref_arg ) = @_;
my $source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $files = $ref_arg->{files};
my $verbose = $ref_arg->{verbose}
my $dbi = $ref_arg->{dbi}
if ( ( !defined $source_id ) or
( !defined $species_id ) or
( !defined $files ) )
{
croak 'Need to pass source_id, species_id and files as pairs';
}
my $csv = Text::CSV->new()
|| confess 'Failed to initialise CSV parser: ' . Text::CSV->error_diag();
my $filename = @{$files}[0];
my $file_io = $self->get_filehandle($filename);
if ( !defined($file_io) ) {
confess "Failed to acquire a file handle for '${filename}'";
}
confess "Malformed or unexpected header in DBASS file '${filename}'";
}
my $processed_count = 0;
my $unmapped_count = 0;
while ( defined( my $line = $csv->getline( $file_io ) ) ) {
if ( scalar @{ $line } < $EXPECTED_NUMBER_OF_COLUMNS ) {
confess 'Line ' . (2 + $processed_count + $unmapped_count)
. " of input file '${filename}' has an incorrect number of columns";
}
# Do not modify the contents of @{$line}, only the output - hence the /r.
my ( $dbass_gene_id, $dbass_gene_name, $dbass_full_name, $ensembl_id )
=
map { s{\s+\z}{}rmsx } @{ $line };
# Do not attempt to create unmapped xrefs. Checking truthiness is good
# enough here because the only non-empty string evaluating as false is
# not a valid Ensembl stable ID.
if ( $ensembl_id ) {
# DBASS files list synonyms in two ways: either "FOO (BAR)" (with or
# without space) or "FOO/BAR". Both forms are relevant to us.
my ( $first_gene_name, $second_gene_name );
if ( ( $dbass_gene_name =~ m{
(.*)
\s?\/\s? # typically no ws here but just in case
(.*)
}msx ) ||
( $dbass_gene_name =~ m{
(.*)
\s? # there are entries both with and without ws
[(] (.*) [)]
}msx ) ) {
$first_gene_name = $1;
$second_gene_name = $2;
}
else {
$first_gene_name = $dbass_gene_name;
$second_gene_name = undef;
}
my $label = $first_gene_name;
my $synonym = $second_gene_name;
my $type = 'gene';
my $version = '1';
my $xref_id =
$self->get_xref( $dbass_gene_id, $source_id, $species_id, $dbi );
if ( ( ! defined $xref_id ) || ( $xref_id eq q{} ) ) {
$xref_id = $self->add_xref({
acc => $dbass_gene_id,
version => $version,
label => $label,
source_id => $source_id,
dbi => $dbi,
species_id => $species_id,
info_type => 'DIRECT'
});
}
if ( defined($synonym) ) {
$self->add_synonym( $xref_id, $synonym, $dbi );
}
$self->add_direct_xref( $xref_id, $ensembl_id, $type, undef, $dbi );
++$processed_count;
}
else {
++$unmapped_count;
}
} ## end while ( defined( my $line...))
$csv->eof;
$file_io->close();
if ($verbose) {
printf( "%d direct xrefs succesfully processed\n", $processed_count );
printf( "Skipped %d unmapped xrefs\n", $unmapped_count );
}
return 0;