my ( $self, $ref_arg ) = @_;
my $source_id = $ref_arg->{source_id};
my $species_id = $ref_arg->{species_id};
my $files = $ref_arg->{files};
my $verbose = $ref_arg->{verbose}
my $dbi = $ref_arg->{dbi}
if ( (!defined $source_id) || (!defined $species_id) || (!defined $files) ) {
confess 'Need to pass source_id, species_id and files as pairs';
}
my $file = @{$files}[0];
my $count = 0;
my $file_io = $self->get_filehandle($file);
if ( !defined $file_io ) {
confess "Can't open UCSC file $file\n";
}
my $input_file = Text::CSV->new({
sep_char => "\t",
empty_is_undef => 1,
strict => 1,
}) || confess "Cannot use file $file: " . Text::CSV->error_diag();
while ( my $data = $input_file->getline( $file_io ) ) {
my (undef, $chromosome, $strand, $tx_start, $tx_end, $cds_start, $cds_end,
undef, $exon_starts, $exon_ends, undef, $accession) = @{ $data };
# UCSC uses slightly different chromosome names, at least for
# human and mouse, so chop off the 'chr' in the beginning. We do
# not yet translate the names of the special chromosomes, e.g.
# "chr6_cox_hap1" (UCSC) into "c6_COX" (Ensembl).
$chromosome =~ s{ \A chr }{}msx;
# They also use '+' and '-' for the strand, instead of -1, 0, or 1.
if ( $strand eq q{+} ) {
$strand = 1;
}
elsif ( $strand eq q{-} ) {
$strand = -1;
}
else {
$strand = 0;
}
# ... and non-coding transcripts have cds_start == cds_end. We would
# like these to be stored as NULLs.
if ( $cds_start == $cds_end ) {
undef $cds_start;
undef $cds_end;
}
# $exon_starts and $exon_ends usually (always?) have trailing commas,
# remove them.
$exon_starts =~ s{ , \z }{}msx;
$exon_ends =~ s{ , \z }{}msx;
# ... and they use the same kind of "inbetween" coordinates as e.g.
# exonerate, so increment all start coordinates by one.
$tx_start += 1;
if ( defined $cds_start ) {
$cds_start += 1;
}
# The string exon_starts is a comma-separated list of start coordinates
# for subsequent exons and we must increment each one. Split the string
# on commas, use map() to apply the "+1" transformation to every
# element of the resulting array, then join the result into a new
# comma-separated list.
$exon_starts =
join q{,},
map { $_ + 1 } split qr{ , }msx, $exon_starts;
$self->add_xref( $source_id, $species_id, {
chromosome => $chromosome,
strand => $strand,
txStart => $tx_start,
txEnd => $tx_end,
cdsStart => $cds_start,
cdsEnd => $cds_end,
exonStarts => $exon_starts,
exonEnds => $exon_ends,
dbi => $dbi,
});
$count += 1;
}
$input_file->eof || confess "Error parsing file $file: " . $input_file->error_diag();
$file_io->close();
if ($verbose) {
print "Loaded a total of $count UCSC xrefs\n";
}
return 0;
}