my ( $self, $entry, $name2species_id, $taxonomy2species_id,
$pred_mrna_source_id, $pred_ncrna_source_id,
$mrna_source_id, $ncrna_source_id,
$pred_peptide_source_id, $peptide_source_id,
$entrez_source_id, $wiki_source_id, $add_dependent_xref_sth,
$species_id, $type, $refseq_ids,$entrez_ids,$wiki_ids
) = @_;
chomp $entry;
my ($species) = $entry =~ /\s+ORGANISM\s+(.*)\n/;
$species = lc $species;
$species =~ s/^\s*
$species =~ s/\s*\(.+\)
$species =~ s/\s+/_/g;
$species =~ s/\n
my $species_id_check = $name2species_id->{$species};
# Try going through the taxon ID if species check didn't work.
if ( !defined $species_id_check ) {
my ($taxon_id) = $entry =~ /db_xref="taxon:(\d+)"/;
$species_id_check = $taxonomy2species_id->{$taxon_id};
}
# skip xrefs for species that aren't in the species table
if ( defined $species_id
&& defined $species_id_check
&& $species_id == $species_id_check )
{
my $xref = {};
my ($acc) = $entry =~ /ACCESSION\s+(\S+)/;
my ($ver) = $entry =~ /VERSION\s+(\S+)/;
my ($refseq_pair) = $entry =~ /DBSOURCE\s+REFSEQ:
accession (\S+)/;
# get the right source ID based on $type and whether this is predicted (X*) or not
my $source_id;
if ($type =~ /dna/) {
if ($acc =~ /^XM_/ ){
$source_id = $pred_mrna_source_id;
} elsif( $acc =~ /^XR/) {
$source_id = $pred_ncrna_source_id;
} elsif( $acc =~ /^NM/) {
$source_id = $mrna_source_id;
} elsif( $acc =~ /^NR/) {
$source_id = $ncrna_source_id;
}
}
elsif ($type =~ /peptide/) {
if ($acc =~ /^XP_/) {
$source_id = $pred_peptide_source_id;
} else {
$source_id = $peptide_source_id;
}
}
print "Warning: can't get source ID for $type $acc\n" if (!$source_id);
# Description - may be multi-line
my ($description) = $entry =~ /DEFINITION\s+([^[]+)/s;
print $entry if (length($description) == 0);
$description =~ s/\nACCESSION.*
$description =~ s/\n
$description =~ s/{.*}-like
$description =~ s/{.*}
$description =~ s/\s+/ /g;
$description = substr($description, 0, 255) if (length($description) > 255);
my ($seq) = $entry =~ /^\s*ORIGIN\s+(.+)/ms; # /s allows . to match newline
my @seq_lines = split /\n/, $seq;
my $parsed_seq = "";
foreach my $x (@seq_lines) {
my ($seq_only) = $x =~ /^\s*\d+\s+(.*)$/;
next if (!defined $seq_only);
$parsed_seq .= $seq_only;
}
$parsed_seq =~ s#
$parsed_seq =~ s#\s##g; # remove whitespace
( my $acc_no_ver, $ver ) = split( /\./, $ver );
$xref->{ACCESSION} = $acc;
if($acc eq $acc_no_ver){
$xref->{VERSION} = $ver;
}
else{
print "$acc NE $acc_no_ver\n";
}
$xref->{LABEL} = $acc . "\." . $ver;
$xref->{DESCRIPTION} = $description;
$xref->{SOURCE_ID} = $source_id;
$xref->{SEQUENCE} = $parsed_seq;
$xref->{SEQUENCE_TYPE} = $type;
$xref->{SPECIES_ID} = $species_id;
$xref->{INFO_TYPE} = "SEQUENCE_MATCH";
# TODO experimental/predicted
my @EntrezGeneIDline = $entry =~ /db_xref=.GeneID:(\d+)/g;
# my @SGDGeneIDline = $entry =~ /db_xref=.SGD:(S\d+)/g;
my @protein_id = $entry =~ /\/protein_id=.(\S+_\d+)/g;
my @coded_by = $entry =~ /\/coded_by=.(\w+_\d+)/g;
foreach my $cb (@coded_by){
$xref->{PAIR} = $cb;
}
if (!defined $xref->{PAIR}) {
$xref->{PAIR} = $refseq_pair;
}
foreach my $pi (@protein_id){
$xref->{PROTEIN} = $pi;
}
foreach my $ll (@EntrezGeneIDline) {
my %dep;
#my $entrez_source = $dependent_sources{EntrezGene} || die( 'No source for EntrezGene!' );
#my $wiki_source = $dependent_sources{WikiGene} || die( 'No source for WikiGene!' );
if (defined $entrez{$ll}) {
$dep{SOURCE_ID} = $entrez_source_id;
$dep{LINKAGE_SOURCE_ID} = $source_id;
$dep{ACCESSION} = $ll;
$dep{LABEL} = $entrez{$ll};
push @{$xref->{DEPENDENT_XREFS}}, \%dep;
my %dep2;
$dep2{SOURCE_ID} = $wiki_source_id;
$dep2{LINKAGE_SOURCE_ID} = $source_id;
$dep2{ACCESSION} = $ll;
$dep2{LABEL} = $entrez{$ll};
push @{$xref->{DEPENDENT_XREFS}}, \%dep2;
# Add xrefs for RefSeq mRNA as well where available
$refseq_pair =~ s/\.[0-9]*
if (defined $refseq_pair) {
if ($refseq_ids->{$refseq_pair}) {
foreach my $refseq_id (@{ $refseq_ids->{$refseq_pair} }) {
foreach my $entrez_id (@{ $entrez_ids->{$ll} }) {
$add_dependent_xref_sth->execute($refseq_id, $entrez_id);
}
foreach my $wiki_id (@{ $wiki_ids->{$ll} }) {
$add_dependent_xref_sth->execute($refseq_id, $wiki_id);
}
}
}
}
}
}
# Don't add SGD Xrefs, as they are mapped directly from SGD ftp site
# Refseq's do not tell whether the mim is for the gene of morbid so ignore for now.
return $xref;
}
}