my ($self, $sp_source_id, $sptr_source_id, $sptr_non_display_source_id, $species_id, $file, $verbose, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id, $dbi ) = @_;
my $num_sp = 0;
my $num_sptr = 0;
my $num_sp_pred = 0;
my $num_sptr_pred = 0;
my $num_sptr_non_display = 0;
my $num_direct_sp = 0;
my $num_direct_sptr = 0;
my %dependent_sources = $self->get_xref_sources($dbi);
my (%genemap) =
%{ $self->get_valid_codes( "mim_gene", $species_id, $dbi ) };
my (%morbidmap) =
%{ $self->get_valid_codes( "mim_morbid", $species_id, $dbi ) };
# Extract descriptions from hgnc
my %hgnc_descriptions;
if ($hgnc_file) {
%hgnc_descriptions = $self->get_hgnc_descriptions($hgnc_file);
}
my $uniprot_io = $self->get_filehandle($file);
if ( !defined $uniprot_io ) { return }
my @xrefs;
local $/ = "//\n";
# Create a hash of all valid taxon_ids for this species
my %species2tax = $self->species_id2taxonomy($dbi);
push @{$species2tax{$species_id}}, $species_id;
my @tax_ids = @{$species2tax{$species_id}};
my %taxonomy2species_id =
map{ $_=>$species_id } @tax_ids;
my %dependent_xrefs;
my $ensembl_derived_protein_count = 0;
# Counter to process file in batches
my $count = 0;
while ( $_ = $uniprot_io->getline() ) {
# if an OX line exists, only store the xref if the taxonomy ID that the OX
# line refers to is in the species table
# due to some records having more than one tax_id, we need to check them
# all and only proceed if one of them matches.
#OX NCBI_TaxID=158878, 158879;
#OX NCBI_TaxID=103690;
my ($ox) = $_ =~ /OX\s+[a-zA-Z_]+=([0-9 ,]+).*;/;
my @ox = ();
my $found = 0;
if ( defined $ox ) {
@ox = split /\, /, $ox;
# my %taxonomy2species_id = $self->taxonomy2species_id();
foreach my $taxon_id_from_file (@ox) {
$taxon_id_from_file =~ s/\s
if ( exists $taxonomy2species_id{$taxon_id_from_file} ){
$found = 1;
$count++;
}
}
}
next if (!$found); # no taxon_id's match, so skip to next record
my $xref;
# set accession (and synonyms if more than one)
# AC line may have primary accession and possibly several ; separated synonyms
# May also be more than one AC line
my ($acc) = $_ =~ /(\nAC\s+.+)/s; # will match first AC line and everything else
my @all_lines = split /\n/, $acc;
# Check for CC (caution) lines containing certain text
# If sequence is from Ensembl, do not use
my $ensembl_derived_protein = 0;
if ($_ =~ /CAUTION: The sequence shown here is derived from an Ensembl/) {
$ensembl_derived_protein = 1;
$ensembl_derived_protein_count++;
}
# extract ^AC lines only & build list of accessions
my @accessions;
foreach my $line (@all_lines) {
my ($accessions_only) = $line =~ /^AC\s+(.+)/;
push(@accessions, (split /;\s*/, $accessions_only)) if ($accessions_only);
}
if(lc($accessions[0]) eq "unreviewed"){
print "WARNING: entries with accession of $acc not allowed will be skipped\n";
next;
}
$xref->{INFO_TYPE} = "SEQUENCE_MATCH";
$xref->{ACCESSION} = $accessions[0];
for (my $a=1; $a <= $#accessions; $a++) {
push(@{$xref->{"SYNONYMS"} }, $accessions[$a]);
}
my ($label, $sp_type) = $_ =~ /ID\s+(\w+)\s+(\w+)/;
my ($protein_evidence_code) = $_ =~ /PE\s+(\d+)/;
# Capture line with entry version
# Example: DT 22-APR-2020, entry version 1.
my ($version) = $_ =~ /DT\s+\d+-\w+-\d+, entry version (\d+)/;
# SwissProt/SPTrEMBL are differentiated by having STANDARD/PRELIMINARY here
if ($sp_type =~ /^Reviewed/i) {
$xref->{SOURCE_ID} = $sp_source_id;
$num_sp++;
} elsif ($sp_type =~ /Unreviewed/i) {
#Use normal source only if it is PE levels 1 & 2
if (defined($protein_evidence_code) && $protein_evidence_code < 3) {
$xref->{SOURCE_ID} = $sptr_source_id;
$num_sptr++;
} else {
$xref->{SOURCE_ID} = $sptr_non_display_source_id;
$num_sptr_non_display++;
}
} else {
next; # ignore if it's neither one nor t'other
}
# some straightforward fields
# the previous $label flag of type BRCA2_HUMAN is not used in Uniprot any more, use accession instead
$xref->{LABEL} = $accessions[0] ."." . $version;
$xref->{VERSION} = $version;
$xref->{SPECIES_ID} = $species_id;
$xref->{SEQUENCE_TYPE} = 'peptide';
$xref->{STATUS} = 'experimental';
# May have multi-line descriptions
my ($description_and_rest) = $_ =~ /(DE\s+.*)/s;
@all_lines = split /\n/, $description_and_rest;
# extract ^DE lines only & build cumulative description string
my $description = "";
my $name = "";
my $sub_description = "";
foreach my $line (@all_lines) {
next if(!($line =~ /^DE/));
# get the data
if($line =~ /^DE RecName: Full=(.*);/){
$name .= '; ' if $name ne q{}; #separate multiple sub-names with a '; '
$name .= $1;
}
elsif($line =~ /RecName: Full=(.*);/){
$description .= ' ' if $description ne q{}; #separate the description bit with just a space
$description .= $1;
}
elsif($line =~ /SubName: Full=(.*);/){
$name .= '; ' if $name ne q{}; #separate multiple sub-names with a '; '
$name .= $1;
}
$description =~ s/^\s*//g;
$description =~ s/\s*$//g;
my $desc = $name.' '.$description;
if(!length($desc)){
$desc = $sub_description;
}
$desc =~ s/\s*\{ECO:.*?\}//g;
$xref->{DESCRIPTION} = $desc;
# Parse the EC_NUMBER line, only for S.cerevisiae for now
if (($line =~ /EC=/) && ($species_id == 4932)) {
#print STDERR "EC Number line: $line\n";
$line =~ /^DE\s+EC=([^;]+);/;
# Get the EC Number and make it an xref for S.cer if any
my $EC = $1;
#print STDERR "EC after processing: $EC\n";
my %depe;
$depe{LABEL} = $EC;
$depe{ACCESSION} = $EC;
$depe{SOURCE_NAME} = "EC_NUMBER";
$depe{SOURCE_ID} = $dependent_sources{"EC_NUMBER"};
$depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
push @{$xref->{DEPENDENT_XREFS}}, \%depe;
$dependent_xrefs{"EC_NUMBER"}++;
}
}
# extract sequence
my ($seq) = $_ =~ /SQ\s+(.+)/s; # /s allows . to match newline
my @seq_lines = split /\n/, $seq;
my $parsed_seq = "";
foreach my $x (@seq_lines) {
$parsed_seq .= $x;
}
$parsed_seq =~ s/\/\///g; # remove trailing end-of-record character
$parsed_seq =~ s/\s//g; # remove whitespace
$parsed_seq =~ s/^.*;//g; # remove everything before last ;
$xref->{SEQUENCE} = $parsed_seq;
#print "Adding " . $xref->{ACCESSION} . " " . $xref->{LABEL} ."\n";
my ($gns) = $_ =~ /(GN\s+.+)/s;
my @gn_lines = ();
if (defined $gns) {
foreach my $gn_line (split("\n", $gns)) {
if ($gn_line !~ /^GN/) {last;}
$gn_line =~ s/^GN\s+//g;
push(@gn_lines, $gn_line);
}
$gns = join('', @gn_lines);
@gn_lines = split /;/, $gns;
}
# Do not allow the addition of UniProt Gene Name dependent Xrefs
# if the protein was imported from Ensembl. Otherwise we will
# re-import previously set symbols
if(! $ensembl_derived_protein) {
my %depe;
foreach my $gn (@gn_lines){
my $gene_name = undef;
if ($gn =~ /Name=([A-Za-z0-9_\-\.\s:]+)/s) { #/s for multi-line entries ; is the delimiter
# Example line
# GN Name=ctrc {ECO:0000313|Xenbase:XB-GENE-5790348};
my $name = $1;
$name =~ s/\s+$//g; # Remove white spaces that are left over at the end if there was an evidence code
$depe{LABEL} = $name; # leave name as is, upper/lower case is relevant in gene names
$depe{ACCESSION} = $self->get_name($xref->{ACCESSION},$depe{LABEL});
$gene_name = $depe{ACCESSION};
$depe{SOURCE_NAME} = "Uniprot_gn";
$depe{SOURCE_ID} = $dependent_sources{"Uniprot_gn"};
$depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
$depe{DESCRIPTION} = $hgnc_descriptions{$name} if ($hgnc_file && defined($hgnc_descriptions{$name}));
push @{$xref->{DEPENDENT_XREFS}}, \%depe;
$dependent_xrefs{"Uniprot_gn"}++;
}
my @syn;
if($gn =~ /Synonyms=(.*)/s){ # use of /s as synonyms can be across more than one line
# Example line
# GN Synonyms=cela2a {ECO:0000313|Ensembl:ENSXETP00000014934},
# GN MGC79767 {ECO:0000313|EMBL:AAH80976.1}
my $syn = $1;
$syn =~ s/{.*}//g; # Remove any potential evidence codes
$syn =~ s/\n//g; # Remove return carriages, as entry can span several lines
$syn =~ s/\s+$//g; # Remove white spaces that are left over at the end if there was an evidence code
#$syn =~ s/^\s+//g; # Remove white spaces that are left over at the beginning if there was an evidence code
$syn =~ s/\s+,/,/g; # Remove white spaces that are left over before the comma if there was an evidence code
@syn = split(/, /,$syn);
push (@{$depe{"SYNONYMS"}}, @syn);
}
}
}
# dependent xrefs - only store those that are from sources listed in the source table
my ($deps) = $_ =~ /(DR\s+.+)/s; # /s allows . to match newline
my @dep_lines = ();
if ( defined $deps ) { @dep_lines = split /\n/, $deps }
my %seen=(); # per record basis
foreach my $dep (@dep_lines) {
#both GO and UniGene have the own sources so ignore those in the uniprot files
#as the uniprot data should be older
if($dep =~ /GO/ || $dep =~ /UniGene/){
next;
}
if ($dep =~ /^DR\s+(.+)/) {
my ($source, $acc, @extra) = split /;\s*/, $1;
if($source =~ "RGD"){ #using RGD file now instead.
next;
}
if($source =~ "CCDS"){
next;
}
if($source =~ "IPI"){
next;
}
if($source =~ "UCSC"){
next;
}
if($source =~ "SGD"){
next;
}
if($source =~ "HGNC"){
next;
}
# We get the mappings directly from the source
if($source =~ "MGI"){
next;
}
# Nomenclature data is imported directly from the source
if($source =~ "VGNC"){
next;
}
if($source =~ "Orphanet"){
#we don't want to parse Orphanet xrefs via Uniprot, we get them from Orphanet with descriptions
next;
}
if($source =~ "ArrayExpress"){
next;
}
if($source =~ "GenomeRNAi" || $source =~ "EPD"){
next;
}
if($source =~ "Xenbase"){
next;
}
# Uniprot get Reactome links from Reactome, so we want to get the info from Reactome directly
if($source =~ "Reactome"){
next;
}
# MIM xrefs are already imported separately, ignore from Uniprot
# Also, Uniprot deals with proteins, not appropriate for gene level xrefs
if ($source =~ "MIM_GENE" || $source =~ "MIM_MORBID" || $source =~ "MIM") {
next;
}
# GeneCards xrefs are imported through the HGNC file
if ($source =~ "GeneCards") {
next;
}
# If mapped to Ensembl, add as direct xref
if ($source eq "Ensembl") {
# Example line:
# DR Ensembl; ENST00000380152; ENSP00000369497; ENSG00000139618.
# DR Ensembl; ENST00000372839; ENSP00000361930; ENSG00000166913. [P31946-1]
# $source is Ensembl, $acc is ENST00000380152 and @extra is the rest of the line
# If the UniProt accession is repeated here, it links to a specific isoform
my %direct;
my $isoform;
my $stable_id = $extra[0];
$stable_id =~ s/\.[0-9]+
$direct{STABLE_ID} = $stable_id;
$direct{ENSEMBL_TYPE} = 'Translation';
$direct{LINKAGE_TYPE} = 'DIRECT';
if ($xref->{SOURCE_ID} == $sp_source_id) {
$direct{SOURCE_ID} = $sp_direct_source_id;
$num_direct_sp++;
} else {
$direct{SOURCE_ID} = $sptr_direct_source_id;
$num_direct_sptr++;
}
push @{$xref->{DIRECT_XREFS}}, \%direct;
my $uniprot_acc = $accessions[0];
if ($extra[1] =~ /($accessions[0]-[0-9]+)/) {
$isoform = $1;
$self->add_to_direct_xrefs({
stable_id => $stable_id,
type => 'translation',
acc => $isoform,
label => $isoform,
dbi => $dbi,
source_id => $isoform_source_id,
linkage => 'DIRECT',
species_id => $species_id
});
}
}
if (exists $dependent_sources{$source} ) {
# create dependent xref structure & store it
my %dep;
$dep{SOURCE_NAME} = $source;
$dep{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
$dep{SOURCE_ID} = $dependent_sources{$source};
if($source =~ /HGNC/){
$acc =~ s/HGNC:
$extra[0] =~ s/[.]
$dep{LABEL} = $extra[0];
}
$dep{ACCESSION} = $acc;
# $dep{ACCESSION} = $acc;
$dependent_xrefs{ $dep{SOURCE_NAME} }++; # get count of depenent xrefs.
if(!defined($seen{$dep{SOURCE_NAME}.":".$dep{ACCESSION}})){
push @{$xref->{DEPENDENT_XREFS}}, \%dep; # array of hashrefs
$seen{$dep{SOURCE_NAME}.":".$dep{ACCESSION}} =1;
}
if($dep =~ /EMBL/ && !($dep =~ /ChEMBL/)){
my ($protein_id) = $extra[0];
if(($protein_id ne "-") and (!defined($seen{$source.":".$protein_id}))){
my %dep2;
$dep2{SOURCE_NAME} = $source;
$dep2{SOURCE_ID} = $dependent_sources{"protein_id"};
$dep2{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
# store accession unversioned
$dep2{LABEL} = $protein_id;
my ($prot_acc, $prot_version) = $protein_id =~ /([^.]+)\.([^.]+)/;
$dep2{ACCESSION} = $prot_acc;
$dependent_xrefs{ $dep2{SOURCE_NAME} }++; # get count of dependent xrefs.
$seen{$source.":".$protein_id} = 1;
push @{$xref->{DEPENDENT_XREFS}}, \%dep2; # array of hashrefs
}
}
}
}
}
push @xrefs, $xref;
if ($count > 1000) {
$self->upload_xref_object_graphs(\@xrefs, $dbi);
$count = 0;
undef @xrefs;
}
}
$self->upload_xref_object_graphs(\@xrefs, $dbi) if scalar(@xrefs) > 0;
$uniprot_io->close();
print "Read $num_sp SwissProt xrefs, $num_sptr SPTrEMBL xrefs with protein evidence codes 1-2, and $num_sptr_non_display SPTrEMBL xrefs with protein evidence codes > 2 from $file\n" if($verbose);
print "Added $num_direct_sp direct SwissProt xrefs and $num_direct_sptr direct SPTrEMBL xrefs\n" if ($verbose);
print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if (($num_sp_pred > 0 || $num_sptr_pred > 0) and $verbose);
print "Skipped $ensembl_derived_protein_count ensembl annotations as Gene names\n";
# print "$kount gene anmes added\n";
print "Added the following dependent xrefs:-\n" if($verbose);
foreach my $key (keys %dependent_xrefs){
print $key."\t".$dependent_xrefs{$key}."\n" if($verbose);
}
print "End.\n" if ($verbose);
#TODO - currently include records from other species - filter on OX line??
}