version-2.8/perl/UniProtParser_8pm_source.html

=head1 LICENSE


See the NOTICE file distributed with this work for additional information

regarding copyright ownership.


Licensed under the Apache License, Version 2.0 (the "License");

you may not use this file except in compliance with the License.

You may obtain a copy of the License at


     http://www.apache.org/licenses/LICENSE-2.0


Unless required by applicable law or agreed to in writing, software

distributed under the License is distributed on an "AS IS" BASIS,

WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

See the License for the specific language governing permissions and

limitations under the License.


=cut


# Parse UniProt (SwissProt & SPTrEMBL) files to create xrefs.

#

# Files actually contain both types of xref, distinguished by ID line;

#

# ID   CYC_PIG                 Reviewed;         104 AA.  Swissprot

# ID   Q3ASY8_CHLCH            Unreviewed;     36805 AA.  SPTrEMBL


package XrefParser::UniProtParser;


use strict;

use warnings;

use Carp;

use POSIX qw(strftime);

use File::Basename;

use Text::CSV;


use base qw( XrefParser::BaseParser );


my $hgnc_file;


sub run {


  my ($self, $ref_arg) = @_;

  my $source_id    = $ref_arg->{source_id};

  my $species_id   = $ref_arg->{species_id};

  my $species_name = $ref_arg->{species};

  my $files        = $ref_arg->{files};

  my $release_file = $ref_arg->{rel_file};

  my $verbose      = $ref_arg->{verbose};

  my $dbi          = $ref_arg->{dbi};

  $dbi = $self->dbi unless defined $dbi;


  $hgnc_file = $ref_arg->{hgnc_file} || undef;


  if((!defined $source_id) or (!defined $species_id) or (!defined $files)){

    croak "Need to pass source_id, species_id, files and rel_file as pairs";

  }

  $verbose |=0;


  my $file = @{$files}[0];


  my ( $sp_source_id, $sptr_source_id, $sp_release, $sptr_release, $sptr_non_display_source_id, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id );


  $sp_source_id =

    $self->get_source_id_for_source_name('Uniprot/SWISSPROT','sequence_mapped', $dbi);

  $sptr_source_id =

    $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'sequence_mapped', $dbi);


  $sptr_non_display_source_id =

    $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'protein_evidence_gt_2', $dbi);


  $sp_direct_source_id = $self->get_source_id_for_source_name('Uniprot/SWISSPROT', 'direct', $dbi);

  $sptr_direct_source_id = $self->get_source_id_for_source_name('Uniprot/SPTREMBL', 'direct', $dbi);


  $isoform_source_id = $self->get_source_id_for_source_name('Uniprot_isoform');


  print "SwissProt source id for $file: $sp_source_id\n" if ($verbose);

  print "SpTREMBL source id for $file: $sptr_source_id\n" if ($verbose);

  print "SpTREMBL protein_evidence > 2 source id for $file: $sptr_non_display_source_id\n" if ($verbose);

  print "SwissProt direct source id for $file: $sp_direct_source_id\n" if ($verbose);

  print "SpTREMBL direct source id for $file: $sptr_direct_source_id\n" if ($verbose);


  $self->create_xrefs( $sp_source_id, $sptr_source_id, $sptr_non_display_source_id, $species_id,

      $file, $verbose, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id, $dbi );


    if ( defined $release_file ) {

        # Parse Swiss-Prot and SpTrEMBL release info from

        # $release_file.

        my $release_io = $self->get_filehandle($release_file);

        while ( defined( my $line = $release_io->getline() ) ) {

            if ( $line =~ m#(UniProtKB/Swiss-Prot Release .*)# ) {

                $sp_release = $1;

                print "Swiss-Prot release is '$sp_release'\n" if($verbose);

            } elsif ( $line =~ m#(UniProtKB/TrEMBL Release .*)# ) {

                $sptr_release = $1;

                print "SpTrEMBL release is '$sptr_release'\n" if($verbose);

            }

        }

        $release_io->close();


        # Set releases

        $self->set_release( $sp_source_id,        $sp_release, $dbi );

        $self->set_release( $sptr_source_id,      $sptr_release, $dbi );

    $self->set_release( $sptr_non_display_source_id, $sptr_release, $dbi );

        $self->set_release( $sp_direct_source_id, $sp_release, $dbi );

        $self->set_release( $sptr_direct_source_id,$sptr_release, $dbi );

    }


  return 0; # successfull

}


# --------------------------------------------------------------------------------

# Parse file into array of xref objects


sub create_xrefs {

  my ($self, $sp_source_id, $sptr_source_id, $sptr_non_display_source_id, $species_id, $file, $verbose, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id, $dbi ) = @_;


  my $num_sp = 0;

  my $num_sptr = 0;

  my $num_sp_pred = 0;

  my $num_sptr_pred = 0;

  my $num_sptr_non_display = 0;

  my $num_direct_sp = 0;

  my $num_direct_sptr = 0;


  my %dependent_sources = $self->get_xref_sources($dbi);


  my (%genemap) =

    %{ $self->get_valid_codes( "mim_gene", $species_id, $dbi ) };

  my (%morbidmap) =

    %{ $self->get_valid_codes( "mim_morbid", $species_id, $dbi ) };


  # Extract descriptions from hgnc

  my %hgnc_descriptions;

  if ($hgnc_file) {

    %hgnc_descriptions = $self->get_hgnc_descriptions($hgnc_file);

  }


  my $uniprot_io = $self->get_filehandle($file);

  if ( !defined $uniprot_io ) { return }


  my @xrefs;


  local $/ = "//\n";


  # Create a hash of all valid taxon_ids for this species

  my %species2tax = $self->species_id2taxonomy($dbi);

  push @{$species2tax{$species_id}}, $species_id;

  my @tax_ids = @{$species2tax{$species_id}};

  my %taxonomy2species_id = map{ $_=>$species_id } @tax_ids;


  my %dependent_xrefs;

  my $ensembl_derived_protein_count = 0;


  # Counter to process file in batches

  my $count = 0;


  while ( $_ = $uniprot_io->getline() ) {


    # if an OX line exists, only store the xref if the taxonomy ID that the OX

    # line refers to is in the species table

    # due to some records having more than one tax_id, we need to check them

    # all and only proceed if one of them matches.

    #OX   NCBI_TaxID=158878, 158879;

    #OX   NCBI_TaxID=103690;


    my ($ox) = $_ =~ /OX\s+[a-zA-Z_]+=([0-9 ,]+).*;/;

    my @ox = ();

    my $found = 0;


    if ( defined $ox ) {

        @ox = split /\, /, $ox;


        # my %taxonomy2species_id = $self->taxonomy2species_id();


        foreach my $taxon_id_from_file (@ox) {

          $taxon_id_from_file =~ s/\s//;

          if ( exists $taxonomy2species_id{$taxon_id_from_file} ){

            $found = 1;

            $count++;

          }

        }

    }


    next if (!$found); # no taxon_id's match, so skip to next record

    my $xref;


    # set accession (and synonyms if more than one)

    # AC line may have primary accession and possibly several ; separated synonyms

    # May also be more than one AC line

    my ($acc) = $_ =~ /(\nAC\s+.+)/s; # will match first AC line and everything else


    my @all_lines = split /\n/, $acc;


    # Check for CC (caution) lines containing certain text

    # If sequence is from Ensembl, do not use

    my $ensembl_derived_protein = 0;

    if ($_ =~ /CAUTION: The sequence shown here is derived from an Ensembl/) {

      $ensembl_derived_protein = 1;

      $ensembl_derived_protein_count++;

    }


    # extract ^AC lines only & build list of accessions

    my @accessions;

    foreach my $line (@all_lines) {

      my ($accessions_only) = $line =~ /^AC\s+(.+)/;

      push(@accessions, (split /;\s*/, $accessions_only)) if ($accessions_only);


    }


    if(lc($accessions[0]) eq "unreviewed"){

      print "WARNING: entries with accession of $acc not allowed will be skipped\n";

      next;

    }

    $xref->{INFO_TYPE} = "SEQUENCE_MATCH";

    $xref->{ACCESSION} = $accessions[0];

    for (my $a=1; $a <= $#accessions; $a++) {

      push(@{$xref->{"SYNONYMS"} }, $accessions[$a]);

    }


    my ($label, $sp_type) = $_ =~ /ID\s+(\w+)\s+(\w+)/;

    my ($protein_evidence_code) = $_ =~ /PE\s+(\d+)/;

    # Capture line with entry version

    # Example: DT   22-APR-2020, entry version 1.

    my ($version) = $_ =~ /DT\s+\d+-\w+-\d+, entry version (\d+)/;


    # SwissProt/SPTrEMBL are differentiated by having STANDARD/PRELIMINARY here

    if ($sp_type =~ /^Reviewed/i) {


      $xref->{SOURCE_ID} = $sp_source_id;

      $num_sp++;

    } elsif ($sp_type =~ /Unreviewed/i) {


    #Use normal source only if it is PE levels 1 & 2

      if (defined($protein_evidence_code) && $protein_evidence_code < 3) {

          $xref->{SOURCE_ID} = $sptr_source_id;

          $num_sptr++;

      } else {

          $xref->{SOURCE_ID} = $sptr_non_display_source_id;

          $num_sptr_non_display++;

      }


    } else {


      next; # ignore if it's neither one nor t'other


    }


    # some straightforward fields

    # the previous $label flag of type BRCA2_HUMAN is not used in Uniprot any more, use accession instead

    $xref->{LABEL} = $accessions[0] ."." . $version;

    $xref->{VERSION} = $version;

    $xref->{SPECIES_ID} = $species_id;

    $xref->{SEQUENCE_TYPE} = 'peptide';

    $xref->{STATUS} = 'experimental';


    # May have multi-line descriptions

    my ($description_and_rest) = $_ =~ /(DE\s+.*)/s;

    @all_lines = split /\n/, $description_and_rest;


    # extract ^DE lines only & build cumulative description string

    my $description = "";

    my $name        = "";

    my $sub_description = "";


    foreach my $line (@all_lines) {


      next if(!($line =~ /^DE/));


      # get the data

      if($line =~ /^DE   RecName: Full=(.*);/){

        $name .= '; ' if $name ne q{}; #separate multiple sub-names with a '; '

        $name .= $1;

      }

      elsif($line =~ /RecName: Full=(.*);/){

        $description .= ' ' if $description ne q{}; #separate the description bit with just a space

        $description .= $1;

      }

      elsif($line =~ /SubName: Full=(.*);/){

        $name .= '; ' if $name ne q{}; #separate multiple sub-names with a '; '

        $name .= $1;

      }


      $description =~ s/^\s*//g;

      $description =~ s/\s*$//g;


      my $desc = $name.' '.$description;

      if(!length($desc)){

    $desc = $sub_description;

      }


      $desc =~ s/\s*\{ECO:.*?\}//g;

      $xref->{DESCRIPTION} = $desc;


      # Parse the EC_NUMBER line, only for S.cerevisiae for now


      if (($line =~ /EC=/) && ($species_id == 4932)) {


      #print STDERR "EC Number line: $line\n";


      $line =~ /^DE\s+EC=([^;]+);/;


      # Get the EC Number and make it an xref for S.cer if any


      my $EC = $1;


      #print STDERR "EC after processing: $EC\n";


      my %depe;

      $depe{LABEL} = $EC;

      $depe{ACCESSION} = $EC;


      $depe{SOURCE_NAME} = "EC_NUMBER";


      $depe{SOURCE_ID} = $dependent_sources{"EC_NUMBER"};

      $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};

      push @{$xref->{DEPENDENT_XREFS}}, \%depe;

      $dependent_xrefs{"EC_NUMBER"}++;

      }


    }


    # extract sequence

    my ($seq) = $_ =~ /SQ\s+(.+)/s; # /s allows . to match newline

      my @seq_lines = split /\n/, $seq;

    my $parsed_seq = "";

    foreach my $x (@seq_lines) {

      $parsed_seq .= $x;

    }

    $parsed_seq =~ s/\/\///g;   # remove trailing end-of-record character

    $parsed_seq =~ s/\s//g;     # remove whitespace

    $parsed_seq =~ s/^.*;//g;   # remove everything before last ;


    $xref->{SEQUENCE} = $parsed_seq;

    #print "Adding " . $xref->{ACCESSION} . " " . $xref->{LABEL} ."\n";


    my ($gns) = $_ =~ /(GN\s+.+)/s;

    my @gn_lines = ();

    if (defined $gns) {

      foreach my $gn_line (split("\n", $gns)) {

        if ($gn_line !~ /^GN/) {last;}


        $gn_line =~ s/^GN\s+//g;

        push(@gn_lines, $gn_line);

      }


      $gns = join('', @gn_lines);

      @gn_lines = split /;/, $gns;

    }


    # Do not allow the addition of UniProt Gene Name dependent Xrefs

    # if the protein was imported from Ensembl. Otherwise we will

    # re-import previously set symbols

    if(! $ensembl_derived_protein) {

      my %depe;

      foreach my $gn (@gn_lines){

        my $gene_name = undef;


        if ($gn =~ /Name=([A-Za-z0-9_\-\.\s:]+)/s) { #/s for multi-line entries ; is the delimiter

# Example line

# GN   Name=ctrc {ECO:0000313|Xenbase:XB-GENE-5790348};

          my $name = $1;

          $name =~ s/\s+$//g; # Remove white spaces that are left over at the end if there was an evidence code

          $depe{LABEL} = $name; # leave name as is, upper/lower case is relevant in gene names

          $depe{ACCESSION} = $self->get_name($xref->{ACCESSION},$depe{LABEL});

          $gene_name = $depe{ACCESSION};


          $depe{SOURCE_NAME} = "Uniprot_gn";

          $depe{SOURCE_ID} = $dependent_sources{"Uniprot_gn"};

          $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};

          $depe{DESCRIPTION} = $hgnc_descriptions{$name} if ($hgnc_file && defined($hgnc_descriptions{$name}));

          push @{$xref->{DEPENDENT_XREFS}}, \%depe;

          $dependent_xrefs{"Uniprot_gn"}++;

        }

        my @syn;

        if($gn =~ /Synonyms=(.*)/s){ # use of /s as synonyms can be across more than one line

# Example line

# GN   Synonyms=cela2a {ECO:0000313|Ensembl:ENSXETP00000014934},

# GN   MGC79767 {ECO:0000313|EMBL:AAH80976.1}

          my $syn = $1;

          $syn =~ s/{.*}//g;  # Remove any potential evidence codes

          $syn =~ s/\n//g;    # Remove return carriages, as entry can span several lines

          $syn =~ s/\s+$//g;  # Remove white spaces that are left over at the end if there was an evidence code

          #$syn =~ s/^\s+//g;  # Remove white spaces that are left over at the beginning if there was an evidence code

          $syn =~ s/\s+,/,/g;  # Remove white spaces that are left over before the comma if there was an evidence code

          @syn = split(/, /,$syn);

          push (@{$depe{"SYNONYMS"}}, @syn);

        }

      }

    }


    # dependent xrefs - only store those that are from sources listed in the source table

    my ($deps) = $_ =~ /(DR\s+.+)/s; # /s allows . to match newline


    my @dep_lines = ();

    if ( defined $deps ) { @dep_lines = split /\n/, $deps }


    my %seen=();  # per record basis


    foreach my $dep (@dep_lines) {

      #both GO and UniGene have the own sources so ignore those in the uniprot files

      #as the uniprot data should be older

      if($dep =~ /GO/ || $dep =~ /UniGene/){

    next;

      }

      if ($dep =~ /^DR\s+(.+)/) {

        my ($source, $acc, @extra) = split /;\s*/, $1;

        if($source =~ "RGD"){  #using RGD file now instead.

          next;

        }

        if($source =~ "CCDS"){

          next;

        }

        if($source =~ "IPI"){

          next;

        }

        if($source =~ "UCSC"){

          next;

        }

        if($source =~ "SGD"){

          next;

        }

        if($source =~ "HGNC"){

          next;

        }

        # We get the mappings directly from the source

        if($source =~ "MGI"){

          next;

        }

        # Nomenclature data is imported directly from the source

        if($source =~ "VGNC"){

          next;

        }

        if($source =~ "Orphanet"){

          #we don't want to parse Orphanet xrefs via Uniprot, we get them from Orphanet with descriptions

          next;

        }

        if($source =~ "ArrayExpress"){

            next;

        }

        if($source =~ "GenomeRNAi" || $source =~ "EPD"){

            next;

        }

        if($source =~ "Xenbase"){

            next;

        }

# Uniprot get Reactome links from Reactome, so we want to get the info from Reactome directly

        if($source =~ "Reactome"){

            next;

        }

# MIM xrefs are already imported separately, ignore from Uniprot

# Also, Uniprot deals with proteins, not appropriate for gene level xrefs

        if ($source =~ "MIM_GENE" || $source =~ "MIM_MORBID" || $source =~ "MIM") {

            next;

        }

        # GeneCards xrefs are imported through the HGNC file

        if ($source =~ "GeneCards") {

          next;

        }

# If mapped to Ensembl, add as direct xref

        if ($source eq "Ensembl") {

# Example line:

# DR   Ensembl; ENST00000380152; ENSP00000369497; ENSG00000139618.

# DR   Ensembl; ENST00000372839; ENSP00000361930; ENSG00000166913. [P31946-1]

# $source is Ensembl, $acc is ENST00000380152 and @extra is the rest of the line

# If the UniProt accession is repeated here, it links to a specific isoform

          my %direct;

          my $isoform;


          my $stable_id = $extra[0];

          $stable_id =~ s/\.[0-9]+//;

          $direct{STABLE_ID} = $stable_id;

          $direct{ENSEMBL_TYPE} = 'Translation';

          $direct{LINKAGE_TYPE} = 'DIRECT';

          if ($xref->{SOURCE_ID} == $sp_source_id) {

            $direct{SOURCE_ID} = $sp_direct_source_id;

            $num_direct_sp++;

          } else {

            $direct{SOURCE_ID} = $sptr_direct_source_id;

            $num_direct_sptr++;

          }

          push @{$xref->{DIRECT_XREFS}}, \%direct;


          my $uniprot_acc = $accessions[0];

          if ($extra[1] =~ /($accessions[0]-[0-9]+)/) {

            $isoform = $1;

            $self->add_to_direct_xrefs({

              stable_id  => $stable_id,

              type       => 'translation',

              acc        => $isoform,

              label      => $isoform,

              dbi        => $dbi,

              source_id  => $isoform_source_id,

              linkage    => 'DIRECT',

              species_id => $species_id

            });

          }

        }

       if (exists $dependent_sources{$source} ) {

      # create dependent xref structure & store it

      my %dep;

          $dep{SOURCE_NAME} = $source;

          $dep{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};

          $dep{SOURCE_ID} = $dependent_sources{$source};


      if($source =~ /HGNC/){

        $acc =~ s/HGNC://;

        $extra[0] =~ s/[.]//;

        $dep{LABEL} = $extra[0];

      }

      $dep{ACCESSION} = $acc;


#     $dep{ACCESSION} = $acc;

      $dependent_xrefs{ $dep{SOURCE_NAME} }++; # get count of depenent xrefs.

      if(!defined($seen{$dep{SOURCE_NAME}.":".$dep{ACCESSION}})){

        push @{$xref->{DEPENDENT_XREFS}}, \%dep; # array of hashrefs

        $seen{$dep{SOURCE_NAME}.":".$dep{ACCESSION}} =1;

      }

      if($dep =~ /EMBL/ && !($dep =~ /ChEMBL/)){

        my ($protein_id) = $extra[0];

        if(($protein_id ne "-") and (!defined($seen{$source.":".$protein_id}))){

          my %dep2;

          $dep2{SOURCE_NAME} = $source;

          $dep2{SOURCE_ID} = $dependent_sources{"protein_id"};

          $dep2{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};

          # store accession unversioned

          $dep2{LABEL} = $protein_id;

          my ($prot_acc, $prot_version) = $protein_id =~ /([^.]+)\.([^.]+)/;

          $dep2{ACCESSION} = $prot_acc;

          $dependent_xrefs{ $dep2{SOURCE_NAME} }++; # get count of dependent xrefs.

          $seen{$source.":".$protein_id} = 1;

          push @{$xref->{DEPENDENT_XREFS}}, \%dep2; # array of hashrefs

        }

      }

    }

      }

    }


    push @xrefs, $xref;


    if ($count > 1000) {

      $self->upload_xref_object_graphs(\@xrefs, $dbi);

      $count = 0;

      undef @xrefs;

    }


  }


  $self->upload_xref_object_graphs(\@xrefs, $dbi) if scalar(@xrefs) > 0;


  $uniprot_io->close();


  print "Read $num_sp SwissProt xrefs, $num_sptr SPTrEMBL xrefs with protein evidence codes 1-2, and $num_sptr_non_display SPTrEMBL xrefs with protein evidence codes > 2 from $file\n" if($verbose);

  print "Added $num_direct_sp direct SwissProt xrefs and $num_direct_sptr direct SPTrEMBL xrefs\n" if ($verbose);

  print "Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if (($num_sp_pred > 0 || $num_sptr_pred > 0) and $verbose);

  print "Skipped $ensembl_derived_protein_count ensembl annotations as Gene names\n";


#  print "$kount gene anmes added\n";


  print "Added the following dependent xrefs:-\n" if($verbose);

  foreach my $key (keys %dependent_xrefs){

    print $key."\t".$dependent_xrefs{$key}."\n" if($verbose);

  }

  print "End.\n" if ($verbose);


  #TODO - currently include records from other species - filter on OX line??

}


sub get_name {

  my $self = shift;

  my $acc  = shift;

  my $label = shift;


  return $acc;

}


sub get_hgnc_descriptions {

  my ($self, $hgnc_file) = @_;

  my %descriptions;


  my $hgnc_fh = $self->get_filehandle($hgnc_file);

  if ( !defined $hgnc_fh ) {confess "Can't open HGNC file '$hgnc_file'\n";}

  $hgnc_file = do { local $/; <$hgnc_fh> };


  my $input_file = Text::CSV->new({

    sep_char       => "\t",

    empty_is_undef => 1,

    binary         => 1,

    auto_diag      => 1

  }) or croak "Cannot use file $hgnc_file: ".Text::CSV->error_diag ();


  $hgnc_file = Encode::encode("UTF-8", $hgnc_file);

  $hgnc_file =~ s/"//xg;


  open my $hgnc_io, '<', \$hgnc_file or confess "Can't open HGNC file: $!\n";


  $input_file->column_names( @{ $input_file->getline( $hgnc_io ) } );


  while ( my $data = $input_file->getline_hr( $hgnc_io ) ) {

    my $gene_name   = $data->{'Approved symbol'};

    my $description = $data->{'Approved name'};


    $descriptions{$gene_name} = $description;

  }


  close $hgnc_io;


  return %descriptions;

}


1;