3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 # Parse UniProt (SwissProt & SPTrEMBL) files to create xrefs.
22 # Files actually contain both types of xref, distinguished by ID line;
24 # ID CYC_PIG Reviewed; 104 AA. Swissprot
25 # ID Q3ASY8_CHLCH Unreviewed; 36805 AA. SPTrEMBL
29 package XrefParser::UniProtParser;
34 use POSIX qw(strftime);
44 my ($self, $ref_arg) = @_;
45 my $source_id = $ref_arg->{source_id};
46 my $species_id = $ref_arg->{species_id};
47 my $species_name = $ref_arg->{species};
48 my $files = $ref_arg->{files};
49 my $release_file = $ref_arg->{rel_file};
50 my $verbose = $ref_arg->{verbose};
51 my $dbi = $ref_arg->{dbi};
52 $dbi = $self->dbi unless defined $dbi;
54 $hgnc_file = $ref_arg->{hgnc_file} || undef;
56 if((!defined $source_id) or (!defined $species_id) or (!defined $files)){
57 croak
"Need to pass source_id, species_id, files and rel_file as pairs";
61 my $file = @{$files}[0];
63 my ( $sp_source_id, $sptr_source_id, $sp_release, $sptr_release, $sptr_non_display_source_id, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id );
66 $self->get_source_id_for_source_name(
'Uniprot/SWISSPROT',
'sequence_mapped', $dbi);
68 $self->get_source_id_for_source_name(
'Uniprot/SPTREMBL',
'sequence_mapped', $dbi);
70 $sptr_non_display_source_id =
71 $self->get_source_id_for_source_name(
'Uniprot/SPTREMBL',
'protein_evidence_gt_2', $dbi);
73 $sp_direct_source_id = $self->get_source_id_for_source_name(
'Uniprot/SWISSPROT',
'direct', $dbi);
74 $sptr_direct_source_id = $self->get_source_id_for_source_name(
'Uniprot/SPTREMBL',
'direct', $dbi);
76 $isoform_source_id = $self->get_source_id_for_source_name(
'Uniprot_isoform');
78 print
"SwissProt source id for $file: $sp_source_id\n" if ($verbose);
79 print
"SpTREMBL source id for $file: $sptr_source_id\n" if ($verbose);
80 print
"SpTREMBL protein_evidence > 2 source id for $file: $sptr_non_display_source_id\n" if ($verbose);
81 print
"SwissProt direct source id for $file: $sp_direct_source_id\n" if ($verbose);
82 print
"SpTREMBL direct source id for $file: $sptr_direct_source_id\n" if ($verbose);
84 $self->create_xrefs( $sp_source_id, $sptr_source_id, $sptr_non_display_source_id, $species_id,
85 $file, $verbose, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id, $dbi );
87 if ( defined $release_file ) {
88 # Parse Swiss-Prot and SpTrEMBL release info from
90 my $release_io = $self->get_filehandle($release_file);
91 while ( defined( my $line = $release_io->getline() ) ) {
92 if ( $line =~ m#(UniProtKB/Swiss-Prot Release .*)# ) {
94 print
"Swiss-Prot release is '$sp_release'\n" if($verbose);
95 } elsif ( $line =~ m#(UniProtKB/TrEMBL Release .*)# ) {
97 print
"SpTrEMBL release is '$sptr_release'\n" if($verbose);
100 $release_io->close();
103 $self->set_release( $sp_source_id, $sp_release, $dbi );
104 $self->set_release( $sptr_source_id, $sptr_release, $dbi );
105 $self->set_release( $sptr_non_display_source_id, $sptr_release, $dbi );
106 $self->set_release( $sp_direct_source_id, $sp_release, $dbi );
107 $self->set_release( $sptr_direct_source_id,$sptr_release, $dbi );
111 return 0; # successfull
115 # --------------------------------------------------------------------------------
116 # Parse file into array of xref objects
119 my ($self, $sp_source_id, $sptr_source_id, $sptr_non_display_source_id, $species_id, $file, $verbose, $sp_direct_source_id, $sptr_direct_source_id, $isoform_source_id, $dbi ) = @_;
124 my $num_sptr_pred = 0;
125 my $num_sptr_non_display = 0;
126 my $num_direct_sp = 0;
127 my $num_direct_sptr = 0;
129 my %dependent_sources = $self->get_xref_sources($dbi);
132 %{ $self->get_valid_codes(
"mim_gene", $species_id, $dbi ) };
134 %{ $self->get_valid_codes(
"mim_morbid", $species_id, $dbi ) };
136 # Extract descriptions from hgnc
137 my %hgnc_descriptions;
139 %hgnc_descriptions = $self->get_hgnc_descriptions($hgnc_file);
142 my $uniprot_io = $self->get_filehandle($file);
143 if ( !defined $uniprot_io ) {
return }
149 # Create a hash of all valid taxon_ids for this species
150 my %species2tax = $self->species_id2taxonomy($dbi);
151 push @{$species2tax{$species_id}}, $species_id;
152 my @tax_ids = @{$species2tax{$species_id}};
153 my %taxonomy2species_id =
map{ $_=>$species_id } @tax_ids;
156 my $ensembl_derived_protein_count = 0;
158 # Counter to process file in batches
161 while ( $_ = $uniprot_io->getline() ) {
163 # if an OX line exists, only store the xref if the taxonomy ID that the OX
164 # line refers to is in the species table
165 # due to some records having more than one tax_id, we need to check them
166 # all and only proceed if one of them matches.
167 #OX NCBI_TaxID=158878, 158879;
168 #OX NCBI_TaxID=103690;
170 my ($ox) = $_ =~ /OX\s+[a-zA-Z_]+=([0-9 ,]+).*;/;
175 @ox = split /\, /, $ox;
177 # my %taxonomy2species_id = $self->taxonomy2species_id();
179 foreach my $taxon_id_from_file (@ox) {
180 $taxon_id_from_file =~ s/\s
181 if ( exists $taxonomy2species_id{$taxon_id_from_file} ){
188 next
if (!$found); # no taxon_id
's match, so skip to next record
191 # set accession (and synonyms if more than one)
192 # AC line may have primary accession and possibly several ; separated synonyms
193 # May also be more than one AC line
194 my ($acc) = $_ =~ /(\nAC\s+.+)/s; # will match first AC line and everything else
196 my @all_lines = split /\n/, $acc;
198 # Check for CC (caution) lines containing certain text
199 # If sequence is from Ensembl, do not use
200 my $ensembl_derived_protein = 0;
201 if ($_ =~ /CAUTION: The sequence shown here is derived from an Ensembl/) {
202 $ensembl_derived_protein = 1;
203 $ensembl_derived_protein_count++;
206 # extract ^AC lines only & build list of accessions
208 foreach my $line (@all_lines) {
209 my ($accessions_only) = $line =~ /^AC\s+(.+)/;
210 push(@accessions, (split /;\s*/, $accessions_only)) if ($accessions_only);
215 if(lc($accessions[0]) eq "unreviewed"){
216 print "WARNING: entries with accession of $acc not allowed will be skipped\n";
219 $xref->{INFO_TYPE} = "SEQUENCE_MATCH";
220 $xref->{ACCESSION} = $accessions[0];
221 for (my $a=1; $a <= $#accessions; $a++) {
222 push(@{$xref->{"SYNONYMS"} }, $accessions[$a]);
225 my ($label, $sp_type) = $_ =~ /ID\s+(\w+)\s+(\w+)/;
226 my ($protein_evidence_code) = $_ =~ /PE\s+(\d+)/;
227 # Capture line with entry version
228 # Example: DT 22-APR-2020, entry version 1.
229 my ($version) = $_ =~ /DT\s+\d+-\w+-\d+, entry version (\d+)/;
231 # SwissProt/SPTrEMBL are differentiated by having STANDARD/PRELIMINARY here
232 if ($sp_type =~ /^Reviewed/i) {
234 $xref->{SOURCE_ID} = $sp_source_id;
236 } elsif ($sp_type =~ /Unreviewed/i) {
238 #Use normal source only if it is PE levels 1 & 2
239 if (defined($protein_evidence_code) && $protein_evidence_code < 3) {
240 $xref->{SOURCE_ID} = $sptr_source_id;
243 $xref->{SOURCE_ID} = $sptr_non_display_source_id;
244 $num_sptr_non_display++;
249 next; # ignore if it's neither one nor t
'other
255 # some straightforward fields
256 # the previous $label flag of type BRCA2_HUMAN is not used in Uniprot any more, use accession instead
257 $xref->{LABEL} = $accessions[0] ."." . $version;
258 $xref->{VERSION} = $version;
259 $xref->{SPECIES_ID} = $species_id;
260 $xref->{SEQUENCE_TYPE} = 'peptide
';
261 $xref->{STATUS} = 'experimental
';
263 # May have multi-line descriptions
264 my ($description_and_rest) = $_ =~ /(DE\s+.*)/s;
265 @all_lines = split /\n/, $description_and_rest;
267 # extract ^DE lines only & build cumulative description string
268 my $description = "";
270 my $sub_description = "";
272 foreach my $line (@all_lines) {
274 next if(!($line =~ /^DE/));
277 if($line =~ /^DE RecName: Full=(.*);/){
278 $name .= ';
' if $name ne q{}; #separate multiple sub-names with a ';
'
281 elsif($line =~ /RecName: Full=(.*);/){
282 $description .= ' ' if $description ne q{}; #separate the description bit with just a space
285 elsif($line =~ /SubName: Full=(.*);/){
286 $name .= ';
' if $name ne q{}; #separate multiple sub-names with a ';
'
291 $description =~ s/^\s*//g;
292 $description =~ s/\s*$//g;
295 my $desc = $name.' '.$description;
297 $desc = $sub_description;
300 $desc =~ s/\s*\{ECO:.*?\}//g;
301 $xref->{DESCRIPTION} = $desc;
303 # Parse the EC_NUMBER line, only for S.cerevisiae for now
305 if (($line =~ /EC=/) && ($species_id == 4932)) {
307 #print STDERR "EC Number line: $line\n";
309 $line =~ /^DE\s+EC=([^;]+);/;
311 # Get the EC Number and make it an xref for S.cer if any
315 #print STDERR "EC after processing: $EC\n";
319 $depe{ACCESSION} = $EC;
321 $depe{SOURCE_NAME} = "EC_NUMBER";
323 $depe{SOURCE_ID} = $dependent_sources{"EC_NUMBER"};
324 $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
325 push @{$xref->{DEPENDENT_XREFS}}, \%depe;
326 $dependent_xrefs{"EC_NUMBER"}++;
332 my ($seq) = $_ =~ /SQ\s+(.+)/s; # /s allows . to match newline
333 my @seq_lines = split /\n/, $seq;
335 foreach my $x (@seq_lines) {
338 $parsed_seq =~ s/\/\///g; # remove trailing end-of-record character
339 $parsed_seq =~ s/\s//g; # remove whitespace
340 $parsed_seq =~ s/^.*;//g; # remove everything before last ;
342 $xref->{SEQUENCE} = $parsed_seq;
343 #print "Adding " . $xref->{ACCESSION} . " " . $xref->{LABEL} ."\n";
346 my ($gns) = $_ =~ /(GN\s+.+)/s;
349 foreach my $gn_line (split("\n", $gns)) {
350 if ($gn_line !~ /^GN/) {last;}
352 $gn_line =~ s/^GN\s+//g;
353 push(@gn_lines, $gn_line);
356 $gns = join('', @gn_lines);
357 @gn_lines = split /;/, $gns;
360 # Do not allow the addition of UniProt Gene Name dependent Xrefs
361 # if the protein was imported from Ensembl. Otherwise we will
362 # re-import previously set symbols
363 if(! $ensembl_derived_protein) {
365 foreach my $gn (@gn_lines){
366 my $gene_name = undef;
368 if ($gn =~ /Name=([A-Za-z0-9_\-\.\s:]+)/s) { #/s for multi-line entries ; is the delimiter
370 # GN Name=ctrc {ECO:0000313|Xenbase:XB-GENE-5790348};
372 $name =~ s/\s+$//g; # Remove white spaces that are left over at the end if there was an evidence code
373 $depe{LABEL} = $name; # leave name as is, upper/lower case is relevant in gene names
374 $depe{ACCESSION} = $self->get_name($xref->{ACCESSION},$depe{LABEL});
375 $gene_name = $depe{ACCESSION};
377 $depe{SOURCE_NAME} = "Uniprot_gn";
378 $depe{SOURCE_ID} = $dependent_sources{"Uniprot_gn"};
379 $depe{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
380 $depe{DESCRIPTION} = $hgnc_descriptions{$name} if ($hgnc_file && defined($hgnc_descriptions{$name}));
381 push @{$xref->{DEPENDENT_XREFS}}, \%depe;
382 $dependent_xrefs{"Uniprot_gn"}++;
385 if($gn =~ /Synonyms=(.*)/s){ # use of /s as synonyms can be across more than one line
387 # GN Synonyms=cela2a {ECO:0000313|Ensembl:ENSXETP00000014934},
388 # GN MGC79767 {ECO:0000313|EMBL:AAH80976.1}
390 $syn =~ s/{.*}//g; # Remove any potential evidence codes
391 $syn =~ s/\n//g; # Remove return carriages, as entry can span several lines
392 $syn =~ s/\s+$//g; # Remove white spaces that are left over at the end if there was an evidence code
393 #$syn =~ s/^\s+//g; # Remove white spaces that are left over at the beginning if there was an evidence code
394 $syn =~ s/\s+,/,/g; # Remove white spaces that are left over before the comma if there was an evidence code
395 @syn = split(/, /,$syn);
396 push (@{$depe{"SYNONYMS"}}, @syn);
401 # dependent xrefs - only store those that are from sources listed in the source table
402 my ($deps) = $_ =~ /(DR\s+.+)/s; # /s allows . to match newline
405 if ( defined $deps ) { @dep_lines = split /\n/, $deps }
407 my %seen=(); # per record basis
409 foreach my $dep (@dep_lines) {
410 #both GO and UniGene have the own sources so ignore those in the uniprot files
411 #as the uniprot data should be older
412 if($dep =~ /GO/ || $dep =~ /UniGene/){
415 if ($dep =~ /^DR\s+(.+)/) {
416 my ($source, $acc, @extra) = split /;\s*/, $1;
417 if($source =~ "RGD"){ #using RGD file now instead.
420 if($source =~ "CCDS"){
423 if($source =~ "IPI"){
426 if($source =~ "UCSC"){
429 if($source =~ "SGD"){
432 if($source =~ "HGNC"){
435 # We get the mappings directly from the source
436 if($source =~ "MGI"){
439 # Nomenclature data is imported directly from the source
440 if($source =~ "VGNC"){
443 if($source =~ "Orphanet"){
444 #we don't want to parse Orphanet xrefs via Uniprot, we get them from Orphanet with descriptions
447 if($source =~
"ArrayExpress"){
450 if($source =~
"GenomeRNAi" || $source =~
"EPD"){
453 if($source =~
"Xenbase"){
456 # Uniprot get Reactome links from Reactome, so we want to get the info from Reactome directly
457 if($source =~
"Reactome"){
460 # MIM xrefs are already imported separately, ignore from Uniprot
461 # Also, Uniprot deals with proteins, not appropriate for gene level xrefs
462 if ($source =~
"MIM_GENE" || $source =~
"MIM_MORBID" || $source =~
"MIM") {
465 # GeneCards xrefs are imported through the HGNC file
466 if ($source =~
"GeneCards") {
469 # If mapped to Ensembl, add as direct xref
470 if ($source eq
"Ensembl") {
472 # DR Ensembl; ENST00000380152; ENSP00000369497; ENSG00000139618.
473 # DR Ensembl; ENST00000372839; ENSP00000361930; ENSG00000166913. [P31946-1]
474 # $source is Ensembl, $acc is ENST00000380152 and @extra is the rest of the line
475 # If the UniProt accession is repeated here, it links to a specific isoform
479 my $stable_id = $extra[0];
480 $stable_id =~ s/\.[0-9]+
481 $direct{STABLE_ID} = $stable_id;
482 $direct{ENSEMBL_TYPE} =
'Translation';
483 $direct{LINKAGE_TYPE} =
'DIRECT';
484 if ($xref->{SOURCE_ID} == $sp_source_id) {
485 $direct{SOURCE_ID} = $sp_direct_source_id;
488 $direct{SOURCE_ID} = $sptr_direct_source_id;
491 push @{$xref->{DIRECT_XREFS}}, \%direct;
493 my $uniprot_acc = $accessions[0];
494 if ($extra[1] =~ /($accessions[0]-[0-9]+)/) {
496 $self->add_to_direct_xrefs({
497 stable_id => $stable_id,
498 type =>
'translation',
502 source_id => $isoform_source_id,
504 species_id => $species_id
508 if (exists $dependent_sources{$source} ) {
509 # create dependent xref structure & store it
511 $dep{SOURCE_NAME} = $source;
512 $dep{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
513 $dep{SOURCE_ID} = $dependent_sources{$source};
515 if($source =~ /HGNC/){
518 $dep{LABEL} = $extra[0];
520 $dep{ACCESSION} = $acc;
522 # $dep{ACCESSION} = $acc;
523 $dependent_xrefs{ $dep{SOURCE_NAME} }++; # get count of depenent xrefs.
524 if(!defined($seen{$dep{SOURCE_NAME}.
":".$dep{ACCESSION}})){
525 push @{$xref->{DEPENDENT_XREFS}}, \%dep; # array of hashrefs
526 $seen{$dep{SOURCE_NAME}.
":".$dep{ACCESSION}} =1;
528 if($dep =~ /EMBL/ && !($dep =~ /ChEMBL/)){
529 my ($protein_id) = $extra[0];
530 if(($protein_id ne
"-") and (!defined($seen{$source.
":".$protein_id}))){
532 $dep2{SOURCE_NAME} = $source;
533 $dep2{SOURCE_ID} = $dependent_sources{
"protein_id"};
534 $dep2{LINKAGE_SOURCE_ID} = $xref->{SOURCE_ID};
535 # store accession unversioned
536 $dep2{LABEL} = $protein_id;
537 my ($prot_acc, $prot_version) = $protein_id =~ /([^.]+)\.([^.]+)/;
538 $dep2{ACCESSION} = $prot_acc;
539 $dependent_xrefs{ $dep2{SOURCE_NAME} }++; # get count of dependent xrefs.
540 $seen{$source.
":".$protein_id} = 1;
541 push @{$xref->{DEPENDENT_XREFS}}, \%dep2; # array of hashrefs
551 $self->upload_xref_object_graphs(\@xrefs, $dbi);
558 $self->upload_xref_object_graphs(\@xrefs, $dbi)
if scalar(@xrefs) > 0;
560 $uniprot_io->close();
562 print
"Read $num_sp SwissProt xrefs, $num_sptr SPTrEMBL xrefs with protein evidence codes 1-2, and $num_sptr_non_display SPTrEMBL xrefs with protein evidence codes > 2 from $file\n" if($verbose);
563 print
"Added $num_direct_sp direct SwissProt xrefs and $num_direct_sptr direct SPTrEMBL xrefs\n" if ($verbose);
564 print
"Found $num_sp_pred predicted SwissProt xrefs and $num_sptr_pred predicted SPTrEMBL xrefs\n" if (($num_sp_pred > 0 || $num_sptr_pred > 0) and $verbose);
565 print
"Skipped $ensembl_derived_protein_count ensembl annotations as Gene names\n";
568 # print "$kount gene anmes added\n";
570 print
"Added the following dependent xrefs:-\n" if($verbose);
571 foreach my $key (keys %dependent_xrefs){
572 print $key.
"\t".$dependent_xrefs{$key}.
"\n" if($verbose);
574 print
"End.\n" if ($verbose);
576 #TODO - currently include records from other species - filter on OX line??
587 sub get_hgnc_descriptions {
588 my ($self, $hgnc_file) = @_;
591 my $hgnc_fh = $self->get_filehandle($hgnc_file);
592 if ( !defined $hgnc_fh ) {confess
"Can't open HGNC file '$hgnc_file'\n";}
593 $hgnc_file =
do { local $/; <$hgnc_fh> };
595 my $input_file = Text::CSV->new({
600 }) or croak
"Cannot use file $hgnc_file: ".Text::CSV->error_diag ();
602 $hgnc_file = Encode::encode(
"UTF-8", $hgnc_file);
603 $hgnc_file =~ s/
"//xg;
605 open my $hgnc_io, '<', \$hgnc_file or confess "Can
't open HGNC file: $!\n";
607 $input_file->column_names( @{ $input_file->getline( $hgnc_io ) } );
609 while ( my $data = $input_file->getline_hr( $hgnc_io ) ) {
610 my $gene_name = $data->{'Approved symbol
'};
611 my $description = $data->{'Approved name
'};
613 $descriptions{$gene_name} = $description;
618 return %descriptions;