3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 # Parse RefSeq files to create xrefs.
22 package XrefParser::RefSeqParser;
34 my ($self, $ref_arg) = @_;
35 my $source_id = $ref_arg->{source_id};
36 my $species_id = $ref_arg->{species_id};
37 my $files = $ref_arg->{files};
38 my $release_file = $ref_arg->{rel_file};
39 my $verbose = $ref_arg->{verbose};
40 my $dbi = $ref_arg->{dbi};
41 $dbi = $self->dbi unless defined $dbi;
43 if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
44 croak
"Need to pass source_id, species_id and files as pairs";
48 my @files = @{$files};
51 my $peptide_source_id =
52 $self->get_source_id_for_source_name(
'RefSeq_peptide', undef, $dbi);
54 $self->get_source_id_for_source_name(
'RefSeq_mRNA', undef, $dbi);
56 $self->get_source_id_for_source_name(
'RefSeq_ncRNA', undef, $dbi);
58 my $pred_peptide_source_id =
59 $self->get_source_id_for_source_name(
'RefSeq_peptide_predicted', undef, $dbi);
60 my $pred_mrna_source_id =
61 $self->get_source_id_for_source_name(
'RefSeq_mRNA_predicted',
'refseq', $dbi);
62 my $pred_ncrna_source_id =
63 $self->get_source_id_for_source_name(
'RefSeq_ncRNA_predicted', undef, $dbi);
66 print
"RefSeq_peptide source ID = $peptide_source_id\n";
67 print
"RefSeq_mRNA source ID = $mrna_source_id\n";
68 print
"RefSeq_ncRNA source ID = $ncrna_source_id\n";
69 print
"RefSeq_peptide_predicted source ID = $pred_peptide_source_id\n";
70 print
"RefSeq_mRNA_predicted source ID = $pred_mrna_source_id\n" ;
71 print
"RefSeq_ncRNA_predicted source ID = $pred_ncrna_source_id\n" ;
75 foreach my $file (@files) {
78 $self->create_xrefs( $peptide_source_id,
79 $pred_peptide_source_id,
80 $mrna_source_id, $ncrna_source_id,
81 $pred_mrna_source_id, $pred_ncrna_source_id,
85 if ( !defined($xrefs) ) {
88 print
"Read " . scalar(@$xrefs) .
" xrefs from $file\n" if($verbose);
90 push @xrefs, @{$xrefs};
93 if ( !defined( $self->upload_xref_object_graphs( \@xrefs, $dbi ) ) ) {
97 if ( defined $release_file ) {
98 # Parse and set release info.
99 my $release_io = $self->get_filehandle($release_file);
101 my $release = $release_io->getline();
102 $release_io->close();
104 $release =~ s/\s{2,}/ /g;
106 s/.*(NCBI Reference Sequence.*) Distribution Release Notes.*/$1/s;
107 # Put a comma after the release number to make it more readable.
108 $release =~ s/Release (\d+)/Release $1,/;
110 print
"RefSeq release: '$release'\n";
112 $self->set_release( $source_id, $release, $dbi );
113 $self->set_release( $peptide_source_id, $release, $dbi );
114 $self->set_release( $mrna_source_id, $release, $dbi );
115 $self->set_release( $ncrna_source_id, $release, $dbi );
116 $self->set_release( $pred_peptide_source_id, $release, $dbi );
117 $self->set_release( $pred_mrna_source_id, $release, $dbi );
118 $self->set_release( $pred_ncrna_source_id, $release, $dbi );
121 return 0; # successfull
125 # --------------------------------------------------------------------------------
126 # Parse file into array of xref objects
127 # There are 2 types of RefSeq files that we are interested in:
128 # - protein sequence files *.protein.faa
129 # - mRNA sequence files *.rna.fna
130 # Slightly different formats
133 my ($self, $peptide_source_id, $pred_peptide_source_id,
134 $mrna_source_id, $ncrna_source_id,
135 $pred_mrna_source_id, $pred_ncrna_source_id, $file, $species_id, $dbi ) = @_;
137 # Create a hash of all valid names for this species
138 my %species2name = $self->species_id2name($dbi);
139 my @names = @{$species2name{$species_id}};
140 my %name2species_id =
map{ $_=>$species_id } @names;
141 # my %name2species_id = $self->name2species_id();
143 my $refseq_io = $self->get_filehandle($file);
145 if ( !defined $refseq_io ) {
146 print STDERR
"ERROR: Can't open RefSeq file $file\n";
154 while ( $_ = $refseq_io->getline() ) {
159 my ($header, $sequence) = split (/\n/, $entry, 2);
162 my @seq_lines = split (/\n/, $sequence);
163 $sequence = join(
"", @seq_lines);
165 (my $acc, my $description) = split(/\s/, $header, 2);
167 my ($species, $mrna);
168 if ($file =~ /\.faa(\.gz|\.Z)?$/) {
170 ($mrna, $description, $species) = $description =~ /(\S*)\s+(.*)\s+\[(.*)\]$/;
171 $xref->{SEQUENCE_TYPE} =
'peptide';
172 $xref->{STATUS} =
'experimental';
174 if ($acc =~ /^XP_/) {
175 $source_id = $pred_peptide_source_id;
177 $source_id = $peptide_source_id;
179 $xref->{SOURCE_ID} = $source_id;
181 } elsif ($file =~ /\.fna(\.gz|\.Z)?$/) {
183 ($species, $description) = $description =~ /\s*(\w+\s+\w+)\s+(.*)$/;
184 $xref->{SEQUENCE_TYPE} =
'dna';
185 $xref->{STATUS} =
'experimental';
187 if ($acc =~ /^XM_/ ){
188 $source_id = $pred_mrna_source_id;
189 } elsif( $acc =~ /^XR/) {
190 $source_id = $pred_ncrna_source_id;
191 } elsif( $acc =~ /^NM/) {
192 $source_id = $mrna_source_id;
193 } elsif( $acc =~ /^NR/) {
194 $source_id = $ncrna_source_id;
196 $xref->{SOURCE_ID} = $source_id;
200 if (!$species) { next; }
202 $species = lc $species;
205 my $species_id_check = $name2species_id{$species};
207 # skip xrefs for species that aren't in the species table
208 if ( defined $species_id
209 && defined $species_id_check
210 && $species_id == $species_id_check )
212 my ($acc_no_ver,$ver) = split (/\./,$acc);
213 $xref->{ACCESSION} = $acc_no_ver;
214 $xref->{VERSION} = $ver;
215 $xref->{LABEL} = $acc;
216 $xref->{DESCRIPTION} = $description;
217 $xref->{SEQUENCE} = $sequence;
218 $xref->{SPECIES_ID} = $species_id;
219 $xref->{INFO_TYPE} =
"SEQUENCE_MATCH";
221 # TODO synonyms, dependent xrefs etc
235 # --------------------------------------------------------------------------------