3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 # Parse RefSeq GPFF files to create xrefs.
22 package XrefParser::RefSeqGPFFParser;
30 my $peptide_source_id;
33 my $pred_peptide_source_id ;
34 my $pred_mrna_source_id ;
35 my $pred_ncrna_source_id ;
42 my ($self, $ref_arg) = @_;
43 my $source_id = $ref_arg->{source_id};
44 my $species_id = $ref_arg->{species_id};
45 my $species_name = $ref_arg->{species};
46 my $files = $ref_arg->{files};
47 my $release_file = $ref_arg->{rel_file};
48 my $verbose = $ref_arg->{verbose};
49 my $dbi = $ref_arg->{dbi};
50 $dbi = $self->dbi unless defined $dbi;
52 if((!defined $source_id) or (!defined $species_id) or (!defined $files)){
53 croak
"Need to pass source_id, species_id, files and rel_file as pairs";
57 my @files = @{$files};
61 $self->get_source_id_for_source_name(
'RefSeq_peptide', undef, $dbi);
63 $self->get_source_id_for_source_name(
'RefSeq_mRNA',
'refseq', $dbi);
65 $self->get_source_id_for_source_name(
'RefSeq_ncRNA', undef, $dbi);
67 $pred_peptide_source_id =
68 $self->get_source_id_for_source_name(
'RefSeq_peptide_predicted', undef, $dbi);
69 $pred_mrna_source_id =
70 $self->get_source_id_for_source_name(
'RefSeq_mRNA_predicted',
'refseq', $dbi);
71 $pred_ncrna_source_id =
72 $self->get_source_id_for_source_name(
'RefSeq_ncRNA_predicted', undef, $dbi);
74 $entrez_source_id = $self->get_source_id_for_source_name(
'EntrezGene', undef, $dbi);
75 $wiki_source_id = $self->get_source_id_for_source_name(
'WikiGene', undef, $dbi);
78 print
"RefSeq_peptide source ID = $peptide_source_id\n";
79 print
"RefSeq_mRNA source ID = $mrna_source_id\n";
80 print
"RefSeq_ncRNA source ID = $ncrna_source_id\n";
81 print
"RefSeq_peptide_predicted source ID = $pred_peptide_source_id\n";
82 print
"RefSeq_mRNA_predicted source ID = $pred_mrna_source_id\n" ;
83 print
"RefSeq_ncRNA_predicted source ID = $pred_ncrna_source_id\n" ;
86 (%entrez) = %{$self->get_acc_to_label(
"EntrezGene",$species_id, undef, $dbi)};
89 foreach my $file (@files) {
91 $self->create_xrefs( $file, $species_id, $verbose, $dbi, $species_name );
93 if ( !defined( $xrefs ) ) {
96 $self->upload_xref_object_graphs( $xrefs, $dbi )
99 if ( defined $release_file ) {
100 # Parse and set release info.
101 my $release_io = $self->get_filehandle($release_file);
103 my $release = $release_io->getline();
104 $release_io->close();
106 $release =~ s/\s{2,}/ /g;
107 $release =~ s/.*(NCBI Reference Sequence.*) Distribution.*/$1/s;
108 # Put a comma after the release number to make it more readable.
109 $release =~ s/Release (\d+)/Release $1,/;
111 print
"RefSeq release: '$release'\n" if($verbose);
113 $self->set_release( $source_id, $release, $dbi );
114 $self->set_release( $peptide_source_id, $release, $dbi );
115 $self->set_release( $mrna_source_id, $release, $dbi );
116 $self->set_release( $ncrna_source_id, $release, $dbi );
117 $self->set_release( $pred_mrna_source_id, $release, $dbi );
118 $self->set_release( $pred_ncrna_source_id, $release, $dbi );
119 $self->set_release( $pred_peptide_source_id, $release, $dbi );
122 return 0; # successful
125 # --------------------------------------------------------------------------------
126 # Parse file into array of xref objects
127 # There are 2 types of RefSeq files that we are interested in:
128 # - protein sequence files *.protein.faa
129 # - mRNA sequence files *.rna.fna
130 # Slightly different formats
133 my ($self, $file,$species_id, $verbose, $dbi, $species_name ) = @_;
135 # Create a hash of all valid names and taxon_ids for this species
136 my %species2name = $self->species_id2name($dbi);
137 if (defined $species_name) { push @{$species2name{$species_id}}, $species_name; }
138 if (!defined $species2name{$species_id}) {
return; }
139 my %species2tax = $self->species_id2taxonomy($dbi);
140 push @{$species2tax{$species_id}}, $species_id;
141 my @names = @{$species2name{$species_id}};
142 my @tax_ids = @{$species2tax{$species_id}};
143 my %name2species_id =
map{ $_=>$species_id } @names;
144 my %taxonomy2species_id =
map{ $_=>$species_id } @tax_ids;
146 # Retrieve existing RefSeq mRNA
147 my (%refseq_ids) = (%{ $self->get_valid_codes(
"RefSeq_mRNA", $species_id, $dbi) }, %{ $self->get_valid_codes(
"RefSeq_mRNA_predicted", $species_id, $dbi) });
148 my (%entrez_ids) = %{ $self->get_valid_codes(
"EntrezGene", $species_id, $dbi) };
149 my (%wiki_ids) = %{ $self->get_valid_codes(
"WikiGene", $species_id, $dbi) };
152 my %dependent_sources = $self->get_xref_sources($dbi);
154 my $add_dependent_xref_sth = $dbi->prepare(
"INSERT INTO dependent_xref (master_xref_id,dependent_xref_id, linkage_source_id) VALUES (?,?, $entrez_source_id)");
156 my $refseq_io = $self->get_filehandle($file);
158 if ( !defined $refseq_io ) {
159 print STDERR
"ERROR: Can't open RefSeqGPFF file $file\n";
167 my $type = $self->type_from_file($file);
170 while ( my $entry = $refseq_io->getline() ) {
172 my $xref = $self->xref_from_record(
174 \%name2species_id, \%taxonomy2species_id,
175 $pred_mrna_source_id, $pred_ncrna_source_id,
176 $mrna_source_id, $ncrna_source_id,
177 $pred_peptide_source_id, $peptide_source_id,
178 $entrez_source_id, $wiki_source_id, $add_dependent_xref_sth,
179 $species_id, $type, \%refseq_ids,\%entrez_ids,\%wiki_ids
182 push @xrefs, $xref
if $xref;
188 print
"Read " . scalar(@xrefs) .
" xrefs from $file\n" if($verbose);
194 my ($self, $file) = @_;
195 return 'peptide' if $file =~ /RefSeq_protein/;
196 return 'dna' if $file =~ /rna/;
197 return 'peptide' if $file =~ /protein/;
198 print STDERR
"Could not work out sequence type for $file\n";
201 sub xref_from_record {
202 my ( $self, $entry, $name2species_id, $taxonomy2species_id,
203 $pred_mrna_source_id, $pred_ncrna_source_id,
204 $mrna_source_id, $ncrna_source_id,
205 $pred_peptide_source_id, $peptide_source_id,
206 $entrez_source_id, $wiki_source_id, $add_dependent_xref_sth,
207 $species_id, $type, $refseq_ids,$entrez_ids,$wiki_ids
211 my ($species) = $entry =~ /\s+ORGANISM\s+(.*)\n/;
212 $species = lc $species;
214 $species =~ s/\s*\(.+\)
215 $species =~ s/\s+/_/g;
217 my $species_id_check = $name2species_id->{$species};
219 # Try going through the taxon ID if species check didn't work.
220 if ( !defined $species_id_check ) {
221 my ($taxon_id) = $entry =~ /db_xref=
"taxon:(\d+)"/;
222 $species_id_check = $taxonomy2species_id->{$taxon_id};
225 # skip xrefs for species that aren't in the species table
226 if ( defined $species_id
227 && defined $species_id_check
228 && $species_id == $species_id_check )
231 my ($acc) = $entry =~ /ACCESSION\s+(\S+)/;
232 my ($ver) = $entry =~ /VERSION\s+(\S+)/;
233 my ($refseq_pair) = $entry =~ /DBSOURCE\s+REFSEQ:
accession (\S+)/;
235 # get the right source ID based on $type and whether this is predicted (X*) or not
237 if ($type =~ /dna/) {
238 if ($acc =~ /^XM_/ ){
239 $source_id = $pred_mrna_source_id;
240 } elsif( $acc =~ /^XR/) {
241 $source_id = $pred_ncrna_source_id;
242 } elsif( $acc =~ /^NM/) {
243 $source_id = $mrna_source_id;
244 } elsif( $acc =~ /^NR/) {
245 $source_id = $ncrna_source_id;
248 elsif ($type =~ /peptide/) {
249 if ($acc =~ /^XP_/) {
250 $source_id = $pred_peptide_source_id;
252 $source_id = $peptide_source_id;
255 print
"Warning: can't get source ID for $type $acc\n" if (!$source_id);
257 # Description - may be multi-line
258 my ($description) = $entry =~ /DEFINITION\s+([^[]+)/s;
259 print $entry
if (length($description) == 0);
260 $description =~ s/\nACCESSION.*
262 $description =~ s/{.*}-like
263 $description =~ s/{.*}
264 $description =~ s/\s+/ /g;
265 $description = substr($description, 0, 255) if (length($description) > 255);
267 my ($seq) = $entry =~ /^\s*ORIGIN\s+(.+)/ms;
# /s allows . to match newline
268 my @seq_lines = split /\n/, $seq;
270 foreach my $x (@seq_lines) {
271 my ($seq_only) = $x =~ /^\s*\d+\s+(.*)$/;
272 next
if (!defined $seq_only);
273 $parsed_seq .= $seq_only;
276 $parsed_seq =~ s#\s##g; # remove whitespace
278 ( my $acc_no_ver, $ver ) = split( /\./, $ver );
280 $xref->{ACCESSION} = $acc;
281 if($acc eq $acc_no_ver){
282 $xref->{VERSION} = $ver;
285 print
"$acc NE $acc_no_ver\n";
288 $xref->{LABEL} = $acc .
"\." . $ver;
289 $xref->{DESCRIPTION} = $description;
290 $xref->{SOURCE_ID} = $source_id;
291 $xref->{SEQUENCE} = $parsed_seq;
292 $xref->{SEQUENCE_TYPE} = $type;
293 $xref->{SPECIES_ID} = $species_id;
294 $xref->{INFO_TYPE} =
"SEQUENCE_MATCH";
296 # TODO experimental/predicted
298 my @EntrezGeneIDline = $entry =~ /db_xref=.GeneID:(\d+)/g;
299 # my @SGDGeneIDline = $entry =~ /db_xref=.SGD:(S\d+)/g;
300 my @protein_id = $entry =~ /\/protein_id=.(\S+_\d+)/g;
301 my @coded_by = $entry =~ /\/coded_by=.(\w+_\d+)/g;
303 foreach my $cb (@coded_by){
306 if (!defined $xref->{PAIR}) {
307 $xref->{PAIR} = $refseq_pair;
310 foreach my $pi (@protein_id){
311 $xref->{PROTEIN} = $pi;
314 foreach my $ll (@EntrezGeneIDline) {
316 #my $entrez_source = $dependent_sources{EntrezGene} || die( 'No source for EntrezGene!' );
317 #my $wiki_source = $dependent_sources{WikiGene} || die( 'No source for WikiGene!' );
318 if (defined $entrez{$ll}) {
319 $dep{SOURCE_ID} = $entrez_source_id;
320 $dep{LINKAGE_SOURCE_ID} = $source_id;
321 $dep{ACCESSION} = $ll;
322 $dep{LABEL} = $entrez{$ll};
323 push @{$xref->{DEPENDENT_XREFS}}, \%dep;
326 $dep2{SOURCE_ID} = $wiki_source_id;
327 $dep2{LINKAGE_SOURCE_ID} = $source_id;
328 $dep2{ACCESSION} = $ll;
329 $dep2{LABEL} = $entrez{$ll};
330 push @{$xref->{DEPENDENT_XREFS}}, \%dep2;
332 # Add xrefs for RefSeq mRNA as well where available
333 $refseq_pair =~ s/\.[0-9]*
334 if (defined $refseq_pair) {
335 if ($refseq_ids->{$refseq_pair}) {
336 foreach my $refseq_id (@{ $refseq_ids->{$refseq_pair} }) {
337 foreach my $entrez_id (@{ $entrez_ids->{$ll} }) {
338 $add_dependent_xref_sth->execute($refseq_id, $entrez_id);
340 foreach my $wiki_id (@{ $wiki_ids->{$ll} }) {
341 $add_dependent_xref_sth->execute($refseq_id, $wiki_id);
349 # Don't add SGD Xrefs, as they are mapped directly from SGD ftp site
351 # Refseq's do not tell whether the mim is for the gene of morbid so ignore for now.
356 # --------------------------------------------------------------------------------