ensembl-hive  2.8.1
RefSeqGPFFParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 # Parse RefSeq GPFF files to create xrefs.
21 
22 package XrefParser::RefSeqGPFFParser;
23 
24 use strict;
25 use warnings;
26 use Carp;
27 use File::Basename;
28 
29 use base qw( XrefParser::BaseParser );
30 my $peptide_source_id;
31 my $mrna_source_id ;
32 my $ncrna_source_id ;
33 my $pred_peptide_source_id ;
34 my $pred_mrna_source_id ;
35 my $pred_ncrna_source_id ;
36 my $entrez_source_id;
37 my $wiki_source_id;
38 my %entrez;
39 
40 sub run {
41 
42  my ($self, $ref_arg) = @_;
43  my $source_id = $ref_arg->{source_id};
44  my $species_id = $ref_arg->{species_id};
45  my $species_name = $ref_arg->{species};
46  my $files = $ref_arg->{files};
47  my $release_file = $ref_arg->{rel_file};
48  my $verbose = $ref_arg->{verbose};
49  my $dbi = $ref_arg->{dbi};
50  $dbi = $self->dbi unless defined $dbi;
51 
52  if((!defined $source_id) or (!defined $species_id) or (!defined $files)){
53  croak "Need to pass source_id, species_id, files and rel_file as pairs";
54  }
55  $verbose |=0;
56 
57  my @files = @{$files};
58 
59 
60  $peptide_source_id =
61  $self->get_source_id_for_source_name('RefSeq_peptide', undef, $dbi);
62  $mrna_source_id =
63  $self->get_source_id_for_source_name('RefSeq_mRNA','refseq', $dbi);
64  $ncrna_source_id =
65  $self->get_source_id_for_source_name('RefSeq_ncRNA', undef, $dbi);
66 
67  $pred_peptide_source_id =
68  $self->get_source_id_for_source_name('RefSeq_peptide_predicted', undef, $dbi);
69  $pred_mrna_source_id =
70  $self->get_source_id_for_source_name('RefSeq_mRNA_predicted','refseq', $dbi);
71  $pred_ncrna_source_id =
72  $self->get_source_id_for_source_name('RefSeq_ncRNA_predicted', undef, $dbi);
73 
74  $entrez_source_id = $self->get_source_id_for_source_name('EntrezGene', undef, $dbi);
75  $wiki_source_id = $self->get_source_id_for_source_name('WikiGene', undef, $dbi);
76 
77  if($verbose){
78  print "RefSeq_peptide source ID = $peptide_source_id\n";
79  print "RefSeq_mRNA source ID = $mrna_source_id\n";
80  print "RefSeq_ncRNA source ID = $ncrna_source_id\n";
81  print "RefSeq_peptide_predicted source ID = $pred_peptide_source_id\n";
82  print "RefSeq_mRNA_predicted source ID = $pred_mrna_source_id\n" ;
83  print "RefSeq_ncRNA_predicted source ID = $pred_ncrna_source_id\n" ;
84  }
85 
86  (%entrez) = %{$self->get_acc_to_label("EntrezGene",$species_id, undef, $dbi)};
87 
88  my @xrefs;
89  foreach my $file (@files) {
90  my $xrefs =
91  $self->create_xrefs( $file, $species_id, $verbose, $dbi, $species_name );
92 
93  if ( !defined( $xrefs ) ) {
94  return 1; #error
95  }
96  $self->upload_xref_object_graphs( $xrefs, $dbi )
97  }
98 
99  if ( defined $release_file ) {
100  # Parse and set release info.
101  my $release_io = $self->get_filehandle($release_file);
102  local $/ = "\n*";
103  my $release = $release_io->getline();
104  $release_io->close();
105 
106  $release =~ s/\s{2,}/ /g;
107  $release =~ s/.*(NCBI Reference Sequence.*) Distribution.*/$1/s;
108  # Put a comma after the release number to make it more readable.
109  $release =~ s/Release (\d+)/Release $1,/;
110 
111  print "RefSeq release: '$release'\n" if($verbose);
112 
113  $self->set_release( $source_id, $release, $dbi );
114  $self->set_release( $peptide_source_id, $release, $dbi );
115  $self->set_release( $mrna_source_id, $release, $dbi );
116  $self->set_release( $ncrna_source_id, $release, $dbi );
117  $self->set_release( $pred_mrna_source_id, $release, $dbi );
118  $self->set_release( $pred_ncrna_source_id, $release, $dbi );
119  $self->set_release( $pred_peptide_source_id, $release, $dbi );
120  }
121 
122  return 0; # successful
123 }
124 
125 # --------------------------------------------------------------------------------
126 # Parse file into array of xref objects
127 # There are 2 types of RefSeq files that we are interested in:
128 # - protein sequence files *.protein.faa
129 # - mRNA sequence files *.rna.fna
130 # Slightly different formats
131 
132 sub create_xrefs {
133  my ($self, $file,$species_id, $verbose, $dbi, $species_name ) = @_;
134 
135  # Create a hash of all valid names and taxon_ids for this species
136  my %species2name = $self->species_id2name($dbi);
137  if (defined $species_name) { push @{$species2name{$species_id}}, $species_name; }
138  if (!defined $species2name{$species_id}) { return; }
139  my %species2tax = $self->species_id2taxonomy($dbi);
140  push @{$species2tax{$species_id}}, $species_id;
141  my @names = @{$species2name{$species_id}};
142  my @tax_ids = @{$species2tax{$species_id}};
143  my %name2species_id = map{ $_=>$species_id } @names;
144  my %taxonomy2species_id = map{ $_=>$species_id } @tax_ids;
145 
146  # Retrieve existing RefSeq mRNA
147  my (%refseq_ids) = (%{ $self->get_valid_codes("RefSeq_mRNA", $species_id, $dbi) }, %{ $self->get_valid_codes("RefSeq_mRNA_predicted", $species_id, $dbi) });
148  my (%entrez_ids) = %{ $self->get_valid_codes("EntrezGene", $species_id, $dbi) };
149  my (%wiki_ids) = %{ $self->get_valid_codes("WikiGene", $species_id, $dbi) };
150 
151 
152  my %dependent_sources = $self->get_xref_sources($dbi);
153 
154  my $add_dependent_xref_sth = $dbi->prepare("INSERT INTO dependent_xref (master_xref_id,dependent_xref_id, linkage_source_id) VALUES (?,?, $entrez_source_id)");
155 
156  my $refseq_io = $self->get_filehandle($file);
157 
158  if ( !defined $refseq_io ) {
159  print STDERR "ERROR: Can't open RefSeqGPFF file $file\n";
160  return;
161  }
162 
163  my @xrefs;
164 
165  local $/ = "\/\/\n";
166 
167  my $type = $self->type_from_file($file);
168  return unless $type;
169 
170  while ( my $entry = $refseq_io->getline() ) {
171 
172  my $xref = $self->xref_from_record(
173  $entry,
174  \%name2species_id, \%taxonomy2species_id,
175  $pred_mrna_source_id, $pred_ncrna_source_id,
176  $mrna_source_id, $ncrna_source_id,
177  $pred_peptide_source_id, $peptide_source_id,
178  $entrez_source_id, $wiki_source_id, $add_dependent_xref_sth,
179  $species_id, $type, \%refseq_ids,\%entrez_ids,\%wiki_ids
180  );
181 
182  push @xrefs, $xref if $xref;
183 
184  } # while <REFSEQ>
185 
186  $refseq_io->close();
187 
188  print "Read " . scalar(@xrefs) ." xrefs from $file\n" if($verbose);
189 
190  return \@xrefs;
191 
192 }
193 sub type_from_file {
194  my ($self, $file) = @_;
195  return 'peptide' if $file =~ /RefSeq_protein/;
196  return 'dna' if $file =~ /rna/;
197  return 'peptide' if $file =~ /protein/;
198  print STDERR "Could not work out sequence type for $file\n";
199  return;
200 }
201 sub xref_from_record {
202  my ( $self, $entry, $name2species_id, $taxonomy2species_id,
203  $pred_mrna_source_id, $pred_ncrna_source_id,
204  $mrna_source_id, $ncrna_source_id,
205  $pred_peptide_source_id, $peptide_source_id,
206  $entrez_source_id, $wiki_source_id, $add_dependent_xref_sth,
207  $species_id, $type, $refseq_ids,$entrez_ids,$wiki_ids
208 ) = @_;
209  chomp $entry;
210 
211  my ($species) = $entry =~ /\s+ORGANISM\s+(.*)\n/;
212  $species = lc $species;
213  $species =~ s/^\s*//g;
214  $species =~ s/\s*\(.+\)//; # Ditch anything in parens
215  $species =~ s/\s+/_/g;
216  $species =~ s/\n//g;
217  my $species_id_check = $name2species_id->{$species};
218 
219  # Try going through the taxon ID if species check didn't work.
220  if ( !defined $species_id_check ) {
221  my ($taxon_id) = $entry =~ /db_xref="taxon:(\d+)"/;
222  $species_id_check = $taxonomy2species_id->{$taxon_id};
223  }
224 
225  # skip xrefs for species that aren't in the species table
226  if ( defined $species_id
227  && defined $species_id_check
228  && $species_id == $species_id_check )
229  {
230  my $xref = {};
231  my ($acc) = $entry =~ /ACCESSION\s+(\S+)/;
232  my ($ver) = $entry =~ /VERSION\s+(\S+)/;
233  my ($refseq_pair) = $entry =~ /DBSOURCE\s+REFSEQ: accession (\S+)/;
234 
235  # get the right source ID based on $type and whether this is predicted (X*) or not
236  my $source_id;
237  if ($type =~ /dna/) {
238  if ($acc =~ /^XM_/ ){
239  $source_id = $pred_mrna_source_id;
240  } elsif( $acc =~ /^XR/) {
241  $source_id = $pred_ncrna_source_id;
242  } elsif( $acc =~ /^NM/) {
243  $source_id = $mrna_source_id;
244  } elsif( $acc =~ /^NR/) {
245  $source_id = $ncrna_source_id;
246  }
247  }
248  elsif ($type =~ /peptide/) {
249  if ($acc =~ /^XP_/) {
250  $source_id = $pred_peptide_source_id;
251  } else {
252  $source_id = $peptide_source_id;
253  }
254  }
255  print "Warning: can't get source ID for $type $acc\n" if (!$source_id);
256 
257  # Description - may be multi-line
258  my ($description) = $entry =~ /DEFINITION\s+([^[]+)/s;
259  print $entry if (length($description) == 0);
260  $description =~ s/\nACCESSION.*//s;
261  $description =~ s/\n//g;
262  $description =~ s/{.*}-like//g;
263  $description =~ s/{.*}//g;
264  $description =~ s/\s+/ /g;
265  $description = substr($description, 0, 255) if (length($description) > 255);
266 
267  my ($seq) = $entry =~ /^\s*ORIGIN\s+(.+)/ms; # /s allows . to match newline
268  my @seq_lines = split /\n/, $seq;
269  my $parsed_seq = "";
270  foreach my $x (@seq_lines) {
271  my ($seq_only) = $x =~ /^\s*\d+\s+(.*)$/;
272  next if (!defined $seq_only);
273  $parsed_seq .= $seq_only;
274  }
275  $parsed_seq =~ s#//##g; # remove trailing end-of-record character
276  $parsed_seq =~ s#\s##g; # remove whitespace
277 
278  ( my $acc_no_ver, $ver ) = split( /\./, $ver );
279 
280  $xref->{ACCESSION} = $acc;
281  if($acc eq $acc_no_ver){
282  $xref->{VERSION} = $ver;
283  }
284  else{
285  print "$acc NE $acc_no_ver\n";
286  }
287 
288  $xref->{LABEL} = $acc . "\." . $ver;
289  $xref->{DESCRIPTION} = $description;
290  $xref->{SOURCE_ID} = $source_id;
291  $xref->{SEQUENCE} = $parsed_seq;
292  $xref->{SEQUENCE_TYPE} = $type;
293  $xref->{SPECIES_ID} = $species_id;
294  $xref->{INFO_TYPE} = "SEQUENCE_MATCH";
295 
296  # TODO experimental/predicted
297 
298  my @EntrezGeneIDline = $entry =~ /db_xref=.GeneID:(\d+)/g;
299 # my @SGDGeneIDline = $entry =~ /db_xref=.SGD:(S\d+)/g;
300  my @protein_id = $entry =~ /\/protein_id=.(\S+_\d+)/g;
301  my @coded_by = $entry =~ /\/coded_by=.(\w+_\d+)/g;
302 
303  foreach my $cb (@coded_by){
304  $xref->{PAIR} = $cb;
305  }
306  if (!defined $xref->{PAIR}) {
307  $xref->{PAIR} = $refseq_pair;
308  }
309 
310  foreach my $pi (@protein_id){
311  $xref->{PROTEIN} = $pi;
312  }
313 
314  foreach my $ll (@EntrezGeneIDline) {
315  my %dep;
316  #my $entrez_source = $dependent_sources{EntrezGene} || die( 'No source for EntrezGene!' );
317  #my $wiki_source = $dependent_sources{WikiGene} || die( 'No source for WikiGene!' );
318  if (defined $entrez{$ll}) {
319  $dep{SOURCE_ID} = $entrez_source_id;
320  $dep{LINKAGE_SOURCE_ID} = $source_id;
321  $dep{ACCESSION} = $ll;
322  $dep{LABEL} = $entrez{$ll};
323  push @{$xref->{DEPENDENT_XREFS}}, \%dep;
324 
325  my %dep2;
326  $dep2{SOURCE_ID} = $wiki_source_id;
327  $dep2{LINKAGE_SOURCE_ID} = $source_id;
328  $dep2{ACCESSION} = $ll;
329  $dep2{LABEL} = $entrez{$ll};
330  push @{$xref->{DEPENDENT_XREFS}}, \%dep2;
331 
332  # Add xrefs for RefSeq mRNA as well where available
333  $refseq_pair =~ s/\.[0-9]*// if $refseq_pair;
334  if (defined $refseq_pair) {
335  if ($refseq_ids->{$refseq_pair}) {
336  foreach my $refseq_id (@{ $refseq_ids->{$refseq_pair} }) {
337  foreach my $entrez_id (@{ $entrez_ids->{$ll} }) {
338  $add_dependent_xref_sth->execute($refseq_id, $entrez_id);
339  }
340  foreach my $wiki_id (@{ $wiki_ids->{$ll} }) {
341  $add_dependent_xref_sth->execute($refseq_id, $wiki_id);
342  }
343  }
344  }
345  }
346  }
347  }
348 
349  # Don't add SGD Xrefs, as they are mapped directly from SGD ftp site
350 
351  # Refseq's do not tell whether the mim is for the gene of morbid so ignore for now.
352  return $xref;
353  }
354 }
355 
356 # --------------------------------------------------------------------------------
357 
358 1;
map
public map()
accession
public accession()
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()