ensembl-hive  2.8.1
RefSeqParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 # Parse RefSeq files to create xrefs.
21 
22 package XrefParser::RefSeqParser;
23 
24 use strict;
25 use warnings;
26 use Carp;
27 use File::Basename;
28 
29 use base qw( XrefParser::BaseParser );
30 
31 
32 sub run {
33 
34  my ($self, $ref_arg) = @_;
35  my $source_id = $ref_arg->{source_id};
36  my $species_id = $ref_arg->{species_id};
37  my $files = $ref_arg->{files};
38  my $release_file = $ref_arg->{rel_file};
39  my $verbose = $ref_arg->{verbose};
40  my $dbi = $ref_arg->{dbi};
41  $dbi = $self->dbi unless defined $dbi;
42 
43  if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
44  croak "Need to pass source_id, species_id and files as pairs";
45  }
46  $verbose |=0;
47 
48  my @files = @{$files};
49 
50 
51  my $peptide_source_id =
52  $self->get_source_id_for_source_name('RefSeq_peptide', undef, $dbi);
53  my $mrna_source_id =
54  $self->get_source_id_for_source_name('RefSeq_mRNA', undef, $dbi);
55  my $ncrna_source_id =
56  $self->get_source_id_for_source_name('RefSeq_ncRNA', undef, $dbi);
57 
58  my $pred_peptide_source_id =
59  $self->get_source_id_for_source_name('RefSeq_peptide_predicted', undef, $dbi);
60  my $pred_mrna_source_id =
61  $self->get_source_id_for_source_name('RefSeq_mRNA_predicted','refseq', $dbi);
62  my $pred_ncrna_source_id =
63  $self->get_source_id_for_source_name('RefSeq_ncRNA_predicted', undef, $dbi);
64 
65  if($verbose){
66  print "RefSeq_peptide source ID = $peptide_source_id\n";
67  print "RefSeq_mRNA source ID = $mrna_source_id\n";
68  print "RefSeq_ncRNA source ID = $ncrna_source_id\n";
69  print "RefSeq_peptide_predicted source ID = $pred_peptide_source_id\n";
70  print "RefSeq_mRNA_predicted source ID = $pred_mrna_source_id\n" ;
71  print "RefSeq_ncRNA_predicted source ID = $pred_ncrna_source_id\n" ;
72  }
73 
74  my @xrefs;
75  foreach my $file (@files) {
76 
77  my $xrefs =
78  $self->create_xrefs( $peptide_source_id,
79  $pred_peptide_source_id,
80  $mrna_source_id, $ncrna_source_id,
81  $pred_mrna_source_id, $pred_ncrna_source_id,
82  $file,
83  $species_id, $dbi );
84 
85  if ( !defined($xrefs) ) {
86  return 1; #error
87  }
88  print "Read " . scalar(@$xrefs) ." xrefs from $file\n" if($verbose);
89 
90  push @xrefs, @{$xrefs};
91  }
92 
93  if ( !defined( $self->upload_xref_object_graphs( \@xrefs, $dbi ) ) ) {
94  return 1; # error
95  }
96 
97  if ( defined $release_file ) {
98  # Parse and set release info.
99  my $release_io = $self->get_filehandle($release_file);
100  local $/ = "\n*";
101  my $release = $release_io->getline();
102  $release_io->close();
103 
104  $release =~ s/\s{2,}/ /g;
105  $release =~
106  s/.*(NCBI Reference Sequence.*) Distribution Release Notes.*/$1/s;
107  # Put a comma after the release number to make it more readable.
108  $release =~ s/Release (\d+)/Release $1,/;
109 
110  print "RefSeq release: '$release'\n";
111 
112  $self->set_release( $source_id, $release, $dbi );
113  $self->set_release( $peptide_source_id, $release, $dbi );
114  $self->set_release( $mrna_source_id, $release, $dbi );
115  $self->set_release( $ncrna_source_id, $release, $dbi );
116  $self->set_release( $pred_peptide_source_id, $release, $dbi );
117  $self->set_release( $pred_mrna_source_id, $release, $dbi );
118  $self->set_release( $pred_ncrna_source_id, $release, $dbi );
119  }
120 
121  return 0; # successfull
122 
123 }
124 
125 # --------------------------------------------------------------------------------
126 # Parse file into array of xref objects
127 # There are 2 types of RefSeq files that we are interested in:
128 # - protein sequence files *.protein.faa
129 # - mRNA sequence files *.rna.fna
130 # Slightly different formats
131 
132 sub create_xrefs {
133  my ($self, $peptide_source_id, $pred_peptide_source_id,
134  $mrna_source_id, $ncrna_source_id,
135  $pred_mrna_source_id, $pred_ncrna_source_id, $file, $species_id, $dbi ) = @_;
136 
137  # Create a hash of all valid names for this species
138  my %species2name = $self->species_id2name($dbi);
139  my @names = @{$species2name{$species_id}};
140  my %name2species_id = map{ $_=>$species_id } @names;
141  # my %name2species_id = $self->name2species_id();
142 
143  my $refseq_io = $self->get_filehandle($file);
144 
145  if ( !defined $refseq_io ) {
146  print STDERR "ERROR: Can't open RefSeq file $file\n";
147  return;
148  }
149 
150  my @xrefs;
151 
152  local $/ = "\n>";
153 
154  while ( $_ = $refseq_io->getline() ) {
155  my $xref;
156 
157  my $entry = $_;
158  chomp $entry;
159  my ($header, $sequence) = split (/\n/, $entry, 2);
160  $sequence =~ s/^>//;
161  # remove newlines
162  my @seq_lines = split (/\n/, $sequence);
163  $sequence = join("", @seq_lines);
164 
165  (my $acc, my $description) = split(/\s/, $header, 2);
166  $acc =~ s/^>//;
167  my ($species, $mrna);
168  if ($file =~ /\.faa(\.gz|\.Z)?$/) {
169 
170  ($mrna, $description, $species) = $description =~ /(\S*)\s+(.*)\s+\[(.*)\]$/;
171  $xref->{SEQUENCE_TYPE} = 'peptide';
172  $xref->{STATUS} = 'experimental';
173  my $source_id;
174  if ($acc =~ /^XP_/) {
175  $source_id = $pred_peptide_source_id;
176  } else {
177  $source_id = $peptide_source_id;
178  }
179  $xref->{SOURCE_ID} = $source_id;
180 
181  } elsif ($file =~ /\.fna(\.gz|\.Z)?$/) {
182 
183  ($species, $description) = $description =~ /\s*(\w+\s+\w+)\s+(.*)$/;
184  $xref->{SEQUENCE_TYPE} = 'dna';
185  $xref->{STATUS} = 'experimental';
186  my $source_id;
187  if ($acc =~ /^XM_/ ){
188  $source_id = $pred_mrna_source_id;
189  } elsif( $acc =~ /^XR/) {
190  $source_id = $pred_ncrna_source_id;
191  } elsif( $acc =~ /^NM/) {
192  $source_id = $mrna_source_id;
193  } elsif( $acc =~ /^NR/) {
194  $source_id = $ncrna_source_id;
195  }
196  $xref->{SOURCE_ID} = $source_id;
197 
198  }
199 
200  if (!$species) { next; }
201 
202  $species = lc $species;
203  $species =~ s/ /_/;
204 
205  my $species_id_check = $name2species_id{$species};
206 
207  # skip xrefs for species that aren't in the species table
208  if ( defined $species_id
209  && defined $species_id_check
210  && $species_id == $species_id_check )
211  {
212  my ($acc_no_ver,$ver) = split (/\./,$acc);
213  $xref->{ACCESSION} = $acc_no_ver;
214  $xref->{VERSION} = $ver;
215  $xref->{LABEL} = $acc;
216  $xref->{DESCRIPTION} = $description;
217  $xref->{SEQUENCE} = $sequence;
218  $xref->{SPECIES_ID} = $species_id;
219  $xref->{INFO_TYPE} = "SEQUENCE_MATCH";
220 
221  # TODO synonyms, dependent xrefs etc
222 
223  push @xrefs, $xref;
224 
225  }
226 
227  }
228 
229  $refseq_io->close();
230 
231  return \@xrefs;
232 
233 }
234 
235 # --------------------------------------------------------------------------------
236 
237 1;
map
public map()
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()