ensembl-hive  2.8.1
TAIROntologyParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefParser::TAIROntologyParser;
21 
22 =pod
23 
24 =head1 NAME
25 
27 
28 =head1 DESCRIPTION
29 
30 Parse ontology files. File format (and example data):
31 
32  1: DB, database contributing the file (always "TAIR" for this file).
33  2: DB_Object_ID (TAIR's unique identifier for genes).
34  3: DB_Object_Symbol, see below
35  4: Qualifier (optional), one or more of 'NOT', 'contributes_to',
36  'colocalizes_with' as qualifier(s) for a GO annotation, when needed,
37  multiples separated by pipe (|)
38  5: GO ID, unique numeric identifier for the GO term
39  6: DB:Reference(|DB:Reference), the reference associated with the GO
40  annotation
41  7: Evidence, the evidence code for the GO annotation
42  8: With (or) From (optional), any With or From qualifier for the GO
43  annotation
44  9: Aspect, which ontology the GO term belongs (Function, Process or
45  Component)
46  10: DB_Object_Name(|Name) (optional), a name for the gene product in
47  words, e.g. 'acid phosphatase'
48  11: DB_Object_Synonym(|Synonym) (optional), see below
49  12: DB_Object_Type, type of object annotated, e.g. gene, protein, etc.
50  13: taxon(|taxon), taxonomic identifier of species encoding gene
51  product
52  14: Date, date GO annotation was made in the format
53  15: Assigned_by, source of the annotation (either "TAIR" or "TIGR")
54 
55 E.g.:
56  Field1: TAIR
57  Field2: locus:2031476
58  Field3: ENO1
59  Field4:
60  Field5: GO:0000015
61  Field6: TAIR:AnalysisReference:501748310
62  Field7: IEA
63  Field8: INTERPRO:IPR000941|INTERPRO:IPR020809|INTERPRO:IPR020810|INTERPRO:IPR020811
64  Field9: C
65  Field10: enolase 1
66  Field11: AT1G74030|AT1G74030.1|F2P9.10|F2P9_10
67  Field12: protein
68  Field13: taxon:3702
69  Field14: 20120418
70  Field15: TAIR
71  Field16:
72  Field17: TAIR:gene:2031475
73 
74  Field1: TAIR
75  Field2: locus:2081840
76  Field3: LTP12
77  Field4:
78  Field5: PO:0001009
79  Field6: TAIR:Publication:501710265|AGRICOLA_IND:IND23314018
80  Field7: IDA
81  Field8:
82  Field9: G
83  Field10: AT3G51590
84  Field11: AT3G51590|LTP12|lipid transfer protein 12|T18N14.1
85  Field12: protein
86  Field13: taxon:3702
87  Field14: 20060215
88  Field15: TAIR
89  Field16:
90  Field17: TAIR:locus:2081840
91 
92 =head1 AUTHOR
93 
94 Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.
95 
96 =cut
97 
98 use strict;
99 use warnings;
100 use autodie;
101 use Data::Dumper;
102 use File::Basename 'basename';
103 use Readonly;
104 use List::MoreUtils 'uniq';
105 
106 use base qw( XrefParser::BaseParser );
107 
108 Readonly my $DIRECT => 'DIRECT';
109 Readonly my $GENE => 'Gene';
110 Readonly my $TAIR_TRANSLATION => 'TAIR_TRANSLATION';
111 Readonly my @GAF_FIELDS => qw(
112  db
113  db_object_id
114  db_object_symbol
115  qualifier
116  ont_id
117  db_reference
118  evidence_code
119  with
120  aspect
121  db_object_name
122  db_object_synonym
123  db_object_type
124  taxon
125  date
126  assigned_by
127  annotation_extension
128  gene_product_form_id
129 );
130 
131 # Creates ontology xrefs from a GAF annotation file.
132 # Also requires an OBO file mapping IDs to terms.
133 # The ontology xrefs are dependent on xrefs
134 
135 #----------------------------------------------------------------------
136 sub run {
137  my ($self, $args) = @_;
138  my $source_id = $args->{'source_id'};
139  my $species_id = $args->{'species_id'};
140  my $files = $args->{'files'};
141  my $release_file = $args->{'rel_file'};
142  my $notify = sub { print @_ if $args->{'verbose'} };
143 
144  unless ( ref $files eq 'ARRAY' ) {
145  die '"files" argument not an array?';
146  }
147 
148  my ($ont_file) = grep { /\.(xml|obo)/ } @$files;
149  my ($assoc_file) = grep { $ont_file && !/$ont_file/ } @$files;
150 
151  if ( $assoc_file ) {
152  $notify->(sprintf "%s parsing file '$assoc_file'\n", __PACKAGE__);
153  }
154  else {
155  printf STDERR "%s called without a 'files' argument\n%s",
156  __PACKAGE__, Dumper($args);
157  return 1; # error
158  }
159 
160  # get the "main" GO/PO source id.
161  my $source_name = $self->get_source_name_for_source_id( $source_id );
162  $source_id = $self->get_source_id_for_source_name( $source_name, 'main' );
163 
164  #
165  # Ontologies are attached to the translations, so this gets all the
166  # translation xrefs as a hash with the translation IDs as the keys
167  # and the xref IDs as the values.
168  #
169  my %master_xrefs = %{
170  $self->get_valid_codes( $TAIR_TRANSLATION , $species_id )
171  };
172 
173  $species_id ||= $self->get_species_id_for_filename($assoc_file);
174  my %species_id2name = $self->species_id2name();
175  my $species = $species_id2name{$species_id}->[0];
176 
177  $notify->("Parsing $source_name for $species\n");
178 
179  #
180  # Get mappimg from GO terms to descriptions from the XML/OBO file
181  #
182  #
183  my %id_to_term;
184  if ( $ont_file ) {
185  my $obo_io = $self->get_filehandle( $ont_file );
186  if ( !defined $obo_io ) {
187  print STDERR "ERROR: Could not open '$ont_file'\n";
188  return 1; # 1 error
189  }
190 
191  my $term = undef;
192  my $desc = undef;
193 
194  if ( $ont_file =~ /\.xml$/ ) {
195  while ( $_ = $obo_io->getline() ) {
196  if (/<id>([G|P]O:\d+)<\/id>/) {
197  $term = $1;
198  }
199  elsif (/<name>(.*)<\/name>/) {
200  if ( defined($term) ) {
201  $id_to_term{$term} = $1;
202  }
203  $term = undef;
204  }
205  }
206  }
207  else {
208  while ( $_ = $obo_io->getline() ) {
209  if (/^id:\s+([G|P]O:\d+)/) {
210  $term = $1;
211  }
212  elsif (/^name:\s+(.*)/) {
213  if ( defined $term ) {
214  $id_to_term{ $term } = $1;
215  }
216  $term = undef;
217  }
218  }
219  }
220 
221  $obo_io->close();
222 
223  $notify->(
224  sprintf "Mapped %s terms to descriptions\n",
225  scalar keys %id_to_term
226  );
227  }
228 
229  my %wrongtype;
230 
231  my $num_terms = 0;
232  my $count = 0;
233  my $miss_parse = 0;
234 
235  # Open the gene association GAF file
236  my $gaf_io = $self->get_filehandle( $assoc_file );
237 
238  if ( !defined $gaf_io ) {
239  print STDERR "ERROR: Could not open $assoc_file\n";
240  return 1; # error
241  }
242 
243  while ( my $line = $gaf_io->getline() ) {
244  next if $line =~ /^!/; # comment
245 
246  chomp $line;
247 
248  my @vals = split( /\t/, $line );
249  my %rec = map { $GAF_FIELDS[$_], $vals[$_] } 0..$#GAF_FIELDS;
250 
251  next if $rec{'db'} ne 'TAIR'; # Only process TAIR annotations
252  next if $rec{'qualifier'} eq 'NOT'; # Skip "NOT" terms entirely
253 
254  # Find the stable_id(s)
255  my @stable_ids = uniq(
256  map { defined $master_xrefs{ $_ } ? $_ : () }
257  grep { /^AT\d+G/ }
258  map { s/\'/\\\'/g; $_ }
259  map { split /[|]/ }
260  ( $rec{'db_object_name'}, $rec{'db_object_synonym'} )
261  );
262 
263  if ( !@stable_ids ) {
264  $miss_parse++;
265  next;
266  }
267 
268  my $ont_id = $rec{'ont_id'};
269  my $desc = $id_to_term{ $ont_id } || '';
270 
271  for my $stable_id ( @stable_ids ) {
272  printf "%-70s\r", sprintf(
273  "%10s: %s => %s", ++$num_terms, $ont_id, $stable_id
274  );
275 
276  my @xref_ids = @{ $master_xrefs{ $stable_id } };
277 
278  for my $xref_id ( @xref_ids ) {
279  $self->add_dependent_xref({
280  master_xref_id => $xref_id,
281  acc => $ont_id,
282  label => $ont_id,
283  desc => $desc,
284  linkage => $rec{'evidence_code'},
285  source_id => $source_id,
286  species_id => $species_id
287  });
288  }
289  }
290 
291  $count++;
292  }
293 
294  $gaf_io->close();
295 
296  $notify->(
297  map { " - $_\n" }
298  "$count $source_name dependent xrefs added",
299  "$miss_parse lines did not contain a recognised stable_id",
300  );
301 
302  if ( defined $release_file ) {
303  # Parse and set release information from $release_file.
304  my $release_io = $self->get_filehandle($release_file);
305 
306  # Slurp mode.
307  local $/;
308  my $release = <$release_io>;
309  $release_io->close();
310 
311  $release =~ tr/\n/ /;
312  $release =~ s#.*The following table describes.*?of (POC.*?)<ul>.*#$1#;
313  $release =~ s#<[^>]+>##g;
314 
315  $notify->("$source_name release: '$release'\n");
316  $self->set_release( $source_id, $release );
317  }
318 
319  return 0; # success
320 }
321 
322 1;
main
public main()
XrefParser::TAIROntologyParser
Definition: TAIROntologyParser.pm:72