3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 package XrefParser::TAIROntologyParser;
30 Parse ontology files. File format (and example data):
32 1: DB, database contributing the file (always
"TAIR" for this file).
33 2: DB_Object_ID (TAIR
's unique identifier for genes).
34 3: DB_Object_Symbol, see below
35 4: Qualifier (optional), one or more of 'NOT
', 'contributes_to
',
36 'colocalizes_with
' as qualifier(s) for a GO annotation, when needed,
37 multiples separated by pipe (|)
38 5: GO ID, unique numeric identifier for the GO term
39 6: DB:Reference(|DB:Reference), the reference associated with the GO
41 7: Evidence, the evidence code for the GO annotation
42 8: With (or) From (optional), any With or From qualifier for the GO
44 9: Aspect, which ontology the GO term belongs (Function, Process or
46 10: DB_Object_Name(|Name) (optional), a name for the gene product in
47 words, e.g. 'acid phosphatase
'
48 11: DB_Object_Synonym(|Synonym) (optional), see below
49 12: DB_Object_Type, type of object annotated, e.g. gene, protein, etc.
50 13: taxon(|taxon), taxonomic identifier of species encoding gene
52 14: Date, date GO annotation was made in the format
53 15: Assigned_by, source of the annotation (either "TAIR" or "TIGR")
61 Field6: TAIR:AnalysisReference:501748310
63 Field8: INTERPRO:IPR000941|INTERPRO:IPR020809|INTERPRO:IPR020810|INTERPRO:IPR020811
66 Field11: AT1G74030|AT1G74030.1|F2P9.10|F2P9_10
72 Field17: TAIR:gene:2031475
79 Field6: TAIR:Publication:501710265|AGRICOLA_IND:IND23314018
84 Field11: AT3G51590|LTP12|lipid transfer protein 12|T18N14.1
90 Field17: TAIR:locus:2081840
94 Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.
102 use File::Basename 'basename
';
104 use List::MoreUtils 'uniq
';
106 use base qw( XrefParser::BaseParser );
108 Readonly my $DIRECT => 'DIRECT
';
109 Readonly my $GENE => 'Gene
';
110 Readonly my $TAIR_TRANSLATION => 'TAIR_TRANSLATION
';
111 Readonly my @GAF_FIELDS => qw(
131 # Creates ontology xrefs from a GAF annotation file.
132 # Also requires an OBO file mapping IDs to terms.
133 # The ontology xrefs are dependent on xrefs
135 #----------------------------------------------------------------------
137 my ($self, $args) = @_;
138 my $source_id = $args->{'source_id
'};
139 my $species_id = $args->{'species_id
'};
140 my $files = $args->{'files
'};
141 my $release_file = $args->{'rel_file
'};
142 my $notify = sub { print @_ if $args->{'verbose
'} };
144 unless ( ref $files eq 'ARRAY
' ) {
145 die '"files" argument not an array?
';
148 my ($ont_file) = grep { /\.(xml|obo)/ } @$files;
149 my ($assoc_file) = grep { $ont_file && !/$ont_file/ } @$files;
152 $notify->(sprintf "%s parsing file '$assoc_file
'\n", __PACKAGE__);
155 printf STDERR "%s called without a 'files
' argument\n%s",
156 __PACKAGE__, Dumper($args);
160 # get the "main" GO/PO source id.
161 my $source_name = $self->get_source_name_for_source_id( $source_id );
162 $source_id = $self->get_source_id_for_source_name( $source_name, 'main' );
165 # Ontologies are attached to the translations, so this gets all the
166 # translation xrefs as a hash with the translation IDs as the keys
167 # and the xref IDs as the values.
169 my %master_xrefs = %{
170 $self->get_valid_codes( $TAIR_TRANSLATION , $species_id )
173 $species_id ||= $self->get_species_id_for_filename($assoc_file);
174 my %species_id2name = $self->species_id2name();
175 my $species = $species_id2name{$species_id}->[0];
177 $notify->("Parsing $source_name for $species\n");
180 # Get mappimg from GO terms to descriptions from the XML/OBO file
185 my $obo_io = $self->get_filehandle( $ont_file );
186 if ( !defined $obo_io ) {
187 print STDERR "ERROR: Could not open '$ont_file
'\n";
194 if ( $ont_file =~ /\.xml$/ ) {
195 while ( $_ = $obo_io->getline() ) {
196 if (/<id>([G|P]O:\d+)<\/id>/) {
199 elsif (/<name>(.*)<\/name>/) {
200 if ( defined($term) ) {
201 $id_to_term{$term} = $1;
208 while ( $_ = $obo_io->getline() ) {
209 if (/^id:\s+([G|P]O:\d+)/) {
212 elsif (/^name:\s+(.*)/) {
213 if ( defined $term ) {
214 $id_to_term{ $term } = $1;
224 sprintf "Mapped %s terms to descriptions\n",
225 scalar keys %id_to_term
235 # Open the gene association GAF file
236 my $gaf_io = $self->get_filehandle( $assoc_file );
238 if ( !defined $gaf_io ) {
239 print STDERR "ERROR: Could not open $assoc_file\n";
243 while ( my $line = $gaf_io->getline() ) {
244 next if $line =~ /^!/; # comment
248 my @vals = split( /\t/, $line );
249 my %rec = map { $GAF_FIELDS[$_], $vals[$_] } 0..$#GAF_FIELDS;
251 next if $rec{'db
'} ne 'TAIR
'; # Only process TAIR annotations
252 next if $rec{'qualifier
'} eq 'NOT
'; # Skip "NOT" terms entirely
254 # Find the stable_id(s)
255 my @stable_ids = uniq(
256 map { defined $master_xrefs{ $_ } ? $_ : () }
258 map { s/\'/\\\'/g; $_ }
260 ( $rec{'db_object_name
'}, $rec{'db_object_synonym
'} )
263 if ( !@stable_ids ) {
268 my $ont_id = $rec{'ont_id
'};
269 my $desc = $id_to_term{ $ont_id } || '';
271 for my $stable_id ( @stable_ids ) {
272 printf "%-70s\r", sprintf(
273 "%10s: %s => %s", ++$num_terms, $ont_id, $stable_id
276 my @xref_ids = @{ $master_xrefs{ $stable_id } };
278 for my $xref_id ( @xref_ids ) {
279 $self->add_dependent_xref({
280 master_xref_id => $xref_id,
284 linkage => $rec{'evidence_code
'},
285 source_id => $source_id,
286 species_id => $species_id
298 "$count $source_name dependent xrefs added",
299 "$miss_parse lines did not contain a recognised stable_id",
302 if ( defined $release_file ) {
303 # Parse and set release information from $release_file.
304 my $release_io = $self->get_filehandle($release_file);
308 my $release = <$release_io>;
309 $release_io->close();
311 $release =~ tr/\n/ /;
312 $release =~ s#.*The following table describes.*?of (POC.*?)<ul>.*#$1#;
313 $release =~ s#<[^>]+>##g;
315 $notify->("$source_name release: '$release
'\n");
316 $self->set_release( $source_id, $release );