ensembl-hive  2.7.0
TAIRIDParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefParser::TAIRIDParser;
21 
22 =pod
23 
24 =head1 NAME
25 
27 
28 =head1 DESCRIPTION
29 
30 Takes the TAIR cDNA FASTA file and uses the description lines to
31 create direct xrefs.
32 
33 Sample data:
34 
35 >AT5G55930.1 | Symbols: ATOPT1, OPT1 | oligopeptide transporter 1 | chr5:22652715-22656106 FORWARD LENGTH=2820
36 CAAAATTCATGTGGTGTAAATTGTCTAAAGTCTGTATTTTTTTTTATTGACATCCATTTTTTTTGTGTCGAAAGTCTAT
37 ...
38 
39 =head1 AUTHOR
40 
41 Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.
42 
43 =cut
44 
45 use strict;
46 
47 use base qw( XrefParser::BaseParser );
48 
49 use Data::Dumper;
50 use Readonly;
51 
52 Readonly my $EMPTY_STR => q{};
53 Readonly my $DIRECT => 'DIRECT';
54 Readonly my $GENE => 'Gene';
55 Readonly my $NASC_GENE_ID => 'NASC_GENE_ID';
56 Readonly my $TAIR_LOCUS_MODEL => 'TAIR_LOCUS_MODEL';
57 Readonly my $TAIR_LOCUS => 'TAIR_LOCUS';
58 Readonly my $TAIR_SYMBOL => 'TAIR_SYMBOL';
59 Readonly my $TAIR_TRANSLATION => 'TAIR_TRANSLATION';
60 Readonly my $TRANSCRIPT => 'Transcript';
61 Readonly my $TRANSLATION => 'Translation';
62 
63 
64 # -----------------------------------------------------------------
65 sub run {
66  my ( $self, $args ) = @_;
67 
68  my $notify = sub { print @_, "\n" if $args->{'verbose'} };
69 
70  my $files = $args->{'files'};
71  my $file = ref $files eq 'ARRAY' ? shift @$files : $EMPTY_STR;
72 
73  if ( $file ) {
74  $notify->(sprintf "%s Processing file '%s'", __PACKAGE__, $file);
75  }
76  else {
77  printf STDERR "%s called without a 'files' argument\n%s",
78  __PACKAGE__, Dumper($args);
79  return 1; # error
80  }
81 
82  my $tair_io = $self->get_filehandle($file);
83 
84  if ( !defined $tair_io ) {
85  print STDERR "ERROR: Could not open $file\n";
86  return 1; # 1 is an error
87  }
88 
89  my $source_id = $args->{'source_id'} ||
90  XrefParser::BaseParser->get_source_id_for_filename($file);
91  my $species_id = $args->{'species_id'} ||
92  XrefParser::BaseParser->get_species_id_for_filename($file);
93 
94  my $tairg_source_id = $self->get_source_id_for_source_name($TAIR_LOCUS);
95  my $tairs_source_id = $self->get_source_id_for_source_name($TAIR_SYMBOL);
96  my $tairl_source_id =
97  $self->get_source_id_for_source_name($TAIR_LOCUS_MODEL);
98  my $tairt_source_id =
99  $self->get_source_id_for_source_name($TAIR_TRANSLATION);
100  my $nascg_source_id = $self->get_source_id_for_source_name($NASC_GENE_ID);
101 
102  my $line_num = 0;
103  my %xrefs_added;
104  while ( my $line = $tair_io->getline() ) {
105  # Only process FASTA header lines
106  next unless $line =~ s/^>//;
107 
108  chomp $line;
109 
110  my ( $gene_stable_id, $symbol_str, $desc ) = split /\s*\|\s*/, $line;
111 
112  next unless $gene_stable_id;
113 
114  $desc ||= $EMPTY_STR;
115 
116  if ( $args->{'verbose'} ) {
117  printf "%-70s\r",
118  sprintf( '%9d: Processing %s', ++$line_num, $gene_stable_id )
119  ;
120  }
121 
122  #
123  # Transcript, e.g., "AT5G55930.1"
124  #
125  if ( $gene_stable_id =~ /^([A-Z0-9]+) \. (\d+)$/xms ) {
126  my $transcript_id = $gene_stable_id;
127  $gene_stable_id = $1;
128 
129  my $tairl_xref_id = $self->add_xref({
130  source_id => $tairl_source_id,
131  species_id => $species_id,
132  info_type => $DIRECT,
133  acc => $transcript_id,
134  label => $transcript_id,
135  desc => $desc,
136  });
137 
138  $self->add_direct_xref(
139  $tairl_xref_id, $transcript_id, $TRANSCRIPT, $DIRECT
140  );
141 
142  $xrefs_added{ $TAIR_LOCUS_MODEL }++;
143 
144  my $tairt_xref_id = $self->add_xref({
145  source_id => $tairt_source_id,
146  species_id => $species_id,
147  info_type => $DIRECT,
148  acc => $gene_stable_id,
149  label => $gene_stable_id,
150  desc => $EMPTY_STR,
151  });
152 
153  $self->add_direct_xref(
154  $tairt_xref_id, $transcript_id, $TRANSLATION, $DIRECT
155  );
156 
157  $xrefs_added{ $TAIR_TRANSLATION }++;
158  }
159 
160  #
161  # Gene IDs for TAIR and NASC
162  #
163  my $tairg_xref_id = $self->add_xref({
164  source_id => $tairg_source_id,
165  species_id => $species_id,
166  info_type => $DIRECT,
167  acc => $gene_stable_id,
168  label => $gene_stable_id,
169  desc => $desc,
170  });
171 
172  $self->add_direct_xref(
173  $tairg_xref_id, $gene_stable_id, $GENE, $DIRECT
174  );
175 
176  $xrefs_added{ $TAIR_LOCUS }++;
177 
178  my $nascg_xref_id = $self->add_xref({
179  source_id => $nascg_source_id,
180  species_id => $species_id,
181  info_type => $DIRECT,
182  acc => $gene_stable_id,
183  label => $gene_stable_id . '-TAIR-G',
184  desc => $desc,
185  });
186 
187  $self->add_direct_xref(
188  $nascg_xref_id, $gene_stable_id, $GENE, $DIRECT
189  );
190 
191  $xrefs_added{ $NASC_GENE_ID }++;
192 
193  #
194  # Symbols, e.g., "ATOPT1, OPT1"
195  #
196  if ( $symbol_str ) {
197  $symbol_str =~ s/^\s*Symbols:\s*//;
198  if ( my @symbols = map { $_ || () } split /\s*,\s*/, $symbol_str ) {
199  if ( my $main_sym = shift @symbols ) {
200  my $sym_xref_id = $self->add_xref({
201  source_id => $tairs_source_id,
202  species_id => $species_id,
203  info_type => $DIRECT,
204  acc => $main_sym,
205  label => $main_sym,
206  desc => $EMPTY_STR,
207  });
208 
209  #
210  # Add only first symbol to the gene
211  #
212  $self->add_direct_xref(
213  $sym_xref_id, $gene_stable_id, $GENE, $DIRECT
214  );
215 
216  $xrefs_added{ $TAIR_SYMBOL }++;
217 
218  #
219  # Add the remainder as "external_synonym"
220  #
221  for my $symbol ( @symbols ) {
222  $self->add_to_syn(
223  $main_sym, $tairs_source_id,
224  $symbol, $species_id
225  );
226 
227  $xrefs_added{'SYNONYMS'}++;
228  }
229  }
230  }
231  }
232  }
233 
234  $tair_io->close();
235 
236  $notify->(
237  join("\n",
238  $EMPTY_STR,
239  map {
240  sprintf " - Added %9d %s xrefs", $xrefs_added{ $_ }, $_
241  }
242  sort keys %xrefs_added
243  )
244  );
245 
246  return 0; # successful
247 }
248 
249 1;
map
public map()
XrefParser::BaseParser::get_source_id_for_source_name
public get_source_id_for_source_name()
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()
XrefParser::TAIRIDParser
Definition: TAIRIDParser.pm:16