3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 package XrefParser::TAIRIDParser;
30 Takes the TAIR cDNA FASTA file and uses the description lines to
35 >AT5G55930.1 | Symbols: ATOPT1, OPT1 | oligopeptide transporter 1 | chr5:22652715-22656106 FORWARD LENGTH=2820
36 CAAAATTCATGTGGTGTAAATTGTCTAAAGTCTGTATTTTTTTTTATTGACATCCATTTTTTTTGTGTCGAAAGTCTAT
41 Ken Youens-Clark E<lt>kclark@cshl.eduE<gt>.
52 Readonly my $EMPTY_STR => q{};
53 Readonly my $DIRECT =>
'DIRECT';
54 Readonly my $GENE =>
'Gene';
55 Readonly my $NASC_GENE_ID =>
'NASC_GENE_ID';
56 Readonly my $TAIR_LOCUS_MODEL =>
'TAIR_LOCUS_MODEL';
57 Readonly my $TAIR_LOCUS =>
'TAIR_LOCUS';
58 Readonly my $TAIR_SYMBOL =>
'TAIR_SYMBOL';
59 Readonly my $TAIR_TRANSLATION =>
'TAIR_TRANSLATION';
60 Readonly my $TRANSCRIPT =>
'Transcript';
61 Readonly my $TRANSLATION =>
'Translation';
64 # -----------------------------------------------------------------
66 my ( $self, $args ) = @_;
68 my $notify = sub { print @_,
"\n" if $args->{
'verbose'} };
70 my $files = $args->{
'files'};
71 my $file = ref $files eq
'ARRAY' ? shift @$files : $EMPTY_STR;
74 $notify->(sprintf
"%s Processing file '%s'", __PACKAGE__, $file);
77 printf STDERR
"%s called without a 'files' argument\n%s",
78 __PACKAGE__, Dumper($args);
82 my $tair_io = $self->get_filehandle($file);
84 if ( !defined $tair_io ) {
85 print STDERR
"ERROR: Could not open $file\n";
86 return 1; # 1 is an error
89 my $source_id = $args->{
'source_id'} ||
91 my $species_id = $args->{
'species_id'} ||
95 my $tairs_source_id = $self->get_source_id_for_source_name($TAIR_SYMBOL);
97 $self->get_source_id_for_source_name($TAIR_LOCUS_MODEL);
99 $self->get_source_id_for_source_name($TAIR_TRANSLATION);
100 my $nascg_source_id = $self->get_source_id_for_source_name($NASC_GENE_ID);
104 while ( my $line = $tair_io->getline() ) {
105 # Only process FASTA header lines
106 next unless $line =~ s/^>
110 my ( $gene_stable_id, $symbol_str, $desc ) = split /\s*\|\s*/, $line;
112 next unless $gene_stable_id;
114 $desc ||= $EMPTY_STR;
116 if ( $args->{
'verbose'} ) {
118 sprintf(
'%9d: Processing %s', ++$line_num, $gene_stable_id )
123 # Transcript, e.g., "AT5G55930.1"
125 if ( $gene_stable_id =~ /^([A-Z0-9]+) \. (\d+)$/xms ) {
126 my $transcript_id = $gene_stable_id;
127 $gene_stable_id = $1;
129 my $tairl_xref_id = $self->add_xref({
130 source_id => $tairl_source_id,
131 species_id => $species_id,
132 info_type => $DIRECT,
133 acc => $transcript_id,
134 label => $transcript_id,
138 $self->add_direct_xref(
139 $tairl_xref_id, $transcript_id, $TRANSCRIPT, $DIRECT
142 $xrefs_added{ $TAIR_LOCUS_MODEL }++;
144 my $tairt_xref_id = $self->add_xref({
145 source_id => $tairt_source_id,
146 species_id => $species_id,
147 info_type => $DIRECT,
148 acc => $gene_stable_id,
149 label => $gene_stable_id,
153 $self->add_direct_xref(
154 $tairt_xref_id, $transcript_id, $TRANSLATION, $DIRECT
157 $xrefs_added{ $TAIR_TRANSLATION }++;
161 # Gene IDs for TAIR and NASC
163 my $tairg_xref_id = $self->add_xref({
164 source_id => $tairg_source_id,
165 species_id => $species_id,
166 info_type => $DIRECT,
167 acc => $gene_stable_id,
168 label => $gene_stable_id,
172 $self->add_direct_xref(
173 $tairg_xref_id, $gene_stable_id, $GENE, $DIRECT
176 $xrefs_added{ $TAIR_LOCUS }++;
178 my $nascg_xref_id = $self->add_xref({
179 source_id => $nascg_source_id,
180 species_id => $species_id,
181 info_type => $DIRECT,
182 acc => $gene_stable_id,
183 label => $gene_stable_id .
'-TAIR-G',
187 $self->add_direct_xref(
188 $nascg_xref_id, $gene_stable_id, $GENE, $DIRECT
191 $xrefs_added{ $NASC_GENE_ID }++;
194 # Symbols, e.g., "ATOPT1, OPT1"
197 $symbol_str =~ s/^\s*Symbols:\s*
198 if ( my @symbols =
map { $_ || () } split /\s*,\s*/, $symbol_str ) {
199 if ( my $main_sym = shift @symbols ) {
200 my $sym_xref_id = $self->add_xref({
201 source_id => $tairs_source_id,
202 species_id => $species_id,
203 info_type => $DIRECT,
210 # Add only first symbol to the gene
212 $self->add_direct_xref(
213 $sym_xref_id, $gene_stable_id, $GENE, $DIRECT
216 $xrefs_added{ $TAIR_SYMBOL }++;
219 # Add the remainder as "external_synonym"
221 for my $symbol ( @symbols ) {
223 $main_sym, $tairs_source_id,
227 $xrefs_added{
'SYNONYMS'}++;
240 sprintf
" - Added %9d %s xrefs", $xrefs_added{ $_ }, $_
242 sort keys %xrefs_added
246 return 0; # successful