3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
22 package XrefParser::FlybaseParser;
32 # The object types we'd like to parse.
33 our %object_types = ( gene => 1,
45 # The Dbxref name 'flybase_annotation_id' will be associated with the
46 # source_name FlyBaseCGID_{gene,transcript,translation} depending on
47 # the type of 'ID' of the line.
49 # Likewise, the source_names FlyBaseName_{gene,transcript,translation}
50 # will be associated with the 'Name' of each entry depending on the type
53 # ... and the source_names flybase_{gene,transcript,translation}_id will
54 # be associated with the 'ID' of each entry depending on the type of
57 # This hash will translate the Dbxref names in the data file into source
58 # names known by the Xref system.
59 # The protein-level FlyBase annotations (UniProt, SwissProt, Interpro) are
60 # not imported; they're attached to genes, which means that when
61 # we run our protein pipeline (at the translation level), those results are
62 # shifted to the gene level, which messes up the web display.
63 our %source_name_map = (
'FlyBase' =>
'flybase_annotation_id',
64 'BIOGRID' =>
'BioGRID',
66 'flyexpress' =>
'FlyExpress',
67 'FlyReactome' =>
'FlyReactome',
68 'GenomeRNAi' =>
'GenomeRNAi',
69 'INTERACTIVEFLY' =>
'InteractiveFly',
71 'MITODROME' =>
'MitoDrome',
76 # This is for source_ids that depend on the type of 'ID' of the line.
77 our %special_source_name_map = (
79 'Dbxref' =>
'FlyBaseCGID_gene',
80 'Name' =>
'FlyBaseName_gene',
81 'ID' =>
'flybase_gene_id'
84 'Dbxref' =>
'FlyBaseCGID_transcript',
85 'Name' =>
'FlyBaseName_transcript',
86 'ID' =>
'flybase_transcript_id'
89 'Dbxref' =>
'FlyBaseCGID_translation',
90 'Name' =>
'FlyBaseName_translation',
91 'ID' =>
'flybase_translation_id'
94 # This hash will eventually be populated with the source_id for the
98 sub get_source_id_for_source_name {
99 my ($self, $source_name, $priority_desc) = @_;
101 if ( !defined( $source_id{$source_name} ) ) {
102 $source_id{$source_name} =
103 $self->SUPER::get_source_id_for_source_name($source_name, $priority_desc);
105 printf(
"source_id for source '%s' is %d\n",
106 $source_name, $source_id{$source_name} )
if ($verbose);
109 if ( !defined( $source_id{$source_name} ) || $source_id{$source_name} < 0 )
111 carp( sprintf(
"Can not find source_id for source '%s'", $source_name ) );
114 return $source_id{$source_name};
119 my ($self, $ref_arg) = @_;
120 my $source_id = $ref_arg->{source_id};
121 my $species_id = $ref_arg->{species_id};
122 my $files = $ref_arg->{files};
123 my $verbose = $ref_arg->{verbose};
125 if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
126 croak
"Need to pass source_id, species_id and files as pairs";
130 # Note: The import of the GO terms from the FlyBase GFF has been removed.
131 # Only Dmel is annotated with evidence codes by FlyBase, the other flies
132 # are inferred from Interpro analysis - so can be handled equally well by
133 # the GOParser (which maps them to translations rather than genes too).
134 # In addition, the evidence codes for Dmel are not even in the GFF
135 # file, and have to be patched across further down the line. A new Dmel-
136 # specific section has been added to GOParser to automate this, in the same
137 # way that C. elegans is done, for example.
139 print
"-------------------------\n";
140 print
"FlybaseParser::run species_id $species_id\n";
141 print
"-------------------------\n\n";
143 my $data_file = @{$files}[0];
147 my $data_io = $self->get_filehandle($data_file);
149 my ( $count_read, $count_skipped, $last_count_read ) = ( 0, 0, 0 );
151 my $status_interval = 30;
152 local $SIG{ALRM} = sub {
153 printf(
"%d lines read, %d skipped, %d parsed; %d lines/s\n",
154 $count_read, $count_skipped,
155 $count_read - $count_skipped,
156 ( $count_read - $last_count_read )/$status_interval )
if($verbose);
157 $last_count_read = $count_read;
158 alarm($status_interval);
160 alarm($status_interval);
162 while ( defined( my $line = $data_io->getline() ) ) {
165 # Skip comment lines at the start of the file.
166 if ( substr( $line, 0, 1 ) eq
'#' ) { ++$count_skipped; next }
170 # Split each line into fields.
171 my @fields = split( /\t/, $line );
173 # Only pick out the interesting lines.
175 !( defined( $fields[1] )
176 && $fields[1] eq
'FlyBase'
177 && defined( $fields[2] )
178 && exists( $object_types{ $fields[2] } ) ) )
184 # Go though each attribute (from the 9th field), split them up into
185 # key-value pairs and store them.
187 foreach my $attribute ( split( /;/, $fields[8] ) ) {
188 my ( $key, $value ) = split( /=/, $attribute );
189 if ( $key ne
'' && $value ne
'' ) {
190 $attributes{$key} = $value;
194 my $id = $attributes{
'ID'};
197 if ( substr( $id, 0, 4 ) eq
'FBgn' ) { $type =
'gene' }
198 elsif ( substr( $id, 0, 4 ) eq
'FBtr' ) { $type =
'transcript' }
199 elsif ( substr( $id, 0, 4 ) eq
'FBpp' ) { $type =
'translation' }
200 else { $type =
'unknown' }
202 if ( exists( $attributes{
'Dbxref'} ) ) {
204 foreach my $subattribute ( split( /,/, $attributes{
'Dbxref'} ) ) {
205 my ( $key, $value ) = split( /:/, $subattribute, 2 );
206 push( @{ $tmphash{$key} }, $value );
209 # Replace the attribute entry with the hash.
210 $attributes{
'Dbxref'} = \%tmphash;
213 # For the 'Alias' attributes, we split them up by commas
214 # but we can't divide them in to key-value. So, we'll create
216 # Aliases will be stored as synonyms and will comprise secondary
217 # IDs from FlyBase to keep tracks of split/merged annotations.
218 my $alias_key =
'Alias';
219 if ( exists( $attributes{$alias_key} ) ) {
220 my @tmp_array = split( /,/, $attributes{$alias_key} );
221 $attributes{$alias_key} =\@tmp_array;
224 #----------------------------------------------------------------------
225 # Store Xrefs and Direct Xrefs for all the interesting Dbxref entries.
226 #----------------------------------------------------------------------
227 my $dbxref = $attributes{
'Dbxref'};
228 foreach my $dbxref_name ( keys( %{$dbxref} ) ) {
229 if ( exists( $source_name_map{$dbxref_name} ) ) {
230 my $source_name = $source_name_map{$dbxref_name};
231 my $source_id = $self->get_source_id_for_source_name($source_name);
233 foreach my $accession ( @{ $dbxref->{$dbxref_name} } ) {
235 if ( exists( $xref_ids{$source_name}{$accession} ) ) {
236 $xref_id = $xref_ids{$source_name}{$accession};
239 $self->add_xref({ acc => $accession,
241 source_id => $source_id,
242 species_id => $species_id,
243 info_type =>
'DIRECT'}
245 $xref_ids{$source_name}{$accession} = $xref_id;
247 $self->add_direct_xref( $xref_id, $id, $type,
'' );
252 #-------------------------------------------------------------------
253 # Store Xrefs and Direct Xrefs for the 'FlyBase_Annotation_IDs'
254 # Dbxref entry (depends on type of 'ID').
255 #-------------------------------------------------------------------
256 if ( exists( $dbxref->{
'FlyBase_Annotation_IDs'} ) ) {
257 my $source_name = $special_source_name_map{$type}{
'Dbxref'};
258 my $source_id = $self->get_source_id_for_source_name($source_name);
260 foreach my $accession ( @{ $dbxref->{
'FlyBase_Annotation_IDs'} } ) {
262 if ( exists( $xref_ids{$source_name}{$accession} ) ) {
263 $xref_id = $xref_ids{$source_name}{$accession};
266 $self->add_xref({ acc => $accession,
268 source_id => $source_id,
269 species_id => $species_id,
270 info_type =>
'DIRECT'}
272 $xref_ids{$source_name}{$accession} = $xref_id;
274 $self->add_direct_xref( $xref_id, $id, $type,
'' );
278 #----------------------------------------------------------------------
279 # Store Xref and Direct Xref for the 'Name' (depends on type of 'ID').
280 #----------------------------------------------------------------------
282 my $source_name = $special_source_name_map{$type}{
'Name'};
283 my $source_id = $self->get_source_id_for_source_name($source_name);
285 my $accession = $attributes{
'Name'};
287 # Names other than D. melanogaster start with D...\ (like Dper\β3galt6)
288 $accession =~ s/^D...\\
290 my $description = (defined($attributes{
'fullname'})) ? $attributes{
'fullname'} :
'';
292 # FlyBase use %2C to distinguish from the , separator in the GFF dump;
293 # we have to put it back
294 $description =~ s/%2C/,/g;
296 # Embedded newlines wreak havoc further down the line
297 $description =~ s/[\n\r]
298 # And slashes to ensure that slashes aren't mistakenly interpreted as control characters
299 $description =~ s/\\/\\\\/gm;
303 if ( exists( $xref_ids{$source_name}{$accession} ) ) {
304 $xref_id = $xref_ids{$source_name}{$accession};
307 $self->add_xref({ acc => $id,
309 desc => $description,
310 source_id => $source_id,
311 species_id => $species_id,
312 info_type =>
'DIRECT'}
314 $xref_ids{$source_name}{$accession} = $xref_id;
316 $self->add_direct_xref( $xref_id, $id, $type,
'' );
319 #-------------------------------------------------------------------
320 # Store Xref and Direct Xref for the 'ID' (depends on type of 'ID').
321 #-------------------------------------------------------------------
323 my $source_name = $special_source_name_map{$type}{
'ID'};
324 my $source_id = $self->get_source_id_for_source_name($source_name);
329 if ( exists( $xref_ids{$source_name}{$accession} ) ) {
330 $xref_id = $xref_ids{$source_name}{$accession};
333 $self->add_xref({ acc => $accession,
335 source_id => $source_id,
336 species_id => $species_id,
337 info_type =>
'DIRECT'}
339 $xref_ids{$source_name}{$accession} = $xref_id;
341 $self->add_direct_xref( $xref_id, $id, $type,
'' );
343 #-------------------------------------------------------------------
344 # Now, if we have aliases for this gene/transcript/translation
345 # Store them in the external_synonym table.
346 #-------------------------------------------------------------------
348 if (defined ($attributes{$alias_key})) {
349 foreach my $alias (@{$attributes{$alias_key}}) {
350 # Skip synonyms with non-ASCII characters
351 next unless $alias =~ /^[\x00-\x7F]+$/;
352 # Embedded newlines wreak havoc further down the line
354 $self->add_synonym($xref_id, $alias);
365 print(
"FlybaseParser Summary:\n");
366 print(
"--------------------------------------------------------------\n");
367 foreach my $label ( sort( keys(%xref_ids) ) ) {
368 my $accessions = $xref_ids{$label};
369 printf(
"\t%-32s %6d\n", $label, scalar( keys( %{$accessions} ) ) );
371 print(
"--------------------------------------------------------------\n");