ensembl-hive  2.7.0
FlybaseParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 # $Id$
21 
22 package XrefParser::FlybaseParser;
23 
24 use strict;
25 use warnings;
26 
27 use Carp;
28 
29 use base qw( XrefParser::BaseParser );
30 my $verbose;
31 
32 # The object types we'd like to parse.
33 our %object_types = ( gene => 1,
34  mRNA => 1,
35  protein => 1,
36  pseudogene => 1,
37  miRNA => 1,
38  ncRNA => 1,
39  pre_miRNA => 1,
40  rRNA => 1,
41  snoRNA => 1,
42  snRNA => 1,
43  tRNA => 1 );
44 
45 # The Dbxref name 'flybase_annotation_id' will be associated with the
46 # source_name FlyBaseCGID_{gene,transcript,translation} depending on
47 # the type of 'ID' of the line.
48 #
49 # Likewise, the source_names FlyBaseName_{gene,transcript,translation}
50 # will be associated with the 'Name' of each entry depending on the type
51 # of 'ID'.
52 #
53 # ... and the source_names flybase_{gene,transcript,translation}_id will
54 # be associated with the 'ID' of each entry depending on the type of
55 # 'ID'.
56 
57 # This hash will translate the Dbxref names in the data file into source
58 # names known by the Xref system.
59 # The protein-level FlyBase annotations (UniProt, SwissProt, Interpro) are
60 # not imported; they're attached to genes, which means that when
61 # we run our protein pipeline (at the translation level), those results are
62 # shifted to the gene level, which messes up the web display.
63 our %source_name_map = ( 'FlyBase' => 'flybase_annotation_id',
64  'BIOGRID' => 'BioGRID',
65  'EPD' => 'EPD',
66  'flyexpress' => 'FlyExpress',
67  'FlyReactome' => 'FlyReactome',
68  'GenomeRNAi' => 'GenomeRNAi',
69  'INTERACTIVEFLY' => 'InteractiveFly',
70  'MIR' => 'miRBase',
71  'MITODROME' => 'MitoDrome',
72  'Rfam' => 'Rfam',
73  'TF' => 'TransFac',
74 );
75 
76 # This is for source_ids that depend on the type of 'ID' of the line.
77 our %special_source_name_map = (
78  'gene' => {
79  'Dbxref' => 'FlyBaseCGID_gene',
80  'Name' => 'FlyBaseName_gene',
81  'ID' => 'flybase_gene_id'
82  },
83  'transcript' => {
84  'Dbxref' => 'FlyBaseCGID_transcript',
85  'Name' => 'FlyBaseName_transcript',
86  'ID' => 'flybase_transcript_id'
87  },
88  'translation' => {
89  'Dbxref' => 'FlyBaseCGID_translation',
90  'Name' => 'FlyBaseName_translation',
91  'ID' => 'flybase_translation_id'
92  } );
93 
94 # This hash will eventually be populated with the source_id for the
95 # sources above.
96 our %source_id;
97 
98 sub get_source_id_for_source_name {
99  my ($self, $source_name, $priority_desc) = @_;
100 
101  if ( !defined( $source_id{$source_name} ) ) {
102  $source_id{$source_name} =
103  $self->SUPER::get_source_id_for_source_name($source_name, $priority_desc);
104 
105  printf( "source_id for source '%s' is %d\n",
106  $source_name, $source_id{$source_name} ) if ($verbose);
107  }
108 
109  if ( !defined( $source_id{$source_name} ) || $source_id{$source_name} < 0 )
110  {
111  carp( sprintf( "Can not find source_id for source '%s'", $source_name ) );
112  }
113 
114  return $source_id{$source_name};
115 }
116 
117 sub run {
118 
119  my ($self, $ref_arg) = @_;
120  my $source_id = $ref_arg->{source_id};
121  my $species_id = $ref_arg->{species_id};
122  my $files = $ref_arg->{files};
123  my $verbose = $ref_arg->{verbose};
124 
125  if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
126  croak "Need to pass source_id, species_id and files as pairs";
127  }
128  $verbose |=0;
129 
130  # Note: The import of the GO terms from the FlyBase GFF has been removed.
131  # Only Dmel is annotated with evidence codes by FlyBase, the other flies
132  # are inferred from Interpro analysis - so can be handled equally well by
133  # the GOParser (which maps them to translations rather than genes too).
134  # In addition, the evidence codes for Dmel are not even in the GFF
135  # file, and have to be patched across further down the line. A new Dmel-
136  # specific section has been added to GOParser to automate this, in the same
137  # way that C. elegans is done, for example.
138 
139  print "-------------------------\n";
140  print "FlybaseParser::run species_id $species_id\n";
141  print "-------------------------\n\n";
142 
143  my $data_file = @{$files}[0];
144 
145  my %xref_ids;
146 
147  my $data_io = $self->get_filehandle($data_file);
148 
149  my ( $count_read, $count_skipped, $last_count_read ) = ( 0, 0, 0 );
150 
151  my $status_interval = 30;
152  local $SIG{ALRM} = sub {
153  printf( "%d lines read, %d skipped, %d parsed; %d lines/s\n",
154  $count_read, $count_skipped,
155  $count_read - $count_skipped,
156  ( $count_read - $last_count_read )/$status_interval ) if($verbose);
157  $last_count_read = $count_read;
158  alarm($status_interval);
159  };
160  alarm($status_interval);
161 
162  while ( defined( my $line = $data_io->getline() ) ) {
163  ++$count_read;
164 
165  # Skip comment lines at the start of the file.
166  if ( substr( $line, 0, 1 ) eq '#' ) { ++$count_skipped; next }
167 
168  chomp($line);
169 
170  # Split each line into fields.
171  my @fields = split( /\t/, $line );
172 
173  # Only pick out the interesting lines.
174  if (
175  !( defined( $fields[1] )
176  && $fields[1] eq 'FlyBase'
177  && defined( $fields[2] )
178  && exists( $object_types{ $fields[2] } ) ) )
179  {
180  ++$count_skipped;
181  next;
182  }
183 
184  # Go though each attribute (from the 9th field), split them up into
185  # key-value pairs and store them.
186  my %attributes;
187  foreach my $attribute ( split( /;/, $fields[8] ) ) {
188  my ( $key, $value ) = split( /=/, $attribute );
189  if ( $key ne '' && $value ne '' ) {
190  $attributes{$key} = $value;
191  }
192  }
193 
194  my $id = $attributes{'ID'};
195 
196  my $type;
197  if ( substr( $id, 0, 4 ) eq 'FBgn' ) { $type = 'gene' }
198  elsif ( substr( $id, 0, 4 ) eq 'FBtr' ) { $type = 'transcript' }
199  elsif ( substr( $id, 0, 4 ) eq 'FBpp' ) { $type = 'translation' }
200  else { $type = 'unknown' }
201 
202  if ( exists( $attributes{'Dbxref'} ) ) {
203  my %tmphash;
204  foreach my $subattribute ( split( /,/, $attributes{'Dbxref'} ) ) {
205  my ( $key, $value ) = split( /:/, $subattribute, 2 );
206  push( @{ $tmphash{$key} }, $value );
207  }
208 
209  # Replace the attribute entry with the hash.
210  $attributes{'Dbxref'} = \%tmphash;
211  }
212 
213  # For the 'Alias' attributes, we split them up by commas
214  # but we can't divide them in to key-value. So, we'll create
215  # a fake key Alias.
216  # Aliases will be stored as synonyms and will comprise secondary
217  # IDs from FlyBase to keep tracks of split/merged annotations.
218  my $alias_key = 'Alias';
219  if ( exists( $attributes{$alias_key} ) ) {
220  my @tmp_array = split( /,/, $attributes{$alias_key} );
221  $attributes{$alias_key} =\@tmp_array;
222  }
223 
224  #----------------------------------------------------------------------
225  # Store Xrefs and Direct Xrefs for all the interesting Dbxref entries.
226  #----------------------------------------------------------------------
227  my $dbxref = $attributes{'Dbxref'};
228  foreach my $dbxref_name ( keys( %{$dbxref} ) ) {
229  if ( exists( $source_name_map{$dbxref_name} ) ) {
230  my $source_name = $source_name_map{$dbxref_name};
231  my $source_id = $self->get_source_id_for_source_name($source_name);
232 
233  foreach my $accession ( @{ $dbxref->{$dbxref_name} } ) {
234  my $xref_id;
235  if ( exists( $xref_ids{$source_name}{$accession} ) ) {
236  $xref_id = $xref_ids{$source_name}{$accession};
237  } else {
238  $xref_id =
239  $self->add_xref({ acc => $accession,
240  label => $accession,
241  source_id => $source_id,
242  species_id => $species_id,
243  info_type => 'DIRECT'}
244  );
245  $xref_ids{$source_name}{$accession} = $xref_id;
246  }
247  $self->add_direct_xref( $xref_id, $id, $type, '' );
248  }
249  }
250  }
251 
252  #-------------------------------------------------------------------
253  # Store Xrefs and Direct Xrefs for the 'FlyBase_Annotation_IDs'
254  # Dbxref entry (depends on type of 'ID').
255  #-------------------------------------------------------------------
256  if ( exists( $dbxref->{'FlyBase_Annotation_IDs'} ) ) {
257  my $source_name = $special_source_name_map{$type}{'Dbxref'};
258  my $source_id = $self->get_source_id_for_source_name($source_name);
259 
260  foreach my $accession ( @{ $dbxref->{'FlyBase_Annotation_IDs'} } ) {
261  my $xref_id;
262  if ( exists( $xref_ids{$source_name}{$accession} ) ) {
263  $xref_id = $xref_ids{$source_name}{$accession};
264  } else {
265  $xref_id =
266  $self->add_xref({ acc => $accession,
267  label => $accession,
268  source_id => $source_id,
269  species_id => $species_id,
270  info_type => 'DIRECT'}
271  );
272  $xref_ids{$source_name}{$accession} = $xref_id;
273  }
274  $self->add_direct_xref( $xref_id, $id, $type, '' );
275  }
276  }
277 
278  #----------------------------------------------------------------------
279  # Store Xref and Direct Xref for the 'Name' (depends on type of 'ID').
280  #----------------------------------------------------------------------
281  {
282  my $source_name = $special_source_name_map{$type}{'Name'};
283  my $source_id = $self->get_source_id_for_source_name($source_name);
284 
285  my $accession = $attributes{'Name'};
286 
287  # Names other than D. melanogaster start with D...\ (like Dper\β3galt6)
288  $accession =~ s/^D...\\//;
289 
290  my $description = (defined($attributes{'fullname'})) ? $attributes{'fullname'} : '';
291 
292  # FlyBase use %2C to distinguish from the , separator in the GFF dump;
293  # we have to put it back
294  $description =~ s/%2C/,/g;
295 
296  # Embedded newlines wreak havoc further down the line
297  $description =~ s/[\n\r]//gm;
298  # And slashes to ensure that slashes aren't mistakenly interpreted as control characters
299  $description =~ s/\\/\\\\/gm;
300 
301  my $xref_id;
302 
303  if ( exists( $xref_ids{$source_name}{$accession} ) ) {
304  $xref_id = $xref_ids{$source_name}{$accession};
305  } else {
306  $xref_id =
307  $self->add_xref({ acc => $id,
308  label => $accession,
309  desc => $description,
310  source_id => $source_id,
311  species_id => $species_id,
312  info_type => 'DIRECT'}
313  );
314  $xref_ids{$source_name}{$accession} = $xref_id;
315  }
316  $self->add_direct_xref( $xref_id, $id, $type, '' );
317  }
318 
319  #-------------------------------------------------------------------
320  # Store Xref and Direct Xref for the 'ID' (depends on type of 'ID').
321  #-------------------------------------------------------------------
322  {
323  my $source_name = $special_source_name_map{$type}{'ID'};
324  my $source_id = $self->get_source_id_for_source_name($source_name);
325 
326  my $accession = $id;
327  my $xref_id;
328 
329  if ( exists( $xref_ids{$source_name}{$accession} ) ) {
330  $xref_id = $xref_ids{$source_name}{$accession};
331  } else {
332  $xref_id =
333  $self->add_xref({ acc => $accession,
334  label => $accession,
335  source_id => $source_id,
336  species_id => $species_id,
337  info_type => 'DIRECT'}
338  );
339  $xref_ids{$source_name}{$accession} = $xref_id;
340  }
341  $self->add_direct_xref( $xref_id, $id, $type, '' );
342 
343  #-------------------------------------------------------------------
344  # Now, if we have aliases for this gene/transcript/translation
345  # Store them in the external_synonym table.
346  #-------------------------------------------------------------------
347 
348  if (defined ($attributes{$alias_key})) {
349  foreach my $alias (@{$attributes{$alias_key}}) {
350  # Skip synonyms with non-ASCII characters
351  next unless $alias =~ /^[\x00-\x7F]+$/;
352  # Embedded newlines wreak havoc further down the line
353  $alias =~ s/[\n\r]//gm;
354  $self->add_synonym($xref_id, $alias);
355  }
356  }
357  }
358 
359  }
360  $data_io->close();
361 
362  alarm(0);
363 
364  if ($verbose) {
365  print("FlybaseParser Summary:\n");
366  print("--------------------------------------------------------------\n");
367  foreach my $label ( sort( keys(%xref_ids) ) ) {
368  my $accessions = $xref_ids{$label};
369  printf( "\t%-32s %6d\n", $label, scalar( keys( %{$accessions} ) ) );
370  }
371  print("--------------------------------------------------------------\n");
372  }
373 
374  return 0;
375 }
376 
377 1;
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()