ensembl-hive  2.6
FastaFactory.pm
Go to the documentation of this file.
1 =pod
2 
3 =head1 NAME
4 
6 
7 =head1 SYNOPSIS
8 
9  standaloneJob.pl Bio::EnsEMBL::Hive::RunnableDB::FastaFactory --inputfile reference.fasta --max_chunk_length 600000
10 
12  --inputfile reference.fasta \
13  --max_chunk_length 700000 \
14  --output_prefix ref_chunk \
15  --flow_into "{ 2 => ['mysql://ensadmin:${ENSADMIN_PSW}@127.0.0.1/lg4_split_fasta/analysis?logic_name=blast']}"
16 
17 =head1 DESCRIPTION
18 
19  This is a Bioinformatics-specific "Factory" Runnable that splits a given Fasta file into smaller chunks
20  and dataflows one job per chunk. Note that:
21  - the files are created in the current directory.
22  - the Runnable does not split the individual sequences, it only groups them in a way that none of the output files will
23  be longer than param('max_chunk_length').
24  - Thanks to BioPerl's versatility, the Runnable can in fact read many formats. Tune param('input_format') to do so.
25 
26  The following parameters are supported:
27 
28  param('inputfile'); # The original Fasta file: 'inputfile' => 'my_sequences.fasta'
29 
30  param('max_chunk_length'); # Maximum total length of sequences in a chunk: 'max_chunk_length' => '200000'
31 
32  param('max_chunk_size'); # Defines the maximum allowed number of sequences to be included in each output file.
33 
34  param('seq_filter'); # Can be used to exclude sequences from output files. e.g. '^TF' would exclude all sequences starting with TF.
35 
36  param('output_prefix'); # A common prefix for output files: 'output_prefix' => 'my_special_chunk_'
37 
38  param('output_suffix'); # A common suffix for output files: 'output_suffix' => '.nt'
39 
40  param('hash_directories'); # Boolean (default to 0): should the output files be put in different ("hashed") directories
41 
42  param('input_format'); # The format of the input file (defaults to "fasta")
43 
44  param('output_format'); # The format of the output file (defaults to the same as param('input_format'))
45 
46  param('output_dir'); # Where to create the chunks (defaults to the current directory)
47 
48 =head1 LICENSE
49 
50  Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
51  Copyright [2016-2024] EMBL-European Bioinformatics Institute
52 
53  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
54  You may obtain a copy of the License at
55 
56  http://www.apache.org/licenses/LICENSE-2.0
57 
58  Unless required by applicable law or agreed to in writing, software distributed under the License
59  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
60  See the License for the specific language governing permissions and limitations under the License.
61 
62 =head1 CONTACT
63 
64  Please subscribe to the Hive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss Hive-related questions or to be notified of our updates
65 
66 =cut
67 
68 
69 package Bio::EnsEMBL::Hive::RunnableDB::FastaFactory;
70 
71 use strict;
72 use warnings;
73 
74 use Bio::SeqIO;
75 use File::Path;
76 use File::Spec;
77 
78 
79 use Bio::EnsEMBL::Hive::Utils ('dir_revhash');
80 
81 use base ('Bio::EnsEMBL::Hive::Process');
82 
83 
84 =head2 param_defaults
85 
86  Description : Implements param_defaults() interface method of Bio::EnsEMBL::Hive::Process that defines module defaults for parameters.
87 
88 =cut
89 
90 sub param_defaults {
91 
92  return {
93  'max_chunk_length' => 100000,
94  'max_chunk_size' => 0,
95  'output_prefix' => 'my_chunk_',
96  'output_suffix' => '.#input_format#',
97  'seq_filter' => undef,
98  'hash_directories' => 0,
99  'input_format' => 'fasta',
100  'output_dir' => '',
101  'output_format' => '#input_format#',
102  };
103 }
104 
105 
106 =head2 fetch_input
107 
108  Description : Implements fetch_input() interface method of Bio::EnsEMBL::Hive::Process that is used to read in parameters and load data.
109  Here we only check the existence of 'inputfile' parameter and try to parse it (all other parameters have defaults).
110 
111 =cut
112 
113 sub fetch_input {
114  my $self = shift @_;
115 
116  my $inputfile = $self->param_required('inputfile');
117  die "Cannot read '$inputfile'" unless(-r $inputfile);
118 
119  my $input_seqio;
120  if($inputfile=~/\.(?:gz|Z)$/) {
121  open(my $in_fh, '-|', "gunzip -c $inputfile");
122  $input_seqio = Bio::SeqIO->new(-fh => $in_fh, -format => $self->param_required('input_format'));
123  $self->param('input_fh', $in_fh);
124  } else {
125  $input_seqio = Bio::SeqIO->new(-file => $inputfile);
126  $self->param('input_fh', undef);
127  }
128  die "Could not open or parse '$inputfile', please investigate" unless $input_seqio;
129 
130  $self->param('input_seqio', $input_seqio);
131 }
132 
133 
134 =head2 run
135 
136  Description : Implements run() interface method of Bio::EnsEMBL::Hive::Process that is used to perform the main bulk of the job (minus input and output).
137  Because we want to stream the data more efficiently, all functionality is in write_output();
138 
139 =cut
140 
141 sub run {
142 }
143 
144 
145 =head2 write_output
146 
147  Description : Implements write_output() interface method of Bio::EnsEMBL::Hive::Process that is used to deal with job's output after the execution.
148  The main bulk of this Runnable's functionality is here.
149  Iterates through all sequences in input_seqio, splits them into separate files ("chunks") using a cut-off length and dataflows one job per chunk.
150 
151 =cut
152 
153 sub write_output {
154  my $self = shift @_;
155 
156  my $input_seqio = $self->param('input_seqio');
157  my $max_chunk_length = $self->param('max_chunk_length');
158  my $max_chunk_size = $self->param('max_chunk_size');
159  my $output_prefix = $self->param('output_prefix');
160  my $output_suffix = $self->param('output_suffix');
161  my $output_dir = $self->param('output_dir');
162 
163  my $chunk_number = 1; # counts the chunks
164  my $chunk_length = 0; # total length of the current chunk
165  my $chunk_size = 0; # number of sequences in the current chunk
166  my $chunk_name = $output_prefix.$chunk_number.$output_suffix;
167  my $seq_filter = $self->param('seq_filter');
168 
169  # No need to check param('hash_directories') because even in this mode
170  # the first file is in the required directory
171  if ($output_dir) {
172  mkpath($output_dir);
173  $chunk_name = File::Spec->catfile($output_dir, $chunk_name);
174  }
175  my $chunk_seqio = Bio::SeqIO->new(-file => '>'.$chunk_name, -format => $self->param_required('output_format'));
176 
177  while (my $seq_object = $input_seqio->next_seq) {
178 
179  next if ( ( defined($seq_filter) ) && ( $seq_object->id =~ /$seq_filter/ ) );
180 
181  $chunk_seqio->write_seq( $seq_object );
182  $chunk_length += $seq_object->length();
183  $chunk_size += 1;
184 
185  if (($max_chunk_length && ($chunk_length > $max_chunk_length)) or ($max_chunk_size && ($chunk_size > $max_chunk_size))) {
186 
187  # dataflow the current chunk:
188  $self->dataflow_output_id( {
189  'chunk_name' => $chunk_name,
190  'chunk_number' => $chunk_number,
191  'chunk_length' => $chunk_length,
192  'chunk_size' => $chunk_size
193  }, 2);
194 
195  # start writing to the next one:
196  $chunk_length = 0;
197  $chunk_size = 0;
198  $chunk_number++;
199  $chunk_name = $output_prefix.$chunk_number.$output_suffix;
200 
201  my @partial_dirs;
202  if ((defined $output_dir) and ($output_dir ne '')) {
203  push @partial_dirs, $output_dir;
204  }
205  if ($self->param('hash_directories')) {
206  my $hash_dir = dir_revhash($chunk_number);
207  if ($hash_dir ne '') {
208  push @partial_dirs, $hash_dir;
209  }
210  }
211  my $dir_tree = File::Spec->catdir(@partial_dirs);
212  if ($dir_tree ne '') {
213  mkpath($dir_tree);
214  $chunk_name = File::Spec->catfile($dir_tree, $chunk_name);
215  }
216  $chunk_seqio = $chunk_seqio->new(-file => '>'.$chunk_name);
217  }
218  }
219 
220  if($chunk_size) { # flush the last chunk:
221 
222  $self->dataflow_output_id( {
223  'chunk_name' => $chunk_name,
224  'chunk_number' => $chunk_number,
225  'chunk_length' => $chunk_length,
226  'chunk_size' => $chunk_size
227  }, 2);
228 
229  } else {
230  unlink $chunk_name unless (stat($chunk_name))[7];
231  }
232 }
233 
234 
235 =head2 post_cleanup
236 
237  Description : Close the file handle open in fetch_input() even if the job fails or write_output never runs
238 
239 =cut
240 
241 sub post_cleanup {
242  my $self = shift;
243  close( $self->param('input_fh') ) if $self->param('input_fh');
244 }
245 
246 1;
247 
Bio::EnsEMBL::Hive::Process
Definition: Process.pm:77
main
public main()
Bio::EnsEMBL::Hive::RunnableDB::FastaFactory
Definition: FastaFactory.pm:53