12 --inputfile reference.fasta \
13 --max_chunk_length 700000 \
14 --output_prefix ref_chunk \
15 --flow_into
"{ 2 => ['mysql://ensadmin:${ENSADMIN_PSW}@127.0.0.1/lg4_split_fasta/analysis?logic_name=blast']}"
19 This is a Bioinformatics-specific
"Factory" Runnable that splits a given Fasta file into smaller chunks
20 and dataflows one job per chunk. Note that:
21 - the files are created in the current directory.
22 - the Runnable does not split the individual sequences, it only groups them in a way that none of the output files will
23 be longer than param(
'max_chunk_length').
24 - Thanks to BioPerl
's versatility, the Runnable can in fact read many formats. Tune param('input_format
') to do so.
26 The following parameters are supported:
28 param('inputfile
'); # The original Fasta file: 'inputfile
' => 'my_sequences.fasta
'
30 param('max_chunk_length
'); # Maximum total length of sequences in a chunk: 'max_chunk_length
' => '200000
'
32 param('max_chunk_size
'); # Defines the maximum allowed number of sequences to be included in each output file.
34 param('seq_filter
'); # Can be used to exclude sequences from output files. e.g. '^TF
' would exclude all sequences starting with TF.
36 param('output_prefix
'); # A common prefix for output files: 'output_prefix
' => 'my_special_chunk_
'
38 param('output_suffix
'); # A common suffix for output files: 'output_suffix
' => '.nt
'
40 param('hash_directories
'); # Boolean (default to 0): should the output files be put in different ("hashed") directories
42 param('input_format
'); # The format of the input file (defaults to "fasta")
44 param('output_format
'); # The format of the output file (defaults to the same as param('input_format
'))
46 param('output_dir
'); # Where to create the chunks (defaults to the current directory)
50 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
51 Copyright [2016-2024] EMBL-European Bioinformatics Institute
53 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
54 You may obtain a copy of the License at
56 http://www.apache.org/licenses/LICENSE-2.0
58 Unless required by applicable law or agreed to in writing, software distributed under the License
59 is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
60 See the License for the specific language governing permissions and limitations under the License.
64 Please subscribe to the Hive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss Hive-related questions or to be notified of our updates
69 package Bio::EnsEMBL::Hive::RunnableDB::FastaFactory;
79 use Bio::EnsEMBL::Hive::Utils ('dir_revhash
');
86 Description : Implements param_defaults() interface method of Bio::EnsEMBL::Hive::Process that defines module defaults for parameters.
93 'max_chunk_length
' => 100000,
94 'max_chunk_size
' => 0,
95 'output_prefix
' => 'my_chunk_
',
96 'output_suffix
' => '.#input_format#
',
97 'seq_filter
' => undef,
98 'hash_directories
' => 0,
99 'input_format
' => 'fasta
',
101 'output_format
' => '#input_format#
',
108 Description : Implements fetch_input() interface method of Bio::EnsEMBL::Hive::Process that is used to read in parameters and load data.
109 Here we only check the existence of 'inputfile
' parameter and try to parse it (all other parameters have defaults).
116 my $inputfile = $self->param_required('inputfile
');
117 die "Cannot read '$inputfile
'" unless(-r $inputfile);
120 if($inputfile=~/\.(?:gz|Z)$/) {
121 open(my $in_fh, '-|
', "gunzip -c $inputfile");
122 $input_seqio = Bio::SeqIO->new(-fh => $in_fh, -format => $self->param_required('input_format
'));
123 $self->param('input_fh
', $in_fh);
125 $input_seqio = Bio::SeqIO->new(-file => $inputfile);
126 $self->param('input_fh
', undef);
128 die "Could not open or parse '$inputfile
', please investigate" unless $input_seqio;
130 $self->param('input_seqio
', $input_seqio);
136 Description : Implements run() interface method of Bio::EnsEMBL::Hive::Process that is used to perform the main bulk of the job (minus input and output).
137 Because we want to stream the data more efficiently, all functionality is in write_output();
147 Description : Implements write_output() interface method of Bio::EnsEMBL::Hive::Process that is used to deal with job's output after the execution.
148 The
main bulk of
this Runnable
's functionality is here.
149 Iterates through all sequences in input_seqio, splits them into separate files ("chunks") using a cut-off length and dataflows one job per chunk.
156 my $input_seqio = $self->param('input_seqio
');
157 my $max_chunk_length = $self->param('max_chunk_length
');
158 my $max_chunk_size = $self->param('max_chunk_size
');
159 my $output_prefix = $self->param('output_prefix
');
160 my $output_suffix = $self->param('output_suffix
');
161 my $output_dir = $self->param('output_dir
');
163 my $chunk_number = 1; # counts the chunks
164 my $chunk_length = 0; # total length of the current chunk
165 my $chunk_size = 0; # number of sequences in the current chunk
166 my $chunk_name = $output_prefix.$chunk_number.$output_suffix;
167 my $seq_filter = $self->param('seq_filter
');
169 # No need to check param('hash_directories
') because even in this mode
170 # the first file is in the required directory
173 $chunk_name = File::Spec->catfile($output_dir, $chunk_name);
175 my $chunk_seqio = Bio::SeqIO->new(-file => '>
'.$chunk_name, -format => $self->param_required('output_format
'));
177 while (my $seq_object = $input_seqio->next_seq) {
179 next if ( ( defined($seq_filter) ) && ( $seq_object->id =~ /$seq_filter/ ) );
181 $chunk_seqio->write_seq( $seq_object );
182 $chunk_length += $seq_object->length();
185 if (($max_chunk_length && ($chunk_length > $max_chunk_length)) or ($max_chunk_size && ($chunk_size > $max_chunk_size))) {
187 # dataflow the current chunk:
188 $self->dataflow_output_id( {
189 'chunk_name
' => $chunk_name,
190 'chunk_number
' => $chunk_number,
191 'chunk_length
' => $chunk_length,
192 'chunk_size
' => $chunk_size
195 # start writing to the next one:
199 $chunk_name = $output_prefix.$chunk_number.$output_suffix;
202 if ((defined $output_dir) and ($output_dir ne '')) {
203 push @partial_dirs, $output_dir;
205 if ($self->param('hash_directories
')) {
206 my $hash_dir = dir_revhash($chunk_number);
207 if ($hash_dir ne '') {
208 push @partial_dirs, $hash_dir;
211 my $dir_tree = File::Spec->catdir(@partial_dirs);
212 if ($dir_tree ne '') {
214 $chunk_name = File::Spec->catfile($dir_tree, $chunk_name);
216 $chunk_seqio = $chunk_seqio->new(-file => '>
'.$chunk_name);
220 if($chunk_size) { # flush the last chunk:
222 $self->dataflow_output_id( {
223 'chunk_name
' => $chunk_name,
224 'chunk_number
' => $chunk_number,
225 'chunk_length
' => $chunk_length,
226 'chunk_size
' => $chunk_size
230 unlink $chunk_name unless (stat($chunk_name))[7];
237 Description : Close the file handle open in fetch_input() even if the job fails or write_output never runs
243 close( $self->param('input_fh
') ) if $self->param('input_fh
');