my $self = shift @_;
my $input_seqio = $self->param('input_seqio');
my $max_chunk_length = $self->param('max_chunk_length');
my $max_chunk_size = $self->param('max_chunk_size');
my $output_prefix = $self->param('output_prefix');
my $output_suffix = $self->param('output_suffix');
my $output_dir = $self->param('output_dir');
my $chunk_number = 1; # counts the chunks
my $chunk_length = 0; # total length of the current chunk
my $chunk_size = 0; # number of sequences in the current chunk
my $chunk_name = $output_prefix.$chunk_number.$output_suffix;
my $seq_filter = $self->param('seq_filter');
# No need to check param('hash_directories') because even in this mode
# the first file is in the required directory
if ($output_dir) {
mkpath($output_dir);
$chunk_name = File::Spec->catfile($output_dir, $chunk_name);
}
my $chunk_seqio = Bio::SeqIO->new(-file => '>'.$chunk_name, -format => $self->param_required('output_format'));
while (my $seq_object = $input_seqio->next_seq) {
next if ( ( defined($seq_filter) ) && ( $seq_object->id =~ /$seq_filter/ ) );
$chunk_seqio->write_seq( $seq_object );
$chunk_length += $seq_object->length();
$chunk_size += 1;
if (($max_chunk_length && ($chunk_length > $max_chunk_length)) or ($max_chunk_size && ($chunk_size > $max_chunk_size))) {
# dataflow the current chunk:
$self->dataflow_output_id( {
'chunk_name' => $chunk_name,
'chunk_number' => $chunk_number,
'chunk_length' => $chunk_length,
'chunk_size' => $chunk_size
}, 2);
# start writing to the next one:
$chunk_length = 0;
$chunk_size = 0;
$chunk_number++;
$chunk_name = $output_prefix.$chunk_number.$output_suffix;
my @partial_dirs;
if ((defined $output_dir) and ($output_dir ne '')) {
push @partial_dirs, $output_dir;
}
if ($self->param('hash_directories')) {
my $hash_dir = dir_revhash($chunk_number);
if ($hash_dir ne '') {
push @partial_dirs, $hash_dir;
}
}
my $dir_tree = File::Spec->catdir(@partial_dirs);
if ($dir_tree ne '') {
mkpath($dir_tree);
$chunk_name = File::Spec->catfile($dir_tree, $chunk_name);
}
$chunk_seqio = $chunk_seqio->new(-file => '>'.$chunk_name);
}
}
if($chunk_size) { # flush the last chunk:
$self->dataflow_output_id( {
'chunk_name' => $chunk_name,
'chunk_number' => $chunk_number,
'chunk_length' => $chunk_length,
'chunk_size' => $chunk_size
}, 2);
} else {
unlink $chunk_name unless (stat($chunk_name))[7];
}
}