ensembl-hive  2.7.0
CountKmers.pm
Go to the documentation of this file.
1 =pod
2 
3 =head1 NAME
4 
6 
7 =head1 SYNOPSIS
8 
9  Please refer to Bio::EnsEMBL::Hive::Examples::Kmer::PipeConfig::Kmer_conf pipeline configuration file
10  to understand how this particular example pipeline is configured and ran.
11 
12 =head1 DESCRIPTION
13 
14  'Bio::EnsEMBL::Hive::Examples::Kmer::RunnableDB::CountKmers is a runnable that counts the number of k-mers in a DNA sequence.
15 
16  As input, it takes the name of a file with a DNA sequence,
17  the format of that file (which must be supported by Bio::SeqIO),
18  k (in bases),
19 
20  k must be >= 1.
21 
22  It flows out each k-mer seen, and the count of the k-mer in the (sub)sequence as key-value pairs.
23  The key is the k-mer, and the value being the count.
24 
25 =head1 LICENSE
26 
27  See the NOTICE file distributed with this work for additional information
28  regarding copyright ownership.
29 
30  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
31  You may obtain a copy of the License at
32 
33  http://www.apache.org/licenses/LICENSE-2.0
34 
35  Unless required by applicable law or agreed to in writing, software distributed under the License
36  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
37  See the License for the specific language governing permissions and limitations under the License.
38 
39 =head1 CONTACT
40 
41  Please subscribe to the Hive mailing list: http://listserver.ebi.ac.uk/mailman/listinfo/ehive-users to discuss Hive-related questions or to be notified of our updates
42 
43 =cut
44 
45 package Bio::EnsEMBL::Hive::Examples::Kmer::RunnableDB::CountKmers;
46 
47 use strict;
48 use warnings;
49 
50 use Bio::SeqIO;
51 
52 use Data::Dumper;
53 
54 use base ('Bio::EnsEMBL::Hive::Process');
55 
56 =head2 fetch_input
57 
58  Description : Implements fetch_input() interface method of Bio::EnsEMBL::Hive::Process that is used to read in parameters and load data.
59  There are no hard and fast rules on whether to fetch parameters in fetch_input(), or to wait until run() to fetch them.
60  In general, fetch_input() is a place to validate parameter existence and values for errors before the worker get set into RUN state
61  from the FETCH_INPUT state. In this case, since it's a simple computation, we don't do anything in fetch_input() and instead just
62  handle the parameters in run()
63 
64 =cut
65 
66 sub fetch_input {
67 
68 }
69 
70 =head2 run
71 
72  Description : Implements run() interface method of Bio::EnsEMBL::Hive::Process that is used to perform the main bulk of the job (minus input and output).
73 
74  Here, run() reads in the filename of a file containing a DNA sequence,
75  the format of that file (e.g. 'FASTA' - the format must be supported by Bio::SeqIO),
76  and k,
77 
78 
79  param('sequence_file'): Name of the file containing the sequence
80  param('input_format'): Format of the sequence file (e.g. FASTA)
81  param('k'): k
82 
83 =cut
84 
85 sub run {
86  my $self = shift;
87 
88  my $sequence_file = $self->param_required('sequence_file');
89  my $input_format = $self->param('input_format');
90  my $k = $self->param_required('k');
91 
92  my $input_seqio;
93  if ( $sequence_file =~ /\.(?:gz|Z)$/) {
94  open(my $in_fh, '-|', "gunzip -c $sequence_file");
95  $input_seqio = Bio::SeqIO->new(-fh => $in_fh, -format => $self->param_required('input_format'));
96  $self->param('input_fh', $in_fh); # storing as a param so that it can be closed by post_cleanup if necessary
97  } else {
98  $input_seqio = Bio::SeqIO->new(-file => $sequence_file);
99  }
100  die "Could not open or parse '$sequence_file', please investigate" unless $input_seqio;
101 
102  my $kmer_counts = _count_kmers($input_seqio, $k);
103  $self->param('kmer_counts', $kmer_counts);
104 
105 }
106 
107 =head2 write_output
108 
109  Description: Implements the write_output() interface method of Bio::EnsEMBL::Hive::Process that is used to flow output to the rest of the pipeline.
110 
111  This is where we dataflow the kmer counts out to the rest of the pipeline. We do this in three different ways on three
112  different branches. Later on, we can choose a runnable to work with the counts we've generated here -- we choose
113  one of the branches to match the output format here to the input format that runnable expects.
114 
115 =cut
116 
117 sub write_output {
118  my $self = shift @_;
119 
120  my $kmer_counts = $self->param('kmer_counts');
121 
122  # Output the entire hash of kmer counts on flow 3
123  $self->dataflow_output_id( {'counts' => $kmer_counts, 'sequence_file' => $self->param('sequence_file')},
124  3);
125 
126  # Output each kmer and count, tagged with the name of the sequence file it was counted in,
127  # on flow 4. Unlike flow 3, we put the output in a loop, creating several dataflow events,
128  # one for each kmer
129  foreach my $kmer(keys(%{$kmer_counts})) {
130  $self->dataflow_output_id( {
131  'count' => $kmer_counts->{$kmer},
132  'sequence_file' => $self->param('sequence_file'),
133  'kmer' => $kmer,
134  }, 4);
135  }
136 }
137 
138 =head2 _count_kmers
139 
140  Description: Private method to identify and count k-mers in a sequence.
141 
142  Arg [1] : A Bio::SeqIO input filehandle.
143  Arg [2] : k
144 
145  Return : A hashref of k-mer counts. key = k-mer, value = count
146 
147 =cut
148 
149 sub _count_kmers {
150 
151  my ($seqio, $k) = @_;
152  my %kmer_counts;
153 
154  while (my $seqobj = $seqio->next_seq()) {
155  my $seq = $seqobj->seq();
156 
157  my $last_kmer_start = (length($seq) - $k) + 1;
158  for (my $i = 0; $i < $last_kmer_start; $i++) {
159  my $kmer = substr($seq, $i, $k);
160  $kmer_counts{$kmer}++;
161  }
162  }
163  return \%kmer_counts;
164 }
165 
166 
167 1;
Bio::EnsEMBL::Hive::Process::param
public param()
Bio::EnsEMBL::Hive::Examples::Kmer::RunnableDB::CountKmers
Definition: CountKmers.pm:30
Bio::EnsEMBL::Hive::Process
Definition: Process.pm:77