ensembl-hive  2.8.1
Cache.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 
21 =head1 CONTACT
22 
23  Please email comments or questions to the public Ensembl
24  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
25 
26  Questions may also be sent to the Ensembl help desk at
27  <http://www.ensembl.org/Help/Contact>.
28 
29 =cut
30 
31 # This package, originally distributed by CPAN, has been modified from
32 # its original version in order to be used by the ensembl project.
33 #
34 # 8 July 2002 - changed package name
35 #
36 
37 package Bio::EnsEMBL::Utils::Cache;
38 
39 use strict;
40 use vars qw(
41  $VERSION $Debug $STRUCT_SIZE $REF_SIZE
42  $BEFORE $AFTER $KEY $VALUE $BYTES $DIRTY
43 );
44 
45 $VERSION = .17;
46 $Debug = 0; # set to 1 for summary, 2 for debug output
47 $STRUCT_SIZE = 240; # per cached elem bytes overhead, approximate
48 $REF_SIZE = 16;
49 
50 # NODE ARRAY STRUCT
51 $KEY = 0;
52 $VALUE = 1;
53 $BYTES = 2;
54 $BEFORE = 3;
55 $AFTER = 4;
56 $DIRTY = 5;
57 
58 =pod
59 
60 =head1 NAME
61 
62 Tie::Cache - LRU Cache in Memory
63 
64 =head1 SYNOPSIS
65 
66  use Tie::Cache;
67  tie %cache, 'Tie::Cache', 100, { Debug => 1 };
68  tie %cache2, 'Tie::Cache', { MaxCount => 100, MaxBytes => 50000 };
69  tie %cache3, 'Tie::Cache', 100, { Debug => 1 , WriteSync => 0};
70 
71  # Options ##################################################################
72  #
73  # Debug => 0 - DEFAULT, no debugging output
74  # 1 - prints cache statistics upon destroying
75  # 2 - prints detailed debugging info
76  #
77  # MaxCount => Maximum entries in cache.
78  #
79  # MaxBytes => Maximum bytes taken in memory for cache based on approximate
80  # size of total cache structure in memory
81  #
82  # There is approximately 240 bytes used per key/value pair in the cache for
83  # the cache data structures, so a cache of 5000 entries would take
84  # at approximately 1.2M plus the size of the data being cached.
85  #
86  # MaxSize => Maximum size of each cache entry. Larger entries are not cached.
87  # This helps prevent much of the cache being flushed when
88  # you set an exceptionally large entry. Defaults to MaxBytes/10
89  #
90  # WriteSync => 1 - DEFAULT, write() when data is dirtied for
91  # TRUE CACHE (see below)
92  # 0 - write() dirty data as late as possible, when leaving
93  # cache, or when cache is being DESTROY'd
94  #
95  ############################################################################
96 
97  # cache supports normal tied hash functions
98  $cache{1} = 2; # STORE
99  print "$cache{1}\n"; # FETCH
100 
101  # FIRSTKEY, NEXTKEY
102  while(($k, $v) = each %cache) { print "$k: $v\n"; }
103 
104  delete $cache{1}; # DELETE
105  %cache = (); # CLEAR
106 
107 =head1 DESCRIPTION
108 
109 This module implements a least recently used (LRU) cache in memory
110 through a tie interface. Any time data is stored in the tied hash,
111 that key/value pair has an entry time associated with it, and
112 as the cache fills up, those members of the cache that are
113 the oldest are removed to make room for new entries.
114 
115 So, the cache only "remembers" the last written entries, up to the
116 size of the cache. This can be especially useful if you access
117 great amounts of data, but only access a minority of the data a
118 majority of the time.
119 
120 The implementation is a hash, for quick lookups,
121 overlaying a doubly linked list for quick insertion and deletion.
122 On a WinNT PII 300, writes to the hash were done at a rate
123 3100 per second, and reads from the hash at 6300 per second.
124 Work has been done to optimize refreshing cache entries that are
125 frequently read from, code like $cache{entry}, which moves the
126 entry to the end of the linked list internally.
127 
128 =cut
129 
130 sub TIEHASH {
131  my($class, $max_count, $options) = @_;
132 
133  if(ref($max_count)) {
134  $options = $max_count;
135  $max_count = $options->{MaxCount};
136  }
137 
138  unless($max_count || $options->{MaxBytes}) {
139  die('you must specify cache size with either MaxBytes or MaxCount');
140  }
141 
142  my $sync = exists($options->{WriteSync}) ? $options->{WriteSync} : 1;
143 
144  my $self = bless
145  {
146  # how many items to cache
147  max_count=> $max_count,
148 
149  # max bytes to cache
150  max_bytes => $options->{MaxBytes},
151 
152  # max size (in bytes) of an individual cache entry
153  max_size => $options->{MaxSize} || ($options->{MaxBytes} ? (int($options->{MaxBytes}/10) + 1) : 0),
154 
155  # class track, so know if overridden subs should be used
156  'class' => $class,
157  'subclass' => $class ne 'Tie::Cache' ? 1 : 0,
158 
159  # current sizes
160  count=>0,
161  bytes=>0,
162 
163  # inner structures
164  head=>0,
165  tail=>0,
166  nodes=>{},
167  'keys'=>[],
168 
169  # statistics
170  hit => 0,
171  miss => 0,
172 
173  # config
174  sync => $sync,
175  dbg => $options->{Debug} || $Debug
176 
177 
178  }, $class;
179 
180  if (($self->{max_bytes} && ! $self->{max_size})) {
181  die("MaxSize must be defined when MaxBytes is");
182  }
183 
184  if($self->{max_bytes} and $self->{max_bytes} < 1000) {
185  die("cannot set MaxBytes to under 1000, each raw entry takes $STRUCT_SIZE bytes alone");
186  }
187 
188  if($self->{max_size} && $self->{max_size} < 3) {
189  die("cannot set MaxSize to under 3 bytes, assuming error in config");
190  }
191 
192  $self;
193 }
194 
195 # override to write data leaving cache
196 sub write { undef; }
197 # commented this section out for speed
198 # my($self, $key, $value) = @_;
199 # 1;
200 #}
201 
202 # override to get data if not in cache, should return $value
203 # associated with $key
204 sub read { undef; }
205 # commented this section out for speed
206 # my($self, $key) = @_;
207 # undef;
208 #}
209 
210 sub FETCH {
211  my($self, $key) = @_;
212 
213  my $node = $self->{nodes}{$key};
214  if($node) {
215  # refresh node's entry
216  $self->{hit}++; # if $self->{dbg};
217 
218  # we used to call delete then insert, but we streamlined code
219  if(my $after = $node->[$AFTER]) {
220  $self->{dbg} > 1 and $self->print("update() node $node to tail of list");
221  # reconnect the nodes
222  my $before = $after->[$BEFORE] = $node->[$BEFORE];
223  if($before) {
224  $before->[$AFTER] = $after;
225  } else {
226  $self->{head} = $after;
227  }
228 
229  # place at the end
230  $self->{tail}[$AFTER] = $node;
231  $node->[$BEFORE] = $self->{tail};
232  $node->[$AFTER] = undef;
233  $self->{tail} = $node; # always true after this
234  } else {
235  # if there is nothing after node, then we are at the end already
236  # so don't do anything to move the nodes around
237  die("this node is the tail, so something's wrong")
238  unless($self->{tail} eq $node);
239  }
240 
241  $self->print("FETCH [$key, $node->[$VALUE]]") if ($self->{dbg} > 1);
242  $node->[$VALUE];
243  } else {
244  # we have a cache miss here
245  $self->{miss}++; # if $self->{dbg};
246 
247  # its fine to always insert a node, even when we have an undef,
248  # because even if we aren't a sub-class, we should assume use
249  # that would then set the entry. This model works well with
250  # sub-classing and reads() that might want to return undef as
251  # a valid value.
252  my $value;
253  if ($self->{subclass}) {
254  $self->print("read() for key $key") if $self->{dbg} > 1;
255  $value = $self->read($key);
256  }
257 
258  if(defined $value) {
259  my $length;
260  if($self->{max_size}) {
261  # check max size of entry, that it not exceed max size
262  $length = &_get_data_length(\$key, \$value);
263  if($length > $self->{max_size}) {
264  $self->print("direct read() [$key, $value]") if ($self->{dbg} > 1);
265  return $value;
266  }
267  }
268  # if we get here, we should insert the new node
269  $node = &create_node($self, \$key, \$value, $length);
270  &insert($self, $node);
271  $value;
272  } else {
273  undef;
274  }
275  }
276 }
277 
278 sub STORE {
279  my($self, $key, $value) = @_;
280  my $node;
281 
282  $self->print("STORE [$key,$value]") if ($self->{dbg} > 1);
283 
284  # do not cache undefined values
285  defined($value) || return(undef);
286 
287  # check max size of entry, that it not exceed max size
288  my $length;
289  if($self->{max_size}) {
290  $length = &_get_data_length(\$key, \$value);
291  if($length > $self->{max_size}) {
292  if ($self->{subclass}) {
293  $self->print("direct write() [$key, $value]") if ($self->{dbg} > 1);
294  $self->write($key, $value);
295  }
296  return $value;
297  }
298  }
299 
300  # do we have node already ?
301  if($self->{nodes}{$key}) {
302  $node = &delete($self, $key);
303 # $node = &delete($self, $key);
304 # $node->[$VALUE] = $value;
305 # $node->[$BYTES] = $length || &_get_data_length(\$key, \$value);
306  }
307 
308  # insert new node
309  $node = &create_node($self, \$key, \$value, $length);
310 # $node ||= &create_node($self, \$key, \$value, $length);
311  &insert($self, $node);
312 
313  # if the data is sync'd call write now, otherwise defer the data
314  # writing, but mark it dirty so it can be cleanup up at the end
315  if ($self->{subclass}) {
316  if($self->{sync}) {
317  $self->print("sync write() [$key, $value]") if $self->{dbg} > 1;
318  $self->write($key, $value);
319  } else {
320  $node->[$DIRTY] = 1;
321  }
322  }
323 
324  $value;
325 }
326 
327 sub DELETE {
328  my($self, $key) = @_;
329 
330  $self->print("DELETE $key") if ($self->{dbg} > 1);
331  my $node = $self->delete($key);
332  $node ? $node->[$VALUE] : undef;
333 }
334 
335 sub CLEAR {
336  my($self) = @_;
337 
338  $self->print("CLEAR CACHE") if ($self->{dbg} > 1);
339 
340  if($self->{subclass}) {
341  my $flushed = $self->flush();
342  $self->print("FLUSH COUNT $flushed") if ($self->{dbg} > 1);
343  }
344 
345  my $node;
346  while($node = $self->{head}) {
347  $self->delete($self->{head}[$KEY]);
348  }
349 
350  1;
351 }
352 
353 sub EXISTS {
354  my($self, $key) = @_;
355  exists $self->{nodes}{$key};
356 }
357 
358 # firstkey / nextkey emulate keys() and each() behavior by
359 # taking a snapshot of all the nodes at firstkey, and
360 # iterating through the keys with nextkey
361 #
362 # this method therefore will only supports one each() / keys()
363 # happening during any given time.
364 #
365 sub FIRSTKEY {
366  my($self) = @_;
367 
368  $self->{'keys'} = [];
369  my $node = $self->{head};
370  while($node) {
371  push(@{$self->{'keys'}}, $node->[$KEY]);
372  $node = $node->[$AFTER];
373  }
374 
375  shift @{$self->{'keys'}};
376 }
377 
378 sub NEXTKEY {
379  my($self, $lastkey) = @_;
380  shift @{$self->{'keys'}};
381 }
382 
383 sub DESTROY {
384  my($self) = @_;
385 
386  # if debugging, snapshot cache before clearing
387  if($self->{dbg}) {
388  if($self->{hit} || $self->{miss}) {
389  $self->{hit_ratio} =
390  sprintf("%4.3f", $self->{hit} / ($self->{hit} + $self->{miss}));
391  }
392  $self->print($self->pretty_self());
393  if($self->{dbg} > 1) {
394  $self->print($self->pretty_chains());
395  }
396  }
397 
398  $self->print("DESTROYING") if $self->{dbg} > 1;
399  $self->CLEAR();
400 
401  1;
402 }
403 
404 ####PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE
405 ## Helper Routines
406 ####PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE
407 
408 # we use scalar_refs for the data for speed
409 sub create_node {
410  my($self, $key, $value, $length) = @_;
411  (defined($$key) && defined($$value))
412  || die("need more localized data than $$key and $$value");
413 
414  # max_size always defined when max_bytes is
415  if (($self->{max_size})) {
416  $length = defined $length ? $length : &_get_data_length($key, $value)
417  } else {
418  $length = 0;
419  }
420 
421  # ORDER SPECIFIC, see top for NODE ARRAY STRUCT
422  my $node = [ $$key, $$value, $length ];
423 }
424 
425 sub _get_data_length {
426  my($key, $value) = @_;
427  my $length = 0;
428  my %refs;
429 
430  my @data = ($$key, $$value);
431  while(my $elem = shift @data) {
432  next if $refs{$elem};
433  $refs{$elem} = 1;
434  if(ref $elem && $elem =~ /(SCALAR|HASH|ARRAY)/) {
435  my $type = $1;
436  $length += $REF_SIZE; # guess, 16 bytes per ref, probably more
437  if (($type eq 'SCALAR')) {
438  $length += length($$elem);
439  } elsif (($type eq 'HASH')) {
440  while (my($k,$v) = each %$elem) {
441  for my $kv($k,$v) {
442  if ((ref $kv)) {
443  push(@data, $kv);
444  } else {
445  $length += length($kv);
446  }
447  }
448  }
449  } elsif (($type eq 'ARRAY')) {
450  for my $val (@$elem){
451  if ((ref $val)) {
452  push(@data, $val);
453  } else {
454  $length += length($val);
455  }
456  }
457  }
458  } else {
459  $length += length($elem);
460  }
461  }
462 
463  $length;
464 }
465 
466 sub insert {
467  my($self, $new_node) = @_;
468 
469  $new_node->[$AFTER] = 0;
470  $new_node->[$BEFORE] = $self->{tail};
471  $self->print("insert() [$new_node->[$KEY], $new_node->[$VALUE]]") if ($self->{dbg} > 1);
472 
473  $self->{nodes}{$new_node->[$KEY]} = $new_node;
474 
475  # current sizes
476  $self->{count}++;
477  $self->{bytes} += $new_node->[$BYTES] + $STRUCT_SIZE;
478 
479  if($self->{tail}) {
480  $self->{tail}[$AFTER] = $new_node;
481  } else {
482  $self->{head} = $new_node;
483  }
484  $self->{tail} = $new_node;
485 
486  ## if we are too big now, remove head
487  while(($self->{max_count} && ($self->{count} > $self->{max_count})) ||
488  ($self->{max_bytes} && ($self->{bytes} > $self->{max_bytes})))
489  {
490  if($self->{dbg} > 1) {
491  $self->print("current/max: ".
492  "bytes ($self->{bytes}/$self->{max_bytes}) ".
493  "count ($self->{count}/$self->{max_count}) "
494  );
495  }
496  my $old_node = $self->delete($self->{head}[$KEY]);
497  if ($self->{subclass}) {
498  if($old_node->[$DIRTY]) {
499  $self->print("dirty write() [$old_node->[$KEY], $old_node->[$VALUE]]")
500  if ($self->{dbg} > 1);
501  $self->write($old_node->[$KEY], $old_node->[$VALUE]);
502  }
503  }
504 # if($self->{dbg} > 1) {
505 # $self->print("after delete - bytes $self->{bytes}; count $self->{count}");
506 # }
507  }
508 
509  1;
510 }
511 
512 sub delete {
513  my($self, $key) = @_;
514  my $node = $self->{nodes}{$key} || return;
515 # return unless $node;
516 
517  $self->print("delete() [$key, $node->[$VALUE]]") if ($self->{dbg} > 1);
518 
519  my $before = $node->[$BEFORE];
520  my $after = $node->[$AFTER];
521 
522  # my($before, $after) = $node->{before,after};
523  if($before) {
524  ($before->[$AFTER] = $after);
525  } else {
526  $self->{head} = $after;
527  }
528 
529  if($after) {
530  ($after->[$BEFORE] = $before);
531  } else {
532  $self->{tail} = $before;
533  }
534 
535  delete $self->{nodes}{$key};
536  $self->{bytes} -= ($node->[$BYTES] + $STRUCT_SIZE);
537  $self->{count}--;
538 
539  $node;
540 }
541 
542 sub flush {
543  my $self = shift;
544 
545  $self->print("FLUSH CACHE") if ($self->{dbg} > 1);
546 
547  my $node = $self->{head};
548  my $flush_count = 0;
549  while($node) {
550  if($node->[$DIRTY]) {
551  $self->print("flush dirty write() [$node->[$KEY], $node->[$VALUE]]")
552  if ($self->{dbg} > 1);
553  $self->write($node->[$KEY], $node->[$VALUE]);
554  $node->[$DIRTY] = 0;
555  $flush_count++;
556  }
557  $node = $node->[$AFTER];
558  }
559 
560  $flush_count;
561 }
562 
563 sub print {
564  my($self, $msg) = @_;
565  print "$self: $msg\n";
566 }
567 
568 sub pretty_self {
569  my($self) = @_;
570 
571  my(@prints);
572  for(sort keys %{$self}) {
573  next unless defined $self->{$_};
574  push(@prints, "$_=>$self->{$_}");
575  }
576 
577  "{ " . join(", ", @prints) . " }";
578 }
579 
580 sub pretty_chains {
581  my($self) = @_;
582  my($str);
583  my $k = $self->FIRSTKEY();
584 
585  $str .= "[head]->";
586  my($curr_node) = $self->{head};
587  while($curr_node) {
588  $str .= "[$curr_node->[$KEY],$curr_node->[$VALUE]]->";
589  $curr_node = $curr_node->[$AFTER];
590  }
591  $str .= "[tail]->";
592 
593  $curr_node = $self->{tail};
594  while($curr_node) {
595  $str .= "[$curr_node->[$KEY],$curr_node->[$VALUE]]->";
596  $curr_node = $curr_node->[$BEFORE];
597  }
598  $str .= "[head]";
599 
600  $str;
601 }
602 
603 1;
604 
605 __END__
606 
607 =head1 INSTALLATION
608 
609 Tie::Cache installs easily using the make or nmake commands as
610 shown below. Otherwise, just copy Cache.pm to $PERLLIB/site/Tie
611 
612  > perl Makefile.PL
613  > make
614  > make test
615  > make install
616 
617  * use nmake for win32
618  ** you can also just copy Cache.pm to $perllib/Tie
619 
620 =head1 BENCMARKS
621 
622 There is another simpler LRU cache implementation in CPAN,
623 Tie::Cache::LRU, which has the same basic size limiting
624 functionality, and for this functionality, the exact same
625 interface.
626 
627 Through healthy competition, Michael G Schwern got
628 Tie::Cache::LRU mostly faster than Tie::Cache on reads & writes:
629 
630  Cache Size 5000 Tie::Cache 0.17 Tie::Cache::LRU 0.21
631  10000 Writes 1.55 CPU sec 1.10 CPU sec
632  40000 Reads 1.82 CPU sec 1.58 CPU sec
633  10000 Deletes 0.55 CPU sec 0.59 CPU sec
634 
635 Unless you are using TRUE CACHE or MaxBytes functionality,
636 using Tie::Cache::LRU should be an easy replacement for Tie::Cache.
637 
638 =head1 TRUE CACHE
639 
640 To use class as a true cache, which acts as the sole interface
641 for some data set, subclass the real cache off Tie::Cache,
642 with @ISA = qw( 'Tie::Cache' ) notation. Then override
643 the read() method for behavior when there is a cache miss,
644 and the write() method for behavior when the cache's data
645 changes.
646 
647 When WriteSync is 1 or TRUE (DEFAULT), write() is called immediately
648 when data in the cache is modified. If set to 0, data that has
649 been modified in the cache gets written out when the entries are deleted or
650 during the DESTROY phase of the cache object, usually at the end of
651 a script.
652 
653 To have the dirty data write() periodically while WriteSync is set to 0,
654 there is a flush() cache API call that will flush the dirty writes
655 in this way. Just call the flush() API like:
656 
657  my $write_flush_count = tied(%cache)->flush();
658 
659 The flush() API was added in the .17 release thanks to Rob Bloodgood.
660 
661 =head1 TRUE CACHE EXAMPLE
662 
663  use Tie::Cache;
664 
665  # personalize the Tie::Cache object, by inheriting from it
666  package My::Cache;
667  @ISA = qw(Tie::Cache);
668 
669  # override the read() and write() member functions
670  # these tell the cache what to do with a cache miss or flush
671  sub read {
672  my($self, $key) = @_;
673  print "cache miss for $key, read() data\n";
674  rand() * $key;
675  }
676  sub write {
677  my($self, $key, $value) = @_;
678  print "flushing [$key, $value] from cache, write() data\n";
679  }
680 
681  my $cache_size = $ARGV[0] || 2;
682  my $num_to_cache = $ARGV[1] || 4;
683  my $Debug = $ARGV[2] || 1;
684 
685  tie %cache, 'My::Cache', $cache_size, {Debug => $Debug};
686 
687  # load the cache with new data, each through its contents,
688  # and then reload in reverse order.
689  for(1..$num_to_cache) { print "read data $_: $cache{$_}\n" }
690  while(my($k, $v) = each %cache) { print "each data $k: $v\n"; }
691  for(my $i=$num_to_cache; $i>0; $i--) { print "read data $i: $cache{$i}\n"; }
692 
693  # flush writes now, trivial use since will happen in DESTROY() anyway
694  tied(%cache)->flush();
695 
696  # clear cache in 2 ways, write will flush out to disk
697  %cache = ();
698  undef %cache;
699 
700 =head1 NOTES
701 
702 Many thanks to all those who helped me make this module a reality,
703 including:
704 
705  :) Tom Hukins who provided me insight and motivation for
706  finishing this module.
707  :) Jamie McCarthy, for trying to make Tie::Cache be all
708  that it can be.
709  :) Rob Fugina who knows how to "TRULY CACHE".
710  :) Rob Bloodgood, for the TRUE CACHE flush() API
711 
712 =head1 AUTHOR
713 
714 Please send any questions or comments to Joshua Chamas
715 at chamas@alumni.stanford.org
716 
717 =head1 COPYRIGHT
718 
719 Copyright (c) 1999-2002 Joshua Chamas, Chamas Enterprises Inc.
720 Sponsored by development on NodeWorks http://www.nodeworks.com
721 
722 All rights reserved. This program is free software;
723 you can redistribute it and/or modify it under the same
724 terms as Perl itself.
725 
726 =cut
727 
728 
729 
730 
731 
732 
debug
public debug()