diff --git a/.gitignore b/.gitignore
index a3cb48d..817fe54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ cmake-build-*
 .idea
 .clangd
 compile_commands.json
+.cache
diff --git a/include/gcache/ghost_cache.h b/include/gcache/ghost_cache.h
index 447d80c..7c83286 100644
--- a/include/gcache/ghost_cache.h
+++ b/include/gcache/ghost_cache.h
@@ -19,8 +19,9 @@ enum AccessMode : uint8_t {
   NOOP,     // do not update
 };
 
+template <typename SizeType = uint32_t>
 struct GhostMeta {
-  uint32_t size_idx;
+  SizeType size_idx;
 };
 
 template <typename Hash>
@@ -31,23 +32,26 @@ class GhostKvCache;
  * Templated type Meta must have a field size_idx; in almost all cases, this
  * field does not need to specified; it is only useful if there is some
  * additional per-page metadata to be carried.
+ * To support cache size of differnet scales, we support templates numerical
+ * type for cache size.
  */
-template <typename Hash = ghash, typename Meta = GhostMeta>
+template <typename Hash = ghash, typename Meta = GhostMeta<uint32_t>,
+          typename SizeType = uint32_t, typename HashType = uint32_t>
 class GhostCache {
  protected:
-  const uint32_t tick;
-  const uint32_t min_size;
-  const uint32_t max_size;
-  const uint32_t num_ticks;
+  const SizeType tick;
+  const SizeType min_size;
+  const SizeType max_size;
+  const SizeType num_ticks;
 
   // Key is block_id/block number
   // Value is "size_idx", which is the least non-negative number such that the
   // key will in cache if the cache size is (size_idx * tick) + min_size
-  LRUCache<uint32_t, Meta, Hash> cache;
+  LRUCache<SizeType, Meta, Hash> cache;
 
  public:
-  using Handle_t = typename LRUCache<uint32_t, Meta, Hash>::Handle_t;
-  using Node_t = typename LRUCache<uint32_t, Meta, Hash>::Node_t;
+  using Handle_t = typename LRUCache<SizeType, Meta, Hash>::Handle_t;
+  using Node_t = typename LRUCache<SizeType, Meta, Hash>::Node_t;
 
  protected:
   // these must be placed after num_ticks to ensure a correct ctor order
@@ -55,18 +59,18 @@ class GhostCache {
   std::vector<CacheStat> caches_stat;
 
   // the reused distances are formatted as a histogram
-  std::vector<uint32_t> reuse_distances;  // converted to caches_stat lazily
-  uint32_t reuse_count;                   // count all access to reuse_distances
+  std::vector<size_t> reuse_distances;  // converted to caches_stat lazily
+  size_t reuse_count;                   // count all access to reuse_distances
 
-  Handle_t access_impl(uint32_t block_id, uint32_t hash, AccessMode mode);
+  Handle_t access_impl(SizeType block_id, HashType hash, AccessMode mode);
 
-  template <uint32_t S, typename H>
+  template <uint32_t S, typename H, typename ST, typename HT>
   friend class SampledGhostKvCache;
 
   void build_caches_stat();
 
  public:
-  GhostCache(uint32_t tick, uint32_t min_size, uint32_t max_size)
+  GhostCache(SizeType tick, SizeType min_size, SizeType max_size)
       : tick(tick),
         min_size(min_size),
         max_size(max_size),
@@ -83,29 +87,29 @@ class GhostCache {
     cache.init(max_size);
   }
 
-  void access(uint32_t block_id, AccessMode mode = AccessMode::DEFAULT) {
+  void access(SizeType block_id, AccessMode mode = AccessMode::DEFAULT) {
     access_impl(block_id, Hash{}(block_id), mode);
   }
 
-  [[nodiscard]] uint32_t get_tick() const { return tick; }
-  [[nodiscard]] uint32_t get_min_size() const { return min_size; }
-  [[nodiscard]] uint32_t get_max_size() const { return max_size; }
+  [[nodiscard]] SizeType get_tick() const { return tick; }
+  [[nodiscard]] SizeType get_min_size() const { return min_size; }
+  [[nodiscard]] SizeType get_max_size() const { return max_size; }
 
-  [[nodiscard]] const CacheStat& get_stat(uint32_t cache_size) {
+  [[nodiscard]] const CacheStat& get_stat(SizeType cache_size) {
     assert(cache_size >= min_size);
     assert(cache_size <= max_size);
     assert((cache_size - min_size) % tick == 0);
-    uint32_t size_idx = (cache_size - min_size) / tick;
+    SizeType size_idx = (cache_size - min_size) / tick;
     assert(size_idx < num_ticks);
     const CacheStat& stat = caches_stat[size_idx];
     if (stat.hit_cnt + stat.miss_cnt != reuse_count) build_caches_stat();
     assert(stat.hit_cnt + stat.miss_cnt == reuse_count);
     return stat;
   }
-  [[nodiscard]] double get_hit_rate(uint32_t cache_size) {
+  [[nodiscard]] double get_hit_rate(SizeType cache_size) {
     return get_stat(cache_size).get_hit_rate();
   }
-  [[nodiscard]] double get_miss_rate(uint32_t cache_size) {
+  [[nodiscard]] double get_miss_rate(SizeType cache_size) {
     return get_stat(cache_size).get_miss_rate();
   }
 
@@ -167,66 +171,71 @@ class GhostCache {
 
 // only sample 1/32 (~3.125%)
 template <uint32_t SampleShift = 5, typename Hash = ghash,
-          typename Meta = GhostMeta>
-class SampledGhostCache : public GhostCache<Hash, Meta> {
+          typename Meta = GhostMeta<uint32_t>, typename SizeType = uint32_t,
+          typename HashType = uint32_t>
+class SampledGhostCache : public GhostCache<Hash, Meta, SizeType, HashType> {
  public:
-  SampledGhostCache(uint32_t tick, uint32_t min_size, uint32_t max_size)
-      : GhostCache<Hash, Meta>(tick >> SampleShift, min_size >> SampleShift,
-                               max_size >> SampleShift) {
-    static_assert(SampleShift <= 32, "SampleShift must be no larger than 32");
-    assert(tick % (1 << SampleShift) == 0);
-    assert(min_size % (1 << SampleShift) == 0);
-    assert(max_size % (1 << SampleShift) == 0);
+  SampledGhostCache(SizeType tick, SizeType min_size, SizeType max_size)
+      : GhostCache<Hash, Meta, SizeType, HashType>(tick >> SampleShift,
+                                                   min_size >> SampleShift,
+                                                   max_size >> SampleShift) {
+    static_assert(SampleShift <= std::numeric_limits<SizeType>::digits);
+    assert((tick >> SampleShift) << SampleShift == tick);
+    assert((min_size >> SampleShift) << SampleShift == min_size);
+    assert((max_size >> SampleShift) << SampleShift == max_size);
     // Left few bits used for sampling; right few used for hash.
     // Make sure they never overlap.
-    assert(std::countr_zero<uint32_t>(std::bit_ceil<uint32_t>(max_size)) <=
-           32 - static_cast<int>(SampleShift));
+    assert(std::countr_zero<SizeType>(std::bit_ceil<SizeType>(max_size)) <=
+           std::numeric_limits<SizeType>::digits -
+               static_cast<int>(SampleShift));
     assert(this->tick > 0);
   }
 
   // Only update ghost cache if the first few bits of hash is all zero
-  void access(uint32_t block_id, AccessMode mode = AccessMode::DEFAULT) {
-    uint32_t hash = Hash{}(block_id);
+  void access(SizeType block_id, AccessMode mode = AccessMode::DEFAULT) {
+    HashType hash = Hash{}(block_id);
     if constexpr (SampleShift > 0) {
-      if (hash >> (32 - SampleShift)) return;
+      if (hash >> (std::numeric_limits<HashType>::digits - SampleShift)) return;
     }
     this->access_impl(block_id, hash, mode);
   }
 
-  [[nodiscard]] uint32_t get_tick() const { return this->tick << SampleShift; }
-  [[nodiscard]] uint32_t get_min_size() const {
+  [[nodiscard]] SizeType get_tick() const { return this->tick << SampleShift; }
+  [[nodiscard]] SizeType get_min_size() const {
     return this->min_size << SampleShift;
   }
-  [[nodiscard]] uint32_t get_max_size() const {
+  [[nodiscard]] SizeType get_max_size() const {
     return this->max_size << SampleShift;
   }
 
-  [[nodiscard]] const CacheStat& get_stat(uint32_t cache_size) {
+  [[nodiscard]] const CacheStat& get_stat(SizeType cache_size) {
     return get_stat_shifted(cache_size >> SampleShift);
   }
-  [[nodiscard]] double get_hit_rate(uint32_t cache_size) {
+  [[nodiscard]] double get_hit_rate(SizeType cache_size) {
     return this->get_stat(cache_size).get_hit_rate();
   }
-  [[nodiscard]] double get_miss_rate(uint32_t cache_size) {
+  [[nodiscard]] double get_miss_rate(SizeType cache_size) {
     return this->get_stat(cache_size).get_miss_rate();
   }
 
  protected:
-  template <uint32_t S, typename H>
+  template <uint32_t S, typename H, typename ST, typename HT>
   friend class SampledGhostKvCache;
 
-  [[nodiscard]] const CacheStat& get_stat_shifted(uint32_t cache_size_shifted) {
-    return GhostCache<Hash, Meta>::get_stat(cache_size_shifted);
+  [[nodiscard]] const CacheStat& get_stat_shifted(SizeType cache_size_shifted) {
+    return GhostCache<Hash, Meta, SizeType, HashType>::get_stat(
+        cache_size_shifted);
   }
 };
 
 /**
  * When using ghost cache, we assume in_use list is always empty.
  */
-template <typename Hash, typename Meta>
-inline typename GhostCache<Hash, Meta>::Handle_t
-GhostCache<Hash, Meta>::access_impl(uint32_t block_id, uint32_t hash,
-                                    AccessMode mode) {
+template <typename Hash, typename Meta, typename SizeType, typename HashType>
+inline typename GhostCache<Hash, Meta, SizeType, HashType>::Handle_t
+GhostCache<Hash, Meta, SizeType, HashType>::access_impl(SizeType block_id,
+                                                        HashType hash,
+                                                        AccessMode mode) {
   Handle_t s;  // successor
   Handle_t h = cache.refresh(block_id, hash, s);
   assert(h);  // Since there is no handle in use, allocation must never fail.
@@ -257,7 +266,7 @@ GhostCache<Hash, Meta>::access_impl(uint32_t block_id, uint32_t hash,
    * 2) X's size_idx should be set to 0.
    * 3) if X itself is a boundary, set that boundary to X's next (sucessor).
    */
-  uint32_t size_idx;
+  SizeType size_idx;
   if (s) {  // No new insertion
     size_idx = h->size_idx;
     if (size_idx < num_ticks - 1 && boundaries[size_idx] == h.node)
@@ -274,7 +283,7 @@ GhostCache<Hash, Meta>::access_impl(uint32_t block_id, uint32_t hash,
     if (size_idx < num_ticks - 1 && cache.size() == size_idx * tick + min_size)
       boundaries[size_idx] = cache.lru_.next;
   }
-  for (uint32_t i = 0; i < size_idx; ++i) {
+  for (SizeType i = 0; i < size_idx; ++i) {
     auto& b = boundaries[i];
     if (!b) continue;
     b->value.size_idx++;
@@ -301,9 +310,9 @@ GhostCache<Hash, Meta>::access_impl(uint32_t block_id, uint32_t hash,
   return h;
 }
 
-template <typename Hash, typename Meta>
-inline void GhostCache<Hash, Meta>::build_caches_stat() {
-  uint32_t accum_hit_cnt = 0;
+template <typename Hash, typename Meta, typename SizeType, typename HashType>
+inline void GhostCache<Hash, Meta, SizeType, HashType>::build_caches_stat() {
+  size_t accum_hit_cnt = 0;
   for (size_t idx = 0; idx < caches_stat.size(); ++idx) {
     accum_hit_cnt += reuse_distances[idx];
     caches_stat[idx].hit_cnt = accum_hit_cnt;
@@ -311,9 +320,9 @@ inline void GhostCache<Hash, Meta>::build_caches_stat() {
   }
 }
 
-template <typename Hash, typename Meta>
-inline std::ostream& GhostCache<Hash, Meta>::print(std::ostream& os,
-                                                   int indent) {
+template <typename Hash, typename Meta, typename SizeType, typename HashType>
+inline std::ostream& GhostCache<Hash, Meta, SizeType, HashType>::print(
+    std::ostream& os, int indent) {
   build_caches_stat();
   os << "GhostCache (tick=" << tick << ", min=" << min_size
      << ", max=" << max_size << ", num_ticks=" << num_ticks
@@ -325,7 +334,7 @@ inline std::ostream& GhostCache<Hash, Meta>::print(std::ostream& os,
     os << boundaries[0]->key;
   else
     os << "(null)";
-  for (uint32_t i = 1; i < boundaries.size(); ++i) {
+  for (size_t i = 1; i < boundaries.size(); ++i) {
     os << ", ";
     if (boundaries[i])
       os << boundaries[i]->key;
@@ -335,7 +344,7 @@ inline std::ostream& GhostCache<Hash, Meta>::print(std::ostream& os,
   os << "]\n";
   for (int i = 0; i < indent + 1; ++i) os << '\t';
   os << "Stat:       [" << min_size << ": " << caches_stat[0];
-  for (uint32_t i = 1; i < num_ticks; ++i)
+  for (size_t i = 1; i < num_ticks; ++i)
     os << ", " << min_size + i * tick << ": " << caches_stat[i];
   os << "]\n";
   for (int i = 0; i < indent + 1; ++i) os << '\t';
diff --git a/include/gcache/ghost_kv_cache.h b/include/gcache/ghost_kv_cache.h
index c0023e6..cec1a6d 100644
--- a/include/gcache/ghost_kv_cache.h
+++ b/include/gcache/ghost_kv_cache.h
@@ -6,9 +6,10 @@
 
 namespace gcache {
 
+template <typename SizeType = uint32_t>
 struct GhostKvMeta {
-  uint32_t size_idx;
-  uint32_t kv_size;
+  SizeType size_idx;
+  SizeType kv_size;
 };
 
 /**
@@ -16,29 +17,34 @@ struct GhostKvMeta {
  * pair can be variable-length. By default support sampling (non-sampling
  * version can be acquired by setting SampleShift=0)
  */
-template <uint32_t SampleShift = 5, typename Hash = std::hash<std::string_view>>
+template <uint32_t SampleShift = 5, typename Hash = std::hash<std::string_view>,
+          typename SizeType = uint32_t, typename HashType = uint32_t>
 class SampledGhostKvCache {
-  SampledGhostCache<SampleShift, idhash, GhostKvMeta> ghost_cache;
+  SampledGhostCache<SampleShift, idhash, GhostKvMeta<SizeType>, SizeType,
+                    HashType>
+      ghost_cache;
 
  public:
   using Handle_t =
-      typename SampledGhostCache<SampleShift, idhash, GhostKvMeta>::Handle_t;
+      typename SampledGhostCache<SampleShift, idhash, GhostKvMeta<SizeType>,
+                                 SizeType, HashType>::Handle_t;
   using Node_t =
-      typename SampledGhostCache<SampleShift, idhash, GhostKvMeta>::Node_t;
+      typename SampledGhostCache<SampleShift, idhash, GhostKvMeta<SizeType>,
+                                 SizeType, HashType>::Node_t;
 
  public:
-  SampledGhostKvCache(uint32_t tick, uint32_t min_count, uint32_t max_count)
+  SampledGhostKvCache(SizeType tick, SizeType min_count, SizeType max_count)
       : ghost_cache(tick, min_count, max_count) {
     static_assert(SampleShift <= 32, "SampleShift must be no larger than 32");
   }
 
-  void access(const std::string_view key, uint32_t kv_size,
+  void access(const std::string_view key, SizeType kv_size,
               AccessMode mode = AccessMode::DEFAULT) {
-    uint32_t key_hash = Hash{}(key);
+    HashType key_hash = Hash{}(key);
     access(key_hash, kv_size, mode);
   }
 
-  void access(uint32_t key_hash, uint32_t kv_size,
+  void access(HashType key_hash, SizeType kv_size,
               AccessMode mode = AccessMode::DEFAULT) {
     // only with certain number of leading zeros is sampled
     if constexpr (SampleShift > 0) {
@@ -49,20 +55,20 @@ class SampledGhostKvCache {
   }
 
   // for compatibility with GhostCache: APIs to query by keys count
-  [[nodiscard]] uint32_t get_tick() const { return ghost_cache.get_tick(); }
-  [[nodiscard]] uint32_t get_min_count() const {
+  [[nodiscard]] SizeType get_tick() const { return ghost_cache.get_tick(); }
+  [[nodiscard]] SizeType get_min_count() const {
     return ghost_cache.get_min_size();
   }
-  [[nodiscard]] uint32_t get_max_count() const {
+  [[nodiscard]] SizeType get_max_count() const {
     return ghost_cache.get_max_size();
   }
-  [[nodiscard]] double get_hit_rate(uint32_t count) {
+  [[nodiscard]] double get_hit_rate(SizeType count) {
     return ghost_cache.get_hit_rate(count);
   }
-  [[nodiscard]] double get_miss_rate(uint32_t count) {
+  [[nodiscard]] double get_miss_rate(SizeType count) {
     return ghost_cache.get_miss_rate(count);
   }
-  [[nodiscard]] const CacheStat& get_stat(uint32_t count) {
+  [[nodiscard]] const CacheStat& get_stat(SizeType count) {
     return ghost_cache.get_stat(count);
   }
 
@@ -92,12 +98,14 @@ class SampledGhostKvCache {
     ghost_cache.for_each_until_mru(fn);
   }
 
+  // Since the aggregated kv_size can easiler grow beyond 4 GB, we use size_t
+  // instead of SizeType for that
   [[nodiscard]] const std::vector<std::tuple<
-      /*count*/ uint32_t, /*size*/ uint32_t, /*miss_rate*/ CacheStat>>
+      /*count*/ SizeType, /*size*/ size_t, /*miss_rate*/ CacheStat>>
   get_cache_stat_curve() {
-    std::vector<std::tuple<uint32_t, uint32_t, CacheStat>> curve;
-    uint32_t curr_count = 0;
-    uint32_t curr_size = 0;
+    std::vector<std::tuple<SizeType, size_t, CacheStat>> curve;
+    SizeType curr_count = 0;
+    size_t curr_size = 0;
     ghost_cache.unsafe_for_each_mru([&](Handle_t h) {
       curr_size += h->kv_size;
       ++curr_count;
diff --git a/include/gcache/lru_cache.h b/include/gcache/lru_cache.h
index bd24c6c..66f4c2e 100644
--- a/include/gcache/lru_cache.h
+++ b/include/gcache/lru_cache.h
@@ -15,7 +15,7 @@
 
 namespace gcache {
 
-template <typename Hash, typename Meta>
+template <typename Hash, typename Meta, typename SizeType, typename HashType>
 class GhostCache;
 
 template <typename Tag_t, typename Key_t, typename Value_t, typename Hash>
@@ -199,7 +199,7 @@ class LRUCache {
   // Pool for additionaly allocated handles.
   std::vector<Node_t*> extra_pool_;
 
-  template <typename H, typename M>
+  template <typename H, typename M, typename ST, typename HT>
   friend class GhostCache;
 
   template <typename T, typename K, typename V, typename H>
diff --git a/include/gcache/node.h b/include/gcache/node.h
index 03db8e1..8bbd41e 100644
--- a/include/gcache/node.h
+++ b/include/gcache/node.h
@@ -33,7 +33,7 @@ class NodeTable;
 template <typename Key_t, typename Value_t, typename Hash>
 class LRUCache;
 
-template <typename Hash, typename Meta>
+template <typename Hash, typename Meta, typename SizeType, typename HashType>
 class GhostCache;
 
 // LRUNodes forms a circular doubly linked list ordered by access time.
@@ -50,7 +50,7 @@ class LRUNode {
   template <typename K, typename V, typename H>
   friend class LRUCache;
 
-  template <typename H, typename M>
+  template <typename H, typename M, typename ST, typename HT>
   friend class GhostCache;
 
  public:
@@ -140,7 +140,7 @@ class LRUHandle : public BaseHandle<LRUNode<Key_t, Value_t>> {
   template <typename K, typename V, typename H>
   friend class LRUCache;
 
-  template <typename H, typename M>
+  template <typename H, typename M, typename ST, typename HT>
   friend class GhostCache;
 
  public:
diff --git a/include/gcache/stat.h b/include/gcache/stat.h
index 475b386..24c810b 100644
--- a/include/gcache/stat.h
+++ b/include/gcache/stat.h
@@ -1,14 +1,13 @@
 #pragma once
 
-#include <cstdint>
 #include <iomanip>
 #include <limits>
 
 namespace gcache {
 
 struct CacheStat {
-  uint64_t hit_cnt;
-  uint64_t miss_cnt;
+  size_t hit_cnt;
+  size_t miss_cnt;
 
  public:
   CacheStat() : hit_cnt(0), miss_cnt(0) {}
@@ -20,13 +19,13 @@ struct CacheStat {
   // e.g., hit_rate > 100%.
   // we don't use atomic here because we find it is too expensive.
   [[nodiscard]] double get_hit_rate() const {
-    uint64_t acc_cnt = hit_cnt + miss_cnt;
+    size_t acc_cnt = hit_cnt + miss_cnt;
     if (acc_cnt == 0) return std::numeric_limits<double>::infinity();
     return double(hit_cnt) / double(acc_cnt);
   }
 
   [[nodiscard]] double get_miss_rate() const {
-    uint64_t acc_cnt = hit_cnt + miss_cnt;
+    size_t acc_cnt = hit_cnt + miss_cnt;
     if (acc_cnt == 0) return std::numeric_limits<double>::infinity();
     return double(miss_cnt) / double(acc_cnt);
   }
@@ -37,7 +36,7 @@ struct CacheStat {
   }
 
   std::ostream& print(std::ostream& os, int width = 0) const {
-    uint64_t acc_cnt = hit_cnt + miss_cnt;
+    size_t acc_cnt = hit_cnt + miss_cnt;
     if (acc_cnt == 0)
       return os << "  NAN (" << std::setw(width) << std::fixed << hit_cnt << '/'
                 << std::setw(width) << std::fixed << acc_cnt << ')';
diff --git a/tests/test_ghost_kv.cpp b/tests/test_ghost_kv.cpp
index 9eccf2c..fe1a68a 100644
--- a/tests/test_ghost_kv.cpp
+++ b/tests/test_ghost_kv.cpp
@@ -5,6 +5,7 @@
 #include <iostream>
 #include <string>
 
+#include "gcache/ghost_cache.h"
 #include "gcache/ghost_kv_cache.h"
 #include "gcache/node.h"
 #include "util.h"
@@ -30,11 +31,10 @@ void bench1() {
   std::vector<uint32_t> reqs;
   std::vector<std::pair<uint32_t, std::string>> reqs2;
   for (uint32_t i = 0; i < bench_size; ++i) {
-    ghost_cache.access(i);
-    sampled_ghost_kv_cache.access(make_key(i), i > bench_size / 4 ? 500 : 2000);
+    ghost_cache.access(i, AccessMode::NOOP);
+    sampled_ghost_kv_cache.access(make_key(i), i > bench_size / 4 ? 500 : 2000,
+                                  AccessMode::NOOP);
   }
-  ghost_cache.reset_stat();
-  sampled_ghost_kv_cache.reset_stat();
 
   for (uint32_t i = 0; i < num_ops; ++i) reqs.emplace_back(rand() % bench_size);
   std::random_shuffle(reqs.begin(), reqs.end());
@@ -53,12 +53,12 @@ void bench1() {
   std::cout << "=== Bench 1 ===\n";
   std::cout << "w/o sampling: " << elapse_g / num_ops << " cycles/op\n";
   std::cout << "w/ sampling:  " << elapse_s / num_ops << " cycles/op\n";
-  std::cout << "========================= Hit Rate ==========================="
-            << "======================\n"
-            << " size |       w/o sampling       |        w/ sampling       |"
-            << "        kv memoy       \n"
-            << "-------------------------------------------------------------"
-            << "-----------------------\n";
+  std::cout << "==================================== Hit Rate ==============="
+               "=======================\n"
+               " size |       w/o sampling       |        w/ sampling       |"
+               "        kv memoy       \n"
+               "-------------------------------------------------------------"
+               "-----------------------\n";
 
   auto curve = sampled_ghost_kv_cache.get_cache_stat_curve();
   for (uint32_t s = tick; s <= bench_size; s += tick) {