#include <KVCache.h>

Public Member Functions
size_t	element_size () const

NNFW_TYPE	to_nnfw_type () const

bool	is_valid () const

int64_t	pos () const

void	set_pos (int pos)

void	reset_pos ()

void	advance_pos ()

void	init (const ggma::GGMAConfig &cfg, int cache_size)

void	transpose (bool is_k_cache, const char *perm, size_t seq_len, size_t num_heads, size_t head_dim)
	Transpose cache with "0213" permutation [0,2,1,3].

Data Fields
KVCacheDataType	data_type

std::vector< std::vector< uint8_t > >	k

std::vector< std::vector< uint8_t > >	v

int64_t	_pos = 0

Detailed Description

Definition at line 43 of file KVCache.h.

Member Function Documentation

◆ advance_pos()

void ggma::KVCache::advance_pos ( )

inline

Definition at line 95 of file KVCache.h.

95{ _pos++; }

ggma::KVCache::_pos

int64_t _pos

Definition KVCache.h:48

References _pos.

◆ element_size()

size_t ggma::KVCache::element_size ( ) const

inline

Definition at line 51 of file KVCache.h.

  {
    switch (data_type)
    {
      case KVCacheDataType::FLOAT32:
        return sizeof(float);
      case KVCacheDataType::UINT8:
        return 1;
      default:
        return sizeof(float);
    }
  }

References data_type, ggma::FLOAT32, and ggma::UINT8.

Referenced by init(), and transpose().

◆ init()

void ggma::KVCache::init	(	const ggma::GGMAConfig &	cfg,
		int	cache_size
	)

Definition at line 100 of file KVCache.cc.

{
  if (cfg.model.n_layers <= 0)
    throw std::runtime_error("n_layers not properly initialized");
 
  // Set KV cache data type from config
  data_type = cfg.kv_cache_type;
 
  // Allocate space for K and V caches for each layer
  // Total: n_layers * 2 vectors (K and V for each layer)
  k.resize(cfg.model.n_layers);
  v.resize(cfg.model.n_layers);
 
  for (int i = 0; i < cfg.model.n_layers; ++i)
  {
    size_t buffer_size = cfg.model.hidden_size * cache_size * element_size();
    k[i].resize(buffer_size, 0);
    v[i].resize(buffer_size, 0);
  }
}

References data_type, element_size(), ggma::ModelConfig::hidden_size, k, ggma::GGMAConfig::kv_cache_type, ggma::GGMAConfig::model, ggma::ModelConfig::n_layers, and v.

Referenced by ggma::Context::Context().

◆ is_valid()

bool ggma::KVCache::is_valid ( ) const

inline

Definition at line 79 of file KVCache.h.

  {
    if (k.size() != v.size())
      return false;
 
    for (size_t i = 0; i < k.size(); ++i)
      if (k[i].size() != v[i].size())
        return false;
 
    return true;
  }

References k, size, and v.

◆ pos()

int64_t ggma::KVCache::pos ( ) const

inline

Definition at line 92 of file KVCache.h.

92{ return _pos; }

References _pos.

Referenced by ggma::Context::generate(), and set_pos().

◆ reset_pos()

void ggma::KVCache::reset_pos ( )

inline

Definition at line 94 of file KVCache.h.

94{ _pos = 0; }

References _pos.

Referenced by ggma::Context::generate().

◆ set_pos()

void ggma::KVCache::set_pos ( int pos )

inline

Definition at line 93 of file KVCache.h.

93{ _pos = pos; }

ggma::KVCache::pos

int64_t pos() const

Definition KVCache.h:92

References _pos, and pos().

Referenced by ggma::Context::generate().

◆ to_nnfw_type()

NNFW_TYPE ggma::KVCache::to_nnfw_type ( ) const

inline

Definition at line 65 of file KVCache.h.

  {
    switch (data_type)
    {
      case KVCacheDataType::FLOAT32:
        return NNFW_TYPE_TENSOR_FLOAT32;
      case KVCacheDataType::UINT8:
        return NNFW_TYPE_TENSOR_UINT8;
      default:
        return NNFW_TYPE_TENSOR_FLOAT32;
    }
  }

References data_type, ggma::FLOAT32, NNFW_TYPE_TENSOR_FLOAT32, NNFW_TYPE_TENSOR_UINT8, and ggma::UINT8.

Referenced by ggma::Context::prefill().

◆ transpose()

void ggma::KVCache::transpose	(	bool	is_k_cache,
		const char *	perm,
		size_t	seq_len,
		size_t	num_heads,
		size_t	head_dim
	)

Transpose cache with "0213" permutation [0,2,1,3].

Parameters

is_k_cache	true for K cache, false for V cache
perm	Permutation string (must be "0213")
seq_len	Sequence length dimension
num_heads	Number of attention heads
head_dim	Head dimension

Definition at line 68 of file KVCache.cc.

{
  if (perm == nullptr || strcmp(perm, "0213") != 0)
    throw std::runtime_error("Only \"0213\" permutation is supported");
 
  std::vector<std::vector<uint8_t>> &cache_vector = is_k_cache ? k : v;
  const size_t element_bytes = element_size();
  const size_t head_bytes = head_dim * element_bytes;
 
  for (size_t i = 0; i < cache_vector.size(); ++i)
  {
    std::vector<uint8_t> transposed_cache = cache_vector[i];
    uint8_t *input_data = cache_vector[i].data();
    uint8_t *output_data = transposed_cache.data();
 
    for (size_t s = 0; s < seq_len; ++s) // seq_len
    {
      for (size_t h = 0; h < num_heads; ++h) // num_heads
      {
        // source offset: s * (num_heads * head_bytes) + h * head_bytes
        // target offset: h * (seq_len * head_bytes) + s * head_bytes
        uint8_t *src_ptr = input_data + s * (num_heads * head_bytes) + h * head_bytes;
        uint8_t *dst_ptr = output_data + h * (seq_len * head_bytes) + s * head_bytes;
        memcpy(dst_ptr, src_ptr, head_bytes);
      }
    }
 
    cache_vector[i] = std::move(transposed_cache);
  }
}