#include <Context.h>

Public Member Functions
	Context (const char *package_path)

GGMAConfig	load_config (const std::string &package_path)

void	prefill (ggma_token *tokens, size_t n_tokens, std::vector< uint8_t > &hidden_state)

void	unemb (std::vector< uint8_t > &hidden_state, size_t n_tokens, std::vector< float > &logits)

ggma_token	sample (const std::vector< float > &logits)

void	decode (ggma_token token_id, std::vector< uint8_t > &hidden_state)

void	decode (ggma_token token_id, std::vector< float > &logits)

	~Context ()=default

GGMA_STATUS	generate (ggma_token tokens, size_t n_tokens, size_t n_tokens_max, size_t n_predict)

Detailed Description

Definition at line 32 of file Context.h.

Constructor & Destructor Documentation

◆ Context()

ggma::Context::Context ( const char * package_path )

Definition at line 78 of file Context.cc.

                                             : _package_path(package_path)
{
  _cfg = load_config(_package_path);
  _cache.init(_cfg, _cfg.cache_size);
}

References ggma::GGMAConfig::cache_size, ggma::KVCache::init(), and load_config().

◆ ~Context()

ggma::Context::~Context ( )

default

Member Function Documentation

◆ decode() [1/2]

void ggma::Context::decode	(	ggma_token	token_id,
		std::vector< float > &	logits
	)

Definition at line 281 of file Context.cc.

{
  decode_impl<true, std::vector<float>>(token_id, logits);
}

◆ decode() [2/2]

void ggma::Context::decode	(	ggma_token	token_id,
		std::vector< uint8_t > &	hidden_state
	)

Definition at line 276 of file Context.cc.

{
  decode_impl<false, std::vector<uint8_t>>(token_id, hidden_state);
}

Referenced by generate().

◆ generate()

GGMA_STATUS ggma::Context::generate	(	ggma_token *	tokens,
		size_t	n_tokens,
		size_t	n_tokens_max,
		size_t *	n_predict
	)

Definition at line 39 of file Generate.cc.

{
  try
  {
    _cache.reset_pos();
 
    std::vector<uint8_t> hidden;
    std::vector<float> logits;
    ggma_token new_token;
 
    // 1. Prefill: run the model on the initial prompt to obtain the initial hidden state.
    prefill(tokens, n_tokens, hidden); // hidden = prefill(tokens)
 
    // 2. Set cache position to the length of the prompt.
    _cache.set_pos(n_tokens);
 
    // 3. Transpose KV caches to the layout expected by the decoder.
    _cache.transpose(true /* k */, "0213", _cfg.model.num_attention_heads, _cfg.cache_size,
                     _cfg.model.hidden_size / _cfg.model.num_attention_heads);
    _cache.transpose(false /* v */, "0213", _cfg.model.num_attention_heads, _cfg.cache_size,
                     _cfg.model.hidden_size / _cfg.model.num_attention_heads);
 
    // 4. Unembed: obtain logits from the hidden state.
    unemb(hidden, n_tokens, logits); // logits = unemb(hidden)
 
    // 5. Determine how many tokens we can actually generate.
    size_t n_possible = n_tokens_max - n_tokens;
    if (*n_predict > n_possible)
      *n_predict = n_possible;
 
    auto is_end_token = [this](ggma_token token) {
      return token == _cfg.model.eos_token_id.value_or(-1) || token == 0;
    };
 
    // 6. Autoregressive generation loop.
    while ((_cache.pos() - n_tokens) < *n_predict)
    {
      // Sample the most probable token from the logits of the last position.
      new_token = sample(logits);
      tokens[n_tokens + (_cache.pos() - n_tokens)] = new_token;
 
      // Stop if we hit an EOS or padding token.
      if (is_end_token(new_token))
        break;
 
      // Decode: run the model for the newly generated token to update hidden state.
      decode(new_token, hidden); // hidden = decode(new_token)
 
      // Unembed: get logits for the next step.
      unemb(hidden, 1, logits); // logits = unemb(hidden)
    }
 
    // Report how many tokens were actually generated.
    *n_predict = _cache.pos() - n_tokens;
  }
  catch (const std::exception &e)
  {
    std::cerr << "Error in generate: " << e.what() << std::endl;
    return GGMA_STATUS_ERROR;
  }
  return GGMA_STATUS_NO_ERROR;
}

References ggma::GGMAConfig::cache_size, decode(), ggma::ModelConfig::eos_token_id, GGMA_STATUS_ERROR, GGMA_STATUS_NO_ERROR, ggma::ModelConfig::hidden_size, ggma::GGMAConfig::model, ggma::ModelConfig::num_attention_heads, ggma::KVCache::pos(), prefill(), ggma::KVCache::reset_pos(), sample(), ggma::KVCache::set_pos(), ggma::KVCache::transpose(), and unemb().

◆ load_config()

ggma::GGMAConfig ggma::Context::load_config ( const std::string & package_path )

Definition at line 84 of file Context.cc.

{
  GGMAConfig config;
 
  // Load config from package path/config.json
  std::filesystem::path config_path = std::filesystem::path(package_path) / "config.json";
  config.model.load_from_file(config_path.string());
 
  return config;
}

Referenced by Context().

◆ prefill()

void ggma::Context::prefill	(	ggma_token *	tokens,
		size_t	n_tokens,
		std::vector< uint8_t > &	hidden_state
	)

Definition at line 95 of file Context.cc.

{
  std::filesystem::path nnpkg_path = std::filesystem::path(_package_path) / "prefill";
  nnfw_session *session = create_and_prepare_session(nnpkg_path.string());
 
  nnfw_tensorinfo ti;
 
  // Input 0: token_id
  //   shape = [n_batch, n_seq]
  //   n_batch = 1
  NNFW_ENSURE_STATUS(nnfw_input_tensorinfo(session, 0, &ti));
  if (ti.rank != 2 || ti.dims[0] != 1)
    throw std::runtime_error("prefill : invalid input shape");
 
  // TODO: Check ubatch from model is same to runtime config
  int ubatch = ti.dims[1]; // Number of tokens after padding to align to 32 multiples
  // Use tokens as input without copying (zero-copy)
  NNFW_ENSURE_STATUS(nnfw_set_input(session, 0, ti.dtype, tokens, ubatch * sizeof(ggma_token)));
 
  // Expected Output:
  //
  //  Index |   Name   |   Description
  //  ------|----------|---------------------------
  //   0    |  k0      |   key cache for layer 0
  //   1    |  v0      | value cache for layer 0
  //   ...  |  ...     |     ...
  //  2n-2  |  k{n-1}  |   key cache for layer n-1
  //  2n-1  |  v{n-1}  | value cache for layer n-1
  //   2n   |  hidden  |        hidden state
  //
  // where n = number of layers
 
  uint32_t num_outputs;
  NNFW_ENSURE_STATUS(nnfw_output_size(session, &num_outputs));
  if (num_outputs != _cfg.model.n_layers * 2 + 1)
    throw std::runtime_error("prefill : number of outputs mismatch");
 
  // Output 0~2n-1: KV caches
  for (int i = 0; i < _cfg.model.n_layers; ++i)
  {
    if (!_cache.v[i].empty())
      NNFW_ENSURE_STATUS(nnfw_set_output(session, 2 * i, _cache.to_nnfw_type(), _cache.v[i].data(),
                                         _cache.v[i].size()));
    if (!_cache.k[i].empty())
      NNFW_ENSURE_STATUS(nnfw_set_output(session, 2 * i + 1, _cache.to_nnfw_type(),
                                         _cache.k[i].data(), _cache.k[i].size()));
  }
 
  // Output 2n: hidden_state
  //   shape = [n_batch, n_seq, n_emb]
  //   n_batch = 1
 
  NNFW_ENSURE_STATUS(nnfw_output_tensorinfo(session, 2 * _cfg.model.n_layers, &ti));
  if (ti.rank != 3 || ti.dims[0] != 1)
    throw std::runtime_error("prefill : invalid hidden shape");
 
  // Allocate output buffer
  hidden_state.resize(bufsize_for(&ti), 0);
  // Output buffer setup - use externally allocated hidden_state (single output for single token)
  NNFW_ENSURE_STATUS(
    nnfw_set_output(session, num_outputs - 1, ti.dtype, hidden_state.data(), hidden_state.size()));
 
  NNFW_ENSURE_STATUS(nnfw_run(session));
  nnfw_close_session(session);
}

References ggma::bufsize_for(), ggma::create_and_prepare_session(), nnfw_tensorinfo::dims, nnfw_tensorinfo::dtype, ggma::KVCache::k, ggma::GGMAConfig::model, ggma::ModelConfig::n_layers, nnfw_close_session(), NNFW_ENSURE_STATUS, nnfw_input_tensorinfo(), nnfw_output_size(), nnfw_output_tensorinfo(), nnfw_run(), nnfw_set_input(), nnfw_set_output(), nnfw_tensorinfo::rank, ggma::KVCache::to_nnfw_type(), and ggma::KVCache::v.

Referenced by generate().

◆ sample()

ggma_token ggma::Context::sample ( const std::vector< float > & logits )

Definition at line 294 of file Context.cc.

{
  if (logits.empty())
    throw std::runtime_error("Empty logits tensor");
 
  // Calculate total number of float elements in logits tensor
  size_t total_elements = logits.size();
 
  if (total_elements % _cfg.model.vocab_size != 0)
    throw std::runtime_error("Invalid sequence length in logits tensor");
 
  const float *last_logits = logits.data() + (total_elements - _cfg.model.vocab_size);
 
  // Find the token with maximum logit value from the last token's logits
  const float *max_elem_iter = std::max_element(last_logits, last_logits + _cfg.model.vocab_size);
 
  return std::distance(last_logits, max_elem_iter);
}

References ggma::GGMAConfig::model, and ggma::ModelConfig::vocab_size.

Referenced by generate().

◆ unemb()

void ggma::Context::unemb	(	std::vector< uint8_t > &	hidden_state,
		size_t	n_tokens,
		std::vector< float > &	logits
	)

Definition at line 161 of file Context.cc.

{
  std::filesystem::path nnpkg_path = std::filesystem::path(_package_path) / "unemb";
  nnfw_session *session = create_and_prepare_session(nnpkg_path.string());
 
  // Input buffer setup - use externally allocated hidden_state
  nnfw_tensorinfo ti;
  NNFW_ENSURE_STATUS(nnfw_input_tensorinfo(session, 0, &ti));
  // ti[0] : n_batch
  // ti[1] : n_seq = ubatch   if padded
  //               = n_tokens if not padded
  if (ti.rank != 3 || ti.dims[0] != 1)
    throw std::runtime_error("unemb : invalid input shape");
  assert(ti.dims[1] == _cfg.ubatch); // Previously, it was padded to ubatch.
  // Handle effective (actual) tokens only.
  ti.dims[1] = n_tokens;
  // Update buffer and nnfw input tensor info as sequence length is adjusted.
  hidden_state.resize(bufsize_for(&ti), 0);
  NNFW_ENSURE_STATUS(nnfw_set_input_tensorinfo(session, 0, &ti));
  NNFW_ENSURE_STATUS(
    nnfw_set_input(session, 0, ti.dtype, hidden_state.data(), hidden_state.size()));
 
  // Output buffer setup - use externally allocated logits
  NNFW_ENSURE_STATUS(nnfw_output_tensorinfo(session, 0, &ti));
  // Check if output data type is float
  if (ti.dtype != NNFW_TYPE_TENSOR_FLOAT32)
    throw std::runtime_error("unemb: output tensor must be float type");
  // Allocate output buffer
  // ti[0] : n_batch
  // ti[1] : n_seq = ubatch   if padded
  //               = n_tokens if not padded
  if (ti.rank != 3 || ti.dims[0] != 1)
    throw std::runtime_error("unemb : invalid output shape");
  assert(ti.dims[1] == _cfg.ubatch); // Previously, it was padded to ubatch.
  // Handle effective (actual) tokens only.
  ti.dims[1] = n_tokens;
  logits.resize(num_elems(&ti), 0);
  NNFW_ENSURE_STATUS(
    nnfw_set_output(session, 0, ti.dtype, logits.data(), logits.size() * sizeof(logits[0])));
 
  NNFW_ENSURE_STATUS(nnfw_run(session));
  nnfw_close_session(session);
}

References ggma::bufsize_for(), ggma::create_and_prepare_session(), nnfw_tensorinfo::dims, nnfw_tensorinfo::dtype, nnfw_close_session(), NNFW_ENSURE_STATUS, nnfw_input_tensorinfo(), nnfw_output_tensorinfo(), nnfw_run(), nnfw_set_input(), nnfw_set_input_tensorinfo(), nnfw_set_output(), NNFW_TYPE_TENSOR_FLOAT32, ggma::num_elems(), nnfw_tensorinfo::rank, and ggma::GGMAConfig::ubatch.

Referenced by generate().

The documentation for this class was generated from the following files:

runtime/ggma/src/Context.h
runtime/ggma/src/Context.cc
runtime/ggma/src/Generate.cc

Public Member Functions

Detailed Description

Constructor & Destructor Documentation

◆ Context()

◆ ~Context()

Member Function Documentation

◆ decode() [1/2]

◆ decode() [2/2]

◆ generate()

◆ load_config()

◆ prefill()

◆ sample()

◆ unemb()