|
ONE - On-device Neural Engine
|
#include <Context.h>
Public Member Functions | |
| Context (const char *package_path) | |
| GGMAConfig | load_config (const std::string &package_path) |
| void | prefill (ggma_token *tokens, size_t n_tokens, std::vector< uint8_t > &hidden_state) |
| void | unemb (std::vector< uint8_t > &hidden_state, size_t n_tokens, std::vector< float > &logits) |
| ggma_token | sample (const std::vector< float > &logits) |
| void | decode (ggma_token token_id, std::vector< uint8_t > &hidden_state) |
| void | decode (ggma_token token_id, std::vector< float > &logits) |
| ~Context ()=default | |
| GGMA_STATUS | generate (ggma_token *tokens, size_t n_tokens, size_t n_tokens_max, size_t *n_predict) |
| ggma::Context::Context | ( | const char * | package_path | ) |
Definition at line 78 of file Context.cc.
References ggma::GGMAConfig::cache_size, ggma::KVCache::init(), and load_config().
|
default |
| void ggma::Context::decode | ( | ggma_token | token_id, |
| std::vector< float > & | logits | ||
| ) |
Definition at line 281 of file Context.cc.
| void ggma::Context::decode | ( | ggma_token | token_id, |
| std::vector< uint8_t > & | hidden_state | ||
| ) |
Definition at line 276 of file Context.cc.
Referenced by generate().
| GGMA_STATUS ggma::Context::generate | ( | ggma_token * | tokens, |
| size_t | n_tokens, | ||
| size_t | n_tokens_max, | ||
| size_t * | n_predict | ||
| ) |
Definition at line 39 of file Generate.cc.
References ggma::GGMAConfig::cache_size, decode(), ggma::ModelConfig::eos_token_id, GGMA_STATUS_ERROR, GGMA_STATUS_NO_ERROR, ggma::ModelConfig::hidden_size, ggma::GGMAConfig::model, ggma::ModelConfig::num_attention_heads, ggma::KVCache::pos(), prefill(), ggma::KVCache::reset_pos(), sample(), ggma::KVCache::set_pos(), ggma::KVCache::transpose(), and unemb().
| ggma::GGMAConfig ggma::Context::load_config | ( | const std::string & | package_path | ) |
Definition at line 84 of file Context.cc.
Referenced by Context().
| void ggma::Context::prefill | ( | ggma_token * | tokens, |
| size_t | n_tokens, | ||
| std::vector< uint8_t > & | hidden_state | ||
| ) |
Definition at line 95 of file Context.cc.
References ggma::bufsize_for(), ggma::create_and_prepare_session(), nnfw_tensorinfo::dims, nnfw_tensorinfo::dtype, ggma::KVCache::k, ggma::GGMAConfig::model, ggma::ModelConfig::n_layers, nnfw_close_session(), NNFW_ENSURE_STATUS, nnfw_input_tensorinfo(), nnfw_output_size(), nnfw_output_tensorinfo(), nnfw_run(), nnfw_set_input(), nnfw_set_output(), nnfw_tensorinfo::rank, ggma::KVCache::to_nnfw_type(), and ggma::KVCache::v.
Referenced by generate().
| ggma_token ggma::Context::sample | ( | const std::vector< float > & | logits | ) |
Definition at line 294 of file Context.cc.
References ggma::GGMAConfig::model, and ggma::ModelConfig::vocab_size.
Referenced by generate().
| void ggma::Context::unemb | ( | std::vector< uint8_t > & | hidden_state, |
| size_t | n_tokens, | ||
| std::vector< float > & | logits | ||
| ) |
Definition at line 161 of file Context.cc.
References ggma::bufsize_for(), ggma::create_and_prepare_session(), nnfw_tensorinfo::dims, nnfw_tensorinfo::dtype, nnfw_close_session(), NNFW_ENSURE_STATUS, nnfw_input_tensorinfo(), nnfw_output_tensorinfo(), nnfw_run(), nnfw_set_input(), nnfw_set_input_tensorinfo(), nnfw_set_output(), NNFW_TYPE_TENSOR_FLOAT32, ggma::num_elems(), nnfw_tensorinfo::rank, and ggma::GGMAConfig::ubatch.
Referenced by generate().