#include "ggma_tokenize.h"
#include "ggma_types.h"
#include "tokenize/TokenizerFactory.h"
#include <string>

Functions
GGMA_STATUS	ggma_create_tokenizer (ggma_tokenizer *tokenizer, const char tokenizer_path)
	Creates a GGMA tokenizer from a specified tokenizer path.

GGMA_STATUS	ggma_free_tokenizer (ggma_tokenizer *tokenizer)
	Frees all resources associated with a GGMA tokenizer.

GGMA_STATUS	ggma_tokenize (const ggma_tokenizer tokenizer, const char text, size_t text_len, int32_t tokens, size_t n_tokens_max, size_t n_tokens)
	Tokenizes an input text string into a sequence of token IDs.

GGMA_STATUS	ggma_detokenize (const ggma_tokenizer tokenizer, const int32_t tokens, size_t n_tokens, char *text, size_t text_len)
	Detokenizes a sequence of token IDs back into a text string.

Function Documentation

◆ ggma_create_tokenizer()

GGMA_STATUS ggma_create_tokenizer	(	ggma_tokenizer **	tokenizer,
		const char *	tokenizer_path
	)

Creates a GGMA tokenizer from a specified tokenizer path.

This function loads the necessary tokenizer components from the given tokenizer path and initializes a GGMA tokenizer handle.

Parameters

[out]	tokenizer	Pointer to the tokenizer object created from the given path
[in]	tokenizer_path	The path to the directory containing the tokenizer model and vocabulary

Returns: GGMA_STATUS_NO_ERROR on success, or an appropriate error code on failure (e.g., GGMA_STATUS_UNEXPECTED_NULL if tokenizer_path or tokenizer is NULL, or if the tokenizer cannot be created).

Definition at line 25 of file ggma_tokenize.cc.

{
  if (!tokenizer || !tokenizer_path)
    return GGMA_STATUS_UNEXPECTED_NULL;
 
  try
  {
    std::string tokenizer_id = "sentencepiece";
    auto impl = ggma::TokenizerFactory::create(tokenizer_id, tokenizer_path);
 
    *tokenizer = reinterpret_cast<ggma_tokenizer *>(impl);
    return GGMA_STATUS_NO_ERROR;
  }
  catch (...)
  {
    return GGMA_STATUS_ERROR;
  }
}

References ggma::TokenizerFactory::create(), GGMA_STATUS_ERROR, GGMA_STATUS_NO_ERROR, and GGMA_STATUS_UNEXPECTED_NULL.

◆ ggma_detokenize()

GGMA_STATUS ggma_detokenize	(	const ggma_tokenizer *	tokenizer,
		const int32_t *	tokens,
		size_t	n_tokens,
		char *	text,
		size_t	text_len
	)

Detokenizes a sequence of token IDs back into a text string.

This function uses the vocabulary from the created tokenizer to convert the sequence of token IDs back into a human-readable text string.

Parameters

[in]	tokenizer	The GGMA tokenizer handle for detokenization.
[in]	tokens	A pointer to the input buffer containing the token IDs to be detokenized.
[in]	n_tokens	The number of tokens in the `tokens` buffer.
[out]	text	A pointer to the output buffer where the detokenized text will be stored.
[in]	text_len	The maximum size of the `text` buffer in bytes.

Returns: GGMA_STATUS_NO_ERROR if successful, or an appropriate error code on failure (e.g., GGMA_STATUS_UNEXPECTED_NULL if tokenizer or tokens is NULL, or if the output buffer is too small).

Definition at line 79 of file ggma_tokenize.cc.

{
  if (!tokenizer || !tokens || !text)
    return GGMA_STATUS_UNEXPECTED_NULL;
 
  try
  {
    auto impl = reinterpret_cast<const ggma::Tokenizer *>(tokenizer);
    impl->detokenize(tokens, n_tokens, text, text_len);
    return GGMA_STATUS_NO_ERROR;
  }
  catch (...)
  {
    return GGMA_STATUS_ERROR;
  }
}

References GGMA_STATUS_ERROR, GGMA_STATUS_NO_ERROR, and GGMA_STATUS_UNEXPECTED_NULL.

◆ ggma_free_tokenizer()

GGMA_STATUS ggma_free_tokenizer ( ggma_tokenizer * tokenizer )

Frees all resources associated with a GGMA tokenizer.

Parameters

[in] tokenizer The GGMA tokenizer to free. This handle will be invalid after the call.

Returns: GGMA_STATUS_NO_ERROR if successful, or an appropriate error code on failure.

Definition at line 44 of file ggma_tokenize.cc.

{
  if (!tokenizer)
    return GGMA_STATUS_UNEXPECTED_NULL;
 
  try
  {
    auto impl = reinterpret_cast<ggma::Tokenizer *>(tokenizer);
    delete impl;
    return GGMA_STATUS_NO_ERROR;
  }
  catch (...)
  {
    return GGMA_STATUS_ERROR;
  }
}

References GGMA_STATUS_ERROR, GGMA_STATUS_NO_ERROR, and GGMA_STATUS_UNEXPECTED_NULL.

◆ ggma_tokenize()

GGMA_STATUS ggma_tokenize	(	const ggma_tokenizer *	tokenizer,
		const char *	text,
		size_t	text_len,
		int32_t *	tokens,
		size_t	n_tokens_max,
		size_t *	n_tokens
	)

Tokenizes an input text string into a sequence of token IDs.

This function uses the vocabulary from the created tokenizer to convert the input text into a series of numerical token IDs.

Parameters

[in]	tokenizer	The GGMA tokenizer handle for tokenization.
[in]	text	The null-terminated text string to be tokenized.
[in]	text_len	The length of the text in bytes. If the text is null-terminated, this can be 0 and the length will be determined internally.
[out]	tokens	Output buffer for generated token IDs.
[in]	n_tokens_max	Maximum number of tokens the `tokens` buffer can hold.
[out]	n_tokens	A pointer to a variable that will receive the actual number of tokens written to the `tokens` buffer.

Returns: GGMA_STATUS_NO_ERROR if successful, or an appropriate error code on failure (e.g., GGMA_STATUS_UNEXPECTED_NULL if tokenizer or text is NULL, or if the output buffer is too small).

Definition at line 61 of file ggma_tokenize.cc.

{
  if (!tokenizer || !text || !tokens || !n_tokens)
    return GGMA_STATUS_UNEXPECTED_NULL;
 
  try
  {
    auto impl = reinterpret_cast<const ggma::Tokenizer *>(tokenizer);
    impl->tokenize(text, text_len, tokens, n_tokens_max, n_tokens);
    return GGMA_STATUS_NO_ERROR;
  }
  catch (...)
  {
    return GGMA_STATUS_ERROR;
  }
}

References GGMA_STATUS_ERROR, GGMA_STATUS_NO_ERROR, and GGMA_STATUS_UNEXPECTED_NULL.

Functions

Function Documentation

◆ ggma_create_tokenizer()

◆ ggma_detokenize()

◆ ggma_free_tokenizer()

◆ ggma_tokenize()