encoding.h
#
Functions to manage various encodings.
Authors#
TheSilvered
Macros#
Nst_CP_MULTIBYTE_MAX_SIZE
#
Description:
Maximum size of a multi-byte character across all supported encodings.
Nst_CP_BOM_MAX_SIZE
#
Description:
Maximum size of the BOM across all supported encodings.
Structs#
Nst_CP
#
Synopsis:
typedef struct _Nst_CP {
const usize ch_size;
const usize mult_max_sz;
const usize mult_min_sz;
const i8 *name;
const i8 *bom;
const usize bom_size;
const Nst_CheckBytesFunc check_bytes;
const Nst_ToUTF32Func to_utf32;
const Nst_FromUTF32Func from_utf32;
} Nst_CP
Description:
The structure that represents an encoding.
Fields:
ch_size
: the size of one unit in bytes (e.g. is 1 in UTF-8 but 2 in UTF-16)mult_max_sz
: the size in bytes of the longest charactermult_min_sz
: the size in bytes of the shortest character (usually the same asch_size
)name
: the name of the encoding displayed in errorscheck_bytes
: theNst_CheckBytesFunc
function of the encodingto_utf32
: theNst_ToUTF32Func
function of the encodingfrom_utf32
: theNst_FromUTF32Func
function of the encoding
Type aliases#
Nst_CheckBytesFunc
#
Synopsis:
typedef i32 (*Nst_CheckBytesFunc)(void *str, usize len)
Description:
The signature of a function that checks the length of the first character in a string of a certain encoding.
Nst_ToUTF32Func
#
Synopsis:
typedef u32 (*Nst_ToUTF32Func)(void *str)
Description:
The signature of a function that returns the code point of the first character in a string of a certain encoding, expecting a valid sequence of bytes.
Nst_FromUTF32Func
#
Synopsis:
typedef i32 (*Nst_FromUTF32Func)(u32 ch, void *buf)
Description:
The signature of a function that encodesa code point in a certain encoding and writes the output to a buffer.
Functions#
Nst_check_ascii_bytes
#
Synopsis:
i32 Nst_check_ascii_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for ASCII.
Nst_ascii_to_utf32
#
Synopsis:
u32 Nst_ascii_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for ASCII
Nst_ascii_from_utf32
#
Synopsis:
i32 Nst_ascii_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for ASCII
Nst_check_utf8_bytes
#
Synopsis:
i32 Nst_check_utf8_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for UTF-8.
Nst_utf8_to_utf32
#
Synopsis:
u32 Nst_utf8_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for UTF-8.
Nst_utf8_from_utf32
#
Synopsis:
i32 Nst_utf8_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for UTF-8.
Nst_check_ext_utf8_bytes
#
Synopsis:
i32 Nst_check_ext_utf8_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for extUTF-8.
Nst_ext_utf8_to_utf32
#
Synopsis:
u32 Nst_ext_utf8_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for extUTF-8.
Nst_ext_utf8_from_utf32
#
Synopsis:
i32 Nst_ext_utf8_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for extUTF-8.
Nst_check_utf16_bytes
#
Synopsis:
i32 Nst_check_utf16_bytes(u16 *str, usize len)
Description:
Nst_CheckBytesFunc
for UTF-16.
Nst_utf16_to_utf32
#
Synopsis:
u32 Nst_utf16_to_utf32(u16 *str)
Description:
Nst_ToUTF32Func
for UTF-16.
Nst_utf16_from_utf32
#
Synopsis:
i32 Nst_utf16_from_utf32(u32 ch, u16 *str)
Description:
Nst_FromUTF32Func
for UTF-16.
Nst_check_utf16be_bytes
#
Synopsis:
i32 Nst_check_utf16be_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for UTF-16BE.
Nst_utf16be_to_utf32
#
Synopsis:
u32 Nst_utf16be_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for UTF-16BE.
Nst_utf16be_from_utf32
#
Synopsis:
i32 Nst_utf16be_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for UTF-16BE.
Nst_check_utf16le_bytes
#
Synopsis:
i32 Nst_check_utf16le_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for UTF-16LE.
Nst_utf16le_to_utf32
#
Synopsis:
u32 Nst_utf16le_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for UTF-16LE.
Nst_utf16le_from_utf32
#
Synopsis:
i32 Nst_utf16le_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for UTF-16LE.
Nst_check_ext_utf16_bytes
#
Synopsis:
i32 Nst_check_ext_utf16_bytes(u16 *str, usize len)
Description:
Nst_CheckBytesFunc
for extUTF-16.
Nst_ext_utf16_to_utf32
#
Synopsis:
u32 Nst_ext_utf16_to_utf32(u16 *str)
Description:
Nst_ToUTF32Func
for extUTF-16.
Nst_ext_utf16_from_utf32
#
Synopsis:
i32 Nst_ext_utf16_from_utf32(u32 ch, u16 *str)
Description:
Nst_FromUTF32Func
for extUTF-16.
Nst_check_ext_utf16be_bytes
#
Synopsis:
i32 Nst_check_ext_utf16be_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for extUTF-16BE.
Nst_ext_utf16be_to_utf32
#
Synopsis:
u32 Nst_ext_utf16be_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for extUTF-16BE.
Nst_ext_utf16be_from_utf32
#
Synopsis:
i32 Nst_ext_utf16be_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for extUTF-16BE.
Nst_check_ext_utf16le_bytes
#
Synopsis:
i32 Nst_check_ext_utf16le_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for extUTF-16LE.
Nst_ext_utf16le_to_utf32
#
Synopsis:
u32 Nst_ext_utf16le_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for extUTF-16LE.
Nst_ext_utf16le_from_utf32
#
Synopsis:
i32 Nst_ext_utf16le_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for extUTF-16LE.
Nst_check_utf32_bytes
#
Synopsis:
i32 Nst_check_utf32_bytes(u32 *str, usize len)
Description:
Nst_CheckBytesFunc
for UTF-32.
Nst_utf32_to_utf32
#
Synopsis:
u32 Nst_utf32_to_utf32(u32 *str)
Description:
Nst_ToUTF32Func
for UTF-32.
Nst_utf32_from_utf32
#
Synopsis:
i32 Nst_utf32_from_utf32(u32 ch, u32 *str)
Description:
Nst_FromUTF32Func
for UTF-32.
Nst_check_utf32be_bytes
#
Synopsis:
i32 Nst_check_utf32be_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for UTF-32BE.
Nst_utf32be_to_utf32
#
Synopsis:
u32 Nst_utf32be_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for UTF-32BE.
Nst_utf32be_from_utf32
#
Synopsis:
i32 Nst_utf32be_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for UTF-32BE.
Nst_check_utf32le_bytes
#
Synopsis:
i32 Nst_check_utf32le_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for UTF-32LE.
Nst_utf32le_to_utf32
#
Synopsis:
u32 Nst_utf32le_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for UTF-32LE.
Nst_utf32le_from_utf32
#
Synopsis:
i32 Nst_utf32le_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for UTF-32LE.
Nst_check_1250_bytes
#
Synopsis:
i32 Nst_check_1250_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for CP1250.
Nst_1250_to_utf32
#
Synopsis:
u32 Nst_1250_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for CP1250.
Nst_1250_from_utf32
#
Synopsis:
i32 Nst_1250_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for CP1250.
Nst_check_1251_bytes
#
Synopsis:
i32 Nst_check_1251_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for CP1251.
Nst_1251_to_utf32
#
Synopsis:
u32 Nst_1251_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for CP1251.
Nst_1251_from_utf32
#
Synopsis:
i32 Nst_1251_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for CP1251.
Nst_check_1252_bytes
#
Synopsis:
i32 Nst_check_1252_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for CP1252.
Nst_1252_to_utf32
#
Synopsis:
u32 Nst_1252_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for CP1252.
Nst_1252_from_utf32
#
Synopsis:
i32 Nst_1252_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for CP1252.
Nst_check_1253_bytes
#
Synopsis:
i32 Nst_check_1253_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for CP1253.
Nst_1253_to_utf32
#
Synopsis:
u32 Nst_1253_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for CP1253.
Nst_1253_from_utf32
#
Synopsis:
i32 Nst_1253_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for CP1253.
Nst_check_1254_bytes
#
Synopsis:
i32 Nst_check_1254_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for CP1254.
Nst_1254_to_utf32
#
Synopsis:
u32 Nst_1254_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for CP1254.
Nst_1254_from_utf32
#
Synopsis:
i32 Nst_1254_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for CP1254.
Nst_check_1255_bytes
#
Synopsis:
i32 Nst_check_1255_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for CP1255.
Nst_1255_to_utf32
#
Synopsis:
u32 Nst_1255_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for CP1255.
Nst_1255_from_utf32
#
Synopsis:
i32 Nst_1255_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for CP1255.
Nst_check_1256_bytes
#
Synopsis:
i32 Nst_check_1256_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for CP1256.
Nst_1256_to_utf32
#
Synopsis:
u32 Nst_1256_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for CP1256.
Nst_1256_from_utf32
#
Synopsis:
i32 Nst_1256_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for CP1256.
Nst_check_1257_bytes
#
Synopsis:
i32 Nst_check_1257_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for CP1257.
Nst_1257_to_utf32
#
Synopsis:
u32 Nst_1257_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for CP1257.
Nst_1257_from_utf32
#
Synopsis:
i32 Nst_1257_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for CP1257.
Nst_check_1258_bytes
#
Synopsis:
i32 Nst_check_1258_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for CP1258.
Nst_1258_to_utf32
#
Synopsis:
u32 Nst_1258_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for CP1258.
Nst_1258_from_utf32
#
Synopsis:
i32 Nst_1258_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for CP1258.
Nst_check_iso8859_1_bytes
#
Synopsis:
i32 Nst_check_iso8859_1_bytes(u8 *str, usize len)
Description:
Nst_CheckBytesFunc
for ISO-8859-1.
Nst_iso8859_1_to_utf32
#
Synopsis:
u32 Nst_iso8859_1_to_utf32(u8 *str)
Description:
Nst_ToUTF32Func
for ISO-8859-1.
Nst_iso8859_1_from_utf32
#
Synopsis:
i32 Nst_iso8859_1_from_utf32(u32 ch, u8 *str)
Description:
Nst_FromUTF32Func
for ISO-8859-1.
Nst_utf16_to_utf8
#
Synopsis:
i32 Nst_utf16_to_utf8(i8 *out_str, u16 *in_str, usize in_str_len)
Description:
Translate the first character of a Unicode (UTF-16) string to UTF-8.
Parameters:
out_str
: the buffer where the UTF-8 character is writtenin_str
: the input string to readin_str_len
: the length of the input string
Returns:
The function returns the number of bytes written or -1
on error, no error is
set.
Nst_translate_cp
#
Synopsis:
bool Nst_translate_cp(Nst_CP *from, Nst_CP *to, void *from_buf, usize from_len,
void **to_buf, usize *to_len)
Description:
Translate a string to another encoding.
All pointers are expected to be valid and not NULL
except for to_len
that
can be NULL
if there is no need to get the length of the output string.
Parameters:
from
: the encoding of the given stringto
: the encoding to translate the string tofrom_buf
: the initial stringfrom_len
: the length in units of the given string (a unit is 1 byte forchar8_t
strings, two bytes forchar16_t
strings etc.)to_buf
: the pointer where the newly translated string is putto_len
: the pointer where the length of the translated string is put, it can beNULL
Returns:
true
on success and false
on failure. On failure the error is always set.
Nst_check_string_cp
#
Synopsis:
isize Nst_check_string_cp(Nst_CP *cp, void *str, usize str_len)
Description:
Checks the validity of the encoding of a string.
Parameters:
cp
: the expected encoding of the stringstr
: the string to checkstr_len
: the length in units of the string (a unit is 1 byte forchar8_t
strings, two bytes forchar16_t
strings etc.)
Returns:
The index in units of the first invalid byte or -1
if the string is correctly
encoded. No error is set.
Nst_cp
#
Synopsis:
Nst_CP *Nst_cp(Nst_CPID cpid)
Returns:
The corresponding encoding structure given its ID. If an invalid ID is given,
NULL
is returned and no error is set.
Nst_acp
#
Synopsis:
Nst_CPID Nst_acp(void)
Description:
WINDOWS ONLY Returns the Nest code page ID of the local ANSI code page. If
the ANSI code page is not supported,
Nst_CP_LATIN1
is returned.
Nst_char_to_wchar_t
#
Synopsis:
wchar_t *Nst_char_to_wchar_t(i8 *str, usize len)
Description:
Translates a UTF-8 string to Unicode (UTF-16).
The new string is heap-allocated. str is assumed to be a valid non-NULL pointer.
Parameters:
str
: the string to translatelen
: the length of the string, if 0, it is calculated withstrlen
Returns:
The function returns the new string or NULL on failure. If the function fails, the error is set.
Nst_wchar_t_to_char
#
Synopsis:
i8 *Nst_wchar_t_to_char(wchar_t *str, usize len)
Description:
Translates a Unicode (UTF-16) string to UTF-8.
The new string is heap-allocated. str is assumed to be a valid non-NULL pointer.
Parameters:
str
: the string to translatelen
: the length of the string, if0
, it is calculated withwcslen
Returns:
The function returns the new string or NULL
on failure. If the function fails,
the error is set.
Nst_is_valid_cp
#
Synopsis:
bool Nst_is_valid_cp(u32 cp)
Description:
Returns whether a code point is valid. A valid code point is smaller or equal to U+10FFFF and is not a high or low surrogate.
Nst_is_non_character
#
Synopsis:
bool Nst_is_non_character(u32 cp)
Description:
Returns whether a code is a non character.
Nst_check_bom
#
Synopsis:
Nst_CPID Nst_check_bom(i8 *str, usize len, i32 *bom_size)
Returns:
The Nst_CPID
deduced from the Byte Order Mark or
Nst_CP_UNKNOWN
if no BOM was detected.
Nst_detect_encoding
#
Synopsis:
Nst_CPID Nst_detect_encoding(i8 *str, usize len, i32 *bom_size)
Description:
Detects the encoding of a file.
If no valid encoding is detected, Nst_CP_LATIN1
is returned. No error is set.
Nst_encoding_from_name
#
Synopsis:
Nst_CPID Nst_encoding_from_name(i8 *name)
Returns:
The encoding ID from a C string, if no matching encoding is found,
Nst_CP_UNKNOWN
is returned. No error is set.
Nst_single_byte_cp
#
Synopsis:
Nst_CPID Nst_single_byte_cp(Nst_CPID cpid)
Returns:
The little endian variation of a multi-byte encoding or the encoding itself, though always one with a unit size of one byte.
Enums#
Nst_CPID
#
Synopsis:
typedef enum _Nst_CPID {
Nst_CP_UNKNOWN = -1,
Nst_CP_ASCII,
Nst_CP_UTF8,
Nst_CP_EXT_UTF8,
Nst_CP_UTF16,
Nst_CP_UTF16BE,
Nst_CP_UTF16LE,
Nst_CP_EXT_UTF16,
Nst_CP_EXT_UTF16BE,
Nst_CP_EXT_UTF16LE,
Nst_CP_UTF32,
Nst_CP_UTF32BE,
Nst_CP_UTF32LE,
Nst_CP_1250,
Nst_CP_1251,
Nst_CP_1252,
Nst_CP_1253,
Nst_CP_1254,
Nst_CP_1255,
Nst_CP_1256,
Nst_CP_1257,
Nst_CP_1258,
Nst_CP_LATIN1,
Nst_CP_ISO8859_1 = Nst_CP_LATIN1
} Nst_CPID
Description:
The supported encodings in Nest.
Nst_CP_UNKNOWN
is -1,
Nst_CP_LATIN1
and
Nst_CP_ISO8859_1
are equivalent.
Note
Nst_CP_EXT_UTF8
is a UTF-8 encoding that
allows surrogates to be encoded.
Note
Nst_CP_EXT_UTF16
along with the little and
big endian versions are UTF-16 encodings that allow for unpaired surrogates
with the only constraint being that a high surrogate cannot be the last
character.