diff options
author | Michael Brown <mcb30@ipxe.org> | 2022-02-28 13:37:40 +0000 |
---|---|---|
committer | Michael Brown <mcb30@ipxe.org> | 2022-03-01 15:57:33 +0000 |
commit | 3cd3a7326178bd10fb38e09eb702b27bc463d3c6 (patch) | |
tree | a863df88bf8509fe64395d6bb479d66871043bc7 /src/include/ipxe/utf8.h | |
parent | 2acdc92994e7aca397b0d24b112e4973e82e0f91 (diff) | |
download | ipxe-3cd3a7326178bd10fb38e09eb702b27bc463d3c6.tar.gz |
[utf8] Add ability to accumulate Unicode characters from UTF-8 bytes
Signed-off-by: Michael Brown <mcb30@ipxe.org>
Diffstat (limited to 'src/include/ipxe/utf8.h')
-rw-r--r-- | src/include/ipxe/utf8.h | 69 |
1 files changed, 69 insertions, 0 deletions
diff --git a/src/include/ipxe/utf8.h b/src/include/ipxe/utf8.h new file mode 100644 index 000000000..299c25511 --- /dev/null +++ b/src/include/ipxe/utf8.h @@ -0,0 +1,69 @@ +#ifndef _IPXE_UTF8_H +#define _IPXE_UTF8_H + +/** @file + * + * UTF-8 Unicode encoding + * + */ + +FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); + +#include <stdint.h> + +/** Maximum length of UTF-8 sequence */ +#define UTF8_MAX_LEN 4 + +/** Minimum legal value for two-byte UTF-8 sequence */ +#define UTF8_MIN_TWO 0x80 + +/** Minimum legal value for three-byte UTF-8 sequence */ +#define UTF8_MIN_THREE 0x800 + +/** Minimum legal value for four-byte UTF-8 sequence */ +#define UTF8_MIN_FOUR 0x10000 + +/** High bit of UTF-8 bytes */ +#define UTF8_HIGH_BIT 0x80 + +/** Number of data bits in each continuation byte */ +#define UTF8_CONTINUATION_BITS 6 + +/** Bit mask for data bits in a continuation byte */ +#define UTF8_CONTINUATION_MASK ( ( 1 << UTF8_CONTINUATION_BITS ) - 1 ) + +/** Non-data bits in a continuation byte */ +#define UTF8_CONTINUATION 0x80 + +/** Check for a continuation byte + * + * @v byte UTF-8 byte + * @ret is_continuation Byte is a continuation byte + */ +#define UTF8_IS_CONTINUATION( byte ) \ + ( ( (byte) & ~UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION ) + +/** Check for an ASCII byte + * + * @v byte UTF-8 byte + * @ret is_ascii Byte is an ASCII byte + */ +#define UTF8_IS_ASCII( byte ) ( ! ( (byte) & UTF8_HIGH_BIT ) ) + +/** Invalid character returned when decoding fails */ +#define UTF8_INVALID 0xfffd + +/** A UTF-8 character accumulator */ +struct utf8_accumulator { + /** Character in progress */ + unsigned int character; + /** Number of remaining continuation bytes */ + unsigned int remaining; + /** Minimum legal character */ + unsigned int min; +}; + +extern unsigned int utf8_accumulate ( struct utf8_accumulator *utf8, + uint8_t byte ); + +#endif /* _IPXE_UTF8_H */ |