aboutsummaryrefslogtreecommitdiffstats
path: root/src/include/ipxe/utf8.h
diff options
context:
space:
mode:
authorMichael Brown <mcb30@ipxe.org>2022-02-28 13:37:40 +0000
committerMichael Brown <mcb30@ipxe.org>2022-03-01 15:57:33 +0000
commit3cd3a7326178bd10fb38e09eb702b27bc463d3c6 (patch)
treea863df88bf8509fe64395d6bb479d66871043bc7 /src/include/ipxe/utf8.h
parent2acdc92994e7aca397b0d24b112e4973e82e0f91 (diff)
downloadipxe-3cd3a7326178bd10fb38e09eb702b27bc463d3c6.tar.gz
[utf8] Add ability to accumulate Unicode characters from UTF-8 bytes
Signed-off-by: Michael Brown <mcb30@ipxe.org>
Diffstat (limited to 'src/include/ipxe/utf8.h')
-rw-r--r--src/include/ipxe/utf8.h69
1 files changed, 69 insertions, 0 deletions
diff --git a/src/include/ipxe/utf8.h b/src/include/ipxe/utf8.h
new file mode 100644
index 000000000..299c25511
--- /dev/null
+++ b/src/include/ipxe/utf8.h
@@ -0,0 +1,69 @@
+#ifndef _IPXE_UTF8_H
+#define _IPXE_UTF8_H
+
+/** @file
+ *
+ * UTF-8 Unicode encoding
+ *
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+
+#include <stdint.h>
+
+/** Maximum length of UTF-8 sequence */
+#define UTF8_MAX_LEN 4
+
+/** Minimum legal value for two-byte UTF-8 sequence */
+#define UTF8_MIN_TWO 0x80
+
+/** Minimum legal value for three-byte UTF-8 sequence */
+#define UTF8_MIN_THREE 0x800
+
+/** Minimum legal value for four-byte UTF-8 sequence */
+#define UTF8_MIN_FOUR 0x10000
+
+/** High bit of UTF-8 bytes */
+#define UTF8_HIGH_BIT 0x80
+
+/** Number of data bits in each continuation byte */
+#define UTF8_CONTINUATION_BITS 6
+
+/** Bit mask for data bits in a continuation byte */
+#define UTF8_CONTINUATION_MASK ( ( 1 << UTF8_CONTINUATION_BITS ) - 1 )
+
+/** Non-data bits in a continuation byte */
+#define UTF8_CONTINUATION 0x80
+
+/** Check for a continuation byte
+ *
+ * @v byte UTF-8 byte
+ * @ret is_continuation Byte is a continuation byte
+ */
+#define UTF8_IS_CONTINUATION( byte ) \
+ ( ( (byte) & ~UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION )
+
+/** Check for an ASCII byte
+ *
+ * @v byte UTF-8 byte
+ * @ret is_ascii Byte is an ASCII byte
+ */
+#define UTF8_IS_ASCII( byte ) ( ! ( (byte) & UTF8_HIGH_BIT ) )
+
+/** Invalid character returned when decoding fails */
+#define UTF8_INVALID 0xfffd
+
+/** A UTF-8 character accumulator */
+struct utf8_accumulator {
+ /** Character in progress */
+ unsigned int character;
+ /** Number of remaining continuation bytes */
+ unsigned int remaining;
+ /** Minimum legal character */
+ unsigned int min;
+};
+
+extern unsigned int utf8_accumulate ( struct utf8_accumulator *utf8,
+ uint8_t byte );
+
+#endif /* _IPXE_UTF8_H */