-
Notifications
You must be signed in to change notification settings - Fork 0
/
ujson_utf.h
168 lines (132 loc) · 3.92 KB
/
ujson_utf.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// SPDX-License-Identifier: LGPL-2.1-or-later
/*
* Copyright (C) 2022-2024 Cyril Hrubis <[email protected]>
*/
/**
* @file ujson_utf.h
* @brief Unicode helper macros and functions.
*/
#ifndef UJSON_UTF_H
#define UJSON_UTF_H
#include <stdint.h>
#include <stddef.h>
/** Returns true if unicode byte is ASCII */
#define UJSON_UTF8_IS_ASCII(ch) (!((ch) & 0x80))
/** Returns true if we have first unicode byte of single byte sequence */
#define UJSON_UTF8_IS_NBYTE(ch) (((ch) & 0xc0) == 0x80)
/** Returns true if we have first unicode byte of two byte sequence */
#define UJSON_UTF8_IS_2BYTE(ch) (((ch) & 0xe0) == 0xc0)
/** Returns true if we have first unicode byte of three byte sequence */
#define UJSON_UTF8_IS_3BYTE(ch) (((ch) & 0xf0) == 0xe0)
/** Returns true if we have first unicode byte of four byte sequence */
#define UJSON_UTF8_IS_4BYTE(ch) (((ch) & 0xf8) == 0xf0)
#define UJSON_UTF8_NBYTE_MASK 0x3f
/**
* @brief Parses next unicode character in UTF-8 string.
* @param str A pointer to the C string.
* @return A unicode character or 0 on error or end of the string.
*/
static inline uint32_t ujson_utf8_next(const char **str)
{
uint32_t s0 = *str[0];
(*str)++;
if (UJSON_UTF8_IS_ASCII(s0))
return s0;
uint32_t s1 = *str[0];
if (!UJSON_UTF8_IS_NBYTE(s1))
return 0;
s1 &= UJSON_UTF8_NBYTE_MASK;
(*str)++;
if (UJSON_UTF8_IS_2BYTE(s0))
return (s0 & 0x1f)<<6 | s1;
uint32_t s2 = *str[0];
if (!UJSON_UTF8_IS_NBYTE(s2))
return 0;
s2 &= UJSON_UTF8_NBYTE_MASK;
(*str)++;
if (UJSON_UTF8_IS_3BYTE(s0))
return (s0 & 0x0f)<<12 | s1<<6 | s2;
(*str)++;
uint32_t s3 = *str[0];
if (!UJSON_UTF8_IS_NBYTE(s2))
return 0;
s3 &= UJSON_UTF8_NBYTE_MASK;
if (UJSON_UTF8_IS_4BYTE(s0))
return (s0 & 0x07)<<18 | s1<<12 | s2<<6 | s3;
return 0;
}
/**
* @brief Returns number of bytes next character is occupying in an UTF-8 string.
*
* @param str A pointer to a string.
* @param off An offset into the string, must point to a valid multibyte boundary.
* @return Number of bytes next character occupies, zero on string end and -1 on failure.
*/
int8_t ujson_utf8_next_chsz(const char *str, size_t off);
/**
* @brief Returns number of bytes previous character is occupying in an UTF-8 string.
*
* @param str A pointer to a string.
* @param off An offset into the string, must point to a valid multibyte boundary.
* @return Number of bytes previous character occupies, and -1 on failure.
*/
int8_t ujson_utf8_prev_chsz(const char *str, size_t off);
/**
* @brief Returns a number of characters in UTF-8 string.
*
* Returns number of characters in an UTF-8 string, which may be less or equal
* to what strlen() reports.
*
* @param str An UTF-8 string.
* @return Number of characters in the string.
*/
size_t ujson_utf8_strlen(const char *str);
/**
* @brief Returns a number of bytes needed to store unicode character into UTF-8.
*
* @param unicode A unicode character.
* @return Number of utf8 bytes required to store a unicode character.
*/
static inline unsigned int ujson_utf8_bytes(uint32_t unicode)
{
if (unicode < 0x0080)
return 1;
if (unicode < 0x0800)
return 2;
if (unicode < 0x10000)
return 3;
return 4;
}
/**
* @brief Writes an unicode character into a UTF-8 buffer.
*
* The buffer _must_ be large enough!
*
* @param unicode A unicode character.
* @param buf A byte buffer.
* @return A number of bytes written.
*/
static inline int ujson_to_utf8(uint32_t unicode, char *buf)
{
if (unicode < 0x0080) {
buf[0] = unicode & 0x007f;
return 1;
}
if (unicode < 0x0800) {
buf[0] = 0xc0 | (0x1f & (unicode>>6));
buf[1] = 0x80 | (0x3f & unicode);
return 2;
}
if (unicode < 0x10000) {
buf[0] = 0xe0 | (0x0f & (unicode>>12));
buf[1] = 0x80 | (0x3f & (unicode>>6));
buf[2] = 0x80 | (0x3f & unicode);
return 3;
}
buf[0] = 0xf0 | (0x07 & (unicode>>18));
buf[1] = 0x80 | (0x3f & (unicode>>12));
buf[2] = 0x80 | (0x3f & (unicode>>6));
buf[3] = 0x80 | (0x3f & unicode);
return 4;
}
#endif /* UJSON_UTF_H */