-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
jv_unicode.c
96 lines (91 loc) · 2.84 KB
/
jv_unicode.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#include <stdio.h>
#include <assert.h>
#include "jv_unicode.h"
#include "jv_utf8_tables.h"
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
assert(in <= end);
if (in == end) {
return 0;
}
int codepoint = -1;
unsigned char first = (unsigned char)in[0];
int length = utf8_coding_length[first];
if ((first & 0x80) == 0) {
/* Fast-path for ASCII */
codepoint = first;
length = 1;
} else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
/* Bad single byte - either an invalid byte or an out-of-place continuation byte */
length = 1;
} else if (in + length > end) {
/* String ends before UTF8 sequence ends */
length = end - in;
} else {
codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
for (int i=1; i<length; i++) {
unsigned ch = (unsigned char)in[i];
if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
/* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
codepoint = -1;
length = i;
break;
}
codepoint = (codepoint << 6) | (ch & 0x3f);
}
if (codepoint < utf8_first_codepoint[length]) {
/* Overlong UTF8 sequence */
codepoint = -1;
}
if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
/* Surrogate codepoints can't be encoded in UTF8 */
codepoint = -1;
}
if (codepoint > 0x10FFFF) {
/* Outside Unicode range */
codepoint = -1;
}
}
assert(length > 0);
*codepoint_ret = codepoint;
return in + length;
}
int jvp_utf8_is_valid(const char* in, const char* end) {
int codepoint;
while ((in = jvp_utf8_next(in, end, &codepoint))) {
if (codepoint == -1) return 0;
}
return 1;
}
int jvp_utf8_decode_length(char startchar) {
if ((startchar & 0x80) == 0) return 1;
else if ((startchar & 0xC0) == 0xC0) return 2;
else if ((startchar & 0xE0) == 0xE0) return 3;
else return 4;
}
int jvp_utf8_encode_length(int codepoint) {
if (codepoint <= 0x7F) return 1;
else if (codepoint <= 0x7FF) return 2;
else if (codepoint <= 0xFFFF) return 3;
else return 4;
}
int jvp_utf8_encode(int codepoint, char* out) {
assert(codepoint >= 0 && codepoint <= 0x10FFFF);
char* start = out;
if (codepoint <= 0x7F) {
*out++ = codepoint;
} else if (codepoint <= 0x7FF) {
*out++ = 0xC0 + ((codepoint & 0x7C0) >> 6);
*out++ = 0x80 + ((codepoint & 0x03F));
} else if(codepoint <= 0xFFFF) {
*out++ = 0xE0 + ((codepoint & 0xF000) >> 12);
*out++ = 0x80 + ((codepoint & 0x0FC0) >> 6);
*out++ = 0x80 + ((codepoint & 0x003F));
} else {
*out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18);
*out++ = 0x80 + ((codepoint & 0x03F000) >> 12);
*out++ = 0x80 + ((codepoint & 0x000FC0) >> 6);
*out++ = 0x80 + ((codepoint & 0x00003F));
}
assert(out - start == jvp_utf8_encode_length(codepoint));
return out - start;
}