@@ -68,8 +68,14 @@ TUtf8Table = record
68
68
UTF16_LOSURROGATE_MIN = $dc00;
69
69
UTF16_LOSURROGATE_MAX = $dfff;
70
70
71
+ // / replace any incoming character whose value is unrepresentable in Unicode
72
+ // - set e.g. by GetUtf8WideChar(), Utf8UpperReference() or
73
+ // RawUnicodeToUtf8() when ccfReplacementCharacterForUnmatchedSurrogate is set
74
+ // - encoded as $ef $bf $bd bytes in UTF-8
75
+ UNICODE_REPLACEMENT_CHARACTER = $fffd;
71
76
72
- // / internal function, used to retrieve a Ucs4 codepoint (>127) from UTF-8
77
+
78
+ // / internal function, used to retrieve a UCS4 CodePoint (>127) from UTF-8
73
79
// - not to be called directly, but from inlined higher-level functions
74
80
// - here U^ shall be always >= #80
75
81
// - typical use is as such:
@@ -79,12 +85,12 @@ TUtf8Table = record
79
85
// ! ch := GetHighUtf8Ucs4(P);
80
86
function GetHighUtf8Ucs4 (var U: PUtf8Char): PtrUInt;
81
87
82
- // / get the WideChar stored in P^ (decode UTF-8 if necessary)
83
- // - any surrogate (Ucs4>$ffff) will be returned as '?'
84
- function GetUtf8Char (P: PUtf8Char): cardinal;
88
+ // / decode UTF-16 WideChar from UTF-8 input buffer
89
+ // - any surrogate (Ucs4>$ffff) is returned as UNICODE_REPLACEMENT_CHARACTER=$fffd
90
+ function GetUtf8WideChar (P: PUtf8Char): cardinal;
85
91
{ $ifdef HASINLINE} inline;{ $endif}
86
92
87
- // / get the Ucs4 char stored in P^ (decode UTF-8 if necessary)
93
+ // / get the UCS4 CodePoint stored in P^ (decode UTF-8 if necessary)
88
94
function NextUtf8Ucs4 (var P: PUtf8Char): cardinal;
89
95
{ $ifdef HASINLINE} inline;{ $endif}
90
96
@@ -94,13 +100,13 @@ function NextUtf8Ucs4(var P: PUtf8Char): cardinal;
94
100
function WideCharToUtf8 (Dest: PUtf8Char; aWideChar: PtrUInt): integer;
95
101
{ $ifdef HASINLINE} inline;{ $endif}
96
102
97
- // / UTF-8 encode one UTF-16 encoded Ucs4 character into Dest
103
+ // / UTF-8 encode one UTF-16 encoded UCS4 CodePoint into Dest
98
104
// - return the number of bytes written into Dest (i.e. from 1 up to 6)
99
105
// - Source will contain the next UTF-16 character
100
106
// - this method DOES handle UTF-16 surrogate pairs
101
107
function Utf16CharToUtf8 (Dest: PUtf8Char; var Source: PWord): integer;
102
108
103
- // / UTF-8 encode one Ucs4 character into Dest
109
+ // / UTF-8 encode one UCS4 CodePoint into Dest
104
110
// - return the number of bytes written into Dest (i.e. from 1 up to 6)
105
111
// - this method DOES handle UTF-16 surrogate pairs
106
112
function Ucs4ToUtf8 (ucs4: cardinal; Dest: PUtf8Char): PtrInt;
@@ -126,8 +132,8 @@ function RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer;
126
132
// since Delphi 2009+
127
133
// - append a trailing #0 to the ending PUtf8Char, unless ccfNoTrailingZero is set
128
134
// - if ccfReplacementCharacterForUnmatchedSurrogate is set, this function will identify
129
- // unmatched surrogate pairs and replace them with EF BF BD / FFFD Unicode
130
- // Replacement character - see https://en.wikipedia.org/wiki/Specials_(Unicode_block)
135
+ // unmatched surrogate pairs and replace them with UNICODE_REPLACEMENT_CHARACTER -
136
+ // see https://en.wikipedia.org/wiki/Specials_(Unicode_block)
131
137
function RawUnicodeToUtf8 (Dest: PUtf8Char; DestLen: PtrInt;
132
138
Source: PWideChar; SourceLen: PtrInt; Flags: TCharConversionFlags): PtrInt; overload;
133
139
@@ -161,10 +167,11 @@ function Utf8ToWideChar(dest: PWideChar; source: PUtf8Char;
161
167
MaxDestChars, sourceBytes: PtrInt; NoTrailingZero: boolean = false): PtrInt; overload;
162
168
163
169
// / direct conversion of a UTF-8 encoded buffer into a WinAnsi shortstring buffer
170
+ // - non WinAnsi chars are replaced by '?' placeholders
164
171
procedure Utf8ToShortString (var dest: shortstring; source: PUtf8Char);
165
172
166
173
// / calculate the UTF-16 Unicode characters count, UTF-8 encoded in source^
167
- // - count may not match the Ucs4 glyphs number , in case of UTF-16 surrogates
174
+ // - count may not match the UCS4 CodePoint , in case of UTF-16 surrogates
168
175
// - faster than System.Utf8ToUnicode with dest=nil
169
176
function Utf8ToUnicodeLength (source: PUtf8Char): PtrUInt;
170
177
@@ -192,7 +199,7 @@ function IsValidUtf8WithoutControlChars(const source: RawUtf8): boolean; overloa
192
199
193
200
// / will truncate the supplied UTF-8 value if its length exceeds the specified
194
201
// UTF-16 Unicode characters count
195
- // - count may not match the Ucs4 glyphs number , in case of UTF-16 surrogates
202
+ // - count may not match the UCS4 CodePoint , in case of UTF-16 surrogates
196
203
// - returns FALSE if text was not truncated, TRUE otherwise
197
204
function Utf8TruncateToUnicodeLength (var text: RawUtf8; maxUtf16: integer): boolean;
198
205
@@ -219,7 +226,7 @@ function Utf8TruncatedLength(text: PAnsiChar;
219
226
textlen, maxBytes: PtrUInt): PtrInt; overload;
220
227
221
228
// / calculate the UTF-16 Unicode characters count of the UTF-8 encoded first line
222
- // - count may not match the Ucs4 glyphs number , in case of UTF-16 surrogates
229
+ // - count may not match the UCS4 CodePoint , in case of UTF-16 surrogates
223
230
// - end the parsing at first #13 or #10 character
224
231
function Utf8FirstLineToUnicodeLength (source: PUtf8Char): PtrInt;
225
232
@@ -377,6 +384,7 @@ TSynAnsiFixedWidth = class(TSynAnsiConvert)
377
384
// / direct conversion of an UTF-8 encoded buffer into a PAnsiChar buffer
378
385
// - Dest^ buffer must be reserved with at least SourceChars bytes
379
386
// - no trailing #0 is appended to the buffer
387
+ // - non Ansi compatible characters are replaced as '?'
380
388
function Utf8BufferToAnsi (Dest: PAnsiChar; Source: PUtf8Char;
381
389
SourceChars: cardinal): PAnsiChar; override;
382
390
// / conversion of a wide char into the corresponding Ansi character
@@ -1151,9 +1159,9 @@ function StrCompIL(P1, P2: pointer; L: PtrInt; Default: PtrInt = 0): PtrInt;
1151
1159
function StrIComp (Str1, Str2: pointer): PtrInt;
1152
1160
{ $ifdef HASINLINE} inline;{ $endif}
1153
1161
1154
- // / retrieve the next Ucs4 value stored in U, then update the U pointer
1162
+ // / retrieve the next UCS4 CodePoint stored in U, then update the U pointer
1155
1163
// - this function will decode the UTF-8 content before using NormToUpper[]
1156
- // - will return '?' if the Ucs4 value is higher than #255: so use this function
1164
+ // - will return '?' if the UCS4 CodePoint is higher than #255: so use this function
1157
1165
// only if you need to deal with ASCII characters (e.g. it's used for Soundex
1158
1166
// and for ContainsUTF8 function)
1159
1167
function GetNextUtf8Upper (var U: PUtf8Char): PtrUInt;
@@ -1384,6 +1392,7 @@ function AnsiIComp(Str1, Str2: pointer): PtrInt;
1384
1392
// - won't call the Operating System, so is consistent on all platforms,
1385
1393
// whereas UpperCaseUnicode() may vary depending on each library implementation
1386
1394
// - some codepoints enhance in length, so D^ should be at least twice than S^
1395
+ // - any invalid input is replaced by UNICODE_REPLACEMENT_CHARACTER=$fffd
1387
1396
// - won't use temporary UTF-16 decoding, and optimized for plain ASCII content
1388
1397
function Utf8UpperReference (S, D: PUtf8Char): PUtf8Char;
1389
1398
@@ -1453,7 +1462,7 @@ function GetHighUtf8Ucs4(var U: PUtf8Char): PtrUInt;
1453
1462
result := c;
1454
1463
end ;
1455
1464
1456
- function GetUtf8Char (P: PUtf8Char): cardinal;
1465
+ function GetUtf8WideChar (P: PUtf8Char): cardinal;
1457
1466
begin
1458
1467
if P <> nil then
1459
1468
begin
@@ -1462,7 +1471,8 @@ function GetUtf8Char(P: PUtf8Char): cardinal;
1462
1471
begin
1463
1472
result := GetHighUtf8Ucs4(P);
1464
1473
if result > $ffff then
1465
- result := ord(' ?' ); // do not handle surrogates now
1474
+ // surrogates can't be stored in a single UTF-16 WideChar
1475
+ result := UNICODE_REPLACEMENT_CHARACTER;
1466
1476
end ;
1467
1477
end
1468
1478
else
@@ -1483,7 +1493,7 @@ function NextUtf8Ucs4(var P: PUtf8Char): cardinal;
1483
1493
inc(P, 2 );
1484
1494
end
1485
1495
else
1486
- result := GetHighUtf8Ucs4(P); // handle even surrogates
1496
+ result := GetHighUtf8Ucs4(P); // handle even UTF-16 surrogates
1487
1497
end
1488
1498
else
1489
1499
result := 0 ;
@@ -1626,7 +1636,7 @@ function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt; Source: PWideChar;
1626
1636
inc(Dest, 2 );
1627
1637
until (Source > Tail) or
1628
1638
(PtrInt(PtrUInt(Dest)) >= DestLen);
1629
- // generic loop, handling one Ucs4 char per iteration
1639
+ // generic loop, handling one UCS4 CodePoint per iteration
1630
1640
if (PtrInt(PtrUInt(Dest)) < DestLen) and
1631
1641
(PtrInt(PtrUInt(Source)) < SourceLen) then
1632
1642
repeat
@@ -1652,7 +1662,7 @@ function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt; Source: PWideChar;
1652
1662
unmatch: if (PtrInt(PtrUInt(@Dest[3 ])) > DestLen) or
1653
1663
not (ccfReplacementCharacterForUnmatchedSurrogate in Flags) then
1654
1664
break;
1655
- PWord(Dest)^ := $BFEF; // store Unicode Replacement Char
1665
+ PWord(Dest)^ := $BFEF; // UTF-8 UNICODE_REPLACEMENT_CHARACTER
1656
1666
Dest[2 ] := AnsiChar($BD);
1657
1667
inc(Dest, 3 );
1658
1668
if (PtrInt(PtrUInt(Dest)) < DestLen) and
@@ -4986,7 +4996,7 @@ function Utf8IComp(u1, u2: PUtf8Char): PtrInt;
4986
4996
else
4987
4997
begin
4988
4998
result := GetHighUtf8Ucs4(u1);
4989
- if result and $ffffff00 = 0 then
4999
+ if result <= 255 then
4990
5000
result := table[result]; // 8 bits to upper, 32-bit as is
4991
5001
end ;
4992
5002
if c2 <= 127 then
@@ -5476,7 +5486,7 @@ function Utf8UpperCopy(Dest, Source: PUtf8Char; SourceChars: cardinal): PUtf8Cha
5476
5486
Dest[3 ] := up[ToByte(c shr 24 )];
5477
5487
inc(Dest, 4 );
5478
5488
until Source > endSourceBy4;
5479
- // generic loop, handling one Ucs4 char per iteration
5489
+ // generic loop, handling one UCS4 CodePoint per iteration
5480
5490
if Source < endSource then
5481
5491
repeat
5482
5492
By1: c := byte(Source^);
@@ -6091,7 +6101,7 @@ function Utf8UpperReference(S, D: PUtf8Char): PUtf8Char;
6091
6101
c := GetHighUtf8Ucs4(S2); // handle even surrogates
6092
6102
S := S2;
6093
6103
if c = 0 then
6094
- c := ord( ' ? ' ) ; // PlaceHolder for invalid UTF-8 input
6104
+ c := UNICODE_REPLACEMENT_CHARACTER ; // =$fffd for invalid input
6095
6105
end ;
6096
6106
if c <= UU_MAX then
6097
6107
c := tab.Ucs4Upper(c);
@@ -6239,9 +6249,9 @@ function Utf8ILCompReference(u1, u2: PUtf8Char; L1, L2: integer): PtrInt;
6239
6249
if c2 <= 127 then
6240
6250
begin
6241
6251
inc(c2, tab.Block[0 , c2]);
6242
- dec(result, c2);
6243
6252
dec(L2);
6244
6253
inc(u2);
6254
+ dec(result, c2);
6245
6255
if result <> 0 then
6246
6256
// found unmatching char
6247
6257
exit
0 commit comments