Skip to content

Commit 2a62f8e

Browse files
author
Arnaud Bouchez
committed
properly implement and document the Unicode Replacement Character
1 parent 47a3e5b commit 2a62f8e

File tree

3 files changed

+37
-26
lines changed

3 files changed

+37
-26
lines changed

Diff for: src/core/mormot.core.search.pas

+2-2
Original file line numberDiff line numberDiff line change
@@ -1105,7 +1105,7 @@ TSynValidateText = class(TSynValidate)
11051105
read fProps[9] write fProps[9];
11061106
/// defines if lengths parameters expects UTF-8 or UTF-16 codepoints number
11071107
// - with default FALSE, the length is calculated with UTF-16 Unicode
1108-
// codepoints - MaxLength may not match the Ucs4 glyphs number, in case of
1108+
// codepoints - MaxLength may not match the UCS4 CodePoint, in case of
11091109
// UTF-16 surrogates
11101110
// - you can set this property to TRUE so that the UTF-8 byte count would
11111111
// be used for truncation againts the MaxLength parameter
@@ -1222,7 +1222,7 @@ TSynFilterTruncate = class(TSynFilter)
12221222
read fMaxLength write fMaxLength;
12231223
/// defines if MaxLength is stored as UTF-8 or UTF-16 codepoints number
12241224
// - with default FALSE, the length is calculated with UTF-16 Unicode
1225-
// codepoints - MaxLength may not match the Ucs4 glyphs number, in case of
1225+
// codepoints - MaxLength may not match the UCS4 CodePoint, in case of
12261226
// UTF-16 surrogates
12271227
// - you can set this property to TRUE so that the UTF-8 byte count would
12281228
// be used for truncation againts the MaxLength parameter

Diff for: src/core/mormot.core.unicode.pas

+33-23
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,14 @@ TUtf8Table = record
6868
UTF16_LOSURROGATE_MIN = $dc00;
6969
UTF16_LOSURROGATE_MAX = $dfff;
7070

71+
/// replace any incoming character whose value is unrepresentable in Unicode
72+
// - set e.g. by GetUtf8WideChar(), Utf8UpperReference() or
73+
// RawUnicodeToUtf8() when ccfReplacementCharacterForUnmatchedSurrogate is set
74+
// - encoded as $ef $bf $bd bytes in UTF-8
75+
UNICODE_REPLACEMENT_CHARACTER = $fffd;
7176

72-
/// internal function, used to retrieve a Ucs4 codepoint (>127) from UTF-8
77+
78+
/// internal function, used to retrieve a UCS4 CodePoint (>127) from UTF-8
7379
// - not to be called directly, but from inlined higher-level functions
7480
// - here U^ shall be always >= #80
7581
// - typical use is as such:
@@ -79,12 +85,12 @@ TUtf8Table = record
7985
// ! ch := GetHighUtf8Ucs4(P);
8086
function GetHighUtf8Ucs4(var U: PUtf8Char): PtrUInt;
8187

82-
/// get the WideChar stored in P^ (decode UTF-8 if necessary)
83-
// - any surrogate (Ucs4>$ffff) will be returned as '?'
84-
function GetUtf8Char(P: PUtf8Char): cardinal;
88+
/// decode UTF-16 WideChar from UTF-8 input buffer
89+
// - any surrogate (Ucs4>$ffff) is returned as UNICODE_REPLACEMENT_CHARACTER=$fffd
90+
function GetUtf8WideChar(P: PUtf8Char): cardinal;
8591
{$ifdef HASINLINE}inline;{$endif}
8692

87-
/// get the Ucs4 char stored in P^ (decode UTF-8 if necessary)
93+
/// get the UCS4 CodePoint stored in P^ (decode UTF-8 if necessary)
8894
function NextUtf8Ucs4(var P: PUtf8Char): cardinal;
8995
{$ifdef HASINLINE}inline;{$endif}
9096

@@ -94,13 +100,13 @@ function NextUtf8Ucs4(var P: PUtf8Char): cardinal;
94100
function WideCharToUtf8(Dest: PUtf8Char; aWideChar: PtrUInt): integer;
95101
{$ifdef HASINLINE}inline;{$endif}
96102

97-
/// UTF-8 encode one UTF-16 encoded Ucs4 character into Dest
103+
/// UTF-8 encode one UTF-16 encoded UCS4 CodePoint into Dest
98104
// - return the number of bytes written into Dest (i.e. from 1 up to 6)
99105
// - Source will contain the next UTF-16 character
100106
// - this method DOES handle UTF-16 surrogate pairs
101107
function Utf16CharToUtf8(Dest: PUtf8Char; var Source: PWord): integer;
102108

103-
/// UTF-8 encode one Ucs4 character into Dest
109+
/// UTF-8 encode one UCS4 CodePoint into Dest
104110
// - return the number of bytes written into Dest (i.e. from 1 up to 6)
105111
// - this method DOES handle UTF-16 surrogate pairs
106112
function Ucs4ToUtf8(ucs4: cardinal; Dest: PUtf8Char): PtrInt;
@@ -126,8 +132,8 @@ function RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer;
126132
// since Delphi 2009+
127133
// - append a trailing #0 to the ending PUtf8Char, unless ccfNoTrailingZero is set
128134
// - if ccfReplacementCharacterForUnmatchedSurrogate is set, this function will identify
129-
// unmatched surrogate pairs and replace them with EF BF BD / FFFD Unicode
130-
// Replacement character - see https://en.wikipedia.org/wiki/Specials_(Unicode_block)
135+
// unmatched surrogate pairs and replace them with UNICODE_REPLACEMENT_CHARACTER -
136+
// see https://en.wikipedia.org/wiki/Specials_(Unicode_block)
131137
function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt;
132138
Source: PWideChar; SourceLen: PtrInt; Flags: TCharConversionFlags): PtrInt; overload;
133139

@@ -161,10 +167,11 @@ function Utf8ToWideChar(dest: PWideChar; source: PUtf8Char;
161167
MaxDestChars, sourceBytes: PtrInt; NoTrailingZero: boolean = false): PtrInt; overload;
162168

163169
/// direct conversion of a UTF-8 encoded buffer into a WinAnsi shortstring buffer
170+
// - non WinAnsi chars are replaced by '?' placeholders
164171
procedure Utf8ToShortString(var dest: shortstring; source: PUtf8Char);
165172

166173
/// calculate the UTF-16 Unicode characters count, UTF-8 encoded in source^
167-
// - count may not match the Ucs4 glyphs number, in case of UTF-16 surrogates
174+
// - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates
168175
// - faster than System.Utf8ToUnicode with dest=nil
169176
function Utf8ToUnicodeLength(source: PUtf8Char): PtrUInt;
170177

@@ -192,7 +199,7 @@ function IsValidUtf8WithoutControlChars(const source: RawUtf8): boolean; overloa
192199

193200
/// will truncate the supplied UTF-8 value if its length exceeds the specified
194201
// UTF-16 Unicode characters count
195-
// - count may not match the Ucs4 glyphs number, in case of UTF-16 surrogates
202+
// - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates
196203
// - returns FALSE if text was not truncated, TRUE otherwise
197204
function Utf8TruncateToUnicodeLength(var text: RawUtf8; maxUtf16: integer): boolean;
198205

@@ -219,7 +226,7 @@ function Utf8TruncatedLength(text: PAnsiChar;
219226
textlen, maxBytes: PtrUInt): PtrInt; overload;
220227

221228
/// calculate the UTF-16 Unicode characters count of the UTF-8 encoded first line
222-
// - count may not match the Ucs4 glyphs number, in case of UTF-16 surrogates
229+
// - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates
223230
// - end the parsing at first #13 or #10 character
224231
function Utf8FirstLineToUnicodeLength(source: PUtf8Char): PtrInt;
225232

@@ -377,6 +384,7 @@ TSynAnsiFixedWidth = class(TSynAnsiConvert)
377384
/// direct conversion of an UTF-8 encoded buffer into a PAnsiChar buffer
378385
// - Dest^ buffer must be reserved with at least SourceChars bytes
379386
// - no trailing #0 is appended to the buffer
387+
// - non Ansi compatible characters are replaced as '?'
380388
function Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char;
381389
SourceChars: cardinal): PAnsiChar; override;
382390
/// conversion of a wide char into the corresponding Ansi character
@@ -1151,9 +1159,9 @@ function StrCompIL(P1, P2: pointer; L: PtrInt; Default: PtrInt = 0): PtrInt;
11511159
function StrIComp(Str1, Str2: pointer): PtrInt;
11521160
{$ifdef HASINLINE}inline;{$endif}
11531161

1154-
/// retrieve the next Ucs4 value stored in U, then update the U pointer
1162+
/// retrieve the next UCS4 CodePoint stored in U, then update the U pointer
11551163
// - this function will decode the UTF-8 content before using NormToUpper[]
1156-
// - will return '?' if the Ucs4 value is higher than #255: so use this function
1164+
// - will return '?' if the UCS4 CodePoint is higher than #255: so use this function
11571165
// only if you need to deal with ASCII characters (e.g. it's used for Soundex
11581166
// and for ContainsUTF8 function)
11591167
function GetNextUtf8Upper(var U: PUtf8Char): PtrUInt;
@@ -1384,6 +1392,7 @@ function AnsiIComp(Str1, Str2: pointer): PtrInt;
13841392
// - won't call the Operating System, so is consistent on all platforms,
13851393
// whereas UpperCaseUnicode() may vary depending on each library implementation
13861394
// - some codepoints enhance in length, so D^ should be at least twice than S^
1395+
// - any invalid input is replaced by UNICODE_REPLACEMENT_CHARACTER=$fffd
13871396
// - won't use temporary UTF-16 decoding, and optimized for plain ASCII content
13881397
function Utf8UpperReference(S, D: PUtf8Char): PUtf8Char;
13891398

@@ -1453,7 +1462,7 @@ function GetHighUtf8Ucs4(var U: PUtf8Char): PtrUInt;
14531462
result := c;
14541463
end;
14551464

1456-
function GetUtf8Char(P: PUtf8Char): cardinal;
1465+
function GetUtf8WideChar(P: PUtf8Char): cardinal;
14571466
begin
14581467
if P <> nil then
14591468
begin
@@ -1462,7 +1471,8 @@ function GetUtf8Char(P: PUtf8Char): cardinal;
14621471
begin
14631472
result := GetHighUtf8Ucs4(P);
14641473
if result > $ffff then
1465-
result := ord('?'); // do not handle surrogates now
1474+
// surrogates can't be stored in a single UTF-16 WideChar
1475+
result := UNICODE_REPLACEMENT_CHARACTER;
14661476
end;
14671477
end
14681478
else
@@ -1483,7 +1493,7 @@ function NextUtf8Ucs4(var P: PUtf8Char): cardinal;
14831493
inc(P, 2);
14841494
end
14851495
else
1486-
result := GetHighUtf8Ucs4(P); // handle even surrogates
1496+
result := GetHighUtf8Ucs4(P); // handle even UTF-16 surrogates
14871497
end
14881498
else
14891499
result := 0;
@@ -1626,7 +1636,7 @@ function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt; Source: PWideChar;
16261636
inc(Dest, 2);
16271637
until (Source > Tail) or
16281638
(PtrInt(PtrUInt(Dest)) >= DestLen);
1629-
// generic loop, handling one Ucs4 char per iteration
1639+
// generic loop, handling one UCS4 CodePoint per iteration
16301640
if (PtrInt(PtrUInt(Dest)) < DestLen) and
16311641
(PtrInt(PtrUInt(Source)) < SourceLen) then
16321642
repeat
@@ -1652,7 +1662,7 @@ function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt; Source: PWideChar;
16521662
unmatch: if (PtrInt(PtrUInt(@Dest[3])) > DestLen) or
16531663
not (ccfReplacementCharacterForUnmatchedSurrogate in Flags) then
16541664
break;
1655-
PWord(Dest)^ := $BFEF; // store Unicode Replacement Char
1665+
PWord(Dest)^ := $BFEF; // UTF-8 UNICODE_REPLACEMENT_CHARACTER
16561666
Dest[2] := AnsiChar($BD);
16571667
inc(Dest, 3);
16581668
if (PtrInt(PtrUInt(Dest)) < DestLen) and
@@ -4986,7 +4996,7 @@ function Utf8IComp(u1, u2: PUtf8Char): PtrInt;
49864996
else
49874997
begin
49884998
result := GetHighUtf8Ucs4(u1);
4989-
if result and $ffffff00 = 0 then
4999+
if result <= 255 then
49905000
result := table[result]; // 8 bits to upper, 32-bit as is
49915001
end;
49925002
if c2 <= 127 then
@@ -5476,7 +5486,7 @@ function Utf8UpperCopy(Dest, Source: PUtf8Char; SourceChars: cardinal): PUtf8Cha
54765486
Dest[3] := up[ToByte(c shr 24)];
54775487
inc(Dest, 4);
54785488
until Source > endSourceBy4;
5479-
// generic loop, handling one Ucs4 char per iteration
5489+
// generic loop, handling one UCS4 CodePoint per iteration
54805490
if Source < endSource then
54815491
repeat
54825492
By1: c := byte(Source^);
@@ -6091,7 +6101,7 @@ function Utf8UpperReference(S, D: PUtf8Char): PUtf8Char;
60916101
c := GetHighUtf8Ucs4(S2); // handle even surrogates
60926102
S := S2;
60936103
if c = 0 then
6094-
c := ord('?'); // PlaceHolder for invalid UTF-8 input
6104+
c := UNICODE_REPLACEMENT_CHARACTER; // =$fffd for invalid input
60956105
end;
60966106
if c <= UU_MAX then
60976107
c := tab.Ucs4Upper(c);
@@ -6239,9 +6249,9 @@ function Utf8ILCompReference(u1, u2: PUtf8Char; L1, L2: integer): PtrInt;
62396249
if c2 <= 127 then
62406250
begin
62416251
inc(c2, tab.Block[0, c2]);
6242-
dec(result, c2);
62436252
dec(L2);
62446253
inc(u2);
6254+
dec(result, c2);
62456255
if result <> 0 then
62466256
// found unmatching char
62476257
exit

Diff for: src/orm/mormot.orm.core.pas

+2-1
Original file line numberDiff line numberDiff line change
@@ -10171,7 +10171,8 @@ procedure TOrmPropInfoRttiChar.SetValue(Instance: TObject; Value: PUtf8Char;
1017110171
if (Value = nil) or (PInteger(Value)^ = NULL_LOW) then
1017210172
i := 0
1017310173
else
10174-
i := GetUtf8Char(Value);
10174+
// decode one UTF-16 or return UNICODE_REPLACEMENT_CHARACTER
10175+
i := GetUtf8WideChar(Value);
1017510176
fPropInfo.SetOrdProp(Instance, i);
1017610177
end;
1017710178

0 commit comments

Comments
 (0)