Skip to content

[stdlib] Add utf8 safeguards, fix chr method, add unicode and utf16 parsing for String #3239

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 30 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7e4f0df
add better safeguards and fix chr method
martinvuyk Jul 13, 2024
7134f8f
update changelog
martinvuyk Jul 13, 2024
ab84608
rename to from_unicode
martinvuyk Jul 13, 2024
53d7038
move from_unicode to be static method
martinvuyk Jul 13, 2024
6d480b7
fix from_unicode
martinvuyk Jul 13, 2024
5236388
fix docstring
martinvuyk Jul 13, 2024
439aa21
fix indentation
martinvuyk Jul 13, 2024
c6f2dfb
fix list constructor
martinvuyk Jul 13, 2024
20bf017
fix use less lines
martinvuyk Jul 13, 2024
9a62b42
add utf16 decode
martinvuyk Jul 13, 2024
0bbc386
fix changelog
martinvuyk Jul 13, 2024
74e698b
fix detail
martinvuyk Jul 13, 2024
bf4093d
fix detail
martinvuyk Jul 13, 2024
5a2af26
fix detail
martinvuyk Jul 13, 2024
30c027f
fix detail
martinvuyk Jul 13, 2024
ddcbf0d
fix detail
martinvuyk Jul 13, 2024
9f5ee3b
simplify utf16 internals
martinvuyk Jul 13, 2024
fcc789c
fix detail
martinvuyk Jul 13, 2024
e08bc57
fix detail
martinvuyk Jul 13, 2024
9ffd5e6
fix detail
martinvuyk Jul 14, 2024
afb537a
fix detail
martinvuyk Jul 14, 2024
805041e
fix detail
martinvuyk Jul 14, 2024
0fcdf50
fix detail
martinvuyk Jul 14, 2024
be5a203
fix detail
martinvuyk Jul 14, 2024
fccdbcd
fix detail
martinvuyk Jul 14, 2024
f46ce80
add suggestion from @mzaks
martinvuyk Jul 14, 2024
6b47694
fix use unsafe_get
martinvuyk Jul 16, 2024
ca38ca3
Merge remote-tracking branch 'upstream/nightly' into add-utf8-safeguards
martinvuyk Jul 16, 2024
af3be58
use variant for unicode parsing
martinvuyk Jul 16, 2024
a4eedb0
Merge remote-tracking branch 'upstream/nightly' into add-utf8-safeguards
martinvuyk Jul 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add utf16 decode
Signed-off-by: martinvuyk <[email protected]>
  • Loading branch information
martinvuyk committed Jul 13, 2024
commit 9a62b4212789e191471f84208f741b541904fc97
5 changes: 3 additions & 2 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,9 @@ and deprecated their private `_byte_length()` methods. Added a warning to
future and `StringSlice.__len__` now does return the Unicode codepoints length.
([PR #2960](https://github.com/modularml/mojo/pull/2960) by [@martinvuyk](https://github.com/martinvuyk))

- Added a `String.from_unicode(values: List[Int]) -> String` function that
returns a String containing the concatenated characters. If a Unicode codepoint
- Added `String.from_unicode(values: List[Int]) -> String` and
`String.from_utf16(values: List[Int]) -> String` functions that return a String
containing the concatenated characters. If a Unicode codepoint
is invalid, the parsed String has a replacement character (�) in that index.
`fn chr(c: Int) -> String` function now returns a replacement character (�)
if the Unicode codepoint is invalid.
Expand Down
81 changes: 76 additions & 5 deletions stdlib/src/builtin/string.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -2300,8 +2300,7 @@ struct String(
.

Notes:
This method allocates `4 * len(values)` bytes and resizes at the
end.
This method allocates `4 * len(values)` bytes.
"""

var max_len = 4 * len(values)
Expand All @@ -2318,9 +2317,7 @@ struct String(
var curr_ptr = ptr.offset(current_offset)
_shift_unicode_to_utf8(curr_ptr, c, num_bytes)
if not _is_valid_utf8(curr_ptr, num_bytes):
debug_assert(
False, "Invalid Unicode code point at index: " + str(i)
)
debug_assert(False, "Invalid Unicode value at index: " + str(i))
num_bytes = 3
_shift_unicode_to_utf8(curr_ptr, 0xFFFD, num_bytes)
current_offset += num_bytes
Expand All @@ -2330,6 +2327,80 @@ struct String(
buf.resize(length)
return String(buf^)

@staticmethod
fn from_utf16(values: List[UInt16]) -> String:
"""Returns a String based on the given UTF-16 values.

Args:
values: A List of UTF-16 values.

Returns:
A String containing the concatenated characters. If a Unicode
codepoint is invalid, the parsed String has a replacement character
(�) in that index.

Examples:
```mojo
print(String.from_utf16(List[UInt16](97, 97, 0xFFFF, 97))) # "aa�a"
```
.

Notes:
This method allocates `2 * len(values)` bytes.
"""

var ptr = UnsafePointer[UInt8].alloc(2 * len(values))
var current_offset = 0
var values_idx = 0

while values_idx < len(values):
var curr_ptr = ptr.offset(current_offset)
var c = values.unsafe_get(values_idx)
var num_bytes: Int
alias low_6b = 0b0011_1111 # get lower 6 bits
alias c_byte = 1000_0000 # continuation byte

if c < 0b1000_0000: # ASCII
num_bytes = 1
curr_ptr[0] = c.cast[DType.uint8]()
elif c < 0x8_00: # 2 byte long sequence
num_bytes = 2
curr_ptr[0] = (0xC0 | (c >> 6)).cast[DType.uint8]()
curr_ptr[1] = (c_byte | (c & low_6b)).cast[DType.uint8]()
elif c < 0xD8_00 or c >= 0xE0_00: # 3 byte long sequence
num_bytes = 3
curr_ptr[0] = (0xE0 | (c >> 12)).cast[DType.uint8]()
curr_ptr[1] = (c_byte | ((c >> 6) & low_6b)).cast[DType.uint8]()
curr_ptr[2] = (c_byte | (c & low_6b)).cast[DType.uint8]()
else: # 4 byte long sequence
if values_idx + 1 >= len(values):
num_bytes = 1
curr_ptr[0] = 0xFF
else:
num_bytes = 4
var c2 = int(values.unsafe_get(values_idx + 1))
var num = 0x1_00_00 + (
((int(c) & 0x3_FF) << 10) | (c2 & 0x3_FF)
)
curr_ptr[0] = UInt8(0xF0 | (num >> 18))
curr_ptr[1] = UInt8(c_byte | ((num >> 12) & low_6b))
curr_ptr[2] = UInt8(c_byte | ((num >> 6) & low_6b))
curr_ptr[3] = UInt8(c_byte | (num & low_6b))
if not _is_valid_utf8(curr_ptr, num_bytes):
debug_assert(
False, "Invalid UTF-16 value at index: " + str(values_idx)
)
num_bytes = 3
_shift_unicode_to_utf8(curr_ptr, 0xFFFD, num_bytes)

current_offset += num_bytes
values_idx += 1 if num_bytes < 4 else 2
var buf = List[UInt8](
unsafe_pointer=ptr, size=current_offset, capacity=2 * len(values)
)
buf.resize(current_offset + 1, 0)
return String(buf^)


# ===----------------------------------------------------------------------=== #
# Utilities
Expand Down
12 changes: 8 additions & 4 deletions stdlib/test/builtin/test_string.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def test_ord():
assert_equal(ord("🔥".as_string_slice()), 128293)


def test_chr():
def test_unicode():
assert_equal("A", chr(65))
assert_equal("a", chr(97))
assert_equal("!", chr(33))
Expand All @@ -314,8 +314,12 @@ def test_chr():
assert_equal("🔥", chr(128293))
assert_equal("�", chr(0xFFFD))
assert_equal("�", chr(0x10FFFF + 1))
var items = List[Int](65, 97, 33, 945, 10175, 128293, 0xFFFD, 0x10FFFF + 1)
assert_equal("Aa!α➿🔥��", String.from_unicode(items))
var ucode = List[Int](65, 97, 33, 945, 10175, 128293, 0xFFFD, 0x10FFFF + 1)
assert_equal("Aa!α➿🔥��", String.from_unicode(ucode))
var utf16 = List[UInt16](
0x41, 0x61, 0x21, 0x3B1, 0x27BF, 0xD83D, 0xDD25, 0xFFFD, 0xD800
)
assert_equal("Aa!α➿🔥��", String.from_utf16(utf16))


def test_string_indexing():
Expand Down Expand Up @@ -1438,7 +1442,7 @@ def main():
test_stringref_from_dtypepointer()
test_stringref_strip()
test_ord()
test_chr()
test_unicode()
test_string_indexing()
test_atol()
test_atol_base_0()
Expand Down