-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.zig
71 lines (59 loc) · 2.78 KB
/
test.zig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
const std = @import("std");
const named_character_references = @import("named_character_references.zig");
pub const ParseResult = struct {
/// UTF-8
output: []u8,
status: Status = .ok,
pub const Status = enum { ok, missing_semicolon };
};
/// Stripped down version of the 'Named character reference state' detailed here:
/// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
///
/// Assumes that all inputs start with '&' and only implements enough to handle the
/// `tokenizer/namedEntities.test` test cases from https://github.com/html5lib/html5lib-tests
fn parse(input: []const u8, output_buf: []u8) !ParseResult {
std.debug.assert(input[0] == '&');
var matcher = named_character_references.Matcher{};
var num_pending_chars: usize = 1; // the &
for (input[1..]) |c| {
if (!matcher.char(c)) break;
num_pending_chars += 1;
}
if (matcher.getCodepoints()) |codepoints| {
var output_len: usize = try std.unicode.utf8Encode(codepoints.first, output_buf);
if (codepoints.second.asInt()) |codepoint| {
output_len += try std.unicode.utf8Encode(codepoint, output_buf[output_len..]);
}
return .{
.output = output_buf[0..output_len],
.status = if (matcher.ends_with_semicolon) .ok else .missing_semicolon,
};
} else {
@memcpy(output_buf[0..num_pending_chars], input[0..num_pending_chars]);
return .{ .output = output_buf[0..num_pending_chars] };
}
}
test "namedEntities.test" {
const allocator = std.testing.allocator;
const test_json_contents = try std.fs.cwd().readFileAlloc(allocator, "namedEntities.test", std.math.maxInt(usize));
defer allocator.free(test_json_contents);
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, test_json_contents, .{});
defer parsed.deinit();
var buf: [128]u8 = undefined;
for (parsed.value.object.get("tests").?.array.items) |item| {
const object = item.object;
const input = object.get("input").?.string;
const result = try parse(input, &buf);
const expected_output = object.get("output").?.array.items[0].array.items[1].string;
try std.testing.expectEqualStrings(expected_output, result.output);
const expected_status: ParseResult.Status = if (object.get("errors") == null) .ok else .missing_semicolon;
try std.testing.expectEqual(expected_status, result.status);
}
}
test "backtracking" {
var buf: [128]u8 = undefined;
// Should match ¬, but ¬i could lead to valid character references so it needs to
// backtrack from ¬i to get back to the last match (¬ -> U+00AC)
const result = try parse("¬it;", &buf);
try std.testing.expectEqualStrings("\u{00AC}", result.output);
}