Skip to content

Commit

Permalink
Fix parsing "meta" tag with encoding attribute
Browse files Browse the repository at this point in the history
When parsing a <meta encoding=""> tag, the parser calls charEncoding
and changeEncoding in the input stream, but the InputStreamWithMemory
wrapper didn't have those methods. This fixes that.

This also creates a new test set for BleachHTMLParser functionality.

Fixes #431
  • Loading branch information
willkg committed Jan 8, 2019
1 parent 93a060e commit cb156cb
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 13 deletions.
8 changes: 7 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Bleach changes
==============

Version 3.0.3 (In development)
Version 3.1.0 (In development)
------------------------------

**Security fixes**
Expand All @@ -25,6 +25,12 @@ None
* Fix cases where attribute names could have invalid characters in them.
(#419)

* Fix problems with ``LinkifyFilter`` not being able to match links
across ``&amp;``. (#422)

* Fix ``InputStreamWithMemory`` when the ``BleachHTMLParser`` is
parsing ``meta`` tags. (#431)


Version 3.0.2 (October 11th, 2018)
----------------------------------
Expand Down
2 changes: 1 addition & 1 deletion bleach/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# yyyymmdd
__releasedate__ = ''
# x.y.z or x.y.z.dev0 -- semver
__version__ = '3.0.3.dev0'
__version__ = '3.1.0.dev0'
VERSION = parse_version(__version__)


Expand Down
8 changes: 8 additions & 0 deletions bleach/html5lib_shim.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,14 @@ def __init__(self, inner_stream):
def errors(self):
return self._inner_stream.errors

@property
def charEncoding(self):
return self._inner_stream.charEncoding

@property
def changeEncoding(self):
return self._inner_stream.changeEncoding

def char(self):
c = self._inner_stream.char()
# char() can return None if EOF, so ignore that
Expand Down
62 changes: 62 additions & 0 deletions tests/test_html5lib_shim.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,65 @@ def test_serializer(data, expected):
serialized = serializer.render(walker(dom))

assert serialized == expected


@pytest.mark.parametrize('parser_args, data, expected', [
# Make sure InputStreamWithMemory has charEncoding and changeEncoding
(
{},
'<meta charset="utf-8">',
'<meta charset="utf-8">'
),
# Handle consume entities False--all entities are passed along and then
# escaped when serialized
(
{'consume_entities': False},
'text &amp;&gt;&quot;',
'text &amp;amp;&amp;gt;&amp;quot;'
),
# Handle consume entities True--all entities are consumed and converted
# to their character equivalents and then &, <, and > are escaped when
# serialized
(
{'consume_entities': True},
'text &amp;&gt;&quot;',
'text &amp;&gt;"'
),
# Test that "invalid-character-in-attribute-name" errors in tokenizing
# result in attributes with invalid names getting dropped
(
{},
'<a href="http://example.com"">',
'<a href="http://example.com"></a>'
),
(
{},
'<a href=\'http://example.com\'\'>',
'<a href="http://example.com"></a>'
)
])
def test_bleach_html_parser(parser_args, data, expected):
args = {
'tags': None,
'strip': True,
'consume_entities': True
}
args.update(parser_args)

# Build a parser, walker, and serializer just like we do in clean()
parser = html5lib_shim.BleachHTMLParser(**args)
walker = html5lib_shim.getTreeWalker('etree')
serializer = html5lib_shim.BleachHTMLSerializer(
quote_attr_values='always',
omit_optional_tags=False,
escape_lt_in_attrs=True,
resolve_entities=False,
sanitize=False,
alphabetical_attributes=False,
)

# Parse, walk, and then serialize the output
dom = parser.parseFragment(data)
serialized = serializer.render(walker(dom))

assert serialized == expected
11 changes: 0 additions & 11 deletions tests/test_linkify.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,6 @@ def ft(attrs, new=False):
)


def test_invalid_attribute_names():
"""Test that "invalid-character-in-attribute-name" errors in tokenizing
result in attributes with invalid names get dropped.
"""
assert (
linkify('<a href="http://example.com/"">') ==
'<a href="http://example.com/" rel="nofollow"></a>'
)


@pytest.mark.parametrize('data,parse_email,expected', [
(
'a [email protected] mailto',
Expand Down

0 comments on commit cb156cb

Please sign in to comment.