[utils] Decode HTML5 entities
Used in test_Vporn_1. Also related to #9270
This commit is contained in:
		| @@ -249,6 +249,8 @@ class TestUtil(unittest.TestCase): | |||||||
|         self.assertEqual(unescapeHTML('/'), '/') |         self.assertEqual(unescapeHTML('/'), '/') | ||||||
|         self.assertEqual(unescapeHTML('é'), 'é') |         self.assertEqual(unescapeHTML('é'), 'é') | ||||||
|         self.assertEqual(unescapeHTML('�'), '�') |         self.assertEqual(unescapeHTML('�'), '�') | ||||||
|  |         # HTML5 entities | ||||||
|  |         self.assertEqual(unescapeHTML('.''), '.\'') | ||||||
|  |  | ||||||
|     def test_date_from_str(self): |     def test_date_from_str(self): | ||||||
|         self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) |         self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) | ||||||
|   | |||||||
| @@ -39,6 +39,7 @@ from .compat import ( | |||||||
|     compat_chr, |     compat_chr, | ||||||
|     compat_etree_fromstring, |     compat_etree_fromstring, | ||||||
|     compat_html_entities, |     compat_html_entities, | ||||||
|  |     compat_html_entities_html5, | ||||||
|     compat_http_client, |     compat_http_client, | ||||||
|     compat_kwargs, |     compat_kwargs, | ||||||
|     compat_parse_qs, |     compat_parse_qs, | ||||||
| @@ -456,12 +457,19 @@ def orderedSet(iterable): | |||||||
|     return res |     return res | ||||||
|  |  | ||||||
|  |  | ||||||
| def _htmlentity_transform(entity): | def _htmlentity_transform(entity_with_semicolon): | ||||||
|     """Transforms an HTML entity to a character.""" |     """Transforms an HTML entity to a character.""" | ||||||
|  |     entity = entity_with_semicolon[:-1] | ||||||
|  |  | ||||||
|     # Known non-numeric HTML entity |     # Known non-numeric HTML entity | ||||||
|     if entity in compat_html_entities.name2codepoint: |     if entity in compat_html_entities.name2codepoint: | ||||||
|         return compat_chr(compat_html_entities.name2codepoint[entity]) |         return compat_chr(compat_html_entities.name2codepoint[entity]) | ||||||
|  |  | ||||||
|  |     # TODO: HTML5 allows entities without a semicolon. For example, | ||||||
|  |     # 'Éric' should be decoded as 'Éric'. | ||||||
|  |     if entity_with_semicolon in compat_html_entities_html5: | ||||||
|  |         return compat_html_entities_html5[entity_with_semicolon] | ||||||
|  |  | ||||||
|     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) |     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) | ||||||
|     if mobj is not None: |     if mobj is not None: | ||||||
|         numstr = mobj.group(1) |         numstr = mobj.group(1) | ||||||
| @@ -486,7 +494,7 @@ def unescapeHTML(s): | |||||||
|     assert type(s) == compat_str |     assert type(s) == compat_str | ||||||
|  |  | ||||||
|     return re.sub( |     return re.sub( | ||||||
|         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) |         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_subprocess_encoding(): | def get_subprocess_encoding(): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user