Merge branch 'atomicdryad-pr-crashfix_compat_urllib_unquote'
This commit is contained in:
		| @@ -14,6 +14,7 @@ from youtube_dl.utils import get_filesystem_encoding | ||||
| from youtube_dl.compat import ( | ||||
|     compat_getenv, | ||||
|     compat_expanduser, | ||||
|     compat_urllib_parse_unquote, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @@ -42,5 +43,23 @@ class TestCompat(unittest.TestCase): | ||||
|             dir(youtube_dl.compat))) - set(['unicode_literals']) | ||||
|         self.assertEqual(all_names, sorted(present_names)) | ||||
|  | ||||
|     def test_compat_urllib_parse_unquote(self): | ||||
|         self.assertEqual(compat_urllib_parse_unquote(''), '') | ||||
|         self.assertEqual(compat_urllib_parse_unquote('%'), '%') | ||||
|         self.assertEqual(compat_urllib_parse_unquote('%%'), '%%') | ||||
|         self.assertEqual(compat_urllib_parse_unquote('%%%'), '%%%') | ||||
|         self.assertEqual(compat_urllib_parse_unquote('%2F'), '/') | ||||
|         self.assertEqual(compat_urllib_parse_unquote('%2f'), '/') | ||||
|         self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波') | ||||
|         self.assertEqual(compat_urllib_parse_unquote(str('%E6%B4%A5%E6%B3%A2')), '津波') | ||||
|         self.assertEqual( | ||||
|             compat_urllib_parse_unquote('''<meta property="og:description" content="%E2%96%81%E2%96%82%E2%96%83%E2%96%84%25%E2%96%85%E2%96%86%E2%96%87%E2%96%88" /> | ||||
| %<a href="https://ar.wikipedia.org/wiki/%D8%AA%D8%B3%D9%88%D9%86%D8%A7%D9%85%D9%8A">%a'''), | ||||
|             '''<meta property="og:description" content="▁▂▃▄%▅▆▇█" /> | ||||
| %<a href="https://ar.wikipedia.org/wiki/تسونامي">%a''') | ||||
|         self.assertEqual( | ||||
|             compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80    %E2%87%80    %E2%87%80    %E2%87%80    %E2%87%80    %E2%87%80    %E2%86%B6%I%Break%25Things%'''), | ||||
|             '''(^◣_◢^)っ︻デ═一    ⇀    ⇀    ⇀    ⇀    ⇀    ↶%I%Break%Things%''') | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     unittest.main() | ||||
|   | ||||
| @@ -75,42 +75,61 @@ except ImportError: | ||||
|     import BaseHTTPServer as compat_http_server | ||||
|  | ||||
| try: | ||||
|     from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes | ||||
|     from urllib.parse import unquote as compat_urllib_parse_unquote | ||||
| except ImportError: | ||||
|     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): | ||||
|         if string == '': | ||||
| except ImportError:  # Python 2 | ||||
|     # HACK: The following are the correct unquote_to_bytes and unquote | ||||
|     # implementations from cpython 3.4.3's stdlib. Python 2's version | ||||
|     # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244) | ||||
|  | ||||
|     def compat_urllib_parse_unquote_to_bytes(string): | ||||
|         """unquote_to_bytes('abc%20def') -> b'abc def'.""" | ||||
|         # Note: strings are encoded as UTF-8. This is only an issue if it contains | ||||
|         # unescaped non-ASCII characters, which URIs should not. | ||||
|         if not string: | ||||
|             # Is it a string-like object? | ||||
|             string.split | ||||
|             return b'' | ||||
|         if isinstance(string, unicode): | ||||
|             string = string.encode('utf-8') | ||||
|         bits = string.split(b'%') | ||||
|         if len(bits) == 1: | ||||
|             return string | ||||
|         res = string.split('%') | ||||
|         if len(res) == 1: | ||||
|         res = [bits[0]] | ||||
|         append = res.append | ||||
|         for item in bits[1:]: | ||||
|             try: | ||||
|                 append(compat_urllib_parse._hextochr[item[:2]]) | ||||
|                 append(item[2:]) | ||||
|             except KeyError: | ||||
|                 append(b'%') | ||||
|                 append(item) | ||||
|         return b''.join(res) | ||||
|  | ||||
|     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): | ||||
|         """Replace %xx escapes by their single-character equivalent. The optional | ||||
|         encoding and errors parameters specify how to decode percent-encoded | ||||
|         sequences into Unicode characters, as accepted by the bytes.decode() | ||||
|         method. | ||||
|         By default, percent-encoded sequences are decoded with UTF-8, and invalid | ||||
|         sequences are replaced by a placeholder character. | ||||
|  | ||||
|         unquote('abc%20def') -> 'abc def'. | ||||
|         """ | ||||
|         if '%' not in string: | ||||
|             string.split | ||||
|             return string | ||||
|         if encoding is None: | ||||
|             encoding = 'utf-8' | ||||
|         if errors is None: | ||||
|             errors = 'replace' | ||||
|         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded | ||||
|         pct_sequence = b'' | ||||
|         string = res[0] | ||||
|         for item in res[1:]: | ||||
|             try: | ||||
|                 if not item: | ||||
|                     raise ValueError | ||||
|                 pct_sequence += item[:2].decode('hex') | ||||
|                 rest = item[2:] | ||||
|                 if not rest: | ||||
|                     # This segment was just a single percent-encoded character. | ||||
|                     # May be part of a sequence of code units, so delay decoding. | ||||
|                     # (Stored in pct_sequence). | ||||
|                     continue | ||||
|             except ValueError: | ||||
|                 rest = '%' + item | ||||
|             # Encountered non-percent-encoded characters. Flush the current | ||||
|             # pct_sequence. | ||||
|             string += pct_sequence.decode(encoding, errors) + rest | ||||
|             pct_sequence = b'' | ||||
|         if pct_sequence: | ||||
|             # Flush the final pct_sequence | ||||
|             string += pct_sequence.decode(encoding, errors) | ||||
|         return string | ||||
|         bits = compat_urllib_parse._asciire.split(string) | ||||
|         res = [bits[0]] | ||||
|         append = res.append | ||||
|         for i in range(1, len(bits), 2): | ||||
|             append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors)) | ||||
|             append(bits[i + 1]) | ||||
|         return ''.join(res) | ||||
|  | ||||
| try: | ||||
|     compat_str = unicode  # Python 2 | ||||
| @@ -422,6 +441,7 @@ __all__ = [ | ||||
|     'compat_urllib_error', | ||||
|     'compat_urllib_parse', | ||||
|     'compat_urllib_parse_unquote', | ||||
|     'compat_urllib_parse_unquote_to_bytes', | ||||
|     'compat_urllib_parse_urlparse', | ||||
|     'compat_urllib_request', | ||||
|     'compat_urlparse', | ||||
|   | ||||
| @@ -1115,7 +1115,7 @@ class GenericIE(InfoExtractor): | ||||
|         # Sometimes embedded video player is hidden behind percent encoding | ||||
|         # (e.g. https://github.com/rg3/youtube-dl/issues/2448) | ||||
|         # Unescaping the whole page allows to handle those cases in a generic way | ||||
|         webpage = compat_urllib_parse.unquote(webpage) | ||||
|         webpage = compat_urllib_parse_unquote(webpage) | ||||
|  | ||||
|         # it's tempting to parse this further, but you would | ||||
|         # have to take into account all the variations like | ||||
|   | ||||
		Reference in New Issue
	
	Block a user