[generic] Add support for BOMs (Fixes #4753)
This commit is contained in:
		@@ -28,6 +28,7 @@ from youtube_dl.utils import (
 | 
			
		||||
    fix_xml_ampersands,
 | 
			
		||||
    InAdvancePagedList,
 | 
			
		||||
    intlist_to_bytes,
 | 
			
		||||
    is_html,
 | 
			
		||||
    js_to_json,
 | 
			
		||||
    limit_length,
 | 
			
		||||
    OnDemandPagedList,
 | 
			
		||||
@@ -417,5 +418,21 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
 | 
			
		||||
        self.assertTrue(age_restricted(18, 14))
 | 
			
		||||
        self.assertFalse(age_restricted(18, 18))
 | 
			
		||||
 | 
			
		||||
    def test_is_html(self):
 | 
			
		||||
        self.assertFalse(is_html(b'\x49\x44\x43<html'))
 | 
			
		||||
        self.assertTrue(is_html(b'<!DOCTYPE foo>\xaaa'))
 | 
			
		||||
        self.assertTrue(is_html(  # UTF-8 with BOM
 | 
			
		||||
            b'\xef\xbb\xbf<!DOCTYPE foo>\xaaa'))
 | 
			
		||||
        self.assertTrue(is_html(  # UTF-16-LE
 | 
			
		||||
            b'\xff\xfe<\x00h\x00t\x00m\x00l\x00>\x00\xe4\x00'
 | 
			
		||||
        ))
 | 
			
		||||
        self.assertTrue(is_html(  # UTF-16-BE
 | 
			
		||||
            b'\xfe\xff\x00<\x00h\x00t\x00m\x00l\x00>\x00\xe4'
 | 
			
		||||
        ))
 | 
			
		||||
        self.assertTrue(is_html(  # UTF-32-BE
 | 
			
		||||
            b'\x00\x00\xFE\xFF\x00\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4'))
 | 
			
		||||
        self.assertTrue(is_html(  # UTF-32-LE
 | 
			
		||||
            b'\xFF\xFE\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4\x00\x00\x00'))
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    unittest.main()
 | 
			
		||||
 
 | 
			
		||||
@@ -17,6 +17,7 @@ from ..utils import (
 | 
			
		||||
    ExtractorError,
 | 
			
		||||
    float_or_none,
 | 
			
		||||
    HEADRequest,
 | 
			
		||||
    is_html,
 | 
			
		||||
    orderedSet,
 | 
			
		||||
    parse_xml,
 | 
			
		||||
    smuggle_url,
 | 
			
		||||
@@ -647,7 +648,7 @@ class GenericIE(InfoExtractor):
 | 
			
		||||
        # Maybe it's a direct link to a video?
 | 
			
		||||
        # Be careful not to download the whole thing!
 | 
			
		||||
        first_bytes = full_response.read(512)
 | 
			
		||||
        if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')):
 | 
			
		||||
        if not is_html(first_bytes):
 | 
			
		||||
            self._downloader.report_warning(
 | 
			
		||||
                'URL could be a direct video link, returning it as such.')
 | 
			
		||||
            upload_date = unified_strdate(
 | 
			
		||||
 
 | 
			
		||||
@@ -1631,3 +1631,23 @@ def age_restricted(content_limit, age_limit):
 | 
			
		||||
    if content_limit is None:
 | 
			
		||||
        return False  # Content available for everyone
 | 
			
		||||
    return age_limit < content_limit
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_html(first_bytes):
 | 
			
		||||
    """ Detect whether a file contains HTML by examining its first bytes. """
 | 
			
		||||
 | 
			
		||||
    BOMS = [
 | 
			
		||||
        (b'\xef\xbb\xbf', 'utf-8'),
 | 
			
		||||
        (b'\x00\x00\xfe\xff', 'utf-32-be'),
 | 
			
		||||
        (b'\xff\xfe\x00\x00', 'utf-32-le'),
 | 
			
		||||
        (b'\xff\xfe', 'utf-16-le'),
 | 
			
		||||
        (b'\xfe\xff', 'utf-16-be'),
 | 
			
		||||
    ]
 | 
			
		||||
    for bom, enc in BOMS:
 | 
			
		||||
        if first_bytes.startswith(bom):
 | 
			
		||||
            s = first_bytes[len(bom):].decode(enc, 'replace')
 | 
			
		||||
            break
 | 
			
		||||
    else:
 | 
			
		||||
        s = first_bytes.decode('utf-8', 'replace')
 | 
			
		||||
 | 
			
		||||
    return re.match(r'^\s*<', s)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user