Merge branch 'ted_subtitles'
This commit is contained in:
		
							
								
								
									
										63
									
								
								test/test_ted_subtitles.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63
									
								
								test/test_ted_subtitles.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,63 @@ | |||||||
|  | #!/usr/bin/env python | ||||||
|  |  | ||||||
|  | import sys | ||||||
|  | import unittest | ||||||
|  | import hashlib | ||||||
|  |  | ||||||
|  | # Allow direct execution | ||||||
|  | import os | ||||||
|  | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||||||
|  |  | ||||||
|  | from youtube_dl.extractor import TEDIE | ||||||
|  | from youtube_dl.utils import * | ||||||
|  | from helper import FakeYDL | ||||||
|  |  | ||||||
|  | md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() | ||||||
|  |  | ||||||
|  | class TestTedSubtitles(unittest.TestCase): | ||||||
|  |     def setUp(self): | ||||||
|  |         self.DL = FakeYDL() | ||||||
|  |         self.url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' | ||||||
|  |     def getInfoDict(self): | ||||||
|  |         IE = TEDIE(self.DL) | ||||||
|  |         info_dict = IE.extract(self.url) | ||||||
|  |         return info_dict | ||||||
|  |     def getSubtitles(self): | ||||||
|  |         info_dict = self.getInfoDict() | ||||||
|  |         return info_dict[0]['subtitles'] | ||||||
|  |     def test_no_writesubtitles(self): | ||||||
|  |         subtitles = self.getSubtitles() | ||||||
|  |         self.assertEqual(subtitles, None) | ||||||
|  |     def test_subtitles(self): | ||||||
|  |         self.DL.params['writesubtitles'] = True | ||||||
|  |         subtitles = self.getSubtitles() | ||||||
|  |         self.assertEqual(md5(subtitles['en']), '2154f31ff9b9f89a0aa671537559c21d') | ||||||
|  |     def test_subtitles_lang(self): | ||||||
|  |         self.DL.params['writesubtitles'] = True | ||||||
|  |         self.DL.params['subtitleslangs'] = ['fr'] | ||||||
|  |         subtitles = self.getSubtitles() | ||||||
|  |         self.assertEqual(md5(subtitles['fr']), '7616cbc6df20ec2c1204083c83871cf6') | ||||||
|  |     def test_allsubtitles(self): | ||||||
|  |         self.DL.params['writesubtitles'] = True | ||||||
|  |         self.DL.params['allsubtitles'] = True | ||||||
|  |         subtitles = self.getSubtitles() | ||||||
|  |         self.assertEqual(len(subtitles.keys()), 28) | ||||||
|  |     def test_list_subtitles(self): | ||||||
|  |         self.DL.params['listsubtitles'] = True | ||||||
|  |         info_dict = self.getInfoDict() | ||||||
|  |         self.assertEqual(info_dict, [None]) | ||||||
|  |     def test_automatic_captions(self): | ||||||
|  |         self.DL.params['writeautomaticsub'] = True | ||||||
|  |         self.DL.params['subtitleslang'] = ['en'] | ||||||
|  |         subtitles = self.getSubtitles() | ||||||
|  |         self.assertTrue(len(subtitles.keys()) == 0) | ||||||
|  |     def test_multiple_langs(self): | ||||||
|  |         self.DL.params['writesubtitles'] = True | ||||||
|  |         langs = ['es', 'fr', 'de'] | ||||||
|  |         self.DL.params['subtitleslangs'] = langs | ||||||
|  |         subtitles = self.getSubtitles() | ||||||
|  |         for lang in langs: | ||||||
|  |             self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) | ||||||
|  |  | ||||||
|  | if __name__ == '__main__': | ||||||
|  |     unittest.main() | ||||||
| @@ -141,9 +141,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): | |||||||
|             raise ExtractorError(u'Unable to extract video URL') |             raise ExtractorError(u'Unable to extract video URL') | ||||||
|  |  | ||||||
|         # subtitles |         # subtitles | ||||||
|         video_subtitles = self.extract_subtitles(video_id) |         video_subtitles = self.extract_subtitles(video_id, webpage) | ||||||
|         if self._downloader.params.get('listsubtitles', False): |         if self._downloader.params.get('listsubtitles', False): | ||||||
|             self._list_available_subtitles(video_id) |             self._list_available_subtitles(video_id, webpage) | ||||||
|             return |             return | ||||||
|  |  | ||||||
|         return { |         return { | ||||||
| @@ -157,7 +157,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): | |||||||
|             'age_limit': age_limit, |             'age_limit': age_limit, | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     def _get_available_subtitles(self, video_id): |     def _get_available_subtitles(self, video_id, webpage): | ||||||
|         try: |         try: | ||||||
|             sub_list = self._download_webpage( |             sub_list = self._download_webpage( | ||||||
|                 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, |                 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, | ||||||
|   | |||||||
| @@ -12,9 +12,9 @@ class SubtitlesInfoExtractor(InfoExtractor): | |||||||
|         return any([self._downloader.params.get('writesubtitles', False), |         return any([self._downloader.params.get('writesubtitles', False), | ||||||
|                     self._downloader.params.get('writeautomaticsub')]) |                     self._downloader.params.get('writeautomaticsub')]) | ||||||
|  |  | ||||||
|     def _list_available_subtitles(self, video_id, webpage=None): |     def _list_available_subtitles(self, video_id, webpage): | ||||||
|         """ outputs the available subtitles for the video """ |         """ outputs the available subtitles for the video """ | ||||||
|         sub_lang_list = self._get_available_subtitles(video_id) |         sub_lang_list = self._get_available_subtitles(video_id, webpage) | ||||||
|         auto_captions_list = self._get_available_automatic_caption(video_id, webpage) |         auto_captions_list = self._get_available_automatic_caption(video_id, webpage) | ||||||
|         sub_lang = ",".join(list(sub_lang_list.keys())) |         sub_lang = ",".join(list(sub_lang_list.keys())) | ||||||
|         self.to_screen(u'%s: Available subtitles for video: %s' % |         self.to_screen(u'%s: Available subtitles for video: %s' % | ||||||
| @@ -23,7 +23,7 @@ class SubtitlesInfoExtractor(InfoExtractor): | |||||||
|         self.to_screen(u'%s: Available automatic captions for video: %s' % |         self.to_screen(u'%s: Available automatic captions for video: %s' % | ||||||
|                        (video_id, auto_lang)) |                        (video_id, auto_lang)) | ||||||
|  |  | ||||||
|     def extract_subtitles(self, video_id, video_webpage=None): |     def extract_subtitles(self, video_id, webpage): | ||||||
|         """ |         """ | ||||||
|         returns {sub_lang: sub} ,{} if subtitles not found or None if the |         returns {sub_lang: sub} ,{} if subtitles not found or None if the | ||||||
|         subtitles aren't requested. |         subtitles aren't requested. | ||||||
| @@ -32,9 +32,9 @@ class SubtitlesInfoExtractor(InfoExtractor): | |||||||
|             return None |             return None | ||||||
|         available_subs_list = {} |         available_subs_list = {} | ||||||
|         if self._downloader.params.get('writeautomaticsub', False): |         if self._downloader.params.get('writeautomaticsub', False): | ||||||
|             available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) |             available_subs_list.update(self._get_available_automatic_caption(video_id, webpage)) | ||||||
|         if self._downloader.params.get('writesubtitles', False): |         if self._downloader.params.get('writesubtitles', False): | ||||||
|             available_subs_list.update(self._get_available_subtitles(video_id)) |             available_subs_list.update(self._get_available_subtitles(video_id, webpage)) | ||||||
|  |  | ||||||
|         if not available_subs_list:  # error, it didn't get the available subtitles |         if not available_subs_list:  # error, it didn't get the available subtitles | ||||||
|             return {} |             return {} | ||||||
| @@ -74,7 +74,7 @@ class SubtitlesInfoExtractor(InfoExtractor): | |||||||
|             return |             return | ||||||
|         return sub |         return sub | ||||||
|  |  | ||||||
|     def _get_available_subtitles(self, video_id): |     def _get_available_subtitles(self, video_id, webpage): | ||||||
|         """ |         """ | ||||||
|         returns {sub_lang: url} or {} if not available |         returns {sub_lang: url} or {} if not available | ||||||
|         Must be redefined by the subclasses |         Must be redefined by the subclasses | ||||||
|   | |||||||
| @@ -1,10 +1,9 @@ | |||||||
| import json | import json | ||||||
| import re | import re | ||||||
|  |  | ||||||
| from .common import InfoExtractor | from .subtitles import SubtitlesInfoExtractor | ||||||
|  |  | ||||||
|  | class TEDIE(SubtitlesInfoExtractor): | ||||||
| class TEDIE(InfoExtractor): |  | ||||||
|     _VALID_URL=r'''http://www\.ted\.com/ |     _VALID_URL=r'''http://www\.ted\.com/ | ||||||
|                    ( |                    ( | ||||||
|                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist |                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist | ||||||
| @@ -82,11 +81,21 @@ class TEDIE(InfoExtractor): | |||||||
|             'url': stream['file'], |             'url': stream['file'], | ||||||
|             'format': stream['id'] |             'format': stream['id'] | ||||||
|             } for stream in info['htmlStreams']] |             } for stream in info['htmlStreams']] | ||||||
|  |  | ||||||
|  |         video_id = info['id'] | ||||||
|  |  | ||||||
|  |         # subtitles | ||||||
|  |         video_subtitles = self.extract_subtitles(video_id, webpage) | ||||||
|  |         if self._downloader.params.get('listsubtitles', False): | ||||||
|  |             self._list_available_subtitles(video_id, webpage) | ||||||
|  |             return | ||||||
|  |  | ||||||
|         info = { |         info = { | ||||||
|             'id': info['id'], |             'id': video_id, | ||||||
|             'title': title, |             'title': title, | ||||||
|             'thumbnail': thumbnail, |             'thumbnail': thumbnail, | ||||||
|             'description': desc, |             'description': desc, | ||||||
|  |             'subtitles': video_subtitles, | ||||||
|             'formats': formats, |             'formats': formats, | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -94,3 +103,14 @@ class TEDIE(InfoExtractor): | |||||||
|         info.update(info['formats'][-1]) |         info.update(info['formats'][-1]) | ||||||
|  |  | ||||||
|         return info |         return info | ||||||
|  |  | ||||||
|  |     def _get_available_subtitles(self, video_id, webpage): | ||||||
|  |         options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL) | ||||||
|  |         languages = re.findall(r'(?:<option value=")(\S+)"', options) | ||||||
|  |         if languages: | ||||||
|  |             sub_lang_list = {} | ||||||
|  |             for l in languages: | ||||||
|  |                 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) | ||||||
|  |                 sub_lang_list[l] = url | ||||||
|  |             return sub_lang_list | ||||||
|  |         return {} | ||||||
|   | |||||||
| @@ -1094,7 +1094,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | |||||||
|         else: |         else: | ||||||
|             raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) |             raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) | ||||||
|  |  | ||||||
|     def _get_available_subtitles(self, video_id): |     def _get_available_subtitles(self, video_id, webpage): | ||||||
|         try: |         try: | ||||||
|             sub_list = self._download_webpage( |             sub_list = self._download_webpage( | ||||||
|                 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, |                 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user