standardized the use of unescapeHTML; added clean_html()
This commit is contained in:
		@@ -242,6 +242,18 @@ def htmlentity_transform(matchobj):
 | 
			
		||||
	return (u'&%s;' % entity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def clean_html(html):
 | 
			
		||||
	"""Clean an HTML snippet into a readable string"""
 | 
			
		||||
	# Newline vs <br />
 | 
			
		||||
	html = html.replace('\n', ' ')
 | 
			
		||||
	html = re.sub('<\s*br\s*/?\s*>', '\n', html)
 | 
			
		||||
	# Strip html tags
 | 
			
		||||
	html = re.sub('<.*?>', '', html)
 | 
			
		||||
	# Replace html entities
 | 
			
		||||
	html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
 | 
			
		||||
	return html
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def sanitize_title(utitle):
 | 
			
		||||
	"""Sanitizes a video title so it could be used as part of a filename."""
 | 
			
		||||
	utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 | 
			
		||||
@@ -3343,8 +3355,6 @@ class EscapistIE(InfoExtractor):
 | 
			
		||||
		self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
 | 
			
		||||
 | 
			
		||||
	def _real_extract(self, url):
 | 
			
		||||
		htmlParser = HTMLParser.HTMLParser()
 | 
			
		||||
 | 
			
		||||
		mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
		if mobj is None:
 | 
			
		||||
			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 | 
			
		||||
@@ -3360,11 +3370,11 @@ class EscapistIE(InfoExtractor):
 | 
			
		||||
			return
 | 
			
		||||
 | 
			
		||||
		descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
 | 
			
		||||
		description = htmlParser.unescape(descMatch.group(1))
 | 
			
		||||
		description = unescapeHTML(descMatch.group(1))
 | 
			
		||||
		imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
 | 
			
		||||
		imgUrl = htmlParser.unescape(imgMatch.group(1))
 | 
			
		||||
		imgUrl = unescapeHTML(imgMatch.group(1))
 | 
			
		||||
		playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
 | 
			
		||||
		playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
 | 
			
		||||
		playerUrl = unescapeHTML(playerUrlMatch.group(1))
 | 
			
		||||
		configUrlMatch = re.search('config=(.*)$', playerUrl)
 | 
			
		||||
		configUrl = urllib2.unquote(configUrlMatch.group(1))
 | 
			
		||||
 | 
			
		||||
@@ -3423,8 +3433,6 @@ class CollegeHumorIE(InfoExtractor):
 | 
			
		||||
		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 | 
			
		||||
 | 
			
		||||
	def _real_extract(self, url):
 | 
			
		||||
		htmlParser = HTMLParser.HTMLParser()
 | 
			
		||||
 | 
			
		||||
		mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
		if mobj is None:
 | 
			
		||||
			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 | 
			
		||||
@@ -3495,8 +3503,6 @@ class XVideosIE(InfoExtractor):
 | 
			
		||||
		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 | 
			
		||||
 | 
			
		||||
	def _real_extract(self, url):
 | 
			
		||||
		htmlParser = HTMLParser.HTMLParser()
 | 
			
		||||
 | 
			
		||||
		mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
		if mobj is None:
 | 
			
		||||
			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 | 
			
		||||
@@ -3585,8 +3591,6 @@ class SoundcloudIE(InfoExtractor):
 | 
			
		||||
		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 | 
			
		||||
 | 
			
		||||
	def _real_extract(self, url):
 | 
			
		||||
		htmlParser = HTMLParser.HTMLParser()
 | 
			
		||||
 | 
			
		||||
		mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
		if mobj is None:
 | 
			
		||||
			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 | 
			
		||||
@@ -3674,8 +3678,6 @@ class InfoQIE(InfoExtractor):
 | 
			
		||||
		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 | 
			
		||||
 | 
			
		||||
	def _real_extract(self, url):
 | 
			
		||||
		htmlParser = HTMLParser.HTMLParser()
 | 
			
		||||
 | 
			
		||||
		mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
		if mobj is None:
 | 
			
		||||
			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 | 
			
		||||
@@ -3909,8 +3911,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
 | 
			
		||||
			except UnavailableVideoError, err:
 | 
			
		||||
				self._downloader.trouble(u'\nERROR: unable to download video')
 | 
			
		||||
		elif mobj.group('course'): # A course page
 | 
			
		||||
			unescapeHTML = HTMLParser.HTMLParser().unescape
 | 
			
		||||
 | 
			
		||||
			course = mobj.group('course')
 | 
			
		||||
			info = {
 | 
			
		||||
				'id': _simplify_title(course),
 | 
			
		||||
@@ -3947,8 +3947,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
 | 
			
		||||
				assert entry['type'] == 'reference'
 | 
			
		||||
				self.extract(entry['url'])
 | 
			
		||||
		else: # Root page
 | 
			
		||||
			unescapeHTML = HTMLParser.HTMLParser().unescape
 | 
			
		||||
 | 
			
		||||
			info = {
 | 
			
		||||
				'id': 'Stanford OpenClassroom',
 | 
			
		||||
				'type': 'playlist',
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user