@@ -331,30 +331,10 @@ def invert_graph(graph):
331331##########################################################################
332332
333333def clean_html (html ):
334- """
335- Remove HTML markup from the given string.
336-
337- :param html: the HTML string to be cleaned
338- :type html: str
339- :rtype: str
340- """
341-
342- # First we remove inline JavaScript/CSS:
343- cleaned = re .sub (r"(?is)<(script|style).*?>.*?(</\1>)" , "" , html .strip ())
344- # Then we remove html comments. This has to be done before removing regular
345- # tags since comments can contain '>' characters.
346- cleaned = re .sub (r"(?s)<!--(.*?)-->[\n]?" , "" , cleaned )
347- # Next we can remove the remaining tags:
348- cleaned = re .sub (r"(?s)<.*?>" , " " , cleaned )
349- # Finally, we deal with whitespace
350- cleaned = re .sub (r" " , " " , cleaned )
351- cleaned = re .sub (r" " , " " , cleaned )
352- cleaned = re .sub (r" " , " " , cleaned )
353- return cleaned .strip ()
334+ raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function" )
354335
355336def clean_url (url ):
356- html = compat .urlopen (url ).read ()
357- return clean_html (html )
337+ raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function" )
358338
359339##########################################################################
360340# FLATTEN LISTS
0 commit comments