nltk
diff --git a/‎nltk/util.py‎
Lines changed: 2 additions & 22 deletions b/‎nltk/util.py‎
Lines changed: 2 additions & 22 deletions
@@ -331,30 +331,10 @@ def invert_graph(graph):
 ##########################################################################
 
 def clean_html(html):
- """
- Remove HTML markup from the given string.
-
- :param html: the HTML string to be cleaned
- :type html: str
- :rtype: str
- """
-
- # First we remove inline JavaScript/CSS:
- cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
- # Then we remove html comments. This has to be done before removing regular
- # tags since comments can contain '>' characters.
- cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
- # Next we can remove the remaining tags:
- cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
- # Finally, we deal with whitespace
- cleaned = re.sub(r"&nbsp;", " ", cleaned)
- cleaned = re.sub(r" ", " ", cleaned)
- cleaned = re.sub(r" ", " ", cleaned)
- return cleaned.strip()
+ raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
 
 def clean_url(url):
- html = compat.urlopen(url).read()
- return clean_html(html)
+ raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
 
 ##########################################################################
 # FLATTEN LISTS