Skip to content

Commit 39a303e

Browse files
committed
Dropped clean_html and clean_url; use BeautifulSoup instead
1 parent e86e83b commit 39a303e

File tree

1 file changed

+2
-22
lines changed

1 file changed

+2
-22
lines changed

nltk/util.py

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -331,30 +331,10 @@ def invert_graph(graph):
331331
##########################################################################
332332

333333
def clean_html(html):
334-
"""
335-
Remove HTML markup from the given string.
336-
337-
:param html: the HTML string to be cleaned
338-
:type html: str
339-
:rtype: str
340-
"""
341-
342-
# First we remove inline JavaScript/CSS:
343-
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
344-
# Then we remove html comments. This has to be done before removing regular
345-
# tags since comments can contain '>' characters.
346-
cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
347-
# Next we can remove the remaining tags:
348-
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
349-
# Finally, we deal with whitespace
350-
cleaned = re.sub(r"&nbsp;", " ", cleaned)
351-
cleaned = re.sub(r" ", " ", cleaned)
352-
cleaned = re.sub(r" ", " ", cleaned)
353-
return cleaned.strip()
334+
raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
354335

355336
def clean_url(url):
356-
html = compat.urlopen(url).read()
357-
return clean_html(html)
337+
raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
358338

359339
##########################################################################
360340
# FLATTEN LISTS

0 commit comments

Comments
 (0)