Skip to content

Commit 53d4bcd

Browse files
committed
Adding parser for mailman ground truth files (html)
1 parent f6cc64f commit 53d4bcd

File tree

3 files changed

+103
-1
lines changed

3 files changed

+103
-1
lines changed

jwzthreading/jwzthreading.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,14 @@ def has_descendant(self, ctr):
112112

113113
return False
114114

115+
@property
116+
def size(self):
117+
"""Count the number of objects included in the container,
118+
including itself"""
119+
120+
return 1 + sum([child.size for child in self.children])
121+
122+
115123

116124
class Message(object):
117125
"""Represents a message to be threaded.

jwzthreading/tests/test_newsgroups.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,46 @@
66
from __future__ import unicode_literals
77

88
import os
9+
from unittest import SkipTest
10+
911
from jwzthreading import Message, thread, print_container
10-
from jwzthreading.utils import parse_mailman_gzfiles
12+
from jwzthreading.utils import (parse_mailman_gzfiles,
13+
parse_mailman_htmlthread)
1114

1215
BASE_DIR = os.path.dirname(__file__)
1316
DATA_DIR = os.path.join(BASE_DIR, 'data/fedora-devel-mailman')
1417

18+
# Expected number of emails and threads
19+
N_JUNE2010_THREADS = 292
20+
N_JUNE2010_EMAILS = 292
21+
22+
23+
def test_parse_mailman_gzfiles():
24+
""" Test that we can parse mailman files """
25+
msglist = parse_mailman_gzfiles(os.path.join(DATA_DIR, '2010-January.txt.gz'),
26+
encoding='latin1', headersonly=True)
27+
28+
assert len(msglist) == N_JUNE2010_EMAILS
29+
30+
def test_parse_mailman_htmlthread():
31+
""" Test that we can parse mailman html thread """
32+
try:
33+
import lxml
34+
except ImportError:
35+
raise SkipTest
36+
threads = parse_mailman_htmlthread(os.path.join(DATA_DIR,
37+
'2010-January_thread.html'))
38+
39+
print(sum([el.size for el in threads]))
40+
41+
print('OK')
42+
#for el in threads:
43+
# print_container(el)
44+
45+
46+
47+
48+
1549
def test_fedora():
1650
""" Test threading on the fedora-devel mailing list data"""
1751
# 2010-January https://www.redhat.com/archives/fedora-devel-list/

jwzthreading/utils.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,63 @@ def parse_mailman_gzfiles(filename, encoding='utf-8', headersonly=False):
5353
out.append(msg_obj)
5454
return out
5555

56+
57+
def parse_mailman_htmlthread(filename):
58+
""" Parse a gzipped files with multiple concatenaged emails
59+
that can be downloaded from mailman.
60+
61+
Parameters
62+
----------
63+
filename : str
64+
path to the filename
65+
66+
Returns
67+
-------
68+
69+
response : list
70+
a thread list
71+
"""
72+
from lxml import etree
73+
from .jwzthreading import Container
74+
parser = etree.HTMLParser()
75+
with open(filename, 'rt') as fh:
76+
tree = etree.parse(fh, parser)
77+
78+
elements = filter(lambda x: x.tag == 'ul', tree.find('body'))
79+
80+
tree = list(elements)[-1].getchildren() # pick last <ul> element
81+
82+
class DummyMessage(object):
83+
subject = None
84+
id = None
85+
86+
def create_thread(root, parent_container=None):
87+
""" Parse the html nested lists to produce the threading structure"""
88+
#print(dir(root))
89+
if root.tag != 'li':
90+
raise ValueError('Element {} was not expected'.format(root))
91+
92+
container = Container()
93+
for child in root.getchildren():
94+
if child.tag == 'strong':
95+
# url with to the actual email
96+
a_el = child.getchildren()[0]
97+
container.message = DummyMessage()
98+
container.message.subject = a_el.text
99+
container.message.id = int(a_el.get('name'))
100+
elif child.tag == 'em':
101+
pass # email sender, ignore this line
102+
elif child.tag == 'ul':
103+
for nested_child in child.getchildren():
104+
create_thread(nested_child, parent_container=container)
105+
else:
106+
raise ValueError('Unexpected element {}'.format(child))
107+
if parent_container is not None:
108+
parent_container.add_child(container)
109+
110+
return container
111+
112+
threads = [create_thread(el) for el in tree]
113+
114+
return threads
115+

0 commit comments

Comments
 (0)