Skip to content

Commit 4721ce0

Browse files
committed
Add ability to collapse empty root container
1 parent c2dfe84 commit 4721ce0

File tree

3 files changed

+101
-6
lines changed

3 files changed

+101
-6
lines changed

jwzthreading/jwzthreading.py

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -157,26 +157,73 @@ def root(self):
157157
else:
158158
return self.parent.root
159159

160+
def collapse_empty(self, inplace=True):
161+
""" Collapse empty top level containers.
160162
161-
def to_dict(self, include_subject=False):
163+
If multiple messages reference a non existing top level message,
164+
by default JWZ threading algorithm will create a en empty top level
165+
container to be used as the root node.
166+
167+
This method removes this empty container and makes the first child
168+
to be the root message. The other messages at depth == 1 then become
169+
it's children.
170+
171+
Parameters
172+
----------
173+
174+
inplace : bool, default=True
175+
if True the original container is modified
176+
"""
177+
178+
if not inplace:
179+
raise NotImplementedError
180+
181+
182+
if self.message is not None:
183+
# nothing to be done
184+
return self
185+
186+
if any([el.message is None for el in self.children]):
187+
raise ValueError('Children containers cannot be empty!')
188+
189+
# In the following, self.message is None
190+
191+
# make the 1st children the new root container
192+
children = self.children
193+
194+
new_root = children[0]
195+
new_root.parent = None
196+
197+
for idx in range(1, len(children)):
198+
child = children[idx]
199+
child.parent = new_root
200+
new_root.children.append(child)
201+
202+
return new_root
203+
204+
205+
def to_dict(self, include=[]):
162206
""" Convert a Container tree to a nested dict """
163207
if self.message is None:
164208
raise ValueError('Containers with None messages are not supported!')
209+
raise ValueError('Containers with None messages are not supported:!\n'\
210+
' this: {}'.format(self))
165211

166212
res = {'id': self.message.message_idx}
167213

168-
if include_subject:
169-
res['subject'] = self.message.subject
214+
for key in include:
215+
res[key] = getattr(self.message, key)
170216

171217
if self.parent is not None:
172218
if self.parent.message is not None:
173219
res['parent'] = self.parent.message.message_idx
174220
else:
175-
raise ValueError('Containers with None messages are not supported!')
221+
raise ValueError('Containers with None messages are not supported:!\n'\
222+
' this: {}\n parent: {}'.format(self, self.parent))
176223
else:
177224
res['parent'] = None
178225

179-
res['children'] = [el.to_dict() for el in self.children]
226+
res['children'] = [el.to_dict(include=include) for el in self.children]
180227

181228
return res
182229

jwzthreading/tests/test_jwz.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,3 +274,11 @@ def test_thread_two_missing_parent():
274274
assert d[0].message == None
275275
assert len(d[0].children) == 2
276276
assert d[0].children[0].message == m1
277+
assert d[0].size == 3
278+
279+
# check that collapsing the empty container works
280+
container = d[0].collapse_empty()
281+
assert container.size == 2
282+
assert container.message is not None
283+
assert container.message.message_id == 'First'
284+
assert container.parent is None

jwzthreading/tests/test_newsgroups.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def test_mailbox_delimiter():
5858
assert re.match(MAILBOX_DELIMITER, line)
5959

6060

61-
def test_fedora_June2010():
61+
def test_threading_fedora_June2010():
6262
""" Test threading on the fedora-devel mailing list data
6363
from June 2010"""
6464

@@ -154,3 +154,43 @@ def test_fedora_June2010():
154154

155155

156156

157+
def test_empty_collapsing_fedora_June2010():
158+
""" Test threading on the fedora-devel mailing list data
159+
from June 2010"""
160+
161+
try:
162+
import lxml
163+
except ImportError:
164+
raise SkipTest
165+
166+
try:
167+
import numpy as np
168+
from numpy.testing import assert_array_equal
169+
NUMPY_PRESENT = True
170+
except ImportError:
171+
NUMPY_PRESENT = False
172+
173+
174+
175+
msglist = parse_mailbox(os.path.join(DATA_DIR, '2010-January.txt.gz'),
176+
encoding='latin1', headersonly=True)
177+
178+
assert len(msglist) == N_EMAILS_JUNE2010
179+
180+
181+
threads_ref = parse_mailman_htmlthread(os.path.join(DATA_DIR,
182+
'2010-January_thread.html.gz'))
183+
threads_ref = sort_threads(threads_ref, key='subject', missing='Z')
184+
185+
186+
threads = thread([Message(el, message_idx=idx) for idx, el in enumerate(msglist)],
187+
group_by_subject=False)
188+
# There is one single "empty root container"
189+
assert sum([el.message is None for el in threads]) == 1
190+
191+
threads = [el.collapse_empty() for el in threads]
192+
193+
# The empty container was removed
194+
assert sum([el.message is None for el in threads]) == 0
195+
196+
assert sum([el.parent is None for el in threads]) == len(threads)

0 commit comments

Comments
 (0)