Skip to content

Commit 59bda7a

Browse files
committed
Generalized the container object
1 parent 6776f97 commit 59bda7a

File tree

2 files changed

+84
-60
lines changed

2 files changed

+84
-60
lines changed

jwzthreading/jwzthreading.py

Lines changed: 55 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -45,27 +45,33 @@
4545
# models
4646
#
4747

48-
class Container(object):
49-
"""Contains a tree of messages.
48+
class Container(dict):
49+
"""Contains a tree of objects. Each container is a subclassed dict
50+
where the contents are stored.
5051
5152
Attributes:
52-
message (Message): Message corresponding to this tree node.
53-
This can be None, if a Message-Id is referenced but no
54-
message with the ID is included.
5553
children ([Container]): Possibly-empty list of child containers
5654
parent (Container): Parent container, if any
5755
"""
58-
def __init__(self):
59-
self.message = self.parent = None
56+
def __init__(self, **args):
57+
dict.__init__(self, **args)
58+
self.parent = None
6059
self.children = []
6160

6261
def __repr__(self):
6362
return '<%s %x: %r>' % (self.__class__.__name__, id(self),
64-
self.message)
63+
dict.__repr__(self))
64+
def __hash__(self):
65+
""" Make the container hashable. Care must be taken though not to change
66+
the container contents after the initialization as otherwise the hash
67+
value will change
68+
"""
69+
return hash(tuple(sorted(self.items())) + (self.parent,))
70+
6571

6672
def is_dummy(self):
67-
"""Check if Container has a message."""
68-
return self.message is None
73+
"""Check if Container has some contents."""
74+
return not len(self.keys())
6975

7076
def add_child(self, child):
7177
"""Add a child to `self`.
@@ -132,7 +138,7 @@ def depth(self):
132138
return 1 + self.parent.depth
133139

134140
def flatten(self):
135-
""" Return a flatten version of the thread
141+
""" Return a flatten version of the tree
136142
137143
Returns
138144
list [Containers]: a list of messages
@@ -160,6 +166,8 @@ def root(self):
160166
def collapse_empty(self, inplace=True):
161167
""" Collapse empty top level containers.
162168
169+
This only applies to the JWZ algorithm
170+
163171
If multiple messages reference a non existing top level message,
164172
by default JWZ threading algorithm will create a en empty top level
165173
container to be used as the root node.
@@ -178,12 +186,15 @@ def collapse_empty(self, inplace=True):
178186
if not inplace:
179187
raise NotImplementedError
180188

189+
if not 'message' in self:
190+
raise ValueError('This method is only valid when used for email threading')
191+
181192

182-
if self.message is not None:
193+
if self['message'] is not None:
183194
# nothing to be done
184195
return self
185196

186-
if any([el.message is None for el in self.children]):
197+
if any([el['message'] is None for el in self.children]):
187198
raise ValueError('Children containers cannot be empty!')
188199

189200
# In the following, self.message is None
@@ -203,20 +214,24 @@ def collapse_empty(self, inplace=True):
203214

204215

205216
def to_dict(self, include=[]):
206-
""" Convert a Container tree to a nested dict """
207-
if self.message is None:
208-
raise ValueError('Containers with None messages are not supported!')
217+
""" Convert a Container tree to a nested dict
218+
"""
219+
if 'message' not in self:
220+
raise ValueError('This method is currently valid with email threading, '
221+
'please overwrite it for other applications')
222+
223+
if self['message'] is None:
209224
raise ValueError('Containers with None messages are not supported:!\n'\
210225
' this: {}'.format(self))
211226

212-
res = {'id': self.message.message_idx}
227+
res = {'id': self['message'].message_idx}
213228

214229
for key in include:
215-
res[key] = getattr(self.message, key)
230+
res[key] = getattr(self['message'], key)
216231

217232
if self.parent is not None:
218-
if self.parent.message is not None:
219-
res['parent'] = self.parent.message.message_idx
233+
if self.parent['message'] is not None:
234+
res['parent'] = self.parent['message'].message_idx
220235
else:
221236
raise ValueError('Containers with None messages are not supported:!\n'\
222237
' this: {}\n parent: {}'.format(self, self.parent))
@@ -324,10 +339,10 @@ def prune_container(container):
324339
for child in new_children:
325340
container.add_child(child)
326341

327-
if container.message is None and not len(container.children):
342+
if container.get('message') is None and not len(container.children):
328343
# step 4 (a) - nuke empty containers
329344
return []
330-
elif container.message is None and (
345+
elif container.get('message') is None and (
331346
len(container.children) == 1 or container.parent is not None):
332347
# step 4 (b) - promote children
333348
children = container.children[:]
@@ -356,10 +371,10 @@ def sort_threads(threads, key='message_idx', missing=-1, reverse=False):
356371

357372
def _sort_func(el):
358373

359-
if el.message is None:
374+
if el.get('message') is None:
360375
val = missing
361376
else:
362-
val = getattr(el.message, key)
377+
val = getattr(el.get('message'), key)
363378
if val is None:
364379
val = missing
365380
return val
@@ -398,10 +413,10 @@ def thread(messages, group_by_subject=True):
398413
# step one (a)
399414
this_container = id_table.get(msg.message_id, None)
400415
if this_container is not None:
401-
this_container.message = msg
416+
this_container['message'] = msg
402417
else:
403-
this_container = Container()
404-
this_container.message = msg
418+
this_container = Container(message=None)
419+
this_container['message'] = msg
405420
id_table[msg.message_id] = this_container
406421

407422
# step one (b)
@@ -410,7 +425,7 @@ def thread(messages, group_by_subject=True):
410425
## print "Processing reference for "+repr(msg.message_id)+": "+repr(ref)
411426
container = id_table.get(ref, None)
412427
if container is None:
413-
container = Container()
428+
container = Container(message=None)
414429
id_table[ref] = container
415430

416431
if prev is not None:
@@ -466,10 +481,10 @@ def thread(messages, group_by_subject=True):
466481
# step five - group root set by subject
467482
subject_table = OrderedDict()
468483
for container in root_set:
469-
if container.message:
470-
subj = container.message.subject
484+
if container['message']:
485+
subj = container['message'].subject
471486
else:
472-
subj = container.children[0].message.subject
487+
subj = container.children[0]['message'].subject
473488

474489
subj = SUBJECT_RE.sub('', subj)
475490
if subj == '':
@@ -478,19 +493,19 @@ def thread(messages, group_by_subject=True):
478493
existing = subject_table.get(subj, None)
479494
if (existing is None or
480495
(existing.message is not None and
481-
container.message is None) or
496+
container.get('message') is None) or
482497
(existing.message is not None and
483-
container.message is not None and
484-
len(existing.message.subject) > len(container.message.subject))):
498+
container.get('message') is not None and
499+
len(existing.message.subject) > len(container['message'].subject))):
485500
subject_table[subj] = container
486501

487502

488503
# step five (c)
489504
for container in root_set:
490-
if container.message:
491-
subj = container.message.subject
505+
if container['message']:
506+
subj = container['message'].subject
492507
else:
493-
subj = container.children[0].message.subject
508+
subj = container.children[0]['message'].subject
494509

495510
subj = SUBJECT_RE.sub('', subj)
496511
ctr = subject_table.get(subj)
@@ -506,14 +521,14 @@ def thread(messages, group_by_subject=True):
506521
ctr.add_child(container)
507522
else:
508523
container.add_child(ctr)
509-
elif len(ctr.message.subject) < len(container.message.subject):
524+
elif len(ctr.message.subject) < len(container['message'].subject):
510525
# ctr has fewer levels of 're:' headers
511526
ctr.add_child(container)
512-
elif len(ctr.message.subject) > len(container.message.subject):
527+
elif len(ctr.message.subject) > len(container['message'].subject):
513528
# container has fewer levels of 're:' headers
514529
container.add_child(ctr)
515530
else:
516-
new = Container()
531+
new = Container(message=None)
517532
new.add_child(ctr)
518533
new.add_child(container)
519534
subject_table[subj] = new

jwzthreading/tests/test_jwz.py

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,16 @@ def test_container():
5454
c2.add_child(c3)
5555
assert c3.parent == c2
5656

57+
def test_double_container():
58+
c = Container()
59+
60+
c2 = Container()
61+
c3 = Container()
62+
c.add_child(c2)
63+
c.add_child(c3)
64+
assert c.has_descendant(c2)
65+
assert c.has_descendant(c3)
66+
5767
def test_deep_container():
5868
"""Build a 100-deep list of nested Containers."""
5969

@@ -78,7 +88,6 @@ def test_deep_container():
7888
assert L[-1].depth == N
7989

8090
assert L[-1].root == L[0]
81-
8291

8392

8493
def test_unique():
@@ -133,7 +142,7 @@ def test_prune_empty():
133142
def test_prune_promote():
134143
p = Container()
135144
c1 = Container()
136-
c1.message = Message()
145+
c1['message'] = Message()
137146
p.add_child(c1)
138147
assert prune_container(p) == [c1]
139148

@@ -186,18 +195,18 @@ def test_sorting():
186195
d = thread([m2, m1, m3], group_by_subject=False)
187196

188197
d_s = sort_threads(d, key='message_id', missing=-1)
189-
assert d_s[0].message.message_id is None
190-
assert d_s[1].message.message_id == 1
198+
assert d_s[0]['message'].message_id is None
199+
assert d_s[1]['message'].message_id == 1
191200
d_s = sort_threads(d, key='subject', missing='z')
192-
assert d_s[0].message.message_id == 2
193-
assert d_s[1].message.message_id == 1
201+
assert d_s[0]['message'].message_id == 2
202+
assert d_s[1]['message'].message_id == 1
194203

195204
def test_thread_single():
196205
"""Thread a single message."""
197206
m = Message(None)
198207
m.subject = m.message_id = 'Single'
199208
d = thread([m])
200-
assert d[0].message == m
209+
assert d[0]['message'] == m
201210

202211
def test_thread_unrelated():
203212
"""Thread two unconnected messages."""
@@ -206,9 +215,9 @@ def test_thread_unrelated():
206215
m2 = Message(None)
207216
m2.subject = m2.message_id = 'Second'
208217
d = thread([m1, m2], group_by_subject=False)
209-
assert d[0].message == m1
218+
assert d[0]['message'] == m1
210219
assert d[1].children == []
211-
assert d[1].message == m2
220+
assert d[1]['message'] == m2
212221

213222
def test_thread_two():
214223
"""Thread two messages together."""
@@ -218,9 +227,9 @@ def test_thread_two():
218227
m2.subject = m2.message_id = 'Second'
219228
m2.references = ['First']
220229
d = thread([m1, m2])
221-
assert d[0].message == m1
230+
assert d[0]['message'] == m1
222231
assert len(d[0].children) == 1
223-
assert d[0].children[0].message == m2
232+
assert d[0].children[0]['message'] == m2
224233

225234
def test_thread_two_reverse():
226235
"Thread two messages together, with the child message listed first."
@@ -230,9 +239,9 @@ def test_thread_two_reverse():
230239
m2.subject = m2.message_id = 'Second'
231240
m2.references = ['First']
232241
d = thread([m2, m1], group_by_subject=False)
233-
assert d[0].message == m1
242+
assert d[0]['message'] == m1
234243
assert len(d[0].children) == 1
235-
assert d[0].children[0].message == m2
244+
assert d[0].children[0]['message'] == m2
236245

237246
def test_thread_lying_message():
238247
"Thread three messages together, with other messages lying in their references."
@@ -254,11 +263,11 @@ def test_thread_lying_message():
254263
#lying_after_m.references = ['Dummy parent','Third', 'Second', 'First']
255264
d = thread([dummy_parent_m, lying_before_m,
256265
m1, m2, m3, lying_after_m], group_by_subject=False)
257-
assert d[1].message == m1
266+
assert d[1]['message'] == m1
258267
assert len(d[1].children) == 1
259-
assert d[1].children[0].message == m2
268+
assert d[1].children[0]['message'] == m2
260269
assert len(d[1].children[0].children) == 1
261-
assert d[1].children[0].children[0].message == m3
270+
assert d[1].children[0].children[0]['message'] == m3
262271

263272
def test_thread_two_missing_parent():
264273
"Thread two messages, both children of a missing parent."
@@ -271,14 +280,14 @@ def test_thread_two_missing_parent():
271280
m2.message_id = 'Second'
272281
m2.references = ['parent']
273282
d = thread([m1, m2])
274-
assert d[0].message == None
283+
assert d[0]['message'] == None
275284
assert len(d[0].children) == 2
276-
assert d[0].children[0].message == m1
285+
assert d[0].children[0]['message'] == m1
277286
assert d[0].size == 3
278287

279288
# check that collapsing the empty container works
280289
container = d[0].collapse_empty()
281290
assert container.size == 2
282-
assert container.message is not None
283-
assert container.message.message_id == 'First'
291+
assert container['message'] is not None
292+
assert container['message'].message_id == 'First'
284293
assert container.parent is None

0 commit comments

Comments
 (0)