From c67b7838adb3af2bc5a487c03c80e5abddeb8d77 Mon Sep 17 00:00:00 2001 From: James Tauber Date: Wed, 7 Feb 2024 04:47:31 -0500 Subject: [PATCH] simpler way of adding a list of terms fixes #25 --- README.md | 17 +++++++++++++++++ termdoc/htdm.py | 4 ++++ tests.py | 10 ++++++++++ 3 files changed, 31 insertions(+) diff --git a/README.md b/README.md index d5bd66b..97fc7bb 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,23 @@ And here is an example with a two-level hierarchy: Note that if the `count` is `1` you can omit it. +Entire lists of tokens can be added for a particular address in one go using `add(address, term_list)`: + +```python +>>> import termdoc +>>> c = termdoc.HTDM() +>>> c.add("1.1", ["foo", "bar", "bar", "baz"]) +>>> c.add("1.2", ["foo", "foo"]) +>>> c.get_counts()["bar"] +2 +>>> c.get_counts()["foo"] +3 +>>> c.get_counts("1.2")["foo"] +2 + +``` + + You can **prune** a HTDM to just `n` levels with the method `prune(n)`. You can iterate over the document-term counts at the leaves of the HTDM with the method `leaf_entries()` (this returns a generator yielding `(document_address, term, count)` tuples). This is effectively a traditional TDM (the document IDs will still reflect the hierarchy but the aggregate counts aren't present). diff --git a/termdoc/htdm.py b/termdoc/htdm.py index ca87dba..de6ef2e 100644 --- a/termdoc/htdm.py +++ b/termdoc/htdm.py @@ -42,6 +42,10 @@ def increment_count(self, address, term, count=1): address = self.address_sep.join(address.split(self.address_sep)[:-1]) first = False + def add(self, address, term_list): + for term in term_list: + self.increment_count(address, term) + def load(self, filename, field_sep="\t", address_sep=None, prefix=None): address_sep = address_sep or self.address_sep with open(filename) as f: diff --git a/tests.py b/tests.py index 573b543..9f0cbc5 100755 --- a/tests.py +++ b/tests.py @@ -336,6 +336,16 @@ def test_two_arg_increment_count(self): self.assertEqual(c.get_counts()["foo"], 3) self.assertEqual(c.get_counts()["bar"], 3) + def test_add(self): + import termdoc + + c = termdoc.HTDM() + c.add("1", ["foo", "bar", "bar", "baz"]) + c.add("2", ["foo", "foo", "bar"]) + self.assertEqual(c.get_counts()["foo"], 3) + self.assertEqual(c.get_counts("2")["foo"], 2) + self.assertEqual(c.get_counts("1")["bar"], 2) + if __name__ == "__main__": unittest.main()