Skip to content
This repository was archived by the owner on May 18, 2023. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/python-tf-idf.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ table.add_document("baz", ["kilo", "lima", "mike", "november"])

print table.similarities(["alpha", "bravo", "charlie"]) # => [['foo', 0.6875], ['bar', 0.75], ['baz', 0.0]]
```
To show top n results:
```python
print table.similarities(["alpha", "bravo", "charlie"], top_n = 2) # => [['bar', 0.75], ['foo', 0.6875]]
```

### Run the tests

Expand Down
6 changes: 3 additions & 3 deletions test_tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ def test_similarity(self):
table.add_document("baz", ["k", "l", "m", "n"])

self.assertEqual(
table.similarities(["a", "b", "c"]),
[["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])
table.similarities(["a", "b", "c"],2),
[["bar", 0.75],["foo", 0.6875]])


if __name__ == "__main__":
unittest.main()
unittest.main()
23 changes: 18 additions & 5 deletions tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ def add_document(self, doc_name, list_of_words):
# add the normalized document to the corpus
self.documents.append([doc_name, doc_dict])

def similarities(self, list_of_words):
"""Returns a list of all the [docname, similarity_score] pairs relative to a
list of words.

def similarities(self, list_of_words, top_n=-1):
"""
Returns a list of all the [docname, similarity_score] pairs relative to a
list of words.
set top_n to any positive integer value to get the top n (set by the user) results.
"""

# building the query dictionary
query_dict = {}
for w in list_of_words:
Expand All @@ -63,4 +63,17 @@ def similarities(self, list_of_words):
doc_dict[k] / self.corpus_dict[k])
sims.append([doc[0], score])

# sorting and returning the top n results
if top_n > 0:
sims.sort(key=self.__get_score, reverse=True)
return sims[:top_n]

return sims

def __get_score(self, sim_obj):
"""
takes [docname, similarity_score] and returns similarity_score
private method used for top_n sorting
"""
return sim_obj[1]