diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..73f69e0
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..30621ef
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..9bb3bd1
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/python-tf-idf.iml b/.idea/python-tf-idf.iml
new file mode 100644
index 0000000..0f9228c
--- /dev/null
+++ b/.idea/python-tf-idf.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
index 24e7181..0a20b60 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,10 @@ table.add_document("baz", ["kilo", "lima", "mike", "november"])
print table.similarities(["alpha", "bravo", "charlie"]) # => [['foo', 0.6875], ['bar', 0.75], ['baz', 0.0]]
```
+To show top n results:
+```python
+print table.similarities(["alpha", "bravo", "charlie"], top_n = 2) # => [['bar', 0.75], ['foo', 0.6875]]
+```
### Run the tests
diff --git a/test_tfidf.py b/test_tfidf.py
index 60af027..90118a4 100644
--- a/test_tfidf.py
+++ b/test_tfidf.py
@@ -10,9 +10,9 @@ def test_similarity(self):
table.add_document("baz", ["k", "l", "m", "n"])
self.assertEqual(
- table.similarities(["a", "b", "c"]),
- [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])
+ table.similarities(["a", "b", "c"],2),
+ [["bar", 0.75],["foo", 0.6875]])
if __name__ == "__main__":
- unittest.main()
+ unittest.main()
\ No newline at end of file
diff --git a/tfidf.py b/tfidf.py
index 699c672..580ef3d 100755
--- a/tfidf.py
+++ b/tfidf.py
@@ -36,12 +36,12 @@ def add_document(self, doc_name, list_of_words):
# add the normalized document to the corpus
self.documents.append([doc_name, doc_dict])
- def similarities(self, list_of_words):
- """Returns a list of all the [docname, similarity_score] pairs relative to a
-list of words.
-
+ def similarities(self, list_of_words, top_n=-1):
+ """
+ Returns a list of all the [docname, similarity_score] pairs relative to a
+ list of words.
+ set top_n to any positive integer value to get the top n (set by the user) results.
"""
-
# building the query dictionary
query_dict = {}
for w in list_of_words:
@@ -63,4 +63,17 @@ def similarities(self, list_of_words):
doc_dict[k] / self.corpus_dict[k])
sims.append([doc[0], score])
+ # sorting and returning the top n results
+ if top_n > 0:
+ sims.sort(key=self.__get_score, reverse=True)
+ return sims[:top_n]
+
return sims
+
+ def __get_score(self, sim_obj):
+ """
+ takes [docname, similarity_score] and returns similarity_score
+ private method used for top_n sorting
+ """
+ return sim_obj[1]
+