From a58f089e295f2b717086075385e69237d1a9cb86 Mon Sep 17 00:00:00 2001 From: prabhav-pandya Date: Sun, 8 Nov 2020 12:39:44 +0530 Subject: [PATCH 1/5] Added top_n method --- test_tfidf.py | 6 +++--- tfidf.py | 23 ++++++++++++++++++----- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/test_tfidf.py b/test_tfidf.py index 60af027..90118a4 100644 --- a/test_tfidf.py +++ b/test_tfidf.py @@ -10,9 +10,9 @@ def test_similarity(self): table.add_document("baz", ["k", "l", "m", "n"]) self.assertEqual( - table.similarities(["a", "b", "c"]), - [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]]) + table.similarities(["a", "b", "c"],2), + [["bar", 0.75],["foo", 0.6875]]) if __name__ == "__main__": - unittest.main() + unittest.main() \ No newline at end of file diff --git a/tfidf.py b/tfidf.py index 699c672..580ef3d 100755 --- a/tfidf.py +++ b/tfidf.py @@ -36,12 +36,12 @@ def add_document(self, doc_name, list_of_words): # add the normalized document to the corpus self.documents.append([doc_name, doc_dict]) - def similarities(self, list_of_words): - """Returns a list of all the [docname, similarity_score] pairs relative to a -list of words. - + def similarities(self, list_of_words, top_n=-1): + """ + Returns a list of all the [docname, similarity_score] pairs relative to a + list of words. + set top_n to any positive integer value to get the top n (set by the user) results. """ - # building the query dictionary query_dict = {} for w in list_of_words: @@ -63,4 +63,17 @@ def similarities(self, list_of_words): doc_dict[k] / self.corpus_dict[k]) sims.append([doc[0], score]) + # sorting and returning the top n results + if top_n > 0: + sims.sort(key=self.__get_score, reverse=True) + return sims[:top_n] + return sims + + def __get_score(self, sim_obj): + """ + takes [docname, similarity_score] and returns similarity_score + private method used for top_n sorting + """ + return sim_obj[1] + From 27fc46f53120f77e3f96204cf63dd9cd7493e9da Mon Sep 17 00:00:00 2001 From: prabhav-pandya Date: Sun, 8 Nov 2020 12:42:45 +0530 Subject: [PATCH 2/5] Added top_n method --- .idea/.gitignore | 8 ++++++++ .idea/inspectionProfiles/profiles_settings.xml | 6 ++++++ .idea/misc.xml | 7 +++++++ .idea/modules.xml | 8 ++++++++ .idea/python-tf-idf.iml | 8 ++++++++ .idea/vcs.xml | 6 ++++++ 6 files changed, 43 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/python-tf-idf.iml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..30621ef --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..9bb3bd1 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/python-tf-idf.iml b/.idea/python-tf-idf.iml new file mode 100644 index 0000000..0f9228c --- /dev/null +++ b/.idea/python-tf-idf.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file From 82c8684a21493f8f35472641350fa75148c929c4 Mon Sep 17 00:00:00 2001 From: Prabhav Pandya <62668380+prabhav-pandya@users.noreply.github.com> Date: Sun, 8 Nov 2020 12:47:33 +0530 Subject: [PATCH 3/5] Added command for top n results --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 24e7181..472bb17 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,9 @@ table.add_document("baz", ["kilo", "lima", "mike", "november"]) print table.similarities(["alpha", "bravo", "charlie"]) # => [['foo', 0.6875], ['bar', 0.75], ['baz', 0.0]] ``` +To show top n results: +```print table.similarities(["alpha", "bravo", "charlie"], top_n = 2) # => [['bar', 0.75], ['foo', 0.6875]] +``` ### Run the tests From 06684910b860c696dfe978ff6531c8280d10860a Mon Sep 17 00:00:00 2001 From: Prabhav Pandya <62668380+prabhav-pandya@users.noreply.github.com> Date: Sun, 8 Nov 2020 12:48:13 +0530 Subject: [PATCH 4/5] Added command for top_n results --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 472bb17..3c40f7d 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,8 @@ table.add_document("baz", ["kilo", "lima", "mike", "november"]) print table.similarities(["alpha", "bravo", "charlie"]) # => [['foo', 0.6875], ['bar', 0.75], ['baz', 0.0]] ``` To show top n results: -```print table.similarities(["alpha", "bravo", "charlie"], top_n = 2) # => [['bar', 0.75], ['foo', 0.6875]] +``` +print table.similarities(["alpha", "bravo", "charlie"], top_n = 2) # => [['bar', 0.75], ['foo', 0.6875]] ``` ### Run the tests From 76ab06e410d7ace27c02cbc905ee6ac49bcfdcee Mon Sep 17 00:00:00 2001 From: Prabhav Pandya <62668380+prabhav-pandya@users.noreply.github.com> Date: Sun, 8 Nov 2020 12:48:52 +0530 Subject: [PATCH 5/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3c40f7d..0a20b60 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ table.add_document("baz", ["kilo", "lima", "mike", "november"]) print table.similarities(["alpha", "bravo", "charlie"]) # => [['foo', 0.6875], ['bar', 0.75], ['baz', 0.0]] ``` To show top n results: -``` +```python print table.similarities(["alpha", "bravo", "charlie"], top_n = 2) # => [['bar', 0.75], ['foo', 0.6875]] ```