diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..30621ef --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..9bb3bd1 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/python-tf-idf.iml b/.idea/python-tf-idf.iml new file mode 100644 index 0000000..0f9228c --- /dev/null +++ b/.idea/python-tf-idf.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index 24e7181..0a20b60 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,10 @@ table.add_document("baz", ["kilo", "lima", "mike", "november"]) print table.similarities(["alpha", "bravo", "charlie"]) # => [['foo', 0.6875], ['bar', 0.75], ['baz', 0.0]] ``` +To show top n results: +```python +print table.similarities(["alpha", "bravo", "charlie"], top_n = 2) # => [['bar', 0.75], ['foo', 0.6875]] +``` ### Run the tests diff --git a/test_tfidf.py b/test_tfidf.py index 60af027..90118a4 100644 --- a/test_tfidf.py +++ b/test_tfidf.py @@ -10,9 +10,9 @@ def test_similarity(self): table.add_document("baz", ["k", "l", "m", "n"]) self.assertEqual( - table.similarities(["a", "b", "c"]), - [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]]) + table.similarities(["a", "b", "c"],2), + [["bar", 0.75],["foo", 0.6875]]) if __name__ == "__main__": - unittest.main() + unittest.main() \ No newline at end of file diff --git a/tfidf.py b/tfidf.py index 699c672..580ef3d 100755 --- a/tfidf.py +++ b/tfidf.py @@ -36,12 +36,12 @@ def add_document(self, doc_name, list_of_words): # add the normalized document to the corpus self.documents.append([doc_name, doc_dict]) - def similarities(self, list_of_words): - """Returns a list of all the [docname, similarity_score] pairs relative to a -list of words. - + def similarities(self, list_of_words, top_n=-1): + """ + Returns a list of all the [docname, similarity_score] pairs relative to a + list of words. + set top_n to any positive integer value to get the top n (set by the user) results. """ - # building the query dictionary query_dict = {} for w in list_of_words: @@ -63,4 +63,17 @@ def similarities(self, list_of_words): doc_dict[k] / self.corpus_dict[k]) sims.append([doc[0], score]) + # sorting and returning the top n results + if top_n > 0: + sims.sort(key=self.__get_score, reverse=True) + return sims[:top_n] + return sims + + def __get_score(self, sim_obj): + """ + takes [docname, similarity_score] and returns similarity_score + private method used for top_n sorting + """ + return sim_obj[1] +