TimSchopf · hossein-khalilian · Feb 13, 2023
diff --git a/keyphrase_vectorizers/keyphrase_count_vectorizer.py b/keyphrase_vectorizers/keyphrase_count_vectorizer.py
@@ -145,6 +145,11 @@ def __init__(self, spacy_pipeline: Union[str, spacy.Language] = 'en_core_web_sm'
         self.binary = binary
         self.dtype = dtype
 
+    def remove_stopwords(self, text):
+        text = ' '.join([word for word in text.split(' ') if word not in self.stop_words])
+
+        return text
+
     def fit(self, raw_documents: List[str]) -> object:
         """
         Learn the keyphrases that match the defined part-of-speech pattern from the list of raw documents.
@@ -170,7 +175,15 @@ def fit(self, raw_documents: List[str]) -> object:
 
         # remove keyphrases that have more than 8 words, as they are probably no real keyphrases
         # additionally this prevents memory issues during transformation to a document-keyphrase matrix
-        self.keyphrases = [keyphrase for keyphrase in self.keyphrases if len(keyphrase.split()) <= 8]
+        self.keyphrases = [keyphrase for keyphrase in self.keyphrases if len(keyphrase.split()) <= 5]
+
+
+        keys = ' | '.join([key for key in self.keyphrases])
+        if self.stop_words is not None:
+            keys = self.remove_stopwords(keys)
+        keys = keys.replace(' | | ', ' | ')
+        self.keyphrases = list(np.unique(keys.split(' | ')))
+
 
         # compute document frequencies of keyphrases
         if self.max_df or self.min_df: