From a2e3ea5b6f371316696bf7e9ce30f237254d077d Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Mon, 24 Apr 2023 10:40:46 +0200
Subject: [PATCH] Corpus - Fix contexts to be compatible between sessions
---
orangecontrib/text/widgets/owcorpus.py | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py
index 1ee994f92..c6e51c48d 100644
--- a/orangecontrib/text/widgets/owcorpus.py
+++ b/orangecontrib/text/widgets/owcorpus.py
@@ -1,4 +1,6 @@
+import hashlib
import os
+from typing import List
import numpy as np
from AnyQt.QtCore import Qt
@@ -51,7 +53,7 @@ def open_context(self, widget, corpus):
def new_context(self, corpus, attributes, metas):
"""Adding hash of documents to the context"""
context = super().new_context(corpus, attributes, metas)
- context.documents_hash = hash(tuple(corpus.documents))
+ context.documents_hash = self.__compute_hash(corpus.documents)
context.language = corpus.language
return context
@@ -62,7 +64,7 @@ def match(self, context, corpus, attrs, metas):
"""
if (
hasattr(context, "documents_hash")
- and context.documents_hash != hash(tuple(corpus.documents))
+ and context.documents_hash != self.__compute_hash(corpus.documents)
or hasattr(context, "language")
and context.language != corpus.language
):
@@ -73,6 +75,11 @@ def decode_setting(self, setting, value, corpus=None, *args):
"""Modifying decode setting to work with Corpus instead of domain"""
return super().decode_setting(setting, value, corpus.domain, *args)
+ @staticmethod
+ def __compute_hash(texts: List[str]) -> int:
+ texts = " ".join(texts)
+ return int(hashlib.md5(texts.encode("utf-8")).hexdigest(), 16)
+
class OWCorpus(OWWidget, ConcurrentWidgetMixin):
name = "Corpus"