From a2e3ea5b6f371316696bf7e9ce30f237254d077d Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Mon, 24 Apr 2023 10:40:46 +0200 Subject: [PATCH] Corpus - Fix contexts to be compatible between sessions --- orangecontrib/text/widgets/owcorpus.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py index 1ee994f92..c6e51c48d 100644 --- a/orangecontrib/text/widgets/owcorpus.py +++ b/orangecontrib/text/widgets/owcorpus.py @@ -1,4 +1,6 @@ +import hashlib import os +from typing import List import numpy as np from AnyQt.QtCore import Qt @@ -51,7 +53,7 @@ def open_context(self, widget, corpus): def new_context(self, corpus, attributes, metas): """Adding hash of documents to the context""" context = super().new_context(corpus, attributes, metas) - context.documents_hash = hash(tuple(corpus.documents)) + context.documents_hash = self.__compute_hash(corpus.documents) context.language = corpus.language return context @@ -62,7 +64,7 @@ def match(self, context, corpus, attrs, metas): """ if ( hasattr(context, "documents_hash") - and context.documents_hash != hash(tuple(corpus.documents)) + and context.documents_hash != self.__compute_hash(corpus.documents) or hasattr(context, "language") and context.language != corpus.language ): @@ -73,6 +75,11 @@ def decode_setting(self, setting, value, corpus=None, *args): """Modifying decode setting to work with Corpus instead of domain""" return super().decode_setting(setting, value, corpus.domain, *args) + @staticmethod + def __compute_hash(texts: List[str]) -> int: + texts = " ".join(texts) + return int(hashlib.md5(texts.encode("utf-8")).hexdigest(), 16) + class OWCorpus(OWWidget, ConcurrentWidgetMixin): name = "Corpus"