[kbss-cvut/termit-ui#587] Reuse existing approved term occurrences (t…

…hat were created manually) when processing annotation results.
kbss-cvut · Dec 10, 2024 · 5ad5163 · 5ad5163
1 parent 3e6aae7
commit 5ad5163
Show file tree

Hide file tree

Showing 6 changed files with 139 additions and 24 deletions.
diff --git a/src/main/java/cz/cvut/kbss/termit/model/assignment/TermOccurrence.java b/src/main/java/cz/cvut/kbss/termit/model/assignment/TermOccurrence.java
@@ -29,6 +29,7 @@
 import cz.cvut.kbss.jopa.vocabulary.DC;
 import cz.cvut.kbss.termit.model.AbstractEntity;
 import cz.cvut.kbss.termit.model.util.HasTypes;
+import cz.cvut.kbss.termit.util.Constants;
 import cz.cvut.kbss.termit.util.Vocabulary;
 import jakarta.validation.constraints.NotNull;
 
@@ -136,6 +137,12 @@ public void setElementAbout(String elementAbout) {
         this.elementAbout = elementAbout;
     }
 
+    public String resolveElementAbout() {
+        final String strIri = getUri().toString();
+        this.elementAbout = Constants.BNODE_PREFIX + strIri.substring(strIri.lastIndexOf('/') + 1);
+        return elementAbout;
+    }
+
     /**
      * Marks this term occurrence as suggested by automation.
      * <p>

diff --git a/src/main/java/cz/cvut/kbss/termit/service/document/TermOccurrenceResolver.java b/src/main/java/cz/cvut/kbss/termit/service/document/TermOccurrenceResolver.java
@@ -29,6 +29,7 @@
 
 import java.io.InputStream;
 import java.net.URI;
+import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 
@@ -39,7 +40,7 @@ public abstract class TermOccurrenceResolver {
 
     protected final TermRepositoryService termService;
 
-    protected List<TermOccurrence> existingOccurrences = Collections.emptyList();
+    protected List<TermOccurrence> existingApprovedOccurrences = Collections.emptyList();
 
     protected TermOccurrenceResolver(TermRepositoryService termService) {
         this.termService = termService;
@@ -58,11 +59,14 @@ protected TermOccurrenceResolver(TermRepositoryService termService) {
 
     /**
      * Sets occurrences that already existed on previous analyses.
+     * <p>
+     * The resolver uses only those that are approved.
      *
      * @param existingOccurrences Term occurrences from the previous analysis run
      */
     public void setExistingOccurrences(List<TermOccurrence> existingOccurrences) {
-        this.existingOccurrences = existingOccurrences;
+        this.existingApprovedOccurrences = new ArrayList<>(
+                existingOccurrences.stream().filter(to -> !to.isSuggested()).toList());
     }
 
     /**

diff --git a/src/main/java/cz/cvut/kbss/termit/service/document/TermOccurrenceSelectorCreator.java b/src/main/java/cz/cvut/kbss/termit/service/document/TermOccurrenceSelectorCreator.java
@@ -10,6 +10,7 @@
 import cz.cvut.kbss.termit.model.selector.Selector;
 import cz.cvut.kbss.termit.service.business.ResourceService;
 import cz.cvut.kbss.termit.service.document.html.HtmlSelectorGenerators;
+import cz.cvut.kbss.termit.util.Constants;
 import jakarta.annotation.Nonnull;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
@@ -20,8 +21,6 @@
 import java.util.Objects;
 import java.util.Set;
 
-import static cz.cvut.kbss.termit.service.document.html.HtmlTermOccurrenceResolver.BNODE_PREFIX;
-
 /**
  * Creates selectors for a new term occurrence.
  */
@@ -45,7 +44,7 @@ public TermOccurrenceSelectorCreator(HtmlSelectorGenerators selectorGenerators,
      * Creates selectors for a term occurrence in the specified target represented by an HTML element with the specified
      * id.
      *
-     * @param target    Asset in which the occurrence is to be found
+     * @param target       Asset in which the occurrence is to be found
      * @param elementAbout Value of the {@literal about} attribute of the occurrence element
      * @return Set of generated selectors for the occurrence element
      * @throws UnsupportedOperationException If the specified target is not supported
@@ -59,7 +58,8 @@ public Set<Selector> createSelectors(@Nonnull OccurrenceTarget target, @Nonnull
         }
         final FileOccurrenceTarget ft = (FileOccurrenceTarget) target;
         final Document targetContent = loadTargetContent(ft);
-        final Elements elements = targetContent.select("[about=" + BNODE_PREFIX + elementAbout + "]");
+        final Elements elements = targetContent.select(
+                "[" + Constants.RDFa.ABOUT + "=" + Constants.BNODE_PREFIX + elementAbout + "]");
         if (elements.isEmpty()) {
             throw new SelectorGenerationException("No element with id " + elementAbout + " found in " + ft.getSource());
         }

diff --git a/src/main/java/cz/cvut/kbss/termit/service/document/html/HtmlTermOccurrenceResolver.java b/src/main/java/cz/cvut/kbss/termit/service/document/html/HtmlTermOccurrenceResolver.java
@@ -25,15 +25,19 @@
 import cz.cvut.kbss.termit.model.assignment.TermOccurrence;
 import cz.cvut.kbss.termit.model.resource.File;
 import cz.cvut.kbss.termit.model.selector.Selector;
+import cz.cvut.kbss.termit.model.selector.TextQuoteSelector;
 import cz.cvut.kbss.termit.service.document.DocumentManager;
 import cz.cvut.kbss.termit.service.document.TermOccurrenceResolver;
 import cz.cvut.kbss.termit.service.repository.TermRepositoryService;
 import cz.cvut.kbss.termit.util.Configuration;
 import cz.cvut.kbss.termit.util.Constants;
 import cz.cvut.kbss.termit.util.Utils;
+import cz.cvut.kbss.termit.util.Vocabulary;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
 import org.jsoup.select.Elements;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -50,6 +54,7 @@
 import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
@@ -63,13 +68,10 @@
 @Scope(value = ConfigurableBeanFactory.SCOPE_PROTOTYPE)
 public class HtmlTermOccurrenceResolver extends TermOccurrenceResolver {
 
-    /**
-     * Blank node prefix.
-     */
-    public static final String BNODE_PREFIX = "_:";
-
     private static final String SCORE_ATTRIBUTE = "score";
 
+    private static final String ANNOTATION_ELEMENT = "span";
+
     private static final Logger LOG = LoggerFactory.getLogger(HtmlTermOccurrenceResolver.class);
 
     private final HtmlSelectorGenerators selectorGenerators;
@@ -203,6 +205,13 @@ public void findTermOccurrences(OccurrenceConsumer resultConsumer) {
                 }
             });
         }
+        try {
+            addRemainingExistingApprovedOccurrences(resultConsumer);
+        } catch (InterruptedException e) {
+            LOG.error("Thread interrupted while resolving term occurrences.");
+            Thread.currentThread().interrupt();
+            throw new TermItException(e);
+        }
     }
 
     private Optional<TermOccurrence> resolveAnnotation(Element rdfaElem, Asset<?> source) {
@@ -246,8 +255,8 @@ private void verifyTermExists(Element rdfaElem, URI termUri, String termId) {
     private URI resolveOccurrenceId(Element rdfaElem, Asset<?> source) {
         final String base = TermOccurrence.resolveContext(source.getUri()) + "/";
         String about = rdfaElem.attr("about");
-        if (about.startsWith(BNODE_PREFIX)) {
-            about = about.substring(BNODE_PREFIX.length());
+        if (about.startsWith(Constants.BNODE_PREFIX)) {
+            about = about.substring(Constants.BNODE_PREFIX.length());
         }
         return URI.create(base + about);
     }
@@ -256,21 +265,93 @@ private boolean existsApproved(TermOccurrence newOccurrence) {
         final OccurrenceTarget target = newOccurrence.getTarget();
         assert target != null;
         final Set<Selector> selectors = target.getSelectors();
-        for (TermOccurrence to : existingOccurrences) {
+        final Iterator<TermOccurrence> it = existingApprovedOccurrences.iterator();
+        while (it.hasNext()) {
+            final TermOccurrence to = it.next();
             if (!to.getTerm().equals(newOccurrence.getTerm())) {
                 continue;
             }
             final OccurrenceTarget existingTarget = to.getTarget();
             assert existingTarget != null;
             assert existingTarget.getSource().equals(target.getSource());
             // Same term, contains at least one identical selector
-            if (existingTarget.getSelectors().stream().anyMatch(selectors::contains) && !to.isSuggested()) {
+            if (existingTarget.getSelectors().stream().anyMatch(selectors::contains)) {
+                it.remove();
                 return true;
             }
         }
         return false;
     }
 
+    /**
+     * Tries to add existing approved term occurrences to the content.
+     * <p>
+     * This means finding matching text using the {@link TextQuoteSelector} (as it is more resilient to minor changes in
+     * the content file) and inserting a corresponding annotation element into the text.
+     * <p>
+     * If a matching element can be created in the text, the existing term occurrence is processed just as a new one
+     * would be.
+     *
+     * @param consumer Consumer of the occurrences
+     */
+    private void addRemainingExistingApprovedOccurrences(OccurrenceConsumer consumer) throws InterruptedException {
+        LOG.debug("Adding existing approved occurrences to content.");
+        for (TermOccurrence to : existingApprovedOccurrences) {
+            final Optional<Selector> tqSelector = to.getTarget().getSelectors().stream().filter(
+                    TextQuoteSelector.class::isInstance).findFirst();
+            if (tqSelector.isEmpty()) {
+                LOG.trace("Existing approved occurrence does not have a {}. Skipping it.",
+                          TextQuoteSelector.class.getSimpleName());
+                continue;
+            }
+            final TextQuoteSelector tqs = (TextQuoteSelector) tqSelector.get();
+            final Elements containing = document.select(
+                    ":contains(" + tqs.getPrefix() + tqs.getExactMatch() + tqs.getSuffix() + ")");
+            if (containing.isEmpty()) {
+                LOG.trace("{} did not find any matching elements. Skipping term occurrence.",
+                          TextQuoteSelector.class.getSimpleName());
+                continue;
+            }
+            LOG.debug("Adding existing approved term occurrence {} to content.", to);
+            // Last should be the most specific one
+            final Element elem = containing.last();
+            assert elem != null;
+            final Element containingExactMatch = elem.selectFirst(":containsOwn(" + tqs.getExactMatch() + ")");
+            final Element annotationNode = createAnnotationElement(to, tqs);
+            assert containingExactMatch != null;
+            replaceContentWithAnnotation(containingExactMatch, tqs, annotationNode);
+            consumer.accept(to);
+        }
+    }
+
+    private static Element createAnnotationElement(TermOccurrence to, TextQuoteSelector tqs) {
+        final Element annotationNode = new Element(ANNOTATION_ELEMENT, "");
+        annotationNode.text(tqs.getExactMatch());
+        annotationNode.attr(Constants.RDFa.ABOUT, to.resolveElementAbout());
+        annotationNode.attr(Constants.RDFa.RESOURCE, to.getTerm().toString());
+        annotationNode.attr(Constants.RDFa.TYPE, Vocabulary.s_c_vyskyt_termu);
+        annotationNode.attr(Constants.RDFa.PROPERTY, Vocabulary.s_p_je_prirazenim_termu);
+        return annotationNode;
+    }
+
+    private static void replaceContentWithAnnotation(Element containingExactMatch, TextQuoteSelector tqs,
+                                                     Element annotationNode) {
+        for (Node n : containingExactMatch.childNodes()) {
+            if (!(n instanceof TextNode textNode) || !textNode.getWholeText().contains(tqs.getExactMatch())) {
+                continue;
+            }
+            final int exactMatchStart = textNode.getWholeText().indexOf(tqs.getExactMatch());
+            final int exactMatchEnd = exactMatchStart + tqs.getExactMatch().length();
+            final TextNode prefixNode = new TextNode(textNode.getWholeText().substring(0, exactMatchStart));
+            final TextNode suffixNode = new TextNode(textNode.getWholeText().substring(exactMatchEnd));
+            n.after(suffixNode);
+            n.after(annotationNode);
+            n.after(prefixNode);
+            n.remove();
+            break;
+        }
+    }
+
     @Override
     public boolean supports(Asset<?> source) {
         if (source instanceof Term) {

diff --git a/src/main/java/cz/cvut/kbss/termit/util/Constants.java b/src/main/java/cz/cvut/kbss/termit/util/Constants.java
@@ -160,8 +160,8 @@ public class Constants {
     public static final int WEBSOCKET_SEND_BUFFER_SIZE_LIMIT = Integer.MAX_VALUE;
 
     /**
-     * Set the maximum time allowed in milliseconds after the WebSocket connection is established
-     * and before the first sub-protocol message is received.
+     * Set the maximum time allowed in milliseconds after the WebSocket connection is established and before the first
+     * sub-protocol message is received.
      */
     public static final int WEBSOCKET_TIME_TO_FIRST_MESSAGE = 15 * 1000 /* 15s */;
 
@@ -170,6 +170,11 @@ public class Constants {
      */
     public static final String DEVELOPMENT_PROFILE = "development";
 
+    /**
+     * Blank node prefix.
+     */
+    public static final String BNODE_PREFIX = "_:";
+
     private Constants() {
         throw new AssertionError();
     }

diff --git a/src/test/java/cz/cvut/kbss/termit/service/document/html/HtmlTermOccurrenceResolverTest.java b/src/test/java/cz/cvut/kbss/termit/service/document/html/HtmlTermOccurrenceResolverTest.java
@@ -20,7 +20,6 @@
 import cz.cvut.kbss.termit.environment.Generator;
 import cz.cvut.kbss.termit.model.Term;
 import cz.cvut.kbss.termit.model.assignment.TermOccurrence;
-import cz.cvut.kbss.termit.model.resource.Document;
 import cz.cvut.kbss.termit.model.resource.File;
 import cz.cvut.kbss.termit.model.selector.Selector;
 import cz.cvut.kbss.termit.model.selector.TextPositionSelector;
@@ -30,6 +29,7 @@
 import cz.cvut.kbss.termit.util.Configuration;
 import cz.cvut.kbss.termit.util.Vocabulary;
 import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
 import org.jsoup.select.Elements;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.extension.ExtendWith;
@@ -42,6 +42,7 @@
 import java.io.InputStream;
 import java.net.URI;
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Optional;
 import java.util.Set;
@@ -98,13 +99,8 @@ void supportsReturnsTrueForFileWithHtmLabelExtension() {
 
     @Test
     void supportsReturnsTrueForHtmlFileWithoutExtension() {
-        final Document document = new Document();
-        document.setLabel("testDocument");
-        document.setUri(Generator.generateUri());
         final File file = new File();
         file.setLabel("test");
-        file.setDocument(document);
-        document.addFile(file);
         when(documentManager.getContentType(file)).thenReturn(Optional.of(MediaType.TEXT_HTML_VALUE));
         assertTrue(sut.supports(file));
     }
@@ -195,9 +191,31 @@ void findTermOccurrencesSetsFoundOccurrencesAsApprovedWhenCorrespondingExistingO
             assertThat(to.getTypes(), not(hasItem(Vocabulary.s_c_navrzeny_vyskyt_termu)));
         });
         assertEquals(1, resultSize.get());
-        final org.jsoup.nodes.Document document = Jsoup.parse(sut.getContent(), StandardCharsets.UTF_8.name(), "");
+        final Document document = Jsoup.parse(sut.getContent(), StandardCharsets.UTF_8.name(), "");
         final Elements annotations = document.select("span[about]");
         assertEquals(1, annotations.size());
         assertFalse(annotations.get(0).hasAttr("score"));
     }
+
+    @Test
+    void findTermOccurrencesReusesExistingApprovedOccurrencesThatAreNotPresentInAnnotatedContent() throws Exception {
+        when(termService.exists(TERM_URI)).thenReturn(true);
+        final File file = initFile();
+        final String id = "r2d2";
+        final TermOccurrence existing = Generator.generateTermOccurrence(new Term(TERM_URI), file, false);
+        existing.setUri(URI.create(Vocabulary.s_c_vyskyt_termu + "/" + id));
+        final Selector quoteSelector = new TextQuoteSelector("Prahy", " hlavního města ", ".");
+        final Selector posSelector = new TextPositionSelector(57, 62);
+        existing.getTarget().setSelectors(Set.of(quoteSelector, posSelector));
+        final InputStream is = cz.cvut.kbss.termit.environment.Environment.loadFile("data/rdfa-simple.html");
+        sut.parseContent(is, file);
+        sut.setExistingOccurrences(List.of(existing));
+
+        final List<TermOccurrence> result = new ArrayList<>();
+        sut.findTermOccurrences(result::add);
+        assertThat(result, hasItem(existing));
+        final Document resultDoc = Jsoup.parse(sut.getContent(), StandardCharsets.UTF_8.name(), "");
+        final Elements addedAnnotation = resultDoc.select("span[about=_:" + id + "]");
+        assertFalse(addedAnnotation.isEmpty());
+    }
 }