Skip to content

Commit

Permalink
[kbss-cvut/termit-ui#587] Reuse existing approved term occurrences (t…
Browse files Browse the repository at this point in the history
…hat were created manually) when processing annotation results.
  • Loading branch information
ledsoft committed Dec 10, 2024
1 parent 3e6aae7 commit 5ad5163
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import cz.cvut.kbss.jopa.vocabulary.DC;
import cz.cvut.kbss.termit.model.AbstractEntity;
import cz.cvut.kbss.termit.model.util.HasTypes;
import cz.cvut.kbss.termit.util.Constants;
import cz.cvut.kbss.termit.util.Vocabulary;
import jakarta.validation.constraints.NotNull;

Expand Down Expand Up @@ -136,6 +137,12 @@ public void setElementAbout(String elementAbout) {
this.elementAbout = elementAbout;
}

public String resolveElementAbout() {
final String strIri = getUri().toString();
this.elementAbout = Constants.BNODE_PREFIX + strIri.substring(strIri.lastIndexOf('/') + 1);
return elementAbout;
}

/**
* Marks this term occurrence as suggested by automation.
* <p>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

import java.io.InputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

Expand All @@ -39,7 +40,7 @@ public abstract class TermOccurrenceResolver {

protected final TermRepositoryService termService;

protected List<TermOccurrence> existingOccurrences = Collections.emptyList();
protected List<TermOccurrence> existingApprovedOccurrences = Collections.emptyList();

protected TermOccurrenceResolver(TermRepositoryService termService) {
this.termService = termService;
Expand All @@ -58,11 +59,14 @@ protected TermOccurrenceResolver(TermRepositoryService termService) {

/**
* Sets occurrences that already existed on previous analyses.
* <p>
* The resolver uses only those that are approved.
*
* @param existingOccurrences Term occurrences from the previous analysis run
*/
public void setExistingOccurrences(List<TermOccurrence> existingOccurrences) {
this.existingOccurrences = existingOccurrences;
this.existingApprovedOccurrences = new ArrayList<>(
existingOccurrences.stream().filter(to -> !to.isSuggested()).toList());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import cz.cvut.kbss.termit.model.selector.Selector;
import cz.cvut.kbss.termit.service.business.ResourceService;
import cz.cvut.kbss.termit.service.document.html.HtmlSelectorGenerators;
import cz.cvut.kbss.termit.util.Constants;
import jakarta.annotation.Nonnull;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
Expand All @@ -20,8 +21,6 @@
import java.util.Objects;
import java.util.Set;

import static cz.cvut.kbss.termit.service.document.html.HtmlTermOccurrenceResolver.BNODE_PREFIX;

/**
* Creates selectors for a new term occurrence.
*/
Expand All @@ -45,7 +44,7 @@ public TermOccurrenceSelectorCreator(HtmlSelectorGenerators selectorGenerators,
* Creates selectors for a term occurrence in the specified target represented by an HTML element with the specified
* id.
*
* @param target Asset in which the occurrence is to be found
* @param target Asset in which the occurrence is to be found
* @param elementAbout Value of the {@literal about} attribute of the occurrence element
* @return Set of generated selectors for the occurrence element
* @throws UnsupportedOperationException If the specified target is not supported
Expand All @@ -59,7 +58,8 @@ public Set<Selector> createSelectors(@Nonnull OccurrenceTarget target, @Nonnull
}
final FileOccurrenceTarget ft = (FileOccurrenceTarget) target;
final Document targetContent = loadTargetContent(ft);
final Elements elements = targetContent.select("[about=" + BNODE_PREFIX + elementAbout + "]");
final Elements elements = targetContent.select(
"[" + Constants.RDFa.ABOUT + "=" + Constants.BNODE_PREFIX + elementAbout + "]");
if (elements.isEmpty()) {
throw new SelectorGenerationException("No element with id " + elementAbout + " found in " + ft.getSource());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,19 @@
import cz.cvut.kbss.termit.model.assignment.TermOccurrence;
import cz.cvut.kbss.termit.model.resource.File;
import cz.cvut.kbss.termit.model.selector.Selector;
import cz.cvut.kbss.termit.model.selector.TextQuoteSelector;
import cz.cvut.kbss.termit.service.document.DocumentManager;
import cz.cvut.kbss.termit.service.document.TermOccurrenceResolver;
import cz.cvut.kbss.termit.service.repository.TermRepositoryService;
import cz.cvut.kbss.termit.util.Configuration;
import cz.cvut.kbss.termit.util.Constants;
import cz.cvut.kbss.termit.util.Utils;
import cz.cvut.kbss.termit.util.Vocabulary;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -50,6 +54,7 @@
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
Expand All @@ -63,13 +68,10 @@
@Scope(value = ConfigurableBeanFactory.SCOPE_PROTOTYPE)
public class HtmlTermOccurrenceResolver extends TermOccurrenceResolver {

/**
* Blank node prefix.
*/
public static final String BNODE_PREFIX = "_:";

private static final String SCORE_ATTRIBUTE = "score";

private static final String ANNOTATION_ELEMENT = "span";

private static final Logger LOG = LoggerFactory.getLogger(HtmlTermOccurrenceResolver.class);

private final HtmlSelectorGenerators selectorGenerators;
Expand Down Expand Up @@ -203,6 +205,13 @@ public void findTermOccurrences(OccurrenceConsumer resultConsumer) {
}
});
}
try {
addRemainingExistingApprovedOccurrences(resultConsumer);
} catch (InterruptedException e) {
LOG.error("Thread interrupted while resolving term occurrences.");
Thread.currentThread().interrupt();
throw new TermItException(e);
}
}

private Optional<TermOccurrence> resolveAnnotation(Element rdfaElem, Asset<?> source) {
Expand Down Expand Up @@ -246,8 +255,8 @@ private void verifyTermExists(Element rdfaElem, URI termUri, String termId) {
private URI resolveOccurrenceId(Element rdfaElem, Asset<?> source) {
final String base = TermOccurrence.resolveContext(source.getUri()) + "/";
String about = rdfaElem.attr("about");
if (about.startsWith(BNODE_PREFIX)) {
about = about.substring(BNODE_PREFIX.length());
if (about.startsWith(Constants.BNODE_PREFIX)) {
about = about.substring(Constants.BNODE_PREFIX.length());
}
return URI.create(base + about);
}
Expand All @@ -256,21 +265,93 @@ private boolean existsApproved(TermOccurrence newOccurrence) {
final OccurrenceTarget target = newOccurrence.getTarget();
assert target != null;
final Set<Selector> selectors = target.getSelectors();
for (TermOccurrence to : existingOccurrences) {
final Iterator<TermOccurrence> it = existingApprovedOccurrences.iterator();
while (it.hasNext()) {
final TermOccurrence to = it.next();
if (!to.getTerm().equals(newOccurrence.getTerm())) {
continue;
}
final OccurrenceTarget existingTarget = to.getTarget();
assert existingTarget != null;
assert existingTarget.getSource().equals(target.getSource());
// Same term, contains at least one identical selector
if (existingTarget.getSelectors().stream().anyMatch(selectors::contains) && !to.isSuggested()) {
if (existingTarget.getSelectors().stream().anyMatch(selectors::contains)) {
it.remove();
return true;
}
}
return false;
}

/**
* Tries to add existing approved term occurrences to the content.
* <p>
* This means finding matching text using the {@link TextQuoteSelector} (as it is more resilient to minor changes in
* the content file) and inserting a corresponding annotation element into the text.
* <p>
* If a matching element can be created in the text, the existing term occurrence is processed just as a new one
* would be.
*
* @param consumer Consumer of the occurrences
*/
private void addRemainingExistingApprovedOccurrences(OccurrenceConsumer consumer) throws InterruptedException {
LOG.debug("Adding existing approved occurrences to content.");
for (TermOccurrence to : existingApprovedOccurrences) {
final Optional<Selector> tqSelector = to.getTarget().getSelectors().stream().filter(
TextQuoteSelector.class::isInstance).findFirst();
if (tqSelector.isEmpty()) {
LOG.trace("Existing approved occurrence does not have a {}. Skipping it.",
TextQuoteSelector.class.getSimpleName());
continue;
}
final TextQuoteSelector tqs = (TextQuoteSelector) tqSelector.get();
final Elements containing = document.select(
":contains(" + tqs.getPrefix() + tqs.getExactMatch() + tqs.getSuffix() + ")");
if (containing.isEmpty()) {
LOG.trace("{} did not find any matching elements. Skipping term occurrence.",
TextQuoteSelector.class.getSimpleName());
continue;
}
LOG.debug("Adding existing approved term occurrence {} to content.", to);
// Last should be the most specific one
final Element elem = containing.last();
assert elem != null;
final Element containingExactMatch = elem.selectFirst(":containsOwn(" + tqs.getExactMatch() + ")");
final Element annotationNode = createAnnotationElement(to, tqs);
assert containingExactMatch != null;
replaceContentWithAnnotation(containingExactMatch, tqs, annotationNode);
consumer.accept(to);
}
}

private static Element createAnnotationElement(TermOccurrence to, TextQuoteSelector tqs) {
final Element annotationNode = new Element(ANNOTATION_ELEMENT, "");
annotationNode.text(tqs.getExactMatch());
annotationNode.attr(Constants.RDFa.ABOUT, to.resolveElementAbout());
annotationNode.attr(Constants.RDFa.RESOURCE, to.getTerm().toString());
annotationNode.attr(Constants.RDFa.TYPE, Vocabulary.s_c_vyskyt_termu);
annotationNode.attr(Constants.RDFa.PROPERTY, Vocabulary.s_p_je_prirazenim_termu);
return annotationNode;
}

private static void replaceContentWithAnnotation(Element containingExactMatch, TextQuoteSelector tqs,
Element annotationNode) {
for (Node n : containingExactMatch.childNodes()) {
if (!(n instanceof TextNode textNode) || !textNode.getWholeText().contains(tqs.getExactMatch())) {
continue;
}
final int exactMatchStart = textNode.getWholeText().indexOf(tqs.getExactMatch());
final int exactMatchEnd = exactMatchStart + tqs.getExactMatch().length();
final TextNode prefixNode = new TextNode(textNode.getWholeText().substring(0, exactMatchStart));
final TextNode suffixNode = new TextNode(textNode.getWholeText().substring(exactMatchEnd));
n.after(suffixNode);
n.after(annotationNode);
n.after(prefixNode);
n.remove();
break;
}
}

@Override
public boolean supports(Asset<?> source) {
if (source instanceof Term) {
Expand Down
9 changes: 7 additions & 2 deletions src/main/java/cz/cvut/kbss/termit/util/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,8 @@ public class Constants {
public static final int WEBSOCKET_SEND_BUFFER_SIZE_LIMIT = Integer.MAX_VALUE;

/**
* Set the maximum time allowed in milliseconds after the WebSocket connection is established
* and before the first sub-protocol message is received.
* Set the maximum time allowed in milliseconds after the WebSocket connection is established and before the first
* sub-protocol message is received.
*/
public static final int WEBSOCKET_TIME_TO_FIRST_MESSAGE = 15 * 1000 /* 15s */;

Expand All @@ -170,6 +170,11 @@ public class Constants {
*/
public static final String DEVELOPMENT_PROFILE = "development";

/**
* Blank node prefix.
*/
public static final String BNODE_PREFIX = "_:";

private Constants() {
throw new AssertionError();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import cz.cvut.kbss.termit.environment.Generator;
import cz.cvut.kbss.termit.model.Term;
import cz.cvut.kbss.termit.model.assignment.TermOccurrence;
import cz.cvut.kbss.termit.model.resource.Document;
import cz.cvut.kbss.termit.model.resource.File;
import cz.cvut.kbss.termit.model.selector.Selector;
import cz.cvut.kbss.termit.model.selector.TextPositionSelector;
Expand All @@ -30,6 +29,7 @@
import cz.cvut.kbss.termit.util.Configuration;
import cz.cvut.kbss.termit.util.Vocabulary;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
Expand All @@ -42,6 +42,7 @@
import java.io.InputStream;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.Set;
Expand Down Expand Up @@ -98,13 +99,8 @@ void supportsReturnsTrueForFileWithHtmLabelExtension() {

@Test
void supportsReturnsTrueForHtmlFileWithoutExtension() {
final Document document = new Document();
document.setLabel("testDocument");
document.setUri(Generator.generateUri());
final File file = new File();
file.setLabel("test");
file.setDocument(document);
document.addFile(file);
when(documentManager.getContentType(file)).thenReturn(Optional.of(MediaType.TEXT_HTML_VALUE));
assertTrue(sut.supports(file));
}
Expand Down Expand Up @@ -195,9 +191,31 @@ void findTermOccurrencesSetsFoundOccurrencesAsApprovedWhenCorrespondingExistingO
assertThat(to.getTypes(), not(hasItem(Vocabulary.s_c_navrzeny_vyskyt_termu)));
});
assertEquals(1, resultSize.get());
final org.jsoup.nodes.Document document = Jsoup.parse(sut.getContent(), StandardCharsets.UTF_8.name(), "");
final Document document = Jsoup.parse(sut.getContent(), StandardCharsets.UTF_8.name(), "");
final Elements annotations = document.select("span[about]");
assertEquals(1, annotations.size());
assertFalse(annotations.get(0).hasAttr("score"));
}

@Test
void findTermOccurrencesReusesExistingApprovedOccurrencesThatAreNotPresentInAnnotatedContent() throws Exception {
when(termService.exists(TERM_URI)).thenReturn(true);
final File file = initFile();
final String id = "r2d2";
final TermOccurrence existing = Generator.generateTermOccurrence(new Term(TERM_URI), file, false);
existing.setUri(URI.create(Vocabulary.s_c_vyskyt_termu + "/" + id));
final Selector quoteSelector = new TextQuoteSelector("Prahy", " hlavního města ", ".");
final Selector posSelector = new TextPositionSelector(57, 62);
existing.getTarget().setSelectors(Set.of(quoteSelector, posSelector));
final InputStream is = cz.cvut.kbss.termit.environment.Environment.loadFile("data/rdfa-simple.html");
sut.parseContent(is, file);
sut.setExistingOccurrences(List.of(existing));

final List<TermOccurrence> result = new ArrayList<>();
sut.findTermOccurrences(result::add);
assertThat(result, hasItem(existing));
final Document resultDoc = Jsoup.parse(sut.getContent(), StandardCharsets.UTF_8.name(), "");
final Elements addedAnnotation = resultDoc.select("span[about=_:" + id + "]");
assertFalse(addedAnnotation.isEmpty());
}
}

0 comments on commit 5ad5163

Please sign in to comment.