Skip to content

Commit

Permalink
GH-3654 JSON-LD 1.1 security and caching (#4957)
Browse files Browse the repository at this point in the history
* GH-3654 add caching in document loader

* GH-3654 update javadocs

* remove problematic test

* fix copyright

* try to fix junit issues

* try to fix junit issues
  • Loading branch information
hmottestad authored Apr 19, 2024
1 parent 26be6a5 commit 641d65c
Show file tree
Hide file tree
Showing 10 changed files with 449 additions and 47 deletions.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
*******************************************************************************/
package org.eclipse.rdf4j.rio.helpers;

import java.util.List;
import java.util.Set;

import org.eclipse.rdf4j.rio.RioSetting;

import com.github.jsonldjava.core.DocumentLoader;
Expand Down Expand Up @@ -153,6 +156,66 @@ public class JSONLDSettings {
public static final RioSetting<Boolean> HIERARCHICAL_VIEW = new BooleanRioSetting(
"org.eclipse.rdf4j.rio.jsonld.hierarchical_view", "Hierarchical representation of the JSON", Boolean.FALSE);

/**
* Whitelist of remote/local resources that the JSON-LD parser can retrieve. Set of URIs as strings.
* <p>
* Default:
* {@code Set.of("http://www.w3.org/ns/anno.jsonld", "http://www.w3.org/ns/activitystreams.jsonld", "http://www.w3.org/ns/ldp.jsonld", "http://www.w3.org/ns/oa.jsonld", "http://www.w3.org/ns/hydra/context.jsonld", "http://schema.org/", "https://w3id.org/security/v1", "https://w3c.github.io/json-ld-rc/context.jsonld", "https://www.w3.org/2018/credentials/v1", "https://health-lifesci.schema.org/", "https://auto.schema.org/", "https://bib.schema.org/", "http://xmlns.com/foaf/spec/index.jsonld", "https://pending.schema.org/", "https://schema.org/", "https://schema.org/docs/jsonldcontext.jsonld", "https://schema.org/version/latest/schemaorg-current-https.jsonld", "https://schema.org/version/latest/schemaorg-all-http.jsonld", "https://schema.org/version/latest/schemaorg-all-https.jsonld", "https://schema.org/version/latest/schemaorg-current-http.jsonld", "https://schema.org/version/latest/schemaorg-all.jsonld", "https://schema.org/version/latest/schemaorg-current.jsonld", "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld", "https://geojson.org/geojson-ld/geojson-context.jsonld", "https://www.w3.org/2019/wot/td/v1");
*
*/
public static final RioSetting<Set<String>> WHITELIST = new RioSettingImpl<>(
"org.eclipse.rdf4j.rio.jsonld_whitelist",
"Whitelist of remote/local resources that the JSON-LD parser can retrieve. Set of URIs as strings.",
Set.of(
"http://www.w3.org/ns/anno.jsonld",
"http://www.w3.org/ns/activitystreams.jsonld",
"http://www.w3.org/ns/ldp.jsonld",
"http://www.w3.org/ns/oa.jsonld",
"http://www.w3.org/ns/hydra/context.jsonld",
"http://schema.org/",
"https://w3id.org/security/v1",
"https://w3c.github.io/json-ld-rc/context.jsonld",
"https://www.w3.org/2018/credentials/v1",
"https://health-lifesci.schema.org/",
"https://auto.schema.org/",
"https://bib.schema.org/",
"http://xmlns.com/foaf/spec/index.jsonld",
"https://pending.schema.org/",
"https://schema.org/",
"https://schema.org/docs/jsonldcontext.jsonld",
"https://schema.org/version/latest/schemaorg-current-https.jsonld",
"https://schema.org/version/latest/schemaorg-all-http.jsonld",
"https://schema.org/version/latest/schemaorg-all-https.jsonld",
"https://schema.org/version/latest/schemaorg-current-http.jsonld",
"https://schema.org/version/latest/schemaorg-all.jsonld",
"https://schema.org/version/latest/schemaorg-current.jsonld",
"https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"https://geojson.org/geojson-ld/geojson-context.jsonld",
"https://www.w3.org/2019/wot/td/v1"
));

/**
* Secure mode only allows loading remote/local resources (ex. context from url) that are whitelisted.
* <p>
* Default: true
*/
public static final RioSetting<Boolean> SECURE_MODE = new RioSettingImpl<>(
"org.eclipse.rdf4j.rio.jsonld_secure_mode",
"Secure mode only allows loading remote/local resources (ex. context from url) that are whitelisted.",
Boolean.TRUE);

/**
* The document loader cache is enabled by default. All loaded documents, such as remote contexts, are cached for 1
* hour, or until the cache is full. The cache holds up to 1000 documents. The cache is shared between all
* JSONLDParsers. The cache can be disabled by setting this value to false.
* <p>
* Default: true
*/
public static final RioSetting<Boolean> DOCUMENT_LOADER_CACHE = new RioSettingImpl<>(
"org.eclipse.rdf4j.rio.jsonld_document_loader_cache",
"The document loader cache is enabled by default. All loaded documents, such as remote contexts, are cached for 1 hour, or until the cache is full. The cache holds up to 1000 documents. The cache is shared between all JSONLDParsers. The cache can be disabled by setting this value to false.",
Boolean.TRUE);

/**
* Private default constructor.
*/
Expand Down
4 changes: 4 additions & 0 deletions core/rio/jsonld/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>rdf4j-rio-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*******************************************************************************
* Copyright (c) 2024 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
******************************************************************************/

package org.eclipse.rdf4j.rio.jsonld;

import java.net.URI;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;

import org.eclipse.rdf4j.rio.RDFParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;

import no.hasmac.jsonld.JsonLdError;
import no.hasmac.jsonld.document.Document;
import no.hasmac.jsonld.loader.DocumentLoader;
import no.hasmac.jsonld.loader.DocumentLoaderOptions;
import no.hasmac.jsonld.loader.SchemeRouter;

public class CachingDocumentLoader implements DocumentLoader {
private static final DocumentLoader defaultLoader = SchemeRouter.defaultInstance();
private static final Logger logger = LoggerFactory.getLogger(CachingDocumentLoader.class);

private static final LoadingCache<URI, Document> cache = CacheBuilder.newBuilder()
.maximumSize(1000) // Maximum 1000 documents in cache
.expireAfterWrite(1, TimeUnit.HOURS) // Expire after 1 hour
.concurrencyLevel(8) // Optimize for 8 concurrent threads
.build(new CacheLoader<>() {
@Override
public Document load(URI url) throws Exception {
return defaultLoader.loadDocument(url, new DocumentLoaderOptions());
}
});

private final boolean secureMode;
private final Set<String> whitelist;
private final boolean documentLoaderCache;

public CachingDocumentLoader(boolean secureMode, Set<String> whitelist, boolean documentLoaderCache) {
this.secureMode = secureMode;
this.whitelist = whitelist;
this.documentLoaderCache = documentLoaderCache;
}

@Override
public Document loadDocument(URI uri, DocumentLoaderOptions options) {

try {
if (!secureMode || whitelist.contains(uri.toString())) {
if (documentLoaderCache) {
try {
return cache.get(uri);
} catch (ExecutionException e) {
if (e.getCause() != null) {
throw new RDFParseException("Could not load document from " + uri, e.getCause());
}
throw new RDFParseException("Could not load document from " + uri, e);
}
} else {
try {
return defaultLoader.loadDocument(uri, options);
} catch (JsonLdError e) {
throw new RDFParseException("Could not load document from " + uri, e);
}
}
} else {
throw new RDFParseException("Could not load document from " + uri
+ " because it is not whitelisted. See: JSONLDSettings.WHITELIST and JSONLDSettings.SECURE_MODE");
}
} catch (RDFParseException e) {
logger.error(e.getMessage(), e);
throw e;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,18 @@
*******************************************************************************/
package org.eclipse.rdf4j.rio.jsonld;

import static org.eclipse.rdf4j.rio.helpers.JSONLDSettings.DOCUMENT_LOADER_CACHE;
import static org.eclipse.rdf4j.rio.helpers.JSONLDSettings.SECURE_MODE;
import static org.eclipse.rdf4j.rio.helpers.JSONLDSettings.WHITELIST;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Collection;
import java.util.Optional;
import java.util.Set;
import java.util.function.BiConsumer;

import org.eclipse.rdf4j.model.IRI;
Expand Down Expand Up @@ -48,8 +53,6 @@
import no.hasmac.jsonld.document.JsonDocument;
import no.hasmac.jsonld.lang.Keywords;
import no.hasmac.jsonld.loader.DocumentLoader;
import no.hasmac.jsonld.loader.DocumentLoaderOptions;
import no.hasmac.jsonld.loader.SchemeRouter;
import no.hasmac.rdf.RdfConsumer;
import no.hasmac.rdf.RdfValueFactory;

Expand Down Expand Up @@ -126,12 +129,21 @@ private void parse(InputStream in, Reader reader, String baseURI)
BasicParserSettings.FAIL_ON_UNKNOWN_LANGUAGES);
}

boolean secureMode = getParserConfig().get(SECURE_MODE);
boolean documentLoaderCache = getParserConfig().get(DOCUMENT_LOADER_CACHE);

Set<String> whitelist = getParserConfig().get(WHITELIST);

JsonLdOptions opts = new JsonLdOptions();
opts.setUriValidation(false);
opts.setExceptionOnWarning(getParserConfig().get(JSONLDSettings.EXCEPTION_ON_WARNING));

Document context = getParserConfig().get(JSONLDSettings.EXPAND_CONTEXT);

DocumentLoader defaultDocumentLoader = opts.getDocumentLoader();
CachingDocumentLoader cachingDocumentLoader = new CachingDocumentLoader(secureMode, whitelist,
documentLoaderCache);

if (context != null) {

opts.setExpandContext(context);
Expand All @@ -142,22 +154,21 @@ private void parse(InputStream in, Reader reader, String baseURI)
throw new RDFParseException("Expand context is not a valid JSON document");
}
opts.getContextCache().put(context.getDocumentUrl().toString(), jsonContent.get());
opts.setDocumentLoader(new DocumentLoader() {

private final DocumentLoader defaultDocumentLoader = SchemeRouter.defaultInstance();

@Override
public Document loadDocument(URI url, DocumentLoaderOptions options) throws JsonLdError {
if (url.equals(context.getDocumentUrl())) {
return context;
}
return defaultDocumentLoader.loadDocument(url, options);
opts.setDocumentLoader((uri, options) -> {
if (uri.equals(context.getDocumentUrl())) {
return context;
}

return cachingDocumentLoader.loadDocument(uri, options);
});
}

}

if (secureMode && opts.getDocumentLoader() == defaultDocumentLoader) {
opts.setDocumentLoader(cachingDocumentLoader);
}

if (baseURI != null && !baseURI.isEmpty()) {
URI uri = new URI(baseURI);
opts.setBase(uri);
Expand Down
Loading

0 comments on commit 641d65c

Please sign in to comment.