Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement dir and hdt parser #160

Merged
merged 1 commit into from
May 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions hdt-api/src/main/java/org/rdfhdt/hdt/enums/RDFNotation.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
package org.rdfhdt.hdt.enums;

import java.io.File;
import java.nio.file.Files;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;

/**
* Enumeration of the different valid notations for RDF data.
Expand Down Expand Up @@ -93,8 +96,13 @@ public enum RDFNotation {
/**
* Directory with RDF content
*/
DIR

DIR,

/**
* HDT file
*/
HDT

;

public static RDFNotation parse(String str) {
Expand All @@ -120,12 +128,22 @@ public static RDFNotation parse(String str) {
return ZIP;
} else if(str.equals("list")) {
return LIST;
} else if(str.equals("hdt")) {
return HDT;
}
throw new IllegalArgumentException();
}

public static RDFNotation guess(String fileName) throws IllegalArgumentException {
String str = fileName.toLowerCase();

try {
if (Files.isDirectory(Path.of(fileName))) {
return DIR;
}
} catch (InvalidPathException e) {
// not a valid path, so can't be a directory, ignore
}

int idx = str.lastIndexOf('.');
if(idx!=-1) {
Expand All @@ -152,8 +170,11 @@ public static RDFNotation guess(String fileName) throws IllegalArgumentException
} else if(str.endsWith("zip")){
return ZIP;
} else if(str.endsWith("list")){
return LIST;
}
return LIST;
} else if(str.endsWith("hdt")){
return HDT;
}

throw new IllegalArgumentException("Could not guess the format for "+fileName);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.NotImplementedException;
import org.rdfhdt.hdt.rdf.parsers.RDFParserDir;
import org.rdfhdt.hdt.rdf.parsers.RDFParserHDT;
import org.rdfhdt.hdt.rdf.parsers.RDFParserList;
import org.rdfhdt.hdt.rdf.parsers.RDFParserRAR;
import org.rdfhdt.hdt.rdf.parsers.RDFParserRIOT;
Expand All @@ -50,8 +52,7 @@ public static RDFParserCallback getParserCallback(RDFNotation notation) {
case RDFXML:
return new RDFParserRIOT();
case DIR:
// FIXME: Implement
throw new NotImplementedException("RDFParserDir not implemented");
return new RDFParserDir();
case LIST:
return new RDFParserList();
case ZIP:
Expand All @@ -60,6 +61,8 @@ public static RDFParserCallback getParserCallback(RDFNotation notation) {
return new RDFParserTar();
case RAR:
return new RDFParserRAR();
case HDT:
return new RDFParserHDT();
case JSONLD:
// FIXME: Implement
throw new NotImplementedException("RDFParserJSONLD not implemented");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package org.rdfhdt.hdt.rdf.parsers;

import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.NotImplementedException;
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.rdf.RDFParserCallback;
import org.rdfhdt.hdt.rdf.RDFParserFactory;
import org.rdfhdt.hdt.util.ContainerException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;

/**
* @author Antoine Willerval
*/
public class RDFParserDir implements RDFParserCallback {
private static final Logger log = LoggerFactory.getLogger(RDFParserDir.class);

@Override
public void doParse(String fileName, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
try {
doParse(Path.of(fileName), baseUri, notation, keepBNode, callback);
} catch (InvalidPathException e) {
throw new ParserException(e);
}
}

private void doParse(Path p, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
if (notation != RDFNotation.DIR) {
throw new IllegalArgumentException("Can't parse notation different than " + RDFNotation.DIR + "!");
}
try {
Files.list(p).forEach(child -> {
try {
if (Files.isDirectory(child)) {
doParse(child, baseUri, RDFNotation.DIR, keepBNode, callback);
return;
}
RDFParserCallback rdfParserCallback;
RDFNotation childNotation;
try {
// get the notation of the file
childNotation = RDFNotation.guess(child.toFile());
rdfParserCallback = RDFParserFactory.getParserCallback(childNotation);
} catch (IllegalArgumentException e) {
log.warn("Ignore file {}", child, e);
return;
}
log.debug("parse {}", child);
// we can parse it, parsing it
rdfParserCallback.doParse(child.toAbsolutePath().toString(), baseUri, childNotation, keepBNode, callback);
} catch (ParserException e) {
throw new ContainerException(e);
}
});
} catch (IOException | SecurityException e) {
throw new ParserException(e);
} catch (ContainerException e) {
throw (ParserException) e.getCause();
}
}

@Override
public void doParse(InputStream in, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
throw new NotImplementedException("Can't parse a stream of directory!");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package org.rdfhdt.hdt.rdf.parsers;

import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.NotFoundException;
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.hdt.HDT;
import org.rdfhdt.hdt.hdt.HDTManager;
import org.rdfhdt.hdt.rdf.RDFParserCallback;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;

/**
* @author Antoine Willerval
*/
public class RDFParserHDT implements RDFParserCallback {
private static final Logger log = LoggerFactory.getLogger(RDFParserHDT.class);

@Override
public void doParse(String fileName, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
try (HDT hdt = HDTManager.mapHDT(fileName)) {
hdt.search("", "", "").forEachRemaining(t -> callback.processTriple(t, 0));
} catch (IOException | NotFoundException e) {
log.error("Unexpected exception.", e);
throw new ParserException(e);
}
}

@Override
public void doParse(InputStream in, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
try {
// create a temp
Path tempFile = Files.createTempFile("hdtjava-reader", ".hdt");
log.warn("Create temp file to store the HDT stream {}", tempFile);
try {
Files.copy(in, tempFile);
doParse(tempFile.toAbsolutePath().toString(), baseUri, notation, keepBNode, callback);
} finally {
Files.deleteIfExists(tempFile);
}
} catch (IOException e) {
log.error("Unexpected exception.", e);
throw new ParserException(e);
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package org.rdfhdt.hdt.util;

/**
* A simple runtime exception to contain a cause
* @author Antoine Willerval
*/
public class ContainerException extends RuntimeException {

public ContainerException(Throwable cause) {
super(cause);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package org.rdfhdt.hdt.rdf.parsers;

import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.header.HeaderUtil;
import org.rdfhdt.hdt.rdf.RDFParserCallback;
import org.rdfhdt.hdt.rdf.RDFParserFactory;
import org.rdfhdt.hdt.triples.TripleString;
import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

public class RDFParserDirTest {

@Rule
public TemporaryFolder tempDir = new TemporaryFolder();

@Test
public void dirTest() throws IOException, ParserException {
Path root = tempDir.newFolder().toPath();
Files.createDirectories(root);

Path testDir1 = root.resolve("testDir1");
Path testDir2 = root.resolve("testDir2");
Path testDir3 = root.resolve("testDir3");
Path testDir4 = testDir3.resolve("testDir4");

Files.createDirectories(testDir1);
Files.createDirectories(testDir2);
Files.createDirectories(testDir3);
Files.createDirectories(testDir4);

LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier
.createSupplierWithMaxTriples(20, 34);

supplier.createNTFile(root.resolve("test.nt").toAbsolutePath().toString());
supplier.createNTFile(testDir1.resolve("test1.nt").toAbsolutePath().toString());
supplier.createNTFile(testDir2.resolve("test21.nt").toAbsolutePath().toString());
supplier.createNTFile(testDir2.resolve("test22.nt").toAbsolutePath().toString());
supplier.createNTFile(testDir3.resolve("test31.nt").toAbsolutePath().toString());
supplier.createNTFile(testDir3.resolve("test32.nt").toAbsolutePath().toString());

Files.writeString(testDir2.resolve("thing.txt"), "Not parsable RDF DATA");
Files.writeString(root.resolve("thing.py"), "print('Not parsable RDF DATA')");
Files.writeString(testDir4.resolve("thing.sh"), "echo \"Not Parsable RDF data\"");

supplier.reset();

List<TripleString> excepted = new ArrayList<>();
// 6 for the 6 files
for (int i = 0; i < 6; i++) {
Iterator<TripleString> it = supplier.createTripleStringStream();
while (it.hasNext()) {
TripleString ts = it.next();
TripleString e = new TripleString(
HeaderUtil.cleanURI(ts.getSubject().toString()),
HeaderUtil.cleanURI(ts.getPredicate().toString()),
HeaderUtil.cleanURI(ts.getObject().toString())
);
excepted.add(e);
}
}

String filename = root.toAbsolutePath().toString();
RDFNotation dir = RDFNotation.guess(filename);
Assert.assertEquals(dir, RDFNotation.DIR);
RDFParserCallback callback = RDFParserFactory.getParserCallback(dir);
Assert.assertTrue(callback instanceof RDFParserDir);

callback.doParse(filename, "http://example.org/#", dir, true, (triple, pos) ->
Assert.assertTrue("triple " + triple + " wasn't excepted", excepted.remove(triple))
);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package org.rdfhdt.hdt.rdf.parsers;

import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.rdfhdt.hdt.enums.RDFNotation;
import org.rdfhdt.hdt.exceptions.NotFoundException;
import org.rdfhdt.hdt.exceptions.ParserException;
import org.rdfhdt.hdt.hdt.HDT;
import org.rdfhdt.hdt.hdt.HDTManager;
import org.rdfhdt.hdt.options.HDTSpecification;
import org.rdfhdt.hdt.rdf.RDFParserCallback;
import org.rdfhdt.hdt.rdf.RDFParserFactory;
import org.rdfhdt.hdt.triples.IteratorTripleString;
import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier;

import java.io.IOException;
import java.nio.file.Path;

public class RDFParserHDTTest {


@Rule
public TemporaryFolder tempDir = new TemporaryFolder();

@Test
public void hdtTest() throws IOException, ParserException, NotFoundException {
Path root = tempDir.newFile("test.hdt").toPath();

LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier
.createSupplierWithMaxTriples(20, 34);

HDT hdt = HDTManager.generateHDT(
supplier.createTripleStringStream(),
"http://example.org/#",
new HDTSpecification(),
null
);
hdt.saveToHDT(root.toAbsolutePath().toString(), null);

supplier.reset();

String filename = root.toAbsolutePath().toString();
RDFNotation dir = RDFNotation.guess(filename);
Assert.assertEquals(dir, RDFNotation.HDT);
RDFParserCallback callback = RDFParserFactory.getParserCallback(dir);
Assert.assertTrue(callback instanceof RDFParserHDT);

IteratorTripleString it = hdt.search("", "", "");

callback.doParse(filename, "http://example.org/#", dir, true, (triple, pos) ->
Assert.assertEquals(it.next(), triple)
);

hdt.close();
}
}