From 87b320647c2bbf5ac406eb9995e3b98d4535b390 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Wed, 20 Feb 2019 20:06:49 +0900 Subject: [PATCH] Key index on (uri, date) not just uri #11, #12 --- src/org/netpreserve/jwarc/WarcServer.java | 146 +++++++++++++++------- 1 file changed, 100 insertions(+), 46 deletions(-) diff --git a/src/org/netpreserve/jwarc/WarcServer.java b/src/org/netpreserve/jwarc/WarcServer.java index 252c55c..a2fc82f 100644 --- a/src/org/netpreserve/jwarc/WarcServer.java +++ b/src/org/netpreserve/jwarc/WarcServer.java @@ -4,20 +4,25 @@ import java.io.InputStream; import java.net.ServerSocket; import java.net.Socket; +import java.net.URI; import java.net.URLConnection; import java.nio.ByteBuffer; import java.nio.channels.Channels; import java.nio.channels.FileChannel; import java.nio.file.Path; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.time.Duration; +import java.time.Instant; +import java.time.format.DateTimeFormatter; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import static java.nio.charset.StandardCharsets.US_ASCII; import static java.nio.charset.StandardCharsets.UTF_8; import static java.nio.file.StandardOpenOption.READ; import static java.time.ZoneOffset.UTC; import static java.time.format.DateTimeFormatter.RFC_1123_DATE_TIME; +import static java.util.Comparator.comparing; import static org.netpreserve.jwarc.HttpServer.send; /** @@ -27,11 +32,14 @@ * handled client-side by https://github.com/oduwsdl/Reconstructive */ class WarcServer { + private static final DateTimeFormatter ARC_DATE = DateTimeFormatter.ofPattern("yyyyMMddHHmmss").withZone(UTC); private static final MediaType HTML = MediaType.parse("text/html"); + private static final Pattern REPLAY_RE = Pattern.compile("/replay/([0-9]{14})/(.*)"); + private final HttpServer httpServer; - private String firstUrl = null; - private final Map index = new HashMap<>(); + private final Index index = new Index(); private byte[] script = "".getBytes(US_ASCII); + private Entry entrypoint; WarcServer(ServerSocket serverSocket, List warcs) throws IOException { httpServer = new HttpServer(serverSocket, this::handle); @@ -47,10 +55,10 @@ private void index(Path warcFile) throws IOException { WarcCaptureRecord capture = (WarcCaptureRecord) record; String scheme = capture.targetURI().getScheme(); if ("http".equalsIgnoreCase(scheme) || "https".equalsIgnoreCase(scheme)) { - String url = capture.targetURI().toString(); - index.put(url, new IndexEntry(warcFile, reader.position())); - if (firstUrl == null && HTML.equals(capture.payloadType().base())) { - firstUrl = url; + Entry entry = new Entry(capture.targetURI(), capture.date(), warcFile, reader.position()); + index.add(entry); + if (entrypoint == null && HTML.equals(capture.payloadType().base())) { + entrypoint = entry; } } } @@ -67,9 +75,13 @@ void listen() throws IOException { private void handle(Socket socket, String target, HttpRequest request) throws Exception { if (target.equals("/")) { + if (entrypoint == null) { + error(socket, 404, "Empty collection"); + return; + } send(socket, new HttpResponse.Builder(307, "Redirect") .addHeader("Connection", "close") - .addHeader("Location", "/replay/12345678901234/" + firstUrl) + .addHeader("Location", "/replay/" + ARC_DATE.format(entrypoint.date) + "/" + entrypoint.uri) .build()); } else if (target.equals("/__jwarc__/sw.js")) { serve(socket, "sw.js"); @@ -82,50 +94,58 @@ private void handle(Socket socket, String target, HttpRequest request) throws Ex .setHeader("Connection", "close") .build()); } - int i = target.indexOf('/', "/replay/".length()); - if (i != -1) { - target = target.substring(i + 1); + Matcher m = REPLAY_RE.matcher(target); + if (!m.matches()) { + error(socket, 404, "Malformed replay url"); } - replay(socket, target, true); + Instant date = Instant.from(ARC_DATE.parse(m.group(1))); + replay(socket, m.group(2), date, true); } else { - replay(socket, target, false); + Instant date = request.headers().first("Accept-Datetime") + .map(s -> Instant.from(RFC_1123_DATE_TIME.parse(s))) + .orElse(Instant.EPOCH); + replay(socket, target, date, false); } } - private void replay(Socket socket, String target, boolean inject) throws IOException { - IndexEntry entry = index.get(target); - if (entry != null) { - try (FileChannel channel = FileChannel.open(entry.file, READ)) { - channel.position(entry.position); - WarcReader reader = new WarcReader(channel); - WarcResponse record = (WarcResponse) reader.next().get(); - HttpResponse http = record.http(); - HttpResponse.Builder b = new HttpResponse.Builder(http.status(), http.reason()); - for (Map.Entry> e : http.headers().map().entrySet()) { - if (e.getKey().equalsIgnoreCase("Strict-Transport-Security")) continue; - if (e.getKey().equalsIgnoreCase("Transfer-Encoding")) continue; - if (e.getKey().equalsIgnoreCase("Public-Key-Pins")) continue; - for (String value : e.getValue()) { - b.addHeader(e.getKey(), value); - } - } - b.setHeader("Connection", "keep-alive"); - b.setHeader("Memento-Datetime", RFC_1123_DATE_TIME.format(record.date().atOffset(UTC))); - MessageBody body = http.body(); - if (inject && HTML.equals(http.contentType().base())) { - body = LengthedBody.create(body, ByteBuffer.wrap(script), script.length + body.size()); + private void replay(Socket socket, String target, Instant date, boolean inject) throws IOException { + Entry entry = index.closest(URI.create(target), date); + if (entry == null) { + error(socket, 404, "Not found in archive"); + return; + } + try (FileChannel channel = FileChannel.open(entry.file, READ)) { + channel.position(entry.position); + WarcReader reader = new WarcReader(channel); + WarcResponse record = (WarcResponse) reader.next().get(); + HttpResponse http = record.http(); + HttpResponse.Builder b = new HttpResponse.Builder(http.status(), http.reason()); + for (Map.Entry> e : http.headers().map().entrySet()) { + if (e.getKey().equalsIgnoreCase("Strict-Transport-Security")) continue; + if (e.getKey().equalsIgnoreCase("Transfer-Encoding")) continue; + if (e.getKey().equalsIgnoreCase("Public-Key-Pins")) continue; + for (String value : e.getValue()) { + b.addHeader(e.getKey(), value); } - b.body(http.contentType(), body, body.size()); - send(socket, b.build()); } - } else { - send(socket, new HttpResponse.Builder(404, "Not found") - .body(HTML, "Not found".getBytes(UTF_8)) - .setHeader("Connection", "keep-alive") - .build()); + b.setHeader("Connection", "keep-alive"); + b.setHeader("Memento-Datetime", RFC_1123_DATE_TIME.format(record.date().atOffset(UTC))); + MessageBody body = http.body(); + if (inject && HTML.equals(http.contentType().base())) { + body = LengthedBody.create(body, ByteBuffer.wrap(script), script.length + body.size()); + } + b.body(http.contentType(), body, body.size()); + send(socket, b.build()); } } + private void error(Socket socket, int status, String reason) throws IOException { + send(socket, new HttpResponse.Builder(status, reason) + .body(HTML, reason.getBytes(UTF_8)) + .setHeader("Connection", "keep-alive") + .build()); + } + private void serve(Socket socket, String resource) throws IOException { URLConnection conn = getClass().getResource(resource).openConnection(); try (InputStream stream = conn.getInputStream()) { @@ -137,11 +157,45 @@ private void serve(Socket socket, String resource) throws IOException { } } - private static class IndexEntry { + private static class Index { + NavigableSet entries = new TreeSet<>(comparing((Entry e) -> e.urikey).thenComparing(e -> e.date)); + + void add(Entry entry) { + entries.add(entry); + } + + Entry closest(URI uri, Instant date) { + NavigableSet versions = versions(uri); + Entry key = new Entry(uri, date); + Entry a = versions.floor(key); + Entry b = versions.higher(key); + if (a == null) return b; + if (b == null) return a; + Duration da = Duration.between(a.date, date); + Duration db = Duration.between(b.date, date); + return da.compareTo(db) < 0 ? a : b; + } + + NavigableSet versions(URI uri) { + return entries.subSet(new Entry(uri, Instant.MIN), true, new Entry(uri, Instant.MAX), true); + } + } + + private static class Entry { + private final String urikey; + private final URI uri; + private final Instant date; private final Path file; private final long position; - IndexEntry(Path file, long position) { + Entry(URI uri, Instant date) { + this(uri, date, null, -1); + } + + Entry(URI uri, Instant date, Path file, long position) { + urikey = uri.toString(); + this.uri = uri; + this.date = date; this.file = file; this.position = position; }