Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tool to extract a WARC record (or its headers or payload) #41

Merged
merged 1 commit into from
May 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/org/netpreserve/jwarc/IOUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ public static void copy(InputStream inputStream, OutputStream outputStream) thro
}
}

public static ReadableByteChannel gunzipChannel(ReadableByteChannel gzipped) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(8192);
buffer.flip();
return new GunzipChannel(gzipped, buffer);
}

static Socket connect(String scheme, String host, int port) throws IOException {
Objects.requireNonNull(host);
if ("http".equalsIgnoreCase(scheme)) {
Expand Down
156 changes: 156 additions & 0 deletions src/org/netpreserve/jwarc/tools/ExtractTool.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright (C) 2020 National Library of Australia and the jwarc contributors
*/

package org.netpreserve.jwarc.tools;

import org.netpreserve.jwarc.*;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.List;
import java.util.Optional;

import static java.nio.charset.StandardCharsets.UTF_8;

public class ExtractTool {

private static enum ExtractAction { RECORD, HEADERS, PAYLOAD; };

private static void writeWarcHeaders(WritableByteChannel out, WarcRecord record) throws IOException {
StringBuilder sb = new StringBuilder();
sb.append(record.version().toString()).append("\r\n");
record.headers().appendTo(sb);
sb.append("\r\n");
out.write(ByteBuffer.wrap(sb.toString().getBytes(UTF_8)));
}

private static void writeHttpHeaders(WritableByteChannel out, WarcRecord record) throws IOException {
if (record instanceof WarcResponse) {
HttpResponse response = ((WarcResponse) record).http();
out.write(ByteBuffer.wrap(response.serializeHeader()));
} else if (record instanceof WarcRequest) {
HttpRequest request = ((WarcRequest) record).http();
out.write(ByteBuffer.wrap(request.serializeHeader()));
}
}

private static void writePayload(WritableByteChannel out, WarcRecord record) throws IOException {
MessageBody payload;
List<String> contentEncodings = Collections.emptyList();
if (record instanceof WarcResponse) {
HttpResponse response = ((WarcResponse) record).http();
payload = response.body();
contentEncodings = response.headers().all("Content-Encoding");
} else if (record instanceof WarcRequest) {
HttpRequest request = ((WarcRequest) record).http();
payload = request.body();
contentEncodings = request.headers().all("Content-Encoding");
} else {
payload = record.body();
}
if (contentEncodings.isEmpty()) {
writeBody(out, payload);
} else {
if (contentEncodings.size() > 1) {
System.err.println("Multiple Content-Encodings not supported: " + contentEncodings);
} else if (contentEncodings.get(0).equalsIgnoreCase("gzip")
|| contentEncodings.get(0).equalsIgnoreCase("x-gzip")) {
writeBody(out, IOUtils.gunzipChannel(payload));
} else {
System.err.println("Content-Encoding not supported: " + contentEncodings.get(0));
}
}
}

private static void writeBody(WritableByteChannel out, ReadableByteChannel body) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(8192);
while (body.read(buffer) > -1) {
buffer.flip();
out.write(buffer);
buffer.compact();
}
}

private static void usage(int exitValue) {
System.err.println("");
System.err.println("ExtractTool [-h] [--payload | --headers] filename offset");
System.err.println("");
System.err.println("Options:");
System.err.println("");
System.err.println(" --headers\toutput only record (and HTTP) headers");
System.err.println(" --payload\toutput only record payload, if necessary");
System.err.println(" \tdecode transfer and/or content encoding");
System.exit(exitValue);
}

public static void main(String[] args) throws IOException {
ExtractAction action = ExtractAction.RECORD;
Path warcFile = null;
long offset = -1;
for (String arg : args) {
switch (arg) {
case "-h":
case "--help":
usage(0);
case "--headers":
action = ExtractAction.HEADERS;
break;
case "--payload":
action = ExtractAction.PAYLOAD;
break;
default:
if (warcFile == null) {
warcFile = Paths.get(arg);
if (!warcFile.toFile().canRead()) {
System.err.println("Cannot read WARC file: " + warcFile);
usage(1);
}
} else if (offset == -1) {
try {
offset = Long.parseLong(arg);
} catch (NumberFormatException e) {
System.err.println(e.getMessage());
usage(1);
}
} else {
System.err.println("Unknown argument: " + arg);
usage(1);
}
}
}
if (warcFile == null || offset == -1) {
usage(1);
}
try (FileChannel channel = FileChannel.open(warcFile);
WarcReader reader = new WarcReader(channel.position(offset))) {
Optional<WarcRecord> record = reader.next();
if (!record.isPresent()) {
System.err.println("No record found at position " + offset);
System.exit(1);
}
WritableByteChannel out = Channels.newChannel(System.out);
switch (action) {
case RECORD:
writeWarcHeaders(out, record.get());
writeBody(out, record.get().body());
break;
case HEADERS:
writeWarcHeaders(out, record.get());
writeHttpHeaders(out, record.get());
break;
case PAYLOAD:
writePayload(out, record.get());
break;
}
}
}
}
4 changes: 4 additions & 0 deletions src/org/netpreserve/jwarc/tools/WarcTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ public static void main(String[] args) throws Exception {
case "cdx":
CdxTool.main(rest);
break;
case "extract":
ExtractTool.main(rest);
break;
case "fetch":
FetchTool.main(rest);
break;
Expand Down Expand Up @@ -59,6 +62,7 @@ private static void usage() {
System.out.println("Commands:");
System.out.println("");
System.out.println(" cdx List records in CDX format");
System.out.println(" extract Extract record by offset");
System.out.println(" fetch Download a URL recording the request and response");
System.out.println(" filter Copy records that match a given filter expression");
System.out.println(" ls List records in WARC file(s)");
Expand Down