From d753e41d01d131b2cd00c3280d1f743866ad960a Mon Sep 17 00:00:00 2001 From: Neil Minton Date: Tue, 30 Mar 2021 12:24:27 -0400 Subject: [PATCH 1/5] Prefer UTF charsets when found. --- .../replay/charset/UniversalChardetSniffer.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java b/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java index ed6fb3846f..275ee43bf9 100644 --- a/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java +++ b/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java @@ -30,12 +30,21 @@ public String sniff(Resource resource) { detector.setText(bbuffer); CharsetMatch[] matches = detector.detectAll(); if (matches != null) { - for (int i = 0; i < matches.length; i++) { + + String charsetNextBest = null; + + for (int i = 0; i < matches.length; i++) { charsetName = matches[i].getName(); if (!isDubious(charsetName) && isCharsetSupported(charsetName)) { - return charsetName; + + if (charsetNextBest == null) { charsetNextBest = charsetName; } + + // prefer UTF character sets + if (charsetName.startsWith("UTF")) { return charsetName; } } } + + return charsetNextBest; } } catch (IOException ex) { // From 40a1ef28074426ad8a52169f8f52f846d759b938 Mon Sep 17 00:00:00 2001 From: Neil Minton Date: Tue, 30 Mar 2021 12:24:53 -0400 Subject: [PATCH 2/5] Fix spelling error. --- .../archive/wayback/replay/charset/UniversalChardetSniffer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java b/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java index 275ee43bf9..748885a191 100644 --- a/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java +++ b/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java @@ -54,7 +54,7 @@ public String sniff(Resource resource) { /* * Pretty much nothing in the wild is really UTF-32, - * yet icu4j returns that as the likeliest possiblity + * yet icu4j returns that as the likeliest possibility * for several captures... */ protected boolean isDubious(String charsetName) { From df259c0662df3e7a968085993c8640699e8bed85 Mon Sep 17 00:00:00 2001 From: Neil Minton Date: Tue, 30 Mar 2021 21:23:30 -0400 Subject: [PATCH 3/5] Prefer UTF-8 over other forms of UTF. Some content may come back with UTF-16 or better and not be decoded correctly. --- .../archive/wayback/replay/charset/UniversalChardetSniffer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java b/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java index 748885a191..461cd27c1a 100644 --- a/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java +++ b/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java @@ -40,7 +40,7 @@ public String sniff(Resource resource) { if (charsetNextBest == null) { charsetNextBest = charsetName; } // prefer UTF character sets - if (charsetName.startsWith("UTF")) { return charsetName; } + if (charsetName.startsWith("UTF-8")) { return charsetName; } } } From 8a4768557b19c628221fa48036305154695b2794 Mon Sep 17 00:00:00 2001 From: Neil Minton Date: Mon, 19 Apr 2021 16:23:14 -0400 Subject: [PATCH 4/5] Update Travis CI JDK references. Xeniel is the default build for VMs currently. It does not support the older JVMs. https://docs.travis-ci.com/user/reference/xenial/#jvm-clojure-groovy-java-scala-support --- .travis.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 72280adda8..a0545b4305 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,8 @@ +dist: xenial language: java jdk: - - oraclejdk8 - # no longer available - #- oraclejdk7 - - openjdk7 + - openjdk8 + - openjdk10 branches: only: - master From 1b4f4e300939ff0aa19009c84707ea44535b49d6 Mon Sep 17 00:00:00 2001 From: Yuval Langer Date: Fri, 8 Dec 2023 23:09:21 +0200 Subject: [PATCH 5/5] Fix minor typos. --- wayback-cdx-server/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wayback-cdx-server/README.md b/wayback-cdx-server/README.md index 845735b1ec..83bf348938 100644 --- a/wayback-cdx-server/README.md +++ b/wayback-cdx-server/README.md @@ -116,8 +116,8 @@ For example, if given the url: *archive.org/about/* and: The matchType may also be set implicitly by using wildcard '*' at end or beginning of the url: - * If url is ends in '/\*', eg **url=archive.org/\*** the query is equivalent to **url=archive.org/&matchType=prefix** - * if url starts with '\*.', eg **url=\*.archive.org/** the query is equivalent to **url=archive.org/&matchType=domain** + * If url is ends in '/\*', e.g. **url=archive.org/\*** the query is equivalent to **url=archive.org/&matchType=prefix** + * if url starts with '\*.', e.g. **url=\*.archive.org/** the query is equivalent to **url=archive.org/&matchType=domain** (Note: The *domain* mode is only available if the CDX is in SURT-order format.) @@ -359,7 +359,7 @@ Currently two restrictions/permission types are supported: * Access to certain fields, such as filename in the CDX. When restricted, the cdx results contain only public fields. -To allow access, the API key cookie must be explicitly set on the client, eg: +To allow access, the API key cookie must be explicitly set on the client, e.g.: ``` curl -H "Cookie: cdx-auth-token=API-Key-Secret http://mycdxserver/search/cdx?url=..."