diff --git a/.travis.yml b/.travis.yml index 72280adda8..a0545b4305 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,8 @@ +dist: xenial language: java jdk: - - oraclejdk8 - # no longer available - #- oraclejdk7 - - openjdk7 + - openjdk8 + - openjdk10 branches: only: - master diff --git a/wayback-cdx-server/README.md b/wayback-cdx-server/README.md index 845735b1ec..83bf348938 100644 --- a/wayback-cdx-server/README.md +++ b/wayback-cdx-server/README.md @@ -116,8 +116,8 @@ For example, if given the url: *archive.org/about/* and: The matchType may also be set implicitly by using wildcard '*' at end or beginning of the url: - * If url is ends in '/\*', eg **url=archive.org/\*** the query is equivalent to **url=archive.org/&matchType=prefix** - * if url starts with '\*.', eg **url=\*.archive.org/** the query is equivalent to **url=archive.org/&matchType=domain** + * If url is ends in '/\*', e.g. **url=archive.org/\*** the query is equivalent to **url=archive.org/&matchType=prefix** + * if url starts with '\*.', e.g. **url=\*.archive.org/** the query is equivalent to **url=archive.org/&matchType=domain** (Note: The *domain* mode is only available if the CDX is in SURT-order format.) @@ -359,7 +359,7 @@ Currently two restrictions/permission types are supported: * Access to certain fields, such as filename in the CDX. When restricted, the cdx results contain only public fields. -To allow access, the API key cookie must be explicitly set on the client, eg: +To allow access, the API key cookie must be explicitly set on the client, e.g.: ``` curl -H "Cookie: cdx-auth-token=API-Key-Secret http://mycdxserver/search/cdx?url=..." diff --git a/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java b/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java index ed6fb3846f..461cd27c1a 100644 --- a/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java +++ b/wayback-core/src/main/java/org/archive/wayback/replay/charset/UniversalChardetSniffer.java @@ -30,12 +30,21 @@ public String sniff(Resource resource) { detector.setText(bbuffer); CharsetMatch[] matches = detector.detectAll(); if (matches != null) { - for (int i = 0; i < matches.length; i++) { + + String charsetNextBest = null; + + for (int i = 0; i < matches.length; i++) { charsetName = matches[i].getName(); if (!isDubious(charsetName) && isCharsetSupported(charsetName)) { - return charsetName; + + if (charsetNextBest == null) { charsetNextBest = charsetName; } + + // prefer UTF character sets + if (charsetName.startsWith("UTF-8")) { return charsetName; } } } + + return charsetNextBest; } } catch (IOException ex) { // @@ -45,7 +54,7 @@ public String sniff(Resource resource) { /* * Pretty much nothing in the wild is really UTF-32, - * yet icu4j returns that as the likeliest possiblity + * yet icu4j returns that as the likeliest possibility * for several captures... */ protected boolean isDubious(String charsetName) {