Skip to content

Commit

Permalink
Clean room implementation of detectCharsetFromBOM.
Browse files Browse the repository at this point in the history
Drops dependency.
Closes GitHub issue #400.
  • Loading branch information
lapo-luchini authored and asturio committed Aug 28, 2020
1 parent 630c9c5 commit abd765e
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 11 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ Make sure that your contributions can be released with a dual LGPL and MPL licen
## Dependencies ##
### Required Dependencies: ###
- Java 8 or later is required to use OpenPDF. All versions Java 8 to Java OpenJDK 13 have been tested to work.
- [Juniversalchardet](https://github.com/albfernandez/juniversalchardet)

### Optional: ###

Expand Down Expand Up @@ -94,7 +93,7 @@ Significant [Contributors to OpenPDF](https://github.com/LibrePDF/OpenPDF/graphs
[@ubermichael](https://github.com/ubermichael) - Michael Joyce
[@weiyeh](https://github.com/weiyeh)
[@SuperPat45](https://github.com/SuperPat45)
[@lapo-luchini](https://github.com/lapo-luchini)
[@lapo-luchini](https://github.com/lapo-luchini) - Lapo Luchini
[@MartinKocour](https://github.com/MartinKocour) - Martin Kocour
[@jokimaki](https://github.com/jokimaki)
[@sullis](https://github.com/sullis)
Expand Down
6 changes: 0 additions & 6 deletions openpdf/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,10 @@

<properties>
<java-module-name>com.github.librepdf.openpdf</java-module-name>
<juniversalchardet.version>2.3.2</juniversalchardet.version>
<imageio-tiff.version>3.5</imageio-tiff.version>
</properties>

<dependencies>
<dependency>
<groupId>com.github.albfernandez</groupId>
<artifactId>juniversalchardet</artifactId>
<version>${juniversalchardet.version}</version>
</dependency>
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcprov-jdk15on</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
package com.lowagie.text.xml.simpleparser;

import com.lowagie.text.error_messages.MessageLocalization;
import org.mozilla.universalchardet.UniversalDetector;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
Expand Down Expand Up @@ -550,7 +549,39 @@ public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment com
SimpleXMLParser parser = new SimpleXMLParser(doc, comment, html);
parser.go(r);
}


/** Detect charset from BOM, as per <a href="https://unicode.org/faq/utf_bom.html#bom4">Unicode FAQ</a>. */
private static String detectCharsetFromBOM(byte[] bom) {
// 00 00 FE FF UTF-32BE
// EF BB BF .. UTF-8
// FE FF .. .. UTF-16BE
// FF FE 00 00 UTF-32LE
// FF FE .. .. UTF-16LE
switch (bom[0]) {
case (byte) 0x00:
if (bom[1] == (byte) 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF)
return "UTF-32BE";
break;
case (byte) 0xEF:
if (bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF)
return "UTF-8";
break;
case (byte) 0xFE:
if (bom[1] == (byte) 0xFF)
return "UTF-16BE";
break;
case (byte) 0xFF:
if (bom[1] == (byte) 0xFE) {
if (bom[2] == (byte) 0x00 && bom[3] == (byte) 0x00)
return "UTF-32LE";
else
return "UTF-16LE";
}
break;
}
return null;
}

/**
* Parses the XML document firing the events to the handler.
* @param doc the document handler
Expand All @@ -562,7 +593,7 @@ public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOExcep
int count = in.read(b4);
if (count != 4)
throw new IOException(MessageLocalization.getComposedMessage("insufficient.length"));
String encoding = UniversalDetector.detectCharsetFromBOM(b4);
String encoding = detectCharsetFromBOM(b4);
if (encoding == null) encoding = "UTF-8"; //UTF-8 is default.

String decl = null;
Expand Down

0 comments on commit abd765e

Please sign in to comment.