Skip to content

Commit

Permalink
Use Charset.forname, to better cache charset lookups
Browse files Browse the repository at this point in the history
  • Loading branch information
hazendaz authored Jun 24, 2022
1 parent 38b3224 commit b873e21
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 14 deletions.
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ else if (first instanceof Comment) {
if (doc == null) {
if (charsetName == null)
charsetName = defaultCharsetName;
BufferedReader reader = new BufferedReader(new InputStreamReader(input, charsetName), bufferSize); // Android level does not allow us try-with-resources
BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName(charsetName)), bufferSize); // Android level does not allow us try-with-resources
try {
if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here
long skipped = reader.skip(1);
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jsoup/helper/HttpConnection.java
Original file line number Diff line number Diff line change
Expand Up @@ -1163,7 +1163,7 @@ else if (needsMultipart(req)) {

private static void writePost(final Connection.Request req, final OutputStream outputStream, @Nullable final String boundary) throws IOException {
final Collection<Connection.KeyVal> data = req.data();
final BufferedWriter w = new BufferedWriter(new OutputStreamWriter(outputStream, req.postDataCharset()));
final BufferedWriter w = new BufferedWriter(new OutputStreamWriter(outputStream, Charset.forName(req.postDataCharset())));

if (boundary != null) {
// boundary will be set if we're in multipart mode
Expand Down
12 changes: 4 additions & 8 deletions src/test/java/org/jsoup/helper/DataUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.junit.jupiter.api.Test;

import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;

Expand Down Expand Up @@ -37,12 +38,7 @@ private InputStream stream(String data) {
}

private InputStream stream(String data, String charset) {
try {
return new ByteArrayInputStream(data.getBytes(charset));
} catch (UnsupportedEncodingException e) {
fail();
}
return null;
return new ByteArrayInputStream(data.getBytes(Charset.forName(charset)));
}

@Test
Expand Down Expand Up @@ -180,7 +176,7 @@ public void supportsUTF8BOM() throws IOException {

@Test
public void noExtraNULLBytes() throws IOException {
final byte[] b = "<html><head><meta charset=\"UTF-8\"></head><body><div><u>ü</u>ü</div></body></html>".getBytes("UTF-8");
final byte[] b = "<html><head><meta charset=\"UTF-8\"></head><body><div><u>ü</u>ü</div></body></html>".getBytes(StandardCharsets.UTF_8);

Document doc = Jsoup.parse(new ByteArrayInputStream(b), null, "");
assertFalse( doc.outerHtml().contains("\u0000") );
Expand All @@ -201,7 +197,7 @@ public void supportsXmlCharsetDeclaration() throws IOException {
"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>" +
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">" +
"<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\" xml:lang=\"en\">Hellö Wörld!</html>"
).getBytes(encoding));
).getBytes(Charset.forName(encoding)));

Document doc = Jsoup.parse(soup, null, "");
assertEquals("Hellö Wörld!", doc.body().text());
Expand Down
3 changes: 2 additions & 1 deletion src/test/java/org/jsoup/parser/ParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;

import static org.junit.jupiter.api.Assertions.assertEquals;

Expand All @@ -31,7 +32,7 @@ public void unescapeEntitiesHandlesLargeInput() {
@Test
public void testUtf8() throws IOException {
// testcase for https://github.com/jhy/jsoup/issues/1557. no repro.
Document parsed = Jsoup.parse(new ByteArrayInputStream("<p>H\u00E9llo, w\u00F6rld!".getBytes("UTF-8")), null, "");
Document parsed = Jsoup.parse(new ByteArrayInputStream("<p>H\u00E9llo, w\u00F6rld!".getBytes(StandardCharsets.UTF_8)), null, "");
String text = parsed.selectFirst("p").wholeText();
assertEquals(text, "H\u00E9llo, w\u00F6rld!");
}
Expand Down
6 changes: 3 additions & 3 deletions src/test/java/org/jsoup/parser/TokeniserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;

import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Arrays;

import static org.jsoup.parser.CharacterReader.maxBufferLen;
Expand Down Expand Up @@ -165,9 +165,9 @@ public void bufferUpInAttributeVal() {
assertEquals(1, parser.getErrors().size());
}

@Test public void cp1252SubstitutionTable() throws UnsupportedEncodingException {
@Test public void cp1252SubstitutionTable() {
for (int i = 0; i < Tokeniser.win1252Extensions.length; i++) {
String s = new String(new byte[]{ (byte) (i + Tokeniser.win1252ExtensionsStart) }, "Windows-1252");
String s = new String(new byte[]{ (byte) (i + Tokeniser.win1252ExtensionsStart) }, Charset.forName("Windows-1252"));
assertEquals(1, s.length());

// some of these characters are illegal
Expand Down

0 comments on commit b873e21

Please sign in to comment.