Skip to content

Commit 530c5b0

Browse files
committed
Refactored fuzz tests to iterate all files in directory; run timeout tests
1 parent d2c455c commit 530c5b0

File tree

6 files changed

+132
-203
lines changed

6 files changed

+132
-203
lines changed

src/main/java/org/jsoup/Jsoup.java

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,31 +108,48 @@ public static Connection newSession() {
108108
/**
109109
Parse the contents of a file as HTML.
110110
111-
@param in file to load HTML from
111+
@param file file to load HTML from. Supports gzipped files (ending in .z or .gz).
112112
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
113113
present, or fall back to {@code UTF-8} (which is often safe to do).
114114
@param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
115115
@return sane HTML
116116
117117
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
118118
*/
119-
public static Document parse(File in, @Nullable String charsetName, String baseUri) throws IOException {
120-
return DataUtil.load(in, charsetName, baseUri);
119+
public static Document parse(File file, @Nullable String charsetName, String baseUri) throws IOException {
120+
return DataUtil.load(file, charsetName, baseUri);
121121
}
122122

123123
/**
124124
Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
125125
126-
@param in file to load HTML from
126+
@param file file to load HTML from. Supports gzipped files (ending in .z or .gz).
127127
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
128128
present, or fall back to {@code UTF-8} (which is often safe to do).
129129
@return sane HTML
130130
131131
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
132132
@see #parse(File, String, String)
133133
*/
134-
public static Document parse(File in, @Nullable String charsetName) throws IOException {
135-
return DataUtil.load(in, charsetName, in.getAbsolutePath());
134+
public static Document parse(File file, @Nullable String charsetName) throws IOException {
135+
return DataUtil.load(file, charsetName, file.getAbsolutePath());
136+
}
137+
138+
/**
139+
Parse the contents of a file as HTML.
140+
141+
@param file file to load HTML from. Supports gzipped files (ending in .z or .gz).
142+
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
143+
present, or fall back to {@code UTF-8} (which is often safe to do).
144+
@param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
145+
@param parser alternate {@link Parser#xmlParser() parser} to use.
146+
@return sane HTML
147+
148+
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
149+
@since 1.14.2
150+
*/
151+
public static Document parse(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
152+
return DataUtil.load(file, charsetName, baseUri, parser);
136153
}
137154

138155
/**

src/main/java/org/jsoup/helper/DataUtil.java

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,20 +49,38 @@ public final class DataUtil {
4949

5050
private DataUtil() {}
5151

52+
/**
53+
* Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
54+
* are supported in addition to uncompressed files.
55+
*
56+
* @param file file to load
57+
* @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
58+
* the file will always override this setting.
59+
* @param baseUri base URI of document, to resolve relative links against
60+
* @return Document
61+
* @throws IOException on IO error
62+
*/
63+
public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
64+
return load(file, charsetName, baseUri, Parser.htmlParser());
65+
}
66+
5267
/**
5368
* Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
5469
* are supported in addition to uncompressed files.
5570
*
56-
* @param in file to load
71+
* @param file file to load
5772
* @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
5873
* the file will always override this setting.
5974
* @param baseUri base URI of document, to resolve relative links against
75+
* @param parser alternate {@link Parser#xmlParser() parser} to use.
76+
6077
* @return Document
6178
* @throws IOException on IO error
79+
* @since 1.14.2
6280
*/
63-
public static Document load(File in, @Nullable String charsetName, String baseUri) throws IOException {
64-
InputStream stream = new FileInputStream(in);
65-
String name = Normalizer.lowerCase(in.getName());
81+
public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
82+
InputStream stream = new FileInputStream(file);
83+
String name = Normalizer.lowerCase(file.getName());
6684
if (name.endsWith(".gz") || name.endsWith(".z")) {
6785
// unfortunately file input streams don't support marks (why not?), so we will close and reopen after read
6886
boolean zipped;
@@ -72,9 +90,9 @@ public static Document load(File in, @Nullable String charsetName, String baseUr
7290
stream.close();
7391

7492
}
75-
stream = zipped ? new GZIPInputStream(new FileInputStream(in)) : new FileInputStream(in);
93+
stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file);
7694
}
77-
return parseInputStream(stream, charsetName, baseUri, Parser.htmlParser());
95+
return parseInputStream(stream, charsetName, baseUri, parser);
7896
}
7997

8098
/**
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package org.jsoup.integration;
2+
3+
import org.jsoup.Jsoup;
4+
import org.jsoup.nodes.Document;
5+
import org.jsoup.parser.Parser;
6+
import org.junit.jupiter.api.Assertions;
7+
import org.junit.jupiter.params.ParameterizedTest;
8+
import org.junit.jupiter.params.provider.MethodSource;
9+
10+
import java.io.File;
11+
import java.io.IOException;
12+
import java.util.stream.Stream;
13+
14+
import static org.junit.jupiter.api.Assertions.assertNotNull;
15+
import static org.junit.jupiter.api.Assertions.assertTrue;
16+
17+
/**
18+
Tests fixes for issues raised by the OSS Fuzz project @ https://oss-fuzz.com/testcases?project=jsoup As some of these
19+
are timeout tests - run each file 100 times and ensure under time.
20+
*/
21+
public class FuzzFixesIT {
22+
static int numIters = 50;
23+
static int timeout = 20; // external fuzzer is set to 60 for 100 runs
24+
static File testDir = ParseTest.getFile("/fuzztests/");
25+
26+
private static Stream<File> testFiles() {
27+
File[] files = testDir.listFiles();
28+
assertNotNull(files);
29+
assertTrue(files.length > 10);
30+
31+
return Stream.of(files);
32+
}
33+
34+
@ParameterizedTest
35+
@MethodSource("testFiles")
36+
void testHtmlParse(File file) throws IOException {
37+
long startTime = System.currentTimeMillis();
38+
long completeBy = startTime + timeout * 1000L;
39+
40+
for (int i = 0; i < numIters; i++) {
41+
Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/");
42+
assertNotNull(doc);
43+
if (System.currentTimeMillis() > completeBy)
44+
Assertions.fail(String.format("Timeout: only completed %d iters of [%s] in %d seconds", i, file.getName(), timeout));
45+
}
46+
}
47+
48+
@ParameterizedTest
49+
@MethodSource("testFiles")
50+
void testXmlParse(File file) throws IOException {
51+
long startTime = System.currentTimeMillis();
52+
long completeBy = startTime + timeout * 1000L;
53+
54+
for (int i = 0; i < numIters; i++) {
55+
Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/", Parser.xmlParser());
56+
assertNotNull(doc);
57+
if (System.currentTimeMillis() > completeBy)
58+
Assertions.fail(String.format("Timeout: only completed %d iters of [%s] in %d seconds", i, file.getName(), timeout));
59+
}
60+
}
61+
}

0 commit comments

Comments
 (0)