diff --git a/.mvn/jvm.config b/.mvn/jvm.config new file mode 100644 index 0000000000..e2a50e0834 --- /dev/null +++ b/.mvn/jvm.config @@ -0,0 +1 @@ +--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED --add-exports jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED --add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED --add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED --add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED \ No newline at end of file diff --git a/tika-core/pom.xml b/tika-core/pom.xml index 8426ac55ff..1fbef62281 100644 --- a/tika-core/pom.xml +++ b/tika-core/pom.xml @@ -34,6 +34,10 @@ Apache Tika core https://tika.apache.org/ + + 5.3 + + @@ -130,6 +134,46 @@ + + com.cosium.code + git-code-format-maven-plugin + ${git-code-format-maven-plugin.version} + + + + install-formatter-hook + + install-hooks + + + + + validate-code-format + + validate-code-format + + + + + + + com.cosium.code + google-java-format + ${git-code-format-maven-plugin.version} + + + + + true + false + false + false + + + + org.apache.felix maven-bundle-plugin diff --git a/tika-core/src/main/java/org/apache/tika/Tika.java b/tika-core/src/main/java/org/apache/tika/Tika.java index 22811f9c0b..23f4769c59 100644 --- a/tika-core/src/main/java/org/apache/tika/Tika.java +++ b/tika-core/src/main/java/org/apache/tika/Tika.java @@ -24,9 +24,6 @@ import java.net.URL; import java.nio.file.Path; import java.util.Properties; - -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; @@ -41,11 +38,12 @@ import org.apache.tika.parser.ParsingReader; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.WriteOutContentHandler; +import org.xml.sax.SAXException; /** - * Facade class for accessing Tika functionality. This class hides much of - * the underlying complexity of the lower level Tika classes and provides - * simple methods for many common parsing and type detection operations. + * Facade class for accessing Tika functionality. This class hides much of the underlying complexity + * of the lower level Tika classes and provides simple methods for many common parsing and type + * detection operations. * * @see Parser * @see Detector @@ -53,25 +51,18 @@ */ public class Tika { - /** - * The detector instance used by this facade. - */ + /** The detector instance used by this facade. */ private final Detector detector; - /** - * The parser instance used by this facade. - */ + /** The parser instance used by this facade. */ private final Parser parser; - /** - * The Translator instance used by this facade. - */ + /** The Translator instance used by this facade. */ private final Translator translator; /** - * Maximum length of the strings returned by the parseToString methods. - * Used to prevent out of memory problems with huge input documents. - * The default setting is 100k characters. + * Maximum length of the strings returned by the parseToString methods. Used to prevent out of + * memory problems with huge input documents. The default setting is 100k characters. */ private int maxStringLength = 100 * 1000; @@ -80,7 +71,7 @@ public class Tika { * Translator. * * @param detector type detector - * @param parser document parser + * @param parser document parser * @since Apache Tika 0.8 */ public Tika(Detector detector, Parser parser) { @@ -92,8 +83,8 @@ public Tika(Detector detector, Parser parser) { /** * Creates a Tika facade using the given detector, parser, and translator instances. * - * @param detector type detector - * @param parser document parser + * @param detector type detector + * @param parser document parser * @param translator text translator * @since Apache Tika 1.6 */ @@ -112,16 +103,14 @@ public Tika(TikaConfig config) { this(config.getDetector(), new AutoDetectParser(config), config.getTranslator()); } - /** - * Creates a Tika facade using the default configuration. - */ + /** Creates a Tika facade using the default configuration. */ public Tika() { this(TikaConfig.getDefaultConfig()); } /** - * Creates a Tika facade using the given detector instance, the - * default parser configuration, and the default Translator. + * Creates a Tika facade using the given detector instance, the default parser configuration, + * and the default Translator. * * @param detector type detector * @since Apache Tika 0.8 @@ -130,25 +119,21 @@ public Tika(Detector detector) { this(detector, new AutoDetectParser(detector)); } - /** - * Detects the media type of the given document. The type detection is - * based on the content of the given document stream and any given - * document metadata. The document stream can be null, - * in which case only the given document metadata is used for type - * detection. - *

- * If the document stream supports the - * {@link InputStream#markSupported() mark feature}, then the stream is - * marked and reset to the original position before this method returns. - * Only a limited number of bytes are read from the stream. - *

- * The given document stream is not closed by this method. - *

- * Unlike in the {@link #parse(InputStream, Metadata)} method, the - * given document metadata is not modified by this method. + * Detects the media type of the given document. The type detection is based on the content of + * the given document stream and any given document metadata. The document stream can be + * null, in which case only the given document metadata is used for type detection. + * + *

If the document stream supports the {@link InputStream#markSupported() mark feature}, then + * the stream is marked and reset to the original position before this method returns. Only a + * limited number of bytes are read from the stream. * - * @param stream the document stream, or null + *

The given document stream is not closed by this method. + * + *

Unlike in the {@link #parse(InputStream, Metadata)} method, the given document metadata is + * not modified by this method. + * + * @param stream the document stream, or null * @param metadata document metadata * @return detected media type * @throws IOException if the stream can not be read @@ -162,19 +147,17 @@ public String detect(InputStream stream, Metadata metadata) throws IOException { } /** - * Detects the media type of the given document. The type detection is - * based on the content of the given document stream and the name of the - * document. - *

- * If the document stream supports the - * {@link InputStream#markSupported() mark feature}, then the stream is - * marked and reset to the original position before this method returns. - * Only a limited number of bytes are read from the stream. - *

- * The given document stream is not closed by this method. + * Detects the media type of the given document. The type detection is based on the content of + * the given document stream and the name of the document. + * + *

If the document stream supports the {@link InputStream#markSupported() mark feature}, then + * the stream is marked and reset to the original position before this method returns. Only a + * limited number of bytes are read from the stream. + * + *

The given document stream is not closed by this method. * * @param stream the document stream - * @param name document name + * @param name document name * @return detected media type * @throws IOException if the stream can not be read * @since Apache Tika 0.9 @@ -186,15 +169,14 @@ public String detect(InputStream stream, String name) throws IOException { } /** - * Detects the media type of the given document. The type detection is - * based on the content of the given document stream. - *

- * If the document stream supports the - * {@link InputStream#markSupported() mark feature}, then the stream is - * marked and reset to the original position before this method returns. - * Only a limited number of bytes are read from the stream. - *

- * The given document stream is not closed by this method. + * Detects the media type of the given document. The type detection is based on the content of + * the given document stream. + * + *

If the document stream supports the {@link InputStream#markSupported() mark feature}, then + * the stream is marked and reset to the original position before this method returns. Only a + * limited number of bytes are read from the stream. + * + *

The given document stream is not closed by this method. * * @param stream the document stream * @return detected media type @@ -205,16 +187,15 @@ public String detect(InputStream stream) throws IOException { } /** - * Detects the media type of the given document. The type detection is - * based on the first few bytes of a document and the document name. - *

- * For best results at least a few kilobytes of the document data - * are needed. See also the other detect() methods for better - * alternatives when you have more than just the document prefix - * available for type detection. + * Detects the media type of the given document. The type detection is based on the first few + * bytes of a document and the document name. + * + *

For best results at least a few kilobytes of the document data are needed. See also the + * other detect() methods for better alternatives when you have more than just the document + * prefix available for type detection. * * @param prefix first few bytes of the document - * @param name document name + * @param name document name * @return detected media type * @since Apache Tika 0.9 */ @@ -229,13 +210,12 @@ public String detect(byte[] prefix, String name) { } /** - * Detects the media type of the given document. The type detection is - * based on the first few bytes of a document. - *

- * For best results at least a few kilobytes of the document data - * are needed. See also the other detect() methods for better - * alternatives when you have more than just the document prefix - * available for type detection. + * Detects the media type of the given document. The type detection is based on the first few + * bytes of a document. + * + *

For best results at least a few kilobytes of the document data are needed. See also the + * other detect() methods for better alternatives when you have more than just the document + * prefix available for type detection. * * @param prefix first few bytes of the document * @return detected media type @@ -252,12 +232,11 @@ public String detect(byte[] prefix) { } /** - * Detects the media type of the file at the given path. The type - * detection is based on the document content and a potential known - * file extension. - *

- * Use the {@link #detect(String)} method when you want to detect the - * type of the document without actually accessing the file. + * Detects the media type of the file at the given path. The type detection is based on the + * document content and a potential known file extension. + * + *

Use the {@link #detect(String)} method when you want to detect the type of the document + * without actually accessing the file. * * @param path the path of the file * @return detected media type @@ -271,11 +250,11 @@ public String detect(Path path) throws IOException { } /** - * Detects the media type of the given file. The type detection is - * based on the document content and a potential known file extension. - *

- * Use the {@link #detect(String)} method when you want to detect the - * type of the document without actually accessing the file. + * Detects the media type of the given file. The type detection is based on the document content + * and a potential known file extension. + * + *

Use the {@link #detect(String)} method when you want to detect the type of the document + * without actually accessing the file. * * @param file the file * @return detected media type @@ -284,19 +263,18 @@ public String detect(Path path) throws IOException { */ public String detect(File file) throws IOException { Metadata metadata = new Metadata(); - try (@SuppressWarnings("deprecation") InputStream stream = TikaInputStream - .get(file, metadata)) { + try (@SuppressWarnings("deprecation") + InputStream stream = TikaInputStream.get(file, metadata)) { return detect(stream, metadata); } } /** - * Detects the media type of the resource at the given URL. The type - * detection is based on the document content and a potential known - * file extension included in the URL. - *

- * Use the {@link #detect(String)} method when you want to detect the - * type of the document without actually accessing the URL. + * Detects the media type of the resource at the given URL. The type detection is based on the + * document content and a potential known file extension included in the URL. + * + *

Use the {@link #detect(String)} method when you want to detect the type of the document + * without actually accessing the URL. * * @param url the URL of the resource * @return detected media type @@ -310,11 +288,11 @@ public String detect(URL url) throws IOException { } /** - * Detects the media type of a document with the given file name. - * The type detection is based on known file name extensions. - *

- * The given name can also be a URL or a full file path. In such cases - * only the file name part of the string is used for type detection. + * Detects the media type of a document with the given file name. The type detection is based on + * known file name extensions. + * + *

The given name can also be a URL or a full file path. In such cases only the file name + * part of the string is used for type detection. * * @param name the file name of the document * @return detected media type @@ -330,11 +308,11 @@ public String detect(String name) { /** * Translate the given text String to and from the given languages. * - * @param text The text to translate. + * @param text The text to translate. * @param sourceLanguage The input text language (for example, "hi"). * @param targetLanguage The desired output language (for example, "fr"). - * @return The translated text. If translation is unavailable (client keys not set), returns - * the same text back. + * @return The translated text. If translation is unavailable (client keys not set), returns the + * same text back. * @see org.apache.tika.language.translate.Translator */ public String translate(String text, String sourceLanguage, String targetLanguage) { @@ -346,13 +324,13 @@ public String translate(String text, String sourceLanguage, String targetLanguag } /** - * Translate the given text String to the given language, attempting to auto-detect the - * source language. + * Translate the given text String to the given language, attempting to auto-detect the source + * language. * - * @param text The text to translate. + * @param text The text to translate. * @param targetLanguage The desired output language (for example, "en"). - * @return The translated text. If translation is unavailable (client keys not set), returns - * the same text back. + * @return The translated text. If translation is unavailable (client keys not set), returns the + * same text back. * @see org.apache.tika.language.translate.Translator */ public String translate(String text, String targetLanguage) { @@ -363,18 +341,16 @@ public String translate(String text, String targetLanguage) { } } - /** - * Parses the given document and returns the extracted text content. - * Input metadata like a file name or a content type hint can be passed - * in the given metadata instance. Metadata information extracted from - * the document is returned in that same metadata instance. - *

- * The returned reader will be responsible for closing the given stream. - * The stream and any associated resources will be closed at or before - * the time when the {@link Reader#close()} method is called. - * - * @param stream the document to be parsed + * Parses the given document and returns the extracted text content. Input metadata like a file + * name or a content type hint can be passed in the given metadata instance. Metadata + * information extracted from the document is returned in that same metadata instance. + * + *

The returned reader will be responsible for closing the given stream. The stream and any + * associated resources will be closed at or before the time when the {@link Reader#close()} + * method is called. + * + * @param stream the document to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the document can not be read or parsed @@ -387,10 +363,10 @@ public Reader parse(InputStream stream, Metadata metadata) throws IOException { /** * Parses the given document and returns the extracted text content. - *

- * The returned reader will be responsible for closing the given stream. - * The stream and any associated resources will be closed at or before - * the time when the {@link Reader#close()} method is called. + * + *

The returned reader will be responsible for closing the given stream. The stream and any + * associated resources will be closed at or before the time when the {@link Reader#close()} + * method is called. * * @param stream the document to be parsed * @return extracted text content @@ -402,11 +378,11 @@ public Reader parse(InputStream stream) throws IOException { /** * Parses the file at the given path and returns the extracted text content. - *

- * Metadata information extracted from the document is returned in - * the supplied metadata instance. * - * @param path the path of the file to be parsed + *

Metadata information extracted from the document is returned in the supplied metadata + * instance. + * + * @param path the path of the file to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the file can not be read or parsed @@ -429,18 +405,19 @@ public Reader parse(Path path) throws IOException { /** * Parses the given file and returns the extracted text content. - *

- * Metadata information extracted from the document is returned in - * the supplied metadata instance. * - * @param file the file to be parsed + *

Metadata information extracted from the document is returned in the supplied metadata + * instance. + * + * @param file the file to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the file can not be read or parsed * @see #parse(Path) */ public Reader parse(File file, Metadata metadata) throws IOException { - @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); + @SuppressWarnings("deprecation") + InputStream stream = TikaInputStream.get(file, metadata); return parse(stream, metadata); } @@ -457,8 +434,7 @@ public Reader parse(File file) throws IOException { } /** - * Parses the resource at the given URL and returns the extracted - * text content. + * Parses the resource at the given URL and returns the extracted text content. * * @param url the URL of the resource to be parsed * @return extracted text content @@ -471,23 +447,21 @@ public Reader parse(URL url) throws IOException { } /** - * Parses the given document and returns the extracted text content. - * The given input stream is closed by this method. - *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to {@link #getMaxStringLength()} first characters extracted - * from the input document. Use the {@link #setMaxStringLength(int)} - * method to adjust this limitation. - *

- * NOTE: Unlike most other Tika methods that take an - * {@link InputStream}, this method will close the given stream for - * you as a convenience. With other methods you are still responsible - * for closing the stream or a wrapper instance returned by Tika. - * - * @param stream the document to be parsed + * Parses the given document and returns the extracted text content. The given input stream is + * closed by this method. + * + *

To avoid unpredictable excess memory use, the returned string contains only up to {@link + * #getMaxStringLength()} first characters extracted from the input document. Use the {@link + * #setMaxStringLength(int)} method to adjust this limitation. + * + *

NOTE: Unlike most other Tika methods that take an {@link InputStream}, + * this method will close the given stream for you as a convenience. With other methods you are + * still responsible for closing the stream or a wrapper instance returned by Tika. + * + * @param stream the document to be parsed * @param metadata document metadata * @return extracted text content - * @throws IOException if the document can not be read + * @throws IOException if the document can not be read * @throws TikaException if the document can not be parsed */ public String parseToString(InputStream stream, Metadata metadata) @@ -496,24 +470,21 @@ public String parseToString(InputStream stream, Metadata metadata) } /** - * Parses the given document and returns the extracted text content. - * The given input stream is closed by this method. This method lets - * you control the maxStringLength per call. - *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to maxLength (parameter) first characters extracted - * from the input document. - *

- * NOTE: Unlike most other Tika methods that take an - * {@link InputStream}, this method will close the given stream for - * you as a convenience. With other methods you are still responsible - * for closing the stream or a wrapper instance returned by Tika. - * - * @param stream the document to be parsed - * @param metadata document metadata + * Parses the given document and returns the extracted text content. The given input stream is + * closed by this method. This method lets you control the maxStringLength per call. + * + *

To avoid unpredictable excess memory use, the returned string contains only up to + * maxLength (parameter) first characters extracted from the input document. + * + *

NOTE: Unlike most other Tika methods that take an {@link InputStream}, + * this method will close the given stream for you as a convenience. With other methods you are + * still responsible for closing the stream or a wrapper instance returned by Tika. + * + * @param stream the document to be parsed + * @param metadata document metadata * @param maxLength maximum length of the returned string * @return extracted text content - * @throws IOException if the document can not be read + * @throws IOException if the document can not be read * @throws TikaException if the document can not be parsed */ public String parseToString(InputStream stream, Metadata metadata, int maxLength) @@ -535,22 +506,20 @@ public String parseToString(InputStream stream, Metadata metadata, int maxLength } /** - * Parses the given document and returns the extracted text content. - * The given input stream is closed by this method. - *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to {@link #getMaxStringLength()} first characters extracted - * from the input document. Use the {@link #setMaxStringLength(int)} - * method to adjust this limitation. - *

- * NOTE: Unlike most other Tika methods that take an - * {@link InputStream}, this method will close the given stream for - * you as a convenience. With other methods you are still responsible - * for closing the stream or a wrapper instance returned by Tika. + * Parses the given document and returns the extracted text content. The given input stream is + * closed by this method. + * + *

To avoid unpredictable excess memory use, the returned string contains only up to {@link + * #getMaxStringLength()} first characters extracted from the input document. Use the {@link + * #setMaxStringLength(int)} method to adjust this limitation. + * + *

NOTE: Unlike most other Tika methods that take an {@link InputStream}, + * this method will close the given stream for you as a convenience. With other methods you are + * still responsible for closing the stream or a wrapper instance returned by Tika. * * @param stream the document to be parsed * @return extracted text content - * @throws IOException if the document can not be read + * @throws IOException if the document can not be read * @throws TikaException if the document can not be parsed */ public String parseToString(InputStream stream) throws IOException, TikaException { @@ -559,15 +528,14 @@ public String parseToString(InputStream stream) throws IOException, TikaExceptio /** * Parses the file at the given path and returns the extracted text content. - *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to {@link #getMaxStringLength()} first characters extracted - * from the input document. Use the {@link #setMaxStringLength(int)} - * method to adjust this limitation. + * + *

To avoid unpredictable excess memory use, the returned string contains only up to {@link + * #getMaxStringLength()} first characters extracted from the input document. Use the {@link + * #setMaxStringLength(int)} method to adjust this limitation. * * @param path the path of the file to be parsed * @return extracted text content - * @throws IOException if the file can not be read + * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed */ public String parseToString(Path path) throws IOException, TikaException { @@ -578,36 +546,34 @@ public String parseToString(Path path) throws IOException, TikaException { /** * Parses the given file and returns the extracted text content. - *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to {@link #getMaxStringLength()} first characters extracted - * from the input document. Use the {@link #setMaxStringLength(int)} - * method to adjust this limitation. + * + *

To avoid unpredictable excess memory use, the returned string contains only up to {@link + * #getMaxStringLength()} first characters extracted from the input document. Use the {@link + * #setMaxStringLength(int)} method to adjust this limitation. * * @param file the file to be parsed * @return extracted text content - * @throws IOException if the file can not be read + * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed * @see #parseToString(Path) */ public String parseToString(File file) throws IOException, TikaException { Metadata metadata = new Metadata(); - @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); + @SuppressWarnings("deprecation") + InputStream stream = TikaInputStream.get(file, metadata); return parseToString(stream, metadata); } /** - * Parses the resource at the given URL and returns the extracted - * text content. - *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to {@link #getMaxStringLength()} first characters extracted - * from the input document. Use the {@link #setMaxStringLength(int)} - * method to adjust this limitation. + * Parses the resource at the given URL and returns the extracted text content. + * + *

To avoid unpredictable excess memory use, the returned string contains only up to {@link + * #getMaxStringLength()} first characters extracted from the input document. Use the {@link + * #setMaxStringLength(int)} method to adjust this limitation. * * @param url the URL of the resource to be parsed * @return extracted text content - * @throws IOException if the resource can not be read + * @throws IOException if the resource can not be read * @throws TikaException if the resource can not be parsed */ public String parseToString(URL url) throws IOException, TikaException { @@ -617,8 +583,7 @@ public String parseToString(URL url) throws IOException, TikaException { } /** - * Returns the maximum length of strings returned by the - * parseToString methods. + * Returns the maximum length of strings returned by the parseToString methods. * * @return maximum string length, or -1 if the limit has been disabled * @since Apache Tika 0.7 @@ -628,11 +593,9 @@ public int getMaxStringLength() { } /** - * Sets the maximum length of strings returned by the parseToString - * methods. + * Sets the maximum length of strings returned by the parseToString methods. * - * @param maxStringLength maximum string length, - * or -1 to disable this limit + * @param maxStringLength maximum string length, or -1 to disable this limit * @since Apache Tika 0.7 */ public void setMaxStringLength(int maxStringLength) { @@ -669,7 +632,7 @@ public Translator getTranslator() { return translator; } - //--------------------------------------------------------------< Object > + // --------------------------------------------------------------< Object > public String toString() { return getString(); @@ -678,8 +641,9 @@ public String toString() { public static String getString() { String version = null; - try (InputStream stream = Tika.class - .getResourceAsStream("/META-INF/maven/org.apache.tika/tika-core/pom.properties")) { + try (InputStream stream = + Tika.class.getResourceAsStream( + "/META-INF/maven/org.apache.tika/tika-core/pom.properties")) { if (stream != null) { Properties properties = new Properties(); properties.load(stream); @@ -694,5 +658,4 @@ public static String getString() { return "Apache Tika"; } } - } diff --git a/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java b/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java index 1f7c4a0567..f3a5ffadf0 100644 --- a/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java +++ b/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java @@ -24,9 +24,8 @@ * @since Apache Tika 1.11 */ public interface ConfigurableThreadPoolExecutor extends ExecutorService { - + public void setMaximumPoolSize(int threads); - - public void setCorePoolSize(int threads); + public void setCorePoolSize(int threads); } diff --git a/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java b/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java index a4385e279e..6a7ee32cf4 100644 --- a/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java +++ b/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java @@ -25,11 +25,16 @@ * * @since Apache Tika 1.11 */ -public class SimpleThreadPoolExecutor extends ThreadPoolExecutor implements - ConfigurableThreadPoolExecutor { +public class SimpleThreadPoolExecutor extends ThreadPoolExecutor + implements ConfigurableThreadPoolExecutor { public SimpleThreadPoolExecutor() { - super(1, 2, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue<>(), + super( + 1, + 2, + 0L, + TimeUnit.SECONDS, + new LinkedBlockingQueue<>(), r -> new Thread(r, "Tika Executor Thread")); } } diff --git a/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java b/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java index 405294faed..448f8820e9 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java +++ b/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java @@ -29,23 +29,21 @@ import java.util.Locale; import java.util.Map; import java.util.Set; - +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.utils.XMLReaderUtils; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.utils.XMLReaderUtils; - - public abstract class ConfigBase { private static Class[] SUPPORTED_PRIMITIVES = - new Class[]{String.class, boolean.class, long.class, int.class, double.class, - float.class}; + new Class[] { + String.class, boolean.class, long.class, int.class, double.class, float.class + }; /** * Use this to build a single class, where the user specifies the instance class, e.g. @@ -81,8 +79,9 @@ protected static T buildSingle(String itemName, Class itemClass, InputStr * @throws TikaConfigException * @throws IOException */ - protected static T buildSingle(String itemName, Class itemClass, Element properties, - T defaultValue) throws TikaConfigException, IOException { + protected static T buildSingle( + String itemName, Class itemClass, Element properties, T defaultValue) + throws TikaConfigException, IOException { NodeList children = properties.getChildNodes(); T toConfigure = null; @@ -110,10 +109,9 @@ protected static T buildSingle(String itemName, Class itemClass, Element return toConfigure; } - /** - * Use this to build a list of components for a composite item (e.g. - * CompositeMetadataFilter, FetcherManager), each with their own configurations + * Use this to build a list of components for a composite item (e.g. CompositeMetadataFilter, + * FetcherManager), each with their own configurations * * @param compositeElementName * @param itemName @@ -121,8 +119,12 @@ protected static T buildSingle(String itemName, Class itemClass, Element * @throws TikaConfigException * @throws IOException */ - protected static P buildComposite(String compositeElementName, Class

compositeClass, - String itemName, Class itemClass, InputStream is) + protected static P buildComposite( + String compositeElementName, + Class

compositeClass, + String itemName, + Class itemClass, + InputStream is) throws TikaConfigException, IOException { Element properties = null; try { @@ -132,13 +134,16 @@ protected static P buildComposite(String compositeElementName, Class

c } catch (TikaException e) { throw new TikaConfigException("problem loading xml to dom", e); } - return buildComposite(compositeElementName, compositeClass, itemName, itemClass, - properties); + return buildComposite( + compositeElementName, compositeClass, itemName, itemClass, properties); } - protected static P buildComposite(String compositeElementName, Class

compositeClass, - String itemName, Class itemClass, - Element properties) + protected static P buildComposite( + String compositeElementName, + Class

compositeClass, + String itemName, + Class itemClass, + Element properties) throws TikaConfigException, IOException { if (!properties.getLocalName().equals("properties")) { @@ -159,8 +164,10 @@ protected static P buildComposite(String compositeElementName, Class

c P composite = (P) constructor.newInstance(components); setParams(composite, child, new HashSet<>(), itemName); return composite; - } catch (NoSuchMethodException | InvocationTargetException | - InstantiationException | IllegalAccessException e) { + } catch (NoSuchMethodException + | InvocationTargetException + | InstantiationException + | IllegalAccessException e) { throw new TikaConfigException("can't build composite class", e); } } @@ -168,8 +175,8 @@ protected static P buildComposite(String compositeElementName, Class

c throw new TikaConfigException("could not find " + compositeElementName); } - private static List loadComposite(Node composite, String itemName, - Class itemClass) + private static List loadComposite( + Node composite, String itemName, Class itemClass) throws TikaConfigException { NodeList children = composite.getChildNodes(); List items = new ArrayList<>(); @@ -199,14 +206,21 @@ private static T buildClass(Node node, String elementName, Class itemClass) Class clazz = Class.forName(className); if (!itemClass.isAssignableFrom(clazz)) { throw new TikaConfigException( - elementName + " with class name " + className + " must be of type '" + - itemClass.getName() + "'"); + elementName + + " with class name " + + className + + " must be of type '" + + itemClass.getName() + + "'"); } return (T) clazz.getDeclaredConstructor().newInstance(); - } catch (InstantiationException | IllegalAccessException | ClassNotFoundException | - NoSuchMethodException | InvocationTargetException e) { - throw new TikaConfigException("problem loading " + elementName + - " with class " + itemClass.getName(), e); + } catch (InstantiationException + | IllegalAccessException + | ClassNotFoundException + | NoSuchMethodException + | InvocationTargetException e) { + throw new TikaConfigException( + "problem loading " + elementName + " with class " + itemClass.getName(), e); } } @@ -215,8 +229,9 @@ private static void setParams(Object object, Node targetNode, Set settin setParams(object, targetNode, settings, null); } - private static void setParams(Object object, Node targetNode, Set settings, - String exceptNodeName) throws TikaConfigException { + private static void setParams( + Object object, Node targetNode, Set settings, String exceptNodeName) + throws TikaConfigException { NodeList children = targetNode.getChildNodes(); List params = new ArrayList<>(); for (int i = 0; i < children.getLength(); i++) { @@ -257,7 +272,7 @@ private static void setParams(Object object, Node targetNode, Set settin if (isPrimitive(setterClassPair.itemClass)) { tryToSetPrimitive(object, setterClassPair, param.getTextContent()); } else { - //tryToSetPrimitive(object, localName, txt); + // tryToSetPrimitive(object, localName, txt); Object item = buildClass(param, itemName, setterClassPair.itemClass); setParams(setterClassPair.itemClass.cast(item), param, new HashSet<>()); try { @@ -298,8 +313,8 @@ private static boolean hasClass(Node param) { private static SetterClassPair findSetterClassPair(Object object, String itemName) throws TikaConfigException { - //TODO -- we could do more with info from the node -- is it complex, does it have - //a text value, does it have a class, etc... This works for now. + // TODO -- we could do more with info from the node -- is it complex, does it have + // a text value, does it have a class, etc... This works for now. String setter = "set" + itemName.substring(0, 1).toUpperCase(Locale.US) + itemName.substring(1); Class itemClass = null; @@ -308,7 +323,7 @@ private static SetterClassPair findSetterClassPair(Object object, String itemNam if (setter.equals(method.getName())) { Class[] classes = method.getParameterTypes(); if (classes.length == 1) { - //if both setX(String) and setX(Object), prefer setX(String) + // if both setX(String) and setX(Object), prefer setX(String) if (itemClass == null || classes[0].equals(String.class)) { itemClass = classes[0]; setterMethod = method; @@ -319,14 +334,14 @@ private static SetterClassPair findSetterClassPair(Object object, String itemNam if (setterMethod != null && itemClass != null) { return new SetterClassPair(setterMethod, itemClass); } - //now try adders + // now try adders String adder = "add" + itemName.substring(0, 1).toUpperCase(Locale.US) + itemName.substring(1); for (Method method : object.getClass().getMethods()) { if (adder.equals(method.getName())) { Class[] classes = method.getParameterTypes(); if (classes.length == 1) { - //if both setX(String) and setX(Object), prefer setX(String) + // if both setX(String) and setX(Object), prefer setX(String) if (itemClass == null || classes[0].equals(String.class)) { itemClass = classes[0]; setterMethod = method; @@ -336,8 +351,14 @@ private static SetterClassPair findSetterClassPair(Object object, String itemNam } if (setterMethod == null && itemClass == null) { throw new TikaConfigException( - "Couldn't find setter '" + setter + "' or adder '" + adder + "' for " + itemName + - " of class: " + object.getClass()); + "Couldn't find setter '" + + setter + + "' or adder '" + + adder + + "' for " + + itemName + + " of class: " + + object.getClass()); } return new SetterClassPair(setterMethod, itemClass); } @@ -385,8 +406,10 @@ private static void tryToSetClassList(Object object, Node node) throws TikaConfi Method m = object.getClass().getMethod(setter, List.class); m.invoke(object, items); - } catch (ClassNotFoundException | InvocationTargetException | NoSuchMethodException | - IllegalAccessException e) { + } catch (ClassNotFoundException + | InvocationTargetException + | NoSuchMethodException + | IllegalAccessException e) { throw new TikaConfigException("couldn't build class for " + name, e); } } @@ -415,8 +438,8 @@ private static void tryToSetStringList(Object object, Node param) throws TikaCon private static void tryToSetMap(Object object, Node param) throws TikaConfigException { String name = param.getLocalName(); - //only supports string, string at this point - //use LinkedHashMap to keep insertion order! + // only supports string, string at this point + // use LinkedHashMap to keep insertion order! Map map = new LinkedHashMap<>(); NodeList nodeList = param.getChildNodes(); for (int i = 0; i < nodeList.getLength(); i++) { @@ -450,7 +473,6 @@ private static void tryToSetMap(Object object, Node param) throws TikaConfigExce } map.put(key, value); } - } String setter = "set" + name.substring(0, 1).toUpperCase(Locale.US) + name.substring(1); try { @@ -467,11 +489,11 @@ private static boolean isMap(Node param) { Node n = nodeList.item(i); if (n.getNodeType() == 1) { if (n.hasAttributes()) { - if (n.getAttributes().getNamedItem("from") != null && - n.getAttributes().getNamedItem("to") != null) { + if (n.getAttributes().getNamedItem("from") != null + && n.getAttributes().getNamedItem("to") != null) { return true; - } else if (n.getAttributes().getNamedItem("k") != null && - n.getAttributes().getNamedItem("v") != null) { + } else if (n.getAttributes().getNamedItem("k") != null + && n.getAttributes().getNamedItem("v") != null) { return true; } } @@ -480,8 +502,9 @@ private static boolean isMap(Node param) { return false; } - private static void tryToSetPrimitive(Object object, SetterClassPair setterClassPair, - String value) throws TikaConfigException { + private static void tryToSetPrimitive( + Object object, SetterClassPair setterClassPair, String value) + throws TikaConfigException { try { if (setterClassPair.itemClass == int.class) { setterClassPair.setterMethod.invoke(object, Integer.parseInt(value)); @@ -501,15 +524,13 @@ private static void tryToSetPrimitive(Object object, SetterClassPair setterClass } } - /** - * This should be overridden to do something with the settings - * after loading the object. + * This should be overridden to do something with the settings after loading the object. * * @param settings */ protected void handleSettings(Set settings) { - //no-op + // no-op } /** @@ -559,8 +580,12 @@ public SetterClassPair(Method setterMethod, Class itemClass) { @Override public String toString() { - return "SetterClassPair{" + "setterMethod=" + setterMethod + ", itemClass=" + - itemClass + '}'; + return "SetterClassPair{" + + "setterMethod=" + + setterMethod + + ", itemClass=" + + itemClass + + '}'; } } } diff --git a/tika-core/src/main/java/org/apache/tika/config/Field.java b/tika-core/src/main/java/org/apache/tika/config/Field.java index 403ad6dcd3..bd52e5bbda 100644 --- a/tika-core/src/main/java/org/apache/tika/config/Field.java +++ b/tika-core/src/main/java/org/apache/tika/config/Field.java @@ -23,9 +23,8 @@ import java.lang.annotation.Target; /** - * Field annotation is a contract for binding {@link Param} value from - * Tika Configuration to an object. - * services + * Field annotation is a contract for binding {@link Param} value from Tika Configuration to an + * object. services * * @since Apache Tika 1.14 */ diff --git a/tika-core/src/main/java/org/apache/tika/config/Initializable.java b/tika-core/src/main/java/org/apache/tika/config/Initializable.java index f37bdd9095..df7a91d17c 100644 --- a/tika-core/src/main/java/org/apache/tika/config/Initializable.java +++ b/tika-core/src/main/java/org/apache/tika/config/Initializable.java @@ -17,18 +17,16 @@ package org.apache.tika.config; import java.util.Map; - import org.apache.tika.exception.TikaConfigException; /** - * Components that must do special processing across multiple fields - * at initialization time should implement this interface. - *

- * TikaConfig will call initialize on Initializable classes after - * setting the parameters for non-statically service loaded classes. - *

- * TikaConfig will call checkInitialization on all Initializables, - * whether loaded statically + * Components that must do special processing across multiple fields at initialization time should + * implement this interface. + * + *

TikaConfig will call initialize on Initializable classes after setting the parameters for + * non-statically service loaded classes. + * + *

TikaConfig will call checkInitialization on all Initializables, whether loaded statically */ public interface Initializable { @@ -38,15 +36,10 @@ public interface Initializable { */ void initialize(Map params) throws TikaConfigException; - /** - * @param problemHandler if there is a problem and no - * custom initializableProblemHandler has been configured - * via Initializable parameters, - * this is called to respond. + * @param problemHandler if there is a problem and no custom initializableProblemHandler has + * been configured via Initializable parameters, this is called to respond. * @throws TikaConfigException */ void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException; - - } diff --git a/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java b/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java index fdca6901c9..8f933f300d 100644 --- a/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java +++ b/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java @@ -16,72 +16,71 @@ */ package org.apache.tika.config; - -import org.slf4j.LoggerFactory; - import org.apache.tika.exception.TikaConfigException; - +import org.slf4j.LoggerFactory; /** - * This is to be used to handle potential recoverable problems that - * might arise during initialization. + * This is to be used to handle potential recoverable problems that might arise during + * initialization. */ public interface InitializableProblemHandler { + /** Strategy that simply ignores all problems. */ + InitializableProblemHandler IGNORE = + new InitializableProblemHandler() { + public void handleInitializableProblem(String className, String message) {} - /** - * Strategy that simply ignores all problems. - */ - InitializableProblemHandler IGNORE = new InitializableProblemHandler() { - public void handleInitializableProblem(String className, String message) { - } + @Override + public String toString() { + return "IGNORE"; + } + }; - @Override - public String toString() { - return "IGNORE"; - } - }; /** - * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} - * created using the given class name. + * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} created using + * the given class name. */ - InitializableProblemHandler INFO = new InitializableProblemHandler() { - public void handleInitializableProblem(String classname, String message) { - LoggerFactory.getLogger(classname).info(message); - } + InitializableProblemHandler INFO = + new InitializableProblemHandler() { + public void handleInitializableProblem(String classname, String message) { + LoggerFactory.getLogger(classname).info(message); + } + + @Override + public String toString() { + return "INFO"; + } + }; - @Override - public String toString() { - return "INFO"; - } - }; /** - * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} - * created using the given class name. + * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} created using + * the given class name. */ - InitializableProblemHandler WARN = new InitializableProblemHandler() { - public void handleInitializableProblem(String classname, String message) { - LoggerFactory.getLogger(classname).warn(message); - } + InitializableProblemHandler WARN = + new InitializableProblemHandler() { + public void handleInitializableProblem(String classname, String message) { + LoggerFactory.getLogger(classname).warn(message); + } - @Override - public String toString() { - return "WARN"; - } - }; - InitializableProblemHandler THROW = new InitializableProblemHandler() { - public void handleInitializableProblem(String classname, String message) - throws TikaConfigException { - throw new TikaConfigException(message); - } + @Override + public String toString() { + return "WARN"; + } + }; - @Override - public String toString() { - return "THROW"; - } - }; + InitializableProblemHandler THROW = + new InitializableProblemHandler() { + public void handleInitializableProblem(String classname, String message) + throws TikaConfigException { + throw new TikaConfigException(message); + } + + @Override + public String toString() { + return "THROW"; + } + }; InitializableProblemHandler DEFAULT = WARN; void handleInitializableProblem(String className, String message) throws TikaConfigException; - } diff --git a/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java b/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java index 666c20d23e..c134d0689e 100644 --- a/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java +++ b/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java @@ -16,67 +16,65 @@ */ package org.apache.tika.config; - import org.slf4j.LoggerFactory; - /** - * Interface for error handling strategies in service class loading. - * You can implement this interface for a custom error handling mechanism, - * or use one of the predefined strategies. + * Interface for error handling strategies in service class loading. You can implement this + * interface for a custom error handling mechanism, or use one of the predefined strategies. * * @since Apache Tika 0.9 */ public interface LoadErrorHandler { - /** - * Strategy that simply ignores all problems. - */ - LoadErrorHandler IGNORE = new LoadErrorHandler() { - public void handleLoadError(String classname, Throwable throwable) { - } + /** Strategy that simply ignores all problems. */ + LoadErrorHandler IGNORE = + new LoadErrorHandler() { + public void handleLoadError(String classname, Throwable throwable) {} + + @Override + public String toString() { + return "IGNORE"; + } + }; - @Override - public String toString() { - return "IGNORE"; - } - }; /** - * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} - * created using the given class name. + * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} created using + * the given class name. */ - LoadErrorHandler WARN = new LoadErrorHandler() { - public void handleLoadError(String classname, Throwable throwable) { - LoggerFactory.getLogger(classname).warn("Unable to load {}", classname, throwable); - } + LoadErrorHandler WARN = + new LoadErrorHandler() { + public void handleLoadError(String classname, Throwable throwable) { + LoggerFactory.getLogger(classname) + .warn("Unable to load {}", classname, throwable); + } + + @Override + public String toString() { + return "WARN"; + } + }; - @Override - public String toString() { - return "WARN"; - } - }; /** - * Strategy that throws a {@link RuntimeException} with the given - * throwable as the root cause, thus interrupting the entire service - * loading operation. + * Strategy that throws a {@link RuntimeException} with the given throwable as the root cause, + * thus interrupting the entire service loading operation. */ - LoadErrorHandler THROW = new LoadErrorHandler() { - public void handleLoadError(String classname, Throwable throwable) { - throw new RuntimeException("Unable to load " + classname, throwable); - } + LoadErrorHandler THROW = + new LoadErrorHandler() { + public void handleLoadError(String classname, Throwable throwable) { + throw new RuntimeException("Unable to load " + classname, throwable); + } - @Override - public String toString() { - return "THROW"; - } - }; + @Override + public String toString() { + return "THROW"; + } + }; /** - * Handles a problem encountered when trying to load the specified - * service class. The implementation can log or otherwise process - * the given error information. If the method returns normally, then - * the service loader simply skips this class and continues with the - * next one. + * Handles a problem encountered when trying to load the specified service class. The + * implementation can log or otherwise process the given error information. If the method + * returns normally, then the service loader simply skips this class and continues with the next + * one. * * @param classname name of the service class * @param throwable the encountered problem diff --git a/tika-core/src/main/java/org/apache/tika/config/Param.java b/tika-core/src/main/java/org/apache/tika/config/Param.java index 25d367fe16..0ebbc4ad43 100644 --- a/tika-core/src/main/java/org/apache/tika/config/Param.java +++ b/tika-core/src/main/java/org/apache/tika/config/Param.java @@ -38,24 +38,20 @@ import javax.xml.transform.TransformerException; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; - +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.multiple.AbstractMultipleParser; +import org.apache.tika.utils.XMLReaderUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.parser.multiple.AbstractMultipleParser; -import org.apache.tika.utils.XMLReaderUtils; - - /** * This is a serializable model class for parameters from configuration file. * - * @param value type. Should be serializable to string and have a constructor - * with string param + * @param value type. Should be serializable to string and have a constructor with string param * @since Apache Tika 1.14 */ public class Param implements Serializable { @@ -88,7 +84,7 @@ public class Param implements Serializable { wellKnownMap.put("metadataPolicy", AbstractMultipleParser.MetadataPolicy.class); } - //one of these two is used for serialization + // one of these two is used for serialization private final List valueStrings = new ArrayList<>(); private final Map valueMap = new LinkedHashMap<>(); @@ -96,8 +92,7 @@ public class Param implements Serializable { private String name; private T actualValue; - public Param() { - } + public Param() {} public Param(String name, Class type, T value) { this.name = name; @@ -106,7 +101,7 @@ public Param(String name, Class type, T value) { if (List.class.isAssignableFrom(value.getClass())) { this.valueStrings.addAll((List) value); } else if (Map.class.isAssignableFrom(value.getClass())) { - valueMap.putAll((Map)value); + valueMap.putAll((Map) value); } else { this.valueStrings.add(value.toString()); } @@ -156,8 +151,8 @@ public static Param load(Node node) throws TikaConfigException { String type = typeAttr.getTextContent(); if ("class".equals(type)) { if (classAttr == null) { - throw new TikaConfigException("must specify a class attribute if " + - "type=\"class\""); + throw new TikaConfigException( + "must specify a class attribute if " + "type=\"class\""); } ret.setType(clazz); } else { @@ -180,7 +175,7 @@ public static Param load(Node node) throws TikaConfigException { } else if (Map.class.isAssignableFrom(ret.type)) { loadMap(ret, node); } else { - //allow the empty string + // allow the empty string String textContent = ""; if (value != null) { textContent = value.getTextContent(); @@ -190,12 +185,16 @@ public static Param load(Node node) throws TikaConfigException { } return ret; } - private static void loadObject(Param ret, Node root, Class clazz) throws TikaConfigException { + + private static void loadObject(Param ret, Node root, Class clazz) + throws TikaConfigException { try { - ret.actualValue = (T)clazz.getDeclaredConstructor().newInstance(); - } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | - InvocationTargetException e) { + ret.actualValue = (T) clazz.getDeclaredConstructor().newInstance(); + } catch (InstantiationException + | IllegalAccessException + | NoSuchMethodException + | InvocationTargetException e) { throw new TikaConfigException("can't build class: " + clazz, e); } @@ -209,19 +208,23 @@ private static void loadObject(Param ret, Node root, Class clazz) throws Param param = load(params.item(j)); Method method = null; - String methodName = "set" + - param.getName().substring(0,1).toUpperCase(Locale.US) + - param.getName().substring(1); + String methodName = + "set" + + param.getName().substring(0, 1).toUpperCase(Locale.US) + + param.getName().substring(1); try { - method = ret.actualValue.getClass().getMethod(methodName, - param.getType()); + method = + ret.actualValue + .getClass() + .getMethod(methodName, param.getType()); } catch (NoSuchMethodException e) { throw new TikaConfigException("can't find method: " + methodName, e); } try { method.invoke(ret.actualValue, param.getValue()); } catch (IllegalAccessException | InvocationTargetException e) { - throw new TikaConfigException("can't set param value: " + param.getName(), e); + throw new TikaConfigException( + "can't set param value: " + param.getName(), e); } } } @@ -247,10 +250,10 @@ private static void loadMap(Param ret, Node root) throws TikaConfigExcept key = child.getLocalName(); value = child.getTextContent(); } - if (((Map)ret.actualValue).containsKey(key)) { + if (((Map) ret.actualValue).containsKey(key)) { throw new TikaConfigException("Duplicate keys are not allowed: " + key); } - ((Map)ret.actualValue).put(key, value); + ((Map) ret.actualValue).put(key, value); ret.valueMap.put(key, value); } child = child.getNextSibling(); @@ -293,8 +296,8 @@ private static T getTypedValue(Class type, String value) { constructor.setAccessible(true); return constructor.newInstance(value); } catch (NoSuchMethodException e) { - throw new RuntimeException(type + " doesnt have a constructor that takes String arg", - e); + throw new RuntimeException( + type + " doesnt have a constructor that takes String arg", e); } catch (IllegalAccessException | InstantiationException | InvocationTargetException e) { throw new RuntimeException(e); } @@ -344,13 +347,20 @@ public T getValue() { @Override public String toString() { - return "Param{" + "name='" + name + '\'' + ", valueStrings='" + valueStrings + '\'' + - ", actualValue=" + actualValue + '}'; + return "Param{" + + "name='" + + name + + '\'' + + ", valueStrings='" + + valueStrings + + '\'' + + ", actualValue=" + + actualValue + + '}'; } public void save(OutputStream stream) throws TransformerException, TikaException { - DocumentBuilder builder = XMLReaderUtils.getDocumentBuilder(); Document doc = builder.newDocument(); Element paramEl = doc.createElement("param"); @@ -381,9 +391,9 @@ public void save(Document doc, Node node) { el.appendChild(item); } } else if (Map.class.isAssignableFrom(actualValue.getClass())) { - for (Object key : ((Map)actualValue).keySet()) { + for (Object key : ((Map) actualValue).keySet()) { String keyString = (String) key; - String valueString = (String)((Map)actualValue).get(keyString); + String valueString = (String) ((Map) actualValue).get(keyString); Node item = doc.createElement(keyString); item.setTextContent(valueString); el.appendChild(item); @@ -392,5 +402,4 @@ public void save(Document doc, Node node) { el.setTextContent(valueStrings.get(0)); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/config/ParamField.java b/tika-core/src/main/java/org/apache/tika/config/ParamField.java index 15e977ae6d..8a9707b479 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ParamField.java +++ b/tika-core/src/main/java/org/apache/tika/config/ParamField.java @@ -22,12 +22,11 @@ import java.util.HashMap; import java.util.Locale; import java.util.Map; - import org.apache.tika.exception.TikaConfigException; /** - * This class stores metdata for {@link Field} annotation are used to map them - * to {@link Param} at runtime + * This class stores metdata for {@link Field} annotation are used to map them to {@link Param} at + * runtime * * @since Apache Tika 1.14 */ @@ -35,18 +34,20 @@ public class ParamField { public static final String DEFAULT = "#default"; - //NOTE: since (primitive type) is NOT AssignableFrom (BoxedType), + // NOTE: since (primitive type) is NOT AssignableFrom (BoxedType), // we just use boxed type for everything! // Example : short.class.isAssignableFrom(Short.class) ? false private static final Map, Class> PRIMITIVE_MAP = - new HashMap, Class>() {{ + new HashMap, Class>() { + { put(int.class, Integer.class); put(short.class, Short.class); put(boolean.class, Boolean.class); put(long.class, Long.class); put(float.class, Float.class); put(double.class, Double.class); - }}; + } + }; private final String name; private final Class type; private final boolean required; @@ -94,9 +95,9 @@ public boolean isRequired() { /** * Sets given value to the annotated field of bean * - * @param bean bean with annotation for field + * @param bean bean with annotation for field * @param value value of field - * @throws IllegalAccessException when it occurs + * @throws IllegalAccessException when it occurs * @throws InvocationTargetException when it occurs */ public void assignValue(Object bean, Object value) @@ -117,15 +118,17 @@ private Class retrieveType() throws TikaConfigException { if (params.length != 1) { String msg = "Invalid setter method. Must have one and only one parameter. "; if (setter.getName().startsWith("get")) { - msg += "Perhaps the annotation is misplaced on " + setter.getName() + - " while a set'X' is expected?"; + msg += + "Perhaps the annotation is misplaced on " + + setter.getName() + + " while a set'X' is expected?"; } throw new TikaConfigException(msg); } type = params[0]; } if (type.isPrimitive() && PRIMITIVE_MAP.containsKey(type)) { - type = PRIMITIVE_MAP.get(type); //primitive types have hard time + type = PRIMITIVE_MAP.get(type); // primitive types have hard time } return type; } @@ -138,8 +141,9 @@ private String retrieveParamName(Field annotation) { } else { String setterName = setter.getName(); if (setterName.startsWith("set") && setterName.length() > 3) { - name = setterName.substring(3, 4).toLowerCase(Locale.ROOT) + - setterName.substring(4); + name = + setterName.substring(3, 4).toLowerCase(Locale.ROOT) + + setterName.substring(4); } else { name = setter.getName(); } @@ -152,7 +156,14 @@ private String retrieveParamName(Field annotation) { @Override public String toString() { - return "ParamField{" + "name='" + name + '\'' + ", type=" + type + ", required=" + - required + '}'; + return "ParamField{" + + "name='" + + name + + '\'' + + ", type=" + + type + + ", required=" + + required + + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java index acc53ca885..32e624ba2b 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java +++ b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java @@ -33,7 +33,6 @@ import java.util.Map; import java.util.Set; import java.util.regex.Pattern; - import org.apache.tika.exception.TikaConfigException; import org.apache.tika.utils.ServiceLoaderUtils; @@ -45,32 +44,37 @@ public class ServiceLoader { /** - * The dynamic set of services available in an OSGi environment. - * Managed by the {@link TikaActivator} class and used as an additional - * source of service instances in the {@link #loadServiceProviders(Class)} - * method. + * The dynamic set of services available in an OSGi environment. Managed by the {@link + * TikaActivator} class and used as an additional source of service instances in the {@link + * #loadServiceProviders(Class)} method. */ private static final Map SERVICES = new HashMap<>(); + private static final Pattern COMMENT = Pattern.compile("#.*"); private static final Pattern WHITESPACE = Pattern.compile("\\s+"); + /** - * The default context class loader to use for all threads, or - * null to automatically select the context class loader. + * The default context class loader to use for all threads, or null to + * automatically select the context class loader. */ private static volatile ClassLoader CONTEXT_CLASS_LOADER = null; + private final ClassLoader loader; private final LoadErrorHandler handler; private final InitializableProblemHandler initializableProblemHandler; private final boolean dynamic; - public ServiceLoader(ClassLoader loader, LoadErrorHandler handler, - InitializableProblemHandler initializableProblemHandler, boolean dynamic) { + public ServiceLoader( + ClassLoader loader, + LoadErrorHandler handler, + InitializableProblemHandler initializableProblemHandler, + boolean dynamic) { this.loader = loader; this.handler = handler; this.initializableProblemHandler = initializableProblemHandler; this.dynamic = dynamic; - } + public ServiceLoader(ClassLoader loader, LoadErrorHandler handler, boolean dynamic) { this(loader, handler, InitializableProblemHandler.WARN, dynamic); } @@ -80,24 +84,27 @@ public ServiceLoader(ClassLoader loader, LoadErrorHandler handler) { } public ServiceLoader(ClassLoader loader) { - this(loader, - Boolean.getBoolean("org.apache.tika.service.error.warn") ? LoadErrorHandler.WARN : - LoadErrorHandler.IGNORE); + this( + loader, + Boolean.getBoolean("org.apache.tika.service.error.warn") + ? LoadErrorHandler.WARN + : LoadErrorHandler.IGNORE); } public ServiceLoader() { - this(getContextClassLoader(), - Boolean.getBoolean("org.apache.tika.service.error.warn") ? LoadErrorHandler.WARN : - LoadErrorHandler.IGNORE, true); + this( + getContextClassLoader(), + Boolean.getBoolean("org.apache.tika.service.error.warn") + ? LoadErrorHandler.WARN + : LoadErrorHandler.IGNORE, + true); } /** - * Returns the context class loader of the current thread. If such - * a class loader is not available, then the loader of this class or - * finally the system class loader is returned. + * Returns the context class loader of the current thread. If such a class loader is not + * available, then the loader of this class or finally the system class loader is returned. * - * @return context class loader, or null if no loader - * is available + * @return context class loader, or null if no loader is available * @see TIKA-441 */ static ClassLoader getContextClassLoader() { @@ -112,12 +119,11 @@ static ClassLoader getContextClassLoader() { } /** - * Sets the context class loader to use for all threads that access - * this class. Used for example in an OSGi environment to avoid problems - * with the default context class loader. + * Sets the context class loader to use for all threads that access this class. Used for example + * in an OSGi environment to avoid problems with the default context class loader. * - * @param loader default context class loader, - * or null to automatically pick the loader + * @param loader default context class loader, or null to automatically pick the + * loader */ public static void setContextClassLoader(ClassLoader loader) { CONTEXT_CLASS_LOADER = loader; @@ -166,8 +172,7 @@ public InitializableProblemHandler getInitializableProblemHandler() { } /** - * Returns an input stream for reading the specified resource from the - * configured class loader. + * Returns an input stream for reading the specified resource from the configured class loader. * * @param name resource name * @return input stream, or null if the resource was not found @@ -192,18 +197,16 @@ public ClassLoader getLoader() { } /** - * Loads and returns the named service class that's expected to implement - * the given interface. - *

- * Note that this class does not use the {@link LoadErrorHandler}, a - * {@link ClassNotFoundException} is always returned for unknown - * classes or classes of the wrong type + * Loads and returns the named service class that's expected to implement the given interface. + * + *

Note that this class does not use the {@link LoadErrorHandler}, a {@link + * ClassNotFoundException} is always returned for unknown classes or classes of the wrong type * * @param iface service interface - * @param name service class name + * @param name service class name * @return service class - * @throws ClassNotFoundException if the service class can not be found - * or does not implement the given interface + * @throws ClassNotFoundException if the service class can not be found or does not implement + * the given interface * @see Class#forName(String, boolean, ClassLoader) * @since Apache Tika 1.1 */ @@ -225,10 +228,8 @@ public Class getServiceClass(Class iface, String name) } /** - * Returns all the available service resources matching the - * given pattern, such as all instances of tika-mimetypes.xml - * on the classpath, or all org.apache.tika.parser.Parser - * service files. + * Returns all the available service resources matching the given pattern, such as all instances + * of tika-mimetypes.xml on the classpath, or all org.apache.tika.parser.Parser service files. */ public Enumeration findServiceResources(String filePattern) { try { @@ -243,7 +244,7 @@ public Enumeration findServiceResources(String filePattern) { /** * Returns all the available service providers of the given type. * - * As of versions after 2.4.1, this removes duplicate classes + *

As of versions after 2.4.1, this removes duplicate classes * * @param iface service provider interface * @return available service providers @@ -256,7 +257,7 @@ public List loadServiceProviders(Class iface) { List providers = new ArrayList<>(); Set seen = new HashSet<>(); for (T provider : tmp) { - if (! seen.contains(provider.getClass().getCanonicalName())) { + if (!seen.contains(provider.getClass().getCanonicalName())) { providers.add(provider); seen.add(provider.getClass().getCanonicalName()); } @@ -265,9 +266,8 @@ public List loadServiceProviders(Class iface) { } /** - * Returns the available dynamic service providers of the given type. - * The returned list is newly allocated and may be freely modified - * by the caller. + * Returns the available dynamic service providers of the given type. The returned list is newly + * allocated and may be freely modified by the caller. * * @param iface service provider interface * @return dynamic service providers @@ -294,10 +294,9 @@ public List loadDynamicServiceProviders(Class iface) { } /** - * Returns the defined static service providers of the given type, without - * attempting to load them. - * The providers are loaded using the service provider mechanism using - * the configured class loader (if any). + * Returns the defined static service providers of the given type, without attempting to load + * them. The providers are loaded using the service provider mechanism using the configured + * class loader (if any). * * @param iface service provider interface * @return static list of uninitialised service providers @@ -326,19 +325,18 @@ public List loadStaticServiceProviders(Class iface) { } /** - * Returns the available static service providers of the given type. - * The providers are loaded using the service provider mechanism using - * the configured class loader (if any). The returned list is newly - * allocated and may be freely modified by the caller. + * Returns the available static service providers of the given type. The providers are loaded + * using the service provider mechanism using the configured class loader (if any). The returned + * list is newly allocated and may be freely modified by the caller. * - * @param iface service provider interface + * @param iface service provider interface * @param excludes -- do not load these classes * @return static service providers * @since Apache Tika 1.2 */ @SuppressWarnings("unchecked") - public List loadStaticServiceProviders(Class iface, - Collection> excludes) { + public List loadStaticServiceProviders( + Class iface, Collection> excludes) { List providers = new ArrayList<>(); if (loader != null) { @@ -407,7 +405,5 @@ public boolean isInstanceOf(Class iface) { public int compareTo(RankedService that) { return that.rank - rank; // highest number first } - } - } diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java b/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java index e076f1c2dd..54e05ce549 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java @@ -16,6 +16,8 @@ */ package org.apache.tika.config; +import org.apache.tika.detect.Detector; +import org.apache.tika.parser.Parser; import org.osgi.framework.BundleActivator; import org.osgi.framework.BundleContext; import org.osgi.framework.Constants; @@ -23,17 +25,13 @@ import org.osgi.util.tracker.ServiceTracker; import org.osgi.util.tracker.ServiceTrackerCustomizer; -import org.apache.tika.detect.Detector; -import org.apache.tika.parser.Parser; - /** - * Bundle activator that adjust the class loading mechanism of the - * {@link ServiceLoader} class to work correctly in an OSGi environment. - *

- * Note that you should not access this class directly. - * Instead the OSGi environment (if present) will automatically invoke the - * methods of this class based on the Bundle-Activator setting in the bundle - * manifest. + * Bundle activator that adjust the class loading mechanism of the {@link ServiceLoader} class to + * work correctly in an OSGi environment. + * + *

Note that you should not access this class directly. Instead the OSGi + * environment (if present) will automatically invoke the methods of this class based on the + * Bundle-Activator setting in the bundle manifest. * * @since Apache Tika 0.9 */ @@ -44,7 +42,8 @@ public class TikaActivator implements BundleActivator, ServiceTrackerCustomizer private ServiceTracker parserTracker; private BundleContext bundleContext; - //-----------------------------------------------------< BundleActivator > + + // -----------------------------------------------------< BundleActivator > public void start(final BundleContext context) throws Exception { bundleContext = context; @@ -73,12 +72,10 @@ public Object addingService(ServiceReference reference) { return service; } - public void modifiedService(ServiceReference reference, Object service) { - } + public void modifiedService(ServiceReference reference, Object service) {} public void removedService(ServiceReference reference, Object service) { ServiceLoader.removeService(reference); bundleContext.ungetService(reference); } - } diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index e68ad10d65..7fe261430c 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -39,15 +39,6 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import javax.imageio.spi.ServiceRegistry; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.SAXException; - import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor; import org.apache.tika.concurrent.SimpleThreadPoolExecutor; import org.apache.tika.detect.CompositeDetector; @@ -81,16 +72,21 @@ import org.apache.tika.utils.AnnotationUtils; import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; -/** - * Parse xml config file. - */ +/** Parse xml config file. */ public class TikaConfig { - public static int DEFAULT_MAX_JSON_STRING_FIELD_LENGTH = 20_000_000;//jackson's default + public static int DEFAULT_MAX_JSON_STRING_FIELD_LENGTH = 20_000_000; // jackson's default public static String MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME = "maxJsonStringFieldLength"; - //use this to look for unneeded instantiations of TikaConfig + // use this to look for unneeded instantiations of TikaConfig protected static final AtomicInteger TIMES_INSTANTIATED = new AtomicInteger(); private static final Logger LOG = LoggerFactory.getLogger(TikaConfig.class); @@ -124,13 +120,16 @@ public TikaConfig(Path path, ServiceLoader loader) public TikaConfig(File file) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(file.toPath())); } + public TikaConfig(File file, ServiceLoader loader) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(file.toPath()), loader); } + public TikaConfig(URL url) throws TikaException, IOException, SAXException { this(url, ServiceLoader.getContextClassLoader()); } + public TikaConfig(URL url, ClassLoader loader) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(url.toString()).getDocumentElement(), loader); } @@ -184,15 +183,14 @@ private TikaConfig(Element element, ServiceLoader loader) throws TikaException, } /** - * Creates a Tika configuration from the built-in media type rules - * and all the {@link Parser} implementations available through the - * {@link ServiceRegistry service provider mechanism} in the given - * class loader. + * Creates a Tika configuration from the built-in media type rules and all the {@link Parser} + * implementations available through the {@link ServiceRegistry service provider mechanism} in + * the given class loader. * - * @param loader the class loader through which parser implementations - * are loaded, or null for no parsers + * @param loader the class loader through which parser implementations are loaded, or null + * for no parsers * @throws MimeTypeException if the built-in media type rules are broken - * @throws IOException if the built-in media type rules can not be read + * @throws IOException if the built-in media type rules can not be read * @since Apache Tika 0.8 */ public TikaConfig(ClassLoader loader) throws MimeTypeException, IOException { @@ -210,20 +208,21 @@ public TikaConfig(ClassLoader loader) throws MimeTypeException, IOException { } /** - * Creates a default Tika configuration. - * First checks whether an XML config file is specified, either in + * Creates a default Tika configuration. First checks whether an XML config file is specified, + * either in + * *

    - *
  1. System property "tika.config", or
  2. - *
  3. Environment variable TIKA_CONFIG
  4. + *
  5. System property "tika.config", or + *
  6. Environment variable TIKA_CONFIG *
- *

If one of these have a value, try to resolve it relative to file - * system or classpath.

- *

If XML config is not specified, initialize from the built-in media - * type rules and all the {@link Parser} implementations available through - * the {@link ServiceRegistry service provider mechanism} in the context - * class loader of the current thread.

* - * @throws IOException if the configuration can not be read + *

If one of these have a value, try to resolve it relative to file system or classpath. + * + *

If XML config is not specified, initialize from the built-in media type rules and all the + * {@link Parser} implementations available through the {@link ServiceRegistry service provider + * mechanism} in the context class loader of the current thread. + * + * @throws IOException if the configuration can not be read * @throws TikaException if problem with MimeTypes or parsing XML config */ public TikaConfig() throws TikaException, IOException { @@ -281,17 +280,16 @@ public TikaConfig() throws TikaException, IOException { this.autoDetectParserConfig = AutoDetectParserConfig.load(element); setMaxJsonStringFieldLength(element); } catch (SAXException e) { - throw new TikaException("Specified Tika configuration has syntax errors: " + config, - e); + throw new TikaException( + "Specified Tika configuration has syntax errors: " + config, e); } } TIMES_INSTANTIATED.incrementAndGet(); } /** - * * @return maximum field length when serializing String fields in Tika's metadata or metadata - * list into JSON + * list into JSON */ public static int getMaxJsonStringFieldLength() { return MAX_JSON_STRING_FIELD_LENGTH; @@ -305,8 +303,9 @@ private void setMaxJsonStringFieldLength(Element properties) throws TikaConfigEx try { MAX_JSON_STRING_FIELD_LENGTH = Integer.parseInt(n.getTextContent()); } catch (NumberFormatException e) { - throw new TikaConfigException(MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME + " " + - "is not an integer", e); + throw new TikaConfigException( + MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME + " " + "is not an integer", + e); } return; } @@ -328,8 +327,12 @@ protected static CompositeEncodingDetector getDefaultEncodingDetector(ServiceLoa protected static CompositeRenderer getDefaultRenderer(ServiceLoader loader) { return new CompositeRenderer(loader); } - private static CompositeParser getDefaultParser(MimeTypes types, ServiceLoader loader, - EncodingDetector encodingDetector, Renderer renderer) { + + private static CompositeParser getDefaultParser( + MimeTypes types, + ServiceLoader loader, + EncodingDetector encodingDetector, + Renderer renderer) { return new DefaultParser(types.getMediaTypeRegistry(), loader, encodingDetector, renderer); } @@ -379,9 +382,9 @@ private static String getText(Node node) { } /** - * Provides a default configuration (TikaConfig). Currently creates a - * new instance each time it's called; we may be able to have it - * return a shared instance once it is completely immutable. + * Provides a default configuration (TikaConfig). Currently creates a new instance each time + * it's called; we may be able to have it return a shared instance once it is completely + * immutable. * * @return default configuration */ @@ -406,9 +409,8 @@ private static Element getChild(Element element, String name) { return null; } - private static List getTopLevelElementChildren(Element element, String parentName, - String childrenName) - throws TikaException { + private static List getTopLevelElementChildren( + Element element, String parentName, String childrenName) throws TikaException { Node parentNode = null; if (parentName != null) { // Should be only zero or one / etc tag @@ -505,8 +507,9 @@ private static ServiceLoader serviceLoaderFromDomElement(Element element, ClassL if (loader == null) { loader = ServiceLoader.getContextClassLoader(); } - serviceLoader = new ServiceLoader(loader, loadErrorHandler, initializableProblemHandler, - dynamic); + serviceLoader = + new ServiceLoader( + loader, loadErrorHandler, initializableProblemHandler, dynamic); } else if (loader != null) { serviceLoader = new ServiceLoader(loader); } else { @@ -520,22 +523,28 @@ private static InitializableProblemHandler getInitializableProblemHandler( if (initializableProblemHandler == null || initializableProblemHandler.length() == 0) { return InitializableProblemHandler.DEFAULT; } - if (InitializableProblemHandler.IGNORE.toString() + if (InitializableProblemHandler.IGNORE + .toString() .equalsIgnoreCase(initializableProblemHandler)) { return InitializableProblemHandler.IGNORE; - } else if (InitializableProblemHandler.INFO.toString() + } else if (InitializableProblemHandler.INFO + .toString() .equalsIgnoreCase(initializableProblemHandler)) { return InitializableProblemHandler.INFO; - } else if (InitializableProblemHandler.WARN.toString() + } else if (InitializableProblemHandler.WARN + .toString() .equalsIgnoreCase(initializableProblemHandler)) { return InitializableProblemHandler.WARN; - } else if (InitializableProblemHandler.THROW.toString() + } else if (InitializableProblemHandler.THROW + .toString() .equalsIgnoreCase(initializableProblemHandler)) { return InitializableProblemHandler.THROW; } - throw new TikaConfigException(String.format(Locale.US, - "Couldn't parse non-null '%s'. Must be one of 'ignore', 'info', 'warn' or 'throw'", - initializableProblemHandler)); + throw new TikaConfigException( + String.format( + Locale.US, + "Couldn't parse non-null '%s'. Must be one of 'ignore', 'info', 'warn' or 'throw'", + initializableProblemHandler)); } public static void mustNotBeEmpty(String paramName, String paramValue) @@ -562,17 +571,16 @@ private void updateXMLReaderUtils(Element element) throws TikaException { } if (child.hasAttribute("maxEntityExpansions")) { - XMLReaderUtils.setMaxEntityExpansions(Integer.parseInt(child.getAttribute("maxEntityExpansions"))); + XMLReaderUtils.setMaxEntityExpansions( + Integer.parseInt(child.getAttribute("maxEntityExpansions"))); } // make sure to call this after set entity expansions if (child.hasAttribute("poolSize")) { XMLReaderUtils.setPoolSize(Integer.parseInt(child.getAttribute("poolSize"))); } - } - /** * Returns the configured parser instance. * @@ -633,7 +641,7 @@ public AutoDetectParserConfig getAutoDetectParserConfig() { return autoDetectParserConfig; } - private static abstract class XmlLoader { + private abstract static class XmlLoader { protected static final String PARAMS_TAG_NAME = "params"; abstract boolean supportsComposite(); @@ -655,10 +663,13 @@ abstract T preLoadOne(Class loadedClass, String classname, MimeType abstract CT createComposite(List loaded, MimeTypes mimeTypes, ServiceLoader loader); - abstract T createComposite(Class compositeClass, List children, - Set> excludeChildren, - Map params, MimeTypes mimeTypes, - ServiceLoader loader) + abstract T createComposite( + Class compositeClass, + List children, + Set> excludeChildren, + Map params, + MimeTypes mimeTypes, + ServiceLoader loader) throws InvocationTargetException, IllegalAccessException, InstantiationException; abstract T decorate(T created, Element element) @@ -670,8 +681,8 @@ CT loadOverall(Element element, MimeTypes mimeTypes, ServiceLoader loader) List loaded = new ArrayList<>(); // Find the children of the parent tag, if any - for (Element le : getTopLevelElementChildren(element, getParentTagName(), - getLoaderTagName())) { + for (Element le : + getTopLevelElementChildren(element, getParentTagName(), getLoaderTagName())) { T loadedChild = loadOne(le, mimeTypes, loader); if (loadedChild != null) { loaded.add(loadedChild); @@ -694,10 +705,11 @@ CT loadOverall(Element element, MimeTypes mimeTypes, ServiceLoader loader) return (CT) loaded.get(0); } else if (loaded.size() > 1) { throw new TikaConfigException( - "Composite not supported for " + getParentTagName() + - ". Must specify only one child!"); + "Composite not supported for " + + getParentTagName() + + ". Must specify only one child!"); } else { - //throw exception if empty? + // throw exception if empty? } } // Wrap the defined parsers/detectors up in a Composite @@ -724,7 +736,7 @@ T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) Class loadedClass = loader.getServiceClass(getLoaderClass(), name); // Do pre-load checks and short-circuits - //TODO : allow duplicate instances with different configurations + // TODO : allow duplicate instances with different configurations loaded = preLoadOne(loadedClass, name, mimeTypes); if (loaded != null) { return loaded; @@ -762,10 +774,10 @@ T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) Element excl = (Element) excludeChildNodes.item(i); String exclName = excl.getAttribute("class"); try { - excludeChildren - .add(loader.getServiceClass(getLoaderClass(), exclName)); + excludeChildren.add( + loader.getServiceClass(getLoaderClass(), exclName)); } catch (ClassNotFoundException e) { - //TIKA-3268 -- This should stop the world. + // TIKA-3268 -- This should stop the world. throw new TikaConfigException( "Class not found in -exclude list: " + exclName); } @@ -773,8 +785,14 @@ T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) } // Create the Composite - loaded = createComposite(loadedClass, children, excludeChildren, params, - mimeTypes, loader); + loaded = + createComposite( + loadedClass, + children, + excludeChildren, + params, + mimeTypes, + loader); // Default constructor fallback if (loaded == null) { @@ -787,7 +805,7 @@ T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) // See the thread "Configuring parsers and translators" for details } - //Assigning the params to bean fields/setters + // Assigning the params to bean fields/setters AnnotationUtils.assignFieldParams(loaded, params); if (loaded instanceof Initializable) { ((Initializable) loaded).initialize(params); @@ -817,15 +835,19 @@ T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) "Unable to instantiate a " + getLoaderTagName() + " class: " + name, e); } catch (NoSuchMethodException e) { throw new TikaException( - "Unable to find the right constructor for " + getLoaderTagName() + - " class: " + name, e); + "Unable to find the right constructor for " + + getLoaderTagName() + + " class: " + + name, + e); } } - T newInstance(Class loadedClass) - throws IllegalAccessException, InstantiationException, NoSuchMethodException, - InvocationTargetException { + throws IllegalAccessException, + InstantiationException, + NoSuchMethodException, + InvocationTargetException { return loadedClass.getDeclaredConstructor().newInstance(); } @@ -838,8 +860,8 @@ T newInstance(Class loadedClass) Map getParams(Element el) throws TikaException { Map params = new HashMap<>(); for (Node child = el.getFirstChild(); child != null; child = child.getNextSibling()) { - if (PARAMS_TAG_NAME.equals(child.getNodeName())) { //found the node - if (child.hasChildNodes()) { //it has children + if (PARAMS_TAG_NAME.equals(child.getNodeName())) { // found the node + if (child.hasChildNodes()) { // it has children NodeList childNodes = child.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node item = childNodes.item(i); @@ -849,12 +871,11 @@ Map getParams(Element el) throws TikaException { } } } - break; //only the first one is used + break; // only the first one is used } } return params; } - } private static class ParserXmlLoader extends XmlLoader { @@ -885,13 +906,16 @@ Class getLoaderClass() { } @Override - Parser preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + Parser preLoadOne( + Class loadedClass, String classname, MimeTypes mimeTypes) + throws TikaException { // Check for classes which can't be set in config if (AutoDetectParser.class.isAssignableFrom(loadedClass)) { // https://issues.apache.org/jira/browse/TIKA-866 - throw new TikaException("AutoDetectParser not supported in a " + - " configuration element: " + classname); + throw new TikaException( + "AutoDetectParser not supported in a " + + " configuration element: " + + classname); } // Continue with normal loading return null; @@ -904,9 +928,9 @@ boolean isComposite(Parser loaded) { @Override boolean isComposite(Class loadedClass) { - return CompositeParser.class.isAssignableFrom(loadedClass) || - AbstractMultipleParser.class.isAssignableFrom(loadedClass) || - ParserDecorator.class.isAssignableFrom(loadedClass); + return CompositeParser.class.isAssignableFrom(loadedClass) + || AbstractMultipleParser.class.isAssignableFrom(loadedClass) + || ParserDecorator.class.isAssignableFrom(loadedClass); } @Override @@ -915,16 +939,20 @@ CompositeParser createDefault(MimeTypes mimeTypes, ServiceLoader loader) { } @Override - CompositeParser createComposite(List parsers, MimeTypes mimeTypes, - ServiceLoader loader) { + CompositeParser createComposite( + List parsers, MimeTypes mimeTypes, ServiceLoader loader) { MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); return new CompositeParser(registry, parsers); } @Override - Parser createComposite(Class parserClass, List childParsers, - Set> excludeParsers, - Map params, MimeTypes mimeTypes, ServiceLoader loader) + Parser createComposite( + Class parserClass, + List childParsers, + Set> excludeParsers, + Map params, + MimeTypes mimeTypes, + ServiceLoader loader) throws InvocationTargetException, IllegalAccessException, InstantiationException { Parser parser = null; Constructor c = null; @@ -933,47 +961,61 @@ Parser createComposite(Class parserClass, List childPa // Try the possible default and composite parser constructors if (parser == null) { try { - c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, - Collection.class, EncodingDetector.class, Renderer.class); - parser = c.newInstance(registry, loader, excludeParsers, encodingDetector, renderer); + c = + parserClass.getConstructor( + MediaTypeRegistry.class, + ServiceLoader.class, + Collection.class, + EncodingDetector.class, + Renderer.class); + parser = + c.newInstance( + registry, loader, excludeParsers, encodingDetector, renderer); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (parser == null) { try { - c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, - Collection.class, EncodingDetector.class); + c = + parserClass.getConstructor( + MediaTypeRegistry.class, + ServiceLoader.class, + Collection.class, + EncodingDetector.class); parser = c.newInstance(registry, loader, excludeParsers, encodingDetector); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (parser == null) { try { - c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, - Collection.class); + c = + parserClass.getConstructor( + MediaTypeRegistry.class, ServiceLoader.class, Collection.class); parser = c.newInstance(registry, loader, excludeParsers); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (parser == null) { try { - c = parserClass - .getConstructor(MediaTypeRegistry.class, List.class, Collection.class); + c = + parserClass.getConstructor( + MediaTypeRegistry.class, List.class, Collection.class); parser = c.newInstance(registry, childParsers, excludeParsers); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (parser == null) { try { - c = parserClass - .getConstructor(MediaTypeRegistry.class, Collection.class, Map.class); + c = + parserClass.getConstructor( + MediaTypeRegistry.class, Collection.class, Map.class); parser = c.newInstance(registry, childParsers, params); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (parser == null) { @@ -981,7 +1023,7 @@ Parser createComposite(Class parserClass, List childPa c = parserClass.getConstructor(MediaTypeRegistry.class, List.class); parser = c.newInstance(registry, childParsers); } catch (NoSuchMethodException me) { - //swallow + // swallow } } @@ -989,8 +1031,9 @@ Parser createComposite(Class parserClass, List childPa if (parser == null && ParserDecorator.class.isAssignableFrom(parserClass)) { try { CompositeParser cp = null; - if (childParsers.size() == 1 && excludeParsers.size() == 0 && - childParsers.get(0) instanceof CompositeParser) { + if (childParsers.size() == 1 + && excludeParsers.size() == 0 + && childParsers.get(0) instanceof CompositeParser) { cp = (CompositeParser) childParsers.get(0); } else { cp = new CompositeParser(registry, childParsers, excludeParsers); @@ -998,7 +1041,7 @@ Parser createComposite(Class parserClass, List childPa c = parserClass.getConstructor(Parser.class); parser = c.newInstance(cp); } catch (NoSuchMethodException me) { - //swallow + // swallow } } return parser; @@ -1006,8 +1049,10 @@ Parser createComposite(Class parserClass, List childPa @Override Parser newInstance(Class loadedClass) - throws IllegalAccessException, InstantiationException, NoSuchMethodException, - InvocationTargetException { + throws IllegalAccessException, + InstantiationException, + NoSuchMethodException, + InvocationTargetException { Parser parser = null; if (AbstractEncodingDetectorParser.class.isAssignableFrom(loadedClass)) { Constructor ctor = loadedClass.getConstructor(EncodingDetector.class); @@ -1017,7 +1062,7 @@ Parser newInstance(Class loadedClass) } if (parser instanceof RenderingParser) { - ((RenderingParser)parser).setRenderer(renderer); + ((RenderingParser) parser).setRenderer(renderer); } return parser; } @@ -1040,7 +1085,6 @@ Parser decorate(Parser created, Element element) throws IOException, TikaExcepti // All done with decoration return parser; } - } private static class DetectorXmlLoader extends XmlLoader { @@ -1062,8 +1106,9 @@ Class getLoaderClass() { } @Override - Detector preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + Detector preLoadOne( + Class loadedClass, String classname, MimeTypes mimeTypes) + throws TikaException { // If they asked for the mime types as a detector, give // them the one we've already created. TIKA-1708 if (MimeTypes.class.equals(loadedClass)) { @@ -1089,18 +1134,20 @@ CompositeDetector createDefault(MimeTypes mimeTypes, ServiceLoader loader) { } @Override - CompositeDetector createComposite(List detectors, MimeTypes mimeTypes, - ServiceLoader loader) { + CompositeDetector createComposite( + List detectors, MimeTypes mimeTypes, ServiceLoader loader) { MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); return new CompositeDetector(registry, detectors); } @Override - Detector createComposite(Class detectorClass, - List childDetectors, - Set> excludeDetectors, - Map params, MimeTypes mimeTypes, - ServiceLoader loader) + Detector createComposite( + Class detectorClass, + List childDetectors, + Set> excludeDetectors, + Map params, + MimeTypes mimeTypes, + ServiceLoader loader) throws InvocationTargetException, IllegalAccessException, InstantiationException { Detector detector = null; Constructor c; @@ -1109,20 +1156,22 @@ Detector createComposite(Class detectorClass, // Try the possible default and composite detector constructors if (detector == null) { try { - c = detectorClass - .getConstructor(MimeTypes.class, ServiceLoader.class, Collection.class); + c = + detectorClass.getConstructor( + MimeTypes.class, ServiceLoader.class, Collection.class); detector = c.newInstance(mimeTypes, loader, excludeDetectors); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (detector == null) { try { - c = detectorClass - .getConstructor(MediaTypeRegistry.class, List.class, Collection.class); + c = + detectorClass.getConstructor( + MediaTypeRegistry.class, List.class, Collection.class); detector = c.newInstance(registry, childDetectors, excludeDetectors); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (detector == null) { @@ -1130,7 +1179,7 @@ Detector createComposite(Class detectorClass, c = detectorClass.getConstructor(MediaTypeRegistry.class, List.class); detector = c.newInstance(registry, childDetectors); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (detector == null) { @@ -1138,7 +1187,7 @@ Detector createComposite(Class detectorClass, c = detectorClass.getConstructor(List.class); detector = c.newInstance(childDetectors); } catch (NoSuchMethodException me) { - //swallow + // swallow } } @@ -1170,8 +1219,9 @@ Class getLoaderClass() { } @Override - Translator preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + Translator preLoadOne( + Class loadedClass, String classname, MimeTypes mimeTypes) + throws TikaException { // Continue with normal loading return null; } @@ -1192,17 +1242,19 @@ Translator createDefault(MimeTypes mimeTypes, ServiceLoader loader) { } @Override - Translator createComposite(List loaded, MimeTypes mimeTypes, - ServiceLoader loader) { + Translator createComposite( + List loaded, MimeTypes mimeTypes, ServiceLoader loader) { return loaded.get(0); } @Override - Translator createComposite(Class compositeClass, - List children, - Set> excludeChildren, - Map params, MimeTypes mimeTypes, - ServiceLoader loader) + Translator createComposite( + Class compositeClass, + List children, + Set> excludeChildren, + Map params, + MimeTypes mimeTypes, + ServiceLoader loader) throws InvocationTargetException, IllegalAccessException, InstantiationException { throw new InstantiationException("Only one translator supported"); } @@ -1220,14 +1272,18 @@ ConfigurableThreadPoolExecutor createComposite( Class compositeClass, List children, Set> excludeChildren, - Map params, MimeTypes mimeTypes, ServiceLoader loader) + Map params, + MimeTypes mimeTypes, + ServiceLoader loader) throws InvocationTargetException, IllegalAccessException, InstantiationException { throw new InstantiationException("Only one executor service supported"); } @Override - ConfigurableThreadPoolExecutor createComposite(List loaded, - MimeTypes mimeTypes, ServiceLoader loader) { + ConfigurableThreadPoolExecutor createComposite( + List loaded, + MimeTypes mimeTypes, + ServiceLoader loader) { return loaded.get(0); } @@ -1237,8 +1293,9 @@ ConfigurableThreadPoolExecutor createDefault(MimeTypes mimeTypes, ServiceLoader } @Override - ConfigurableThreadPoolExecutor decorate(ConfigurableThreadPoolExecutor created, - Element element) throws IOException, TikaException { + ConfigurableThreadPoolExecutor decorate( + ConfigurableThreadPoolExecutor created, Element element) + throws IOException, TikaException { Element maxThreadElement = getChild(element, "max-threads"); if (maxThreadElement != null) { @@ -1258,8 +1315,8 @@ Class getLoaderClass() { } @Override - ConfigurableThreadPoolExecutor loadOne(Element element, MimeTypes mimeTypes, - ServiceLoader loader) + ConfigurableThreadPoolExecutor loadOne( + Element element, MimeTypes mimeTypes, ServiceLoader loader) throws TikaException, IOException { return super.loadOne(element, mimeTypes, loader); } @@ -1291,8 +1348,10 @@ boolean isComposite(Class loadedClass) @Override ConfigurableThreadPoolExecutor preLoadOne( - Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + Class loadedClass, + String classname, + MimeTypes mimeTypes) + throws TikaException { return null; } } @@ -1317,7 +1376,6 @@ Class getLoaderClass() { return EncodingDetector.class; } - @Override boolean isComposite(EncodingDetector loaded) { return loaded instanceof CompositeEncodingDetector; @@ -1329,8 +1387,11 @@ boolean isComposite(Class loadedClass) { } @Override - EncodingDetector preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + EncodingDetector preLoadOne( + Class loadedClass, + String classname, + MimeTypes mimeTypes) + throws TikaException { // Check for classes which can't be set in config // Continue with normal loading return null; @@ -1342,17 +1403,21 @@ EncodingDetector createDefault(MimeTypes mimeTypes, ServiceLoader loader) { } @Override - CompositeEncodingDetector createComposite(List encodingDetectors, - MimeTypes mimeTypes, ServiceLoader loader) { + CompositeEncodingDetector createComposite( + List encodingDetectors, + MimeTypes mimeTypes, + ServiceLoader loader) { return new CompositeEncodingDetector(encodingDetectors); } @Override - EncodingDetector createComposite(Class encodingDetectorClass, - List childEncodingDetectors, - Set> excludeDetectors, - Map params, MimeTypes mimeTypes, - ServiceLoader loader) + EncodingDetector createComposite( + Class encodingDetectorClass, + List childEncodingDetectors, + Set> excludeDetectors, + Map params, + MimeTypes mimeTypes, + ServiceLoader loader) throws InvocationTargetException, IllegalAccessException, InstantiationException { EncodingDetector encodingDetector = null; Constructor c; @@ -1363,7 +1428,8 @@ EncodingDetector createComposite(Class encodingDetec c = encodingDetectorClass.getConstructor(ServiceLoader.class, Collection.class); encodingDetector = c.newInstance(loader, excludeDetectors); } catch (NoSuchMethodException me) { - LOG.debug("couldn't find constructor for service loader + collection for {}", + LOG.debug( + "couldn't find constructor for service loader + collection for {}", encodingDetectorClass); } } @@ -1372,7 +1438,8 @@ EncodingDetector createComposite(Class encodingDetec c = encodingDetectorClass.getConstructor(List.class); encodingDetector = c.newInstance(childEncodingDetectors); } catch (NoSuchMethodException me) { - LOG.debug("couldn't find constructor for EncodingDetector(List) for {}", + LOG.debug( + "couldn't find constructor for EncodingDetector(List) for {}", encodingDetectorClass); } } @@ -1386,8 +1453,7 @@ EncodingDetector decorate(EncodingDetector created, Element element) { } } - private static class RendererXmlLoader - extends XmlLoader { + private static class RendererXmlLoader extends XmlLoader { boolean supportsComposite() { return true; @@ -1406,7 +1472,6 @@ Class getLoaderClass() { return Renderer.class; } - @Override boolean isComposite(Renderer loaded) { return loaded instanceof CompositeRenderer; @@ -1418,8 +1483,9 @@ boolean isComposite(Class loadedClass) { } @Override - Renderer preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + Renderer preLoadOne( + Class loadedClass, String classname, MimeTypes mimeTypes) + throws TikaException { // Check for classes which can't be set in config // Continue with normal loading return null; @@ -1431,17 +1497,19 @@ Renderer createDefault(MimeTypes mimeTypes, ServiceLoader loader) { } @Override - Renderer createComposite(List renderers, - MimeTypes mimeTypes, ServiceLoader loader) { + Renderer createComposite( + List renderers, MimeTypes mimeTypes, ServiceLoader loader) { return new CompositeRenderer(renderers); } @Override - Renderer createComposite(Class rendererClass, - List childRenderers, - Set> excludeRenderers, - Map params, MimeTypes mimeTypes, - ServiceLoader loader) + Renderer createComposite( + Class rendererClass, + List childRenderers, + Set> excludeRenderers, + Map params, + MimeTypes mimeTypes, + ServiceLoader loader) throws InvocationTargetException, IllegalAccessException, InstantiationException { Renderer renderer = null; Constructor c; @@ -1452,7 +1520,8 @@ Renderer createComposite(Class rendererClass, c = rendererClass.getConstructor(ServiceLoader.class, Collection.class); renderer = c.newInstance(loader, excludeRenderers); } catch (NoSuchMethodException me) { - LOG.debug("couldn't find constructor for service loader + collection for {}", + LOG.debug( + "couldn't find constructor for service loader + collection for {}", renderer); } } @@ -1461,8 +1530,7 @@ Renderer createComposite(Class rendererClass, c = rendererClass.getConstructor(List.class); renderer = c.newInstance(childRenderers); } catch (NoSuchMethodException me) { - LOG.debug("couldn't find constructor for Renderer(List) for {}", - rendererClass); + LOG.debug("couldn't find constructor for Renderer(List) for {}", rendererClass); } } return renderer; diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java index a2313f4081..8077c2bd2d 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java @@ -39,13 +39,6 @@ import javax.xml.transform.Transformer; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Node; - import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.DefaultDetector; @@ -63,6 +56,11 @@ import org.apache.tika.parser.multiple.AbstractMultipleParser; import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; public class TikaConfigSerializer { @@ -86,9 +84,9 @@ public class TikaConfigSerializer { } /** - * @param config config to serialize - * @param mode serialization mode - * @param writer writer + * @param config config to serialize + * @param mode serialization mode + * @param writer writer * @param charset charset * @throws Exception */ @@ -121,8 +119,8 @@ public static void serialize(TikaConfig config, Mode mode, Writer writer, Charse transformer.transform(source, result); } - private static void addExecutorService(Mode mode, Element rootElement, Document doc, - TikaConfig config) { + private static void addExecutorService( + Mode mode, Element rootElement, Document doc, TikaConfig config) { ExecutorService executor = config.getExecutorService(); // TODO Implement the reverse of ExecutorServiceXmlLoader @@ -130,8 +128,8 @@ private static void addExecutorService(Mode mode, Element rootElement, Document // TODO Make it possible to get the current values from ConfigurableThreadPoolExecutor } - private static void addServiceLoader(Mode mode, Element rootElement, Document doc, - TikaConfig config) { + private static void addServiceLoader( + Mode mode, Element rootElement, Document doc, TikaConfig config) { ServiceLoader loader = config.getServiceLoader(); if (mode == Mode.MINIMAL) { @@ -148,18 +146,20 @@ private static void addServiceLoader(Mode mode, Element rootElement, Document do rootElement.appendChild(dslEl); } - private static void addTranslator(Mode mode, Element rootElement, Document doc, - TikaConfig config) { + private static void addTranslator( + Mode mode, Element rootElement, Document doc, TikaConfig config) { // Unlike the other entries, TikaConfig only wants one of // these, and no outer list Translator translator = config.getTranslator(); if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) { - Node mimeComment = doc.createComment("for example: "); + Node mimeComment = + doc.createComment( + "for example: "); rootElement.appendChild(mimeComment); } else { - if (translator instanceof DefaultTranslator && - (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) { + if (translator instanceof DefaultTranslator + && (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) { translator = ((DefaultTranslator) translator).getTranslator(); } if (translator != null) { @@ -173,28 +173,31 @@ private static void addTranslator(Mode mode, Element rootElement, Document doc, } private static void addMimeComment(Mode mode, Element rootElement, Document doc) { - Node mimeComment = doc.createComment("for example: "); + Node mimeComment = + doc.createComment( + "for example: "); rootElement.appendChild(mimeComment); } - private static void addEncodingDetectors(Mode mode, Element rootElement, Document doc, - TikaConfig config) throws Exception { + private static void addEncodingDetectors( + Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception { EncodingDetector encDetector = config.getEncodingDetector(); if (mode == Mode.MINIMAL && encDetector instanceof DefaultEncodingDetector) { // Don't output anything, all using defaults - Node detComment = doc.createComment( - "for example: " + - ""); + Node detComment = + doc.createComment( + "for example: " + + ""); rootElement.appendChild(detComment); return; } Element encDetectorsElement = doc.createElement("encodingDetectors"); - if (mode == Mode.CURRENT && encDetector instanceof DefaultEncodingDetector || - !(encDetector instanceof CompositeEncodingDetector)) { + if (mode == Mode.CURRENT && encDetector instanceof DefaultEncodingDetector + || !(encDetector instanceof CompositeEncodingDetector)) { Element encDetectorElement = doc.createElement("encodingDetector"); encDetectorElement.setAttribute("class", encDetector.getClass().getCanonicalName()); encDetectorsElement.appendChild(encDetectorElement); @@ -212,21 +215,23 @@ private static void addEncodingDetectors(Mode mode, Element rootElement, Documen rootElement.appendChild(encDetectorsElement); } - private static void addDetectors(Mode mode, Element rootElement, Document doc, - TikaConfig config) throws Exception { + private static void addDetectors( + Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception { Detector detector = config.getDetector(); if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) { // Don't output anything, all using defaults - Node detComment = doc.createComment("for example: "); + Node detComment = + doc.createComment( + "for example: "); rootElement.appendChild(detComment); return; } Element detectorsElement = doc.createElement("detectors"); - if (mode == Mode.CURRENT && detector instanceof DefaultDetector || - !(detector instanceof CompositeDetector)) { + if (mode == Mode.CURRENT && detector instanceof DefaultDetector + || !(detector instanceof CompositeDetector)) { Element detectorElement = doc.createElement("detector"); detectorElement.setAttribute("class", detector.getClass().getCanonicalName()); detectorsElement.appendChild(detectorElement); @@ -280,8 +285,8 @@ private static void addParser(Mode mode, Element rootElement, Document doc, Pars outputParser = false; } // Special case for making Default to static - if (parser instanceof DefaultParser && - (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) { + if (parser instanceof DefaultParser + && (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) { outputParser = false; } } else if (parser instanceof AbstractMultipleParser) { @@ -298,8 +303,9 @@ private static void addParser(Mode mode, Element rootElement, Document doc, Pars // TODO Parser Exclusions } - private static Element addParser(Mode mode, Element rootElement, Document doc, Parser parser, - ParserDecorator decorator) throws Exception { + private static Element addParser( + Mode mode, Element rootElement, Document doc, Parser parser, ParserDecorator decorator) + throws Exception { ParseContext context = new ParseContext(); Set addedTypes = new TreeSet<>(); @@ -343,7 +349,7 @@ public static void serializeParams(Document doc, Element element, Object object) Matcher setterMatcher = Pattern.compile("\\Aset([A-Z].*)").matcher(""); Matcher getterMatcher = Pattern.compile("\\A(?:get|is)([A-Z].+)\\Z").matcher(""); - //TODO -- check code base for setters with lowercase initial letters?! + // TODO -- check code base for setters with lowercase initial letters?! MethodTuples nonPrimitiveSetters = new MethodTuples(); MethodTuples primitiveSetters = new MethodTuples(); MethodTuples nonPrimitiveGetters = new MethodTuples(); @@ -353,18 +359,22 @@ public static void serializeParams(Document doc, Element element, Object object) if (setterMatcher.reset(method.getName()).find()) { if (!Modifier.isPublic(method.getModifiers())) { - //we could just call getMethods, but this can be helpful debugging inf + // we could just call getMethods, but this can be helpful debugging inf LOG.trace("inaccessible setter: {} in {}", method.getName(), object.getClass()); continue; } - //require @Field on setters + // require @Field on setters if (method.getAnnotation(Field.class) == null) { - // LOG.warn("unannotated setter {} in {}", method.getName(), object.getClass()); + // LOG.warn("unannotated setter {} in {}", method.getName(), object.getClass()); continue; } if (parameterTypes.length != 1) { - //TODO -- check code base for setX() zero parameters that set boolean to true - LOG.warn("setter with wrong number of params " + method.getName() + " " + parameterTypes.length); + // TODO -- check code base for setX() zero parameters that set boolean to true + LOG.warn( + "setter with wrong number of params " + + method.getName() + + " " + + parameterTypes.length); continue; } String paramName = methodToParamName(setterMatcher.group(1)); @@ -375,23 +385,23 @@ public static void serializeParams(Document doc, Element element, Object object) } } else if (getterMatcher.reset(method.getName()).find()) { if (parameterTypes.length != 0) { - //require 0 parameters for the getter + // require 0 parameters for the getter continue; } String paramName = methodToParamName(getterMatcher.group(1)); if (PRIMITIVES.containsKey(method.getReturnType())) { - primitiveGetters.add(new MethodTuple(paramName, method, method.getReturnType())); + primitiveGetters.add( + new MethodTuple(paramName, method, method.getReturnType())); } else { - nonPrimitiveGetters.add(new MethodTuple(paramName, method, method.getReturnType())); + nonPrimitiveGetters.add( + new MethodTuple(paramName, method, method.getReturnType())); } - } } - //TODO -- remove nonprimitive setters/getters that have a string equivalent + // TODO -- remove nonprimitive setters/getters that have a string equivalent serializePrimitives(doc, element, object, primitiveSetters, primitiveGetters); serializeNonPrimitives(doc, element, object, nonPrimitiveSetters, nonPrimitiveGetters); - } private static String methodToParamName(String name) { @@ -399,28 +409,35 @@ private static String methodToParamName(String name) { return name; } return name.substring(0, 1).toLowerCase(Locale.US) + name.substring(1); - } - private static void serializeNonPrimitives(Document doc, Element element, - Object object, - MethodTuples setterTuples, - MethodTuples getterTuples) { + private static void serializeNonPrimitives( + Document doc, + Element element, + Object object, + MethodTuples setterTuples, + MethodTuples getterTuples) { for (Map.Entry> e : setterTuples.tuples.entrySet()) { Set getters = getterTuples.tuples.get(e.getKey()); processNonPrimitive(e.getKey(), e.getValue(), getters, doc, element, object); if (!getterTuples.tuples.containsKey(e.getKey())) { - LOG.warn("no getter for setter non-primitive: {} in {}", e.getKey(), + LOG.warn( + "no getter for setter non-primitive: {} in {}", + e.getKey(), object.getClass()); continue; } } } - private static void processNonPrimitive(String name, Set setters, - Set getters, Document doc, Element element, - Object object) { + private static void processNonPrimitive( + String name, + Set setters, + Set getters, + Document doc, + Element element, + Object object) { for (MethodTuple setter : setters) { for (MethodTuple getter : getters) { if (setter.singleParam.equals(getter.singleParam)) { @@ -431,9 +448,13 @@ private static void processNonPrimitive(String name, Set setters, } } - private static void serializeObject(String name, Document doc, Element element, - MethodTuple setter, - MethodTuple getter, Object object) { + private static void serializeObject( + String name, + Document doc, + Element element, + MethodTuple setter, + MethodTuple getter, + Object object) { Object item = null; try { @@ -451,17 +472,20 @@ private static void serializeObject(String name, Document doc, Element element, serializeParams(doc, element, item); } - private static void serializePrimitives(Document doc, Element root, - Object object, - MethodTuples setterTuples, MethodTuples getterTuples) { + private static void serializePrimitives( + Document doc, + Element root, + Object object, + MethodTuples setterTuples, + MethodTuples getterTuples) { Element paramsElement = null; if (object instanceof AbstractMultipleParser) { paramsElement = doc.createElement("params"); Element paramElement = doc.createElement("param"); paramElement.setAttribute("name", "metadataPolicy"); - paramElement.setAttribute("value", - ((AbstractMultipleParser) object).getMetadataPolicy().toString()); + paramElement.setAttribute( + "value", ((AbstractMultipleParser) object).getMetadataPolicy().toString()); paramsElement.appendChild(paramElement); root.appendChild(paramsElement); } @@ -504,10 +528,10 @@ private static void serializePrimitives(Document doc, Element root, param.setAttribute("name", getterTuple.name); param.setAttribute("type", PRIMITIVES.get(getterTuple.singleParam)); if (List.class.isAssignableFrom(getterTuple.singleParam)) { - //this outputs even empty list elements, which I think is good. + // this outputs even empty list elements, which I think is good. addList(param, doc, getterTuple, (List) value); } else if (Map.class.isAssignableFrom(getterTuple.singleParam)) { - //this outputs even empty lists, which I think is good. + // this outputs even empty lists, which I think is good. addMap(param, doc, getterTuple, (Map) value); } else { param.setTextContent(valString); @@ -520,19 +544,18 @@ private static void serializePrimitives(Document doc, Element root, } } - private static void addMap(Element param, Document doc, MethodTuple getterTuple, - Map object) { + private static void addMap( + Element param, Document doc, MethodTuple getterTuple, Map object) { for (Map.Entry e : new TreeMap(object).entrySet()) { Element element = doc.createElement("string"); element.setAttribute("key", e.getKey()); element.setAttribute("value", e.getValue()); param.appendChild(element); } - } - private static void addList(Element param, Document doc, MethodTuple getterTuple, - List list) { + private static void addList( + Element param, Document doc, MethodTuple getterTuple, List list) { for (String s : list) { Element element = doc.createElement("string"); element.setTextContent(s); @@ -563,8 +586,8 @@ private static Method findGetter(MethodTuple setter, Object object) { } private static MethodTuple pickBestSetter(Set tuples) { - //TODO -- if both string and integer, which one do we pick? - //stub for now -- just pick the first + // TODO -- if both string and integer, which one do we pick? + // stub for now -- just pick the first for (MethodTuple t : tuples) { return t; } @@ -587,6 +610,7 @@ public int getSize() { return tuples.size(); } } + private static class MethodTuple { String name; Method method; @@ -607,8 +631,9 @@ public boolean equals(Object o) { return false; } MethodTuple that = (MethodTuple) o; - return name.equals(that.name) && method.equals(that.method) && - singleParam.equals(that.singleParam); + return name.equals(that.name) + && method.equals(that.method) + && singleParam.equals(that.singleParam); } @Override @@ -616,24 +641,18 @@ public int hashCode() { return Objects.hash(name, method, singleParam); } } + public enum Mode { - /** - * Minimal version of the config, defaults where possible - */ + /** Minimal version of the config, defaults where possible */ MINIMAL, - /** - * Current config, roughly as loaded - */ + /** Current config, roughly as loaded */ CURRENT, - /** - * Static version of the config, with explicit lists of parsers/decorators/etc - */ + /** Static version of the config, with explicit lists of parsers/decorators/etc */ STATIC, /** - * Static version of the config, with explicit lists of decorators etc, - * and all parsers given with their detected supported mime types + * Static version of the config, with explicit lists of decorators etc, and all parsers + * given with their detected supported mime types */ STATIC_FULL } - } diff --git a/tika-core/src/main/java/org/apache/tika/config/package-info.java b/tika-core/src/main/java/org/apache/tika/config/package-info.java index 77a0559f7f..93b2ca8626 100644 --- a/tika-core/src/main/java/org/apache/tika/config/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/config/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * Tika configuration tools. - */ +/** Tika configuration tools. */ @aQute.bnd.annotation.Version("1.0.0") package org.apache.tika.config; diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java index bd7d4f2a95..ee9833d53c 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java +++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java @@ -22,9 +22,6 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; - -import org.xml.sax.InputSource; - import org.apache.tika.config.LoadErrorHandler; import org.apache.tika.config.ServiceLoader; import org.apache.tika.exception.TikaException; @@ -32,10 +29,11 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.utils.CharsetUtils; +import org.xml.sax.InputSource; /** - * An input stream reader that automatically detects the character encoding - * to be used for converting bytes to characters. + * An input stream reader that automatically detects the character encoding to be used for + * converting bytes to characters. * * @since Apache Tika 1.2 */ @@ -47,8 +45,9 @@ public class AutoDetectReader extends BufferedReader { private static final EncodingDetector DEFAULT_DETECTOR; static { - DEFAULT_DETECTOR = new CompositeEncodingDetector( - DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class)); + DEFAULT_DETECTOR = + new CompositeEncodingDetector( + DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class)); } private final Charset charset; @@ -65,28 +64,33 @@ private AutoDetectReader(InputStream stream, Charset charset) throws IOException } /** - * @param stream stream from which to read -- make sure that it supports mark! + * @param stream stream from which to read -- make sure that it supports mark! * @param metadata * @param detector * @param handler * @throws IOException * @throws TikaException */ - private AutoDetectReader(InputStream stream, Metadata metadata, - EncodingDetector detector, LoadErrorHandler handler) + private AutoDetectReader( + InputStream stream, + Metadata metadata, + EncodingDetector detector, + LoadErrorHandler handler) throws IOException, TikaException { this(stream, detect(stream, metadata, detector, handler)); } - public AutoDetectReader(InputStream stream, Metadata metadata, - EncodingDetector encodingDetector) throws IOException, TikaException { - this(getBuffered(stream), metadata, encodingDetector, - DEFAULT_LOADER.getLoadErrorHandler()); + public AutoDetectReader( + InputStream stream, Metadata metadata, EncodingDetector encodingDetector) + throws IOException, TikaException { + this(getBuffered(stream), metadata, encodingDetector, DEFAULT_LOADER.getLoadErrorHandler()); } public AutoDetectReader(InputStream stream, Metadata metadata, ServiceLoader loader) throws IOException, TikaException { - this(getBuffered(stream), metadata, + this( + getBuffered(stream), + metadata, new CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)), loader.getLoadErrorHandler()); } @@ -100,8 +104,11 @@ public AutoDetectReader(InputStream stream) throws IOException, TikaException { this(stream, new Metadata()); } - private static Charset detect(InputStream input, Metadata metadata, - EncodingDetector detector, LoadErrorHandler handler) + private static Charset detect( + InputStream input, + Metadata metadata, + EncodingDetector detector, + LoadErrorHandler handler) throws IOException, TikaException { // Ask all given detectors for the character encoding try { @@ -122,7 +129,8 @@ private static Charset detect(InputStream input, Metadata metadata, try { Charset cs = CharsetUtils.forName(charset); metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name()); - metadata.set(TikaCoreProperties.ENCODING_DETECTOR, + metadata.set( + TikaCoreProperties.ENCODING_DETECTOR, "AutoDetectReader-charset-metadata-fallback"); return cs; } catch (IllegalArgumentException e) { @@ -141,7 +149,6 @@ private static InputStream getBuffered(InputStream stream) { return new BufferedInputStream(stream); } - public Charset getCharset() { return charset; } @@ -151,5 +158,4 @@ public InputSource asInputSource() { source.setEncoding(charset.name()); return source; } - } diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java index ed53918540..13d5396d48 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java @@ -23,29 +23,26 @@ import java.util.Collection; import java.util.Collections; import java.util.List; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.utils.StringUtils; -/** - * Content type detector that combines multiple different detection mechanisms. - */ +/** Content type detector that combines multiple different detection mechanisms. */ public class CompositeDetector implements Detector { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 5980683158436430252L; private final MediaTypeRegistry registry; private final List detectors; - public CompositeDetector(MediaTypeRegistry registry, List detectors, - Collection> excludeDetectors) { + public CompositeDetector( + MediaTypeRegistry registry, + List detectors, + Collection> excludeDetectors) { if (excludeDetectors == null || excludeDetectors.isEmpty()) { this.detectors = detectors; } else { @@ -78,8 +75,8 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } MediaType type = MediaType.OCTET_STREAM; - //we have to iterate through all detectors because the override detector may - //be within a CompositeDetector + // we have to iterate through all detectors because the override detector may + // be within a CompositeDetector for (Detector detector : getDetectors()) { MediaType detected = detector.detect(input, metadata); if (registry.isSpecializationOf(detected, type)) { @@ -90,7 +87,6 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } /** - * * @param metadata * @return mediaType if a parseable mediatype was sent in via user or parser overrides */ @@ -111,20 +107,19 @@ private static MediaType detectOverrides(Metadata metadata) { } return null; } - /** - * Returns the component detectors. - */ + + /** Returns the component detectors. */ public List getDetectors() { return Collections.unmodifiableList(detectors); } - private boolean isExcluded(Collection> excludeDetectors, - Class d) { + private boolean isExcluded( + Collection> excludeDetectors, Class d) { return excludeDetectors.contains(d) || assignableFrom(excludeDetectors, d); } - private boolean assignableFrom(Collection> excludeDetectors, - Class d) { + private boolean assignableFrom( + Collection> excludeDetectors, Class d) { for (Class e : excludeDetectors) { if (e.isAssignableFrom(d)) { return true; diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java index 7db79ccc7b..999ed60e96 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java @@ -24,29 +24,25 @@ import java.util.Collections; import java.util.LinkedList; import java.util.List; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; public class CompositeEncodingDetector implements EncodingDetector, Serializable { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 5980683158436430252L; private final List detectors; - public CompositeEncodingDetector(List detectors, - Collection> - excludeEncodingDetectors) { + public CompositeEncodingDetector( + List detectors, + Collection> excludeEncodingDetectors) { this.detectors = new LinkedList<>(); for (EncodingDetector encodingDetector : detectors) { if (!isExcluded(excludeEncodingDetectors, encodingDetector.getClass())) { this.detectors.add(encodingDetector); } } - } public CompositeEncodingDetector(List detectors) { @@ -55,7 +51,7 @@ public CompositeEncodingDetector(List detectors) { } /** - * @param input text document input stream, or null + * @param input text document input stream, or null * @param metadata input metadata for the document * @return the detected Charset or null if no charset could be detected * @throws IOException @@ -66,9 +62,10 @@ public Charset detect(InputStream input, Metadata metadata) throws IOException { Charset detected = detector.detect(input, metadata); if (detected != null) { metadata.set(TikaCoreProperties.DETECTED_ENCODING, detected.name()); - //if this has been set by a leaf detector, do not overwrite - if (! detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) { - metadata.set(TikaCoreProperties.ENCODING_DETECTOR, + // if this has been set by a leaf detector, do not overwrite + if (!detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) { + metadata.set( + TikaCoreProperties.ENCODING_DETECTOR, detector.getClass().getSimpleName()); } return detected; @@ -84,8 +81,8 @@ public List getDetectors() { private boolean isExcluded( Collection> excludeEncodingDetectors, Class encodingDetector) { - return excludeEncodingDetectors.contains(encodingDetector) || - assignableFrom(excludeEncodingDetectors, encodingDetector); + return excludeEncodingDetectors.contains(encodingDetector) + || assignableFrom(excludeEncodingDetectors, encodingDetector); } private boolean assignableFrom( diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java index 038d274e46..755c1767e5 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java @@ -20,33 +20,32 @@ import java.util.Collections; import java.util.List; import javax.imageio.spi.ServiceRegistry; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.mime.MimeTypes; import org.apache.tika.utils.ServiceLoaderUtils; /** - * A composite detector based on all the {@link Detector} implementations - * available through the {@link ServiceRegistry service provider mechanism}. - *

- * Detectors are loaded and returned in a specified order, of user supplied - * followed by non-MimeType Tika, followed by the Tika MimeType class. - * If you need to control the order of the Detectors, you should instead - * construct your own {@link CompositeDetector} and pass in the list + * A composite detector based on all the {@link Detector} implementations available through the + * {@link ServiceRegistry service provider mechanism}. + * + *

Detectors are loaded and returned in a specified order, of user supplied followed by + * non-MimeType Tika, followed by the Tika MimeType class. If you need to control the order of the + * Detectors, you should instead construct your own {@link CompositeDetector} and pass in the list * of Detectors in the required order. * * @since Apache Tika 0.9 */ public class DefaultDetector extends CompositeDetector { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = -8170114575326908027L; - private transient final ServiceLoader loader; - public DefaultDetector(MimeTypes types, ServiceLoader loader, - Collection> excludeDetectors) { + private final transient ServiceLoader loader; + + public DefaultDetector( + MimeTypes types, + ServiceLoader loader, + Collection> excludeDetectors) { super(types.getMediaTypeRegistry(), getDefaultDetectors(types, loader, excludeDetectors)); this.loader = loader; } @@ -72,25 +71,24 @@ public DefaultDetector() { } /** - * Finds all statically loadable detectors and sort the list by name, - * rather than discovery order. Detectors are used in the given order, - * so put the Tika parsers last so that non-Tika (user supplied) - * parsers can take precedence. - *

- * If an {@link OverrideDetector} is loaded, it takes precedence over - * all other detectors. + * Finds all statically loadable detectors and sort the list by name, rather than discovery + * order. Detectors are used in the given order, so put the Tika parsers last so that non-Tika + * (user supplied) parsers can take precedence. + * + *

If an {@link OverrideDetector} is loaded, it takes precedence over all other detectors. * * @param loader service loader * @return ordered list of statically loadable detectors */ - private static List getDefaultDetectors(MimeTypes types, ServiceLoader loader, - Collection> - excludeDetectors) { + private static List getDefaultDetectors( + MimeTypes types, + ServiceLoader loader, + Collection> excludeDetectors) { List detectors = loader.loadStaticServiceProviders(Detector.class, excludeDetectors); ServiceLoaderUtils.sortLoadedClasses(detectors); - //look for the override index and put that first + // look for the override index and put that first int overrideIndex = -1; int i = 0; for (Detector detector : detectors) { @@ -123,5 +121,4 @@ public List getDetectors() { return super.getDetectors(); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java index 4cf64d5e97..347cd4db41 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java @@ -19,19 +19,18 @@ import java.util.Collection; import javax.imageio.spi.ServiceRegistry; - import org.apache.tika.config.ServiceLoader; /** - * A composite encoding detector based on all the {@link EncodingDetector} implementations - * available through the {@link ServiceRegistry service provider mechanism}. Those - * loaded via the service provider mechanism are ordered by how they appear in the - * file, if there is a single service file. If multiple, there is no guarantee of order. - *

+ * A composite encoding detector based on all the {@link EncodingDetector} implementations available + * through the {@link ServiceRegistry service provider mechanism}. Those loaded via the service + * provider mechanism are ordered by how they appear in the file, if there is a single service file. + * If multiple, there is no guarantee of order. + * *

- * If you need to control the order of the Detectors, you should instead - * construct your own {@link CompositeDetector} and pass in the list - * of Detectors in the required order. + * + *

If you need to control the order of the Detectors, you should instead construct your own + * {@link CompositeDetector} and pass in the list of Detectors in the required order. * * @since Apache Tika 1.15 */ @@ -45,10 +44,9 @@ public DefaultEncodingDetector(ServiceLoader loader) { super(loader.loadServiceProviders(EncodingDetector.class)); } - public DefaultEncodingDetector(ServiceLoader loader, - Collection> - excludeEncodingDetectors) { + public DefaultEncodingDetector( + ServiceLoader loader, + Collection> excludeEncodingDetectors) { super(loader.loadServiceProviders(EncodingDetector.class), excludeEncodingDetectors); } - } diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java index b7df0b6fa6..c4de0667bf 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java @@ -17,22 +17,19 @@ package org.apache.tika.detect; import java.util.List; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.mime.MimeTypes; import org.apache.tika.mime.ProbabilisticMimeDetectionSelector; import org.apache.tika.utils.ServiceLoaderUtils; /** - * A version of {@link DefaultDetector} for probabilistic mime - * detectors, which use statistical techniques to blend the - * results of differing underlying detectors when attempting - * to detect the type of a given file. - * TODO Link to documentation on configuring these probabilities + * A version of {@link DefaultDetector} for probabilistic mime detectors, which use statistical + * techniques to blend the results of differing underlying detectors when attempting to detect the + * type of a given file. TODO Link to documentation on configuring these probabilities */ public class DefaultProbDetector extends CompositeDetector { private static final long serialVersionUID = -8836240060532323352L; - private transient final ServiceLoader loader; + private final transient ServiceLoader loader; public DefaultProbDetector(ProbabilisticMimeDetectionSelector sel, ServiceLoader loader) { super(sel.getMediaTypeRegistry(), getDefaultDetectors(sel, loader)); @@ -55,8 +52,8 @@ public DefaultProbDetector() { this(MimeTypes.getDefaultMimeTypes()); } - private static List getDefaultDetectors(ProbabilisticMimeDetectionSelector sel, - ServiceLoader loader) { + private static List getDefaultDetectors( + ProbabilisticMimeDetectionSelector sel, ServiceLoader loader) { List detectors = loader.loadStaticServiceProviders(Detector.class); ServiceLoaderUtils.sortLoadedClasses(detectors); detectors.add(sel); diff --git a/tika-core/src/main/java/org/apache/tika/detect/Detector.java b/tika-core/src/main/java/org/apache/tika/detect/Detector.java index fc237aa5aa..ead745a3c0 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/Detector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/Detector.java @@ -19,41 +19,35 @@ import java.io.IOException; import java.io.InputStream; import java.io.Serializable; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; /** - * Content type detector. Implementations of this interface use various - * heuristics to detect the content type of a document based on given - * input metadata or the first few bytes of the document stream. + * Content type detector. Implementations of this interface use various heuristics to detect the + * content type of a document based on given input metadata or the first few bytes of the document + * stream. * * @since Apache Tika 0.3 */ public interface Detector extends Serializable { /** - * Detects the content type of the given input document. Returns - * application/octet-stream if the type of the document - * can not be detected. - *

- * If the document input stream is not available, then the first - * argument may be null. Otherwise the detector may - * read bytes from the start of the stream to help in type detection. - * The given stream is guaranteed to support the - * {@link InputStream#markSupported() mark feature} and the detector - * is expected to {@link InputStream#mark(int) mark} the stream before - * reading any bytes from it, and to {@link InputStream#reset() reset} - * the stream before returning. The stream must not be closed by the - * detector. - *

- * The given input metadata is only read, not modified, by the detector. + * Detects the content type of the given input document. Returns application/octet-stream + * if the type of the document can not be detected. + * + *

If the document input stream is not available, then the first argument may be null + * . Otherwise the detector may read bytes from the start of the stream to help in type + * detection. The given stream is guaranteed to support the {@link InputStream#markSupported() + * mark feature} and the detector is expected to {@link InputStream#mark(int) mark} the stream + * before reading any bytes from it, and to {@link InputStream#reset() reset} the stream before + * returning. The stream must not be closed by the detector. * - * @param input document input stream, or null + *

The given input metadata is only read, not modified, by the detector. + * + * @param input document input stream, or null * @param metadata input metadata for the document * @return detected media type, or application/octet-stream * @throws IOException if the document input stream could not be read */ MediaType detect(InputStream input, Metadata metadata) throws IOException; - } diff --git a/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java b/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java index 9f996301ce..c76d79e210 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java @@ -18,22 +18,16 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; -/** - * Dummy detector that returns application/octet-stream for all documents. - */ +/** Dummy detector that returns application/octet-stream for all documents. */ public class EmptyDetector implements Detector { - /** - * Singleton instance of this class. - */ + /** Singleton instance of this class. */ public static final EmptyDetector INSTANCE = new EmptyDetector(); public MediaType detect(InputStream input, Metadata metadata) throws IOException { return MediaType.OCTET_STREAM; } - } diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java index 9dbad4c277..be60018448 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java @@ -20,39 +20,35 @@ import java.io.InputStream; import java.io.Serializable; import java.nio.charset.Charset; - import org.apache.tika.metadata.Metadata; /** - * Character encoding detector. Implementations of this interface use - * various heuristics to detect the character encoding of a text document - * based on given input metadata or the first few bytes of the document stream. + * Character encoding detector. Implementations of this interface use various heuristics to detect + * the character encoding of a text document based on given input metadata or the first few bytes of + * the document stream. * * @since Apache Tika 0.4 */ public interface EncodingDetector extends Serializable { /** - * Detects the character encoding of the given text document, or - * null if the encoding of the document can not be detected. - *

- * If the document input stream is not available, then the first - * argument may be null. Otherwise the detector may - * read bytes from the start of the stream to help in encoding detection. - * The given stream is guaranteed to support the - * {@link InputStream#markSupported() mark feature} and the detector - * is expected to {@link InputStream#mark(int) mark} the stream before - * reading any bytes from it, and to {@link InputStream#reset() reset} - * the stream before returning. The stream must not be closed by the + * Detects the character encoding of the given text document, or null if the + * encoding of the document can not be detected. + * + *

If the document input stream is not available, then the first argument may be null + * . Otherwise the detector may read bytes from the start of the stream to help in + * encoding detection. The given stream is guaranteed to support the {@link + * InputStream#markSupported() mark feature} and the detector is expected to {@link + * InputStream#mark(int) mark} the stream before reading any bytes from it, and to {@link + * InputStream#reset() reset} the stream before returning. The stream must not be closed by the * detector. - *

- * The given input metadata is only read, not modified, by the detector. * - * @param input text document input stream, or null + *

The given input metadata is only read, not modified, by the detector. + * + * @param input text document input stream, or null * @param metadata input metadata for the document * @return detected character encoding, or null * @throws IOException if the document input stream could not be read */ Charset detect(InputStream input, Metadata metadata) throws IOException; - } diff --git a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java index 42349faec1..a3a7211537 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java @@ -22,10 +22,6 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.Field; import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.TemporaryResources; @@ -38,26 +34,27 @@ import org.apache.tika.utils.FileProcessResult; import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * This runs the linux 'file' command against a file. If - * this is called on a TikaInputStream, it will use the underlying Path - * or spool the full file to disk and then run file against that. - *

- * If this is run against any other type of InputStream, it will spool - * up to {@link #maxBytes} to disk and then run the detector. - *

- * As with all detectors, mark must be supported. - *

- * If you want to use file's mime type in the parse, e.g. - * to select the parser in AutoDetectParser, set {@link FileCommandDetector#setUseMime(boolean)} - * to true. The default behavior is to store the value as {@link FileCommandDetector#FILE_MIME} - * but rely on other detectors for the "active" mime used by Tika. + * This runs the linux 'file' command against a file. If this is called on a TikaInputStream, it + * will use the underlying Path or spool the full file to disk and then run file against that. + * + *

If this is run against any other type of InputStream, it will spool up to {@link #maxBytes} to + * disk and then run the detector. + * + *

As with all detectors, mark must be supported. + * + *

If you want to use file's mime type in the parse, e.g. to select the parser in + * AutoDetectParser, set {@link FileCommandDetector#setUseMime(boolean)} to true. The default + * behavior is to store the value as {@link FileCommandDetector#FILE_MIME} but rely on other + * detectors for the "active" mime used by Tika. */ public class FileCommandDetector implements Detector { - //TODO: file has some diff mimes names for some very common mimes - //should we map file mimes to Tika mimes, e.g. text/xml -> application/xml?? + // TODO: file has some diff mimes names for some very common mimes + // should we map file mimes to Tika mimes, e.g. text/xml -> application/xml?? public static Property FILE_MIME = Property.externalText("file:mime"); private static final Logger LOGGER = LoggerFactory.getLogger(FileCommandDetector.class); @@ -75,14 +72,13 @@ public static boolean checkHasFile() { return checkHasFile(DEFAULT_FILE_COMMAND_PATH); } - public static boolean checkHasFile(String fileCommandPath) { - String[] commandline = new String[]{fileCommandPath, "-v"}; + String[] commandline = new String[] {fileCommandPath, "-v"}; return ExternalParser.check(commandline); } /** - * @param input document input stream, or null + * @param input document input stream, or null * @param metadata input metadata for the document * @return mime as identified by the file command or application/octet-stream otherwise * @throws IOException @@ -101,8 +97,8 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } TikaInputStream tis = TikaInputStream.cast(input); if (tis != null) { - //spool the full file to disk, if called with a TikaInputStream - //and there is no underlying file + // spool the full file to disk, if called with a TikaInputStream + // and there is no underlying file return detectOnPath(tis.getPath(), metadata); } @@ -119,8 +115,12 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException private MediaType detectOnPath(Path path, Metadata metadata) throws IOException { String[] args = - new String[]{ProcessUtils.escapeCommandLine(fileCommandPath), "-b", "--mime-type", - ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString())}; + new String[] { + ProcessUtils.escapeCommandLine(fileCommandPath), + "-b", + "--mime-type", + ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString()) + }; ProcessBuilder builder = new ProcessBuilder(args); FileProcessResult result = ProcessUtils.execute(builder, timeoutMs, 10000, 10000); if (result.isTimeout()) { @@ -149,8 +149,8 @@ private MediaType detectOnPath(Path path, Metadata metadata) throws IOException @Field public void setFilePath(String fileCommandPath) { - //this opens up a potential command vulnerability. - //Don't ever let an untrusted user set this. + // this opens up a potential command vulnerability. + // Don't ever let an untrusted user set this. this.fileCommandPath = fileCommandPath; checkHasFile(this.fileCommandPath); } @@ -163,10 +163,10 @@ public void setUseMime(boolean useMime) { public boolean isUseMime() { return useMime; } + /** - * If this is not called on a TikaInputStream, this detector - * will spool up to this many bytes to a file to be detected - * by the 'file' command. + * If this is not called on a TikaInputStream, this detector will spool up to this many bytes to + * a file to be detected by the 'file' command. * * @param maxBytes */ @@ -179,5 +179,4 @@ public void setMaxBytes(int maxBytes) { public void setTimeoutMs(long timeoutMs) { this.timeoutMs = timeoutMs; } - } diff --git a/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java b/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java index bb9ec1da02..b8d5205e21 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java @@ -27,75 +27,72 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; /** - * Content type detection based on magic bytes, i.e. type-specific patterns - * near the beginning of the document input stream. - *

- * Because this works on bytes, not characters, by default any string - * matching is done as ISO_8859_1. To use an explicit different - * encoding, supply a type other than "string" / "stringignorecase" + * Content type detection based on magic bytes, i.e. type-specific patterns near the beginning of + * the document input stream. + * + *

Because this works on bytes, not characters, by default any string matching is done as + * ISO_8859_1. To use an explicit different encoding, supply a type other than "string" / + * "stringignorecase" * * @since Apache Tika 0.3 */ public class MagicDetector implements Detector { /** - * The matching media type. Returned by the - * {@link #detect(InputStream, Metadata)} method if a match is found. + * The matching media type. Returned by the {@link #detect(InputStream, Metadata)} method if a + * match is found. */ private final MediaType type; - /** - * Length of the comparison window. - */ + + /** Length of the comparison window. */ private final int length; + /** - * The magic match pattern. If this byte pattern is equal to the - * possibly bit-masked bytes from the input stream, then the type - * detection succeeds and the configured {@link #type} is returned. + * The magic match pattern. If this byte pattern is equal to the possibly bit-masked bytes from + * the input stream, then the type detection succeeds and the configured {@link #type} is + * returned. */ private final byte[] pattern; + /** - * Length of the pattern, which in the case of regular expressions will - * not be the same as the comparison window length. + * Length of the pattern, which in the case of regular expressions will not be the same as the + * comparison window length. */ private final int patternLength; - /** - * True if pattern is a regular expression, false otherwise. - */ + + /** True if pattern is a regular expression, false otherwise. */ private final boolean isRegex; - /** - * True if we're doing a case-insensitive string match, false otherwise. - */ + + /** True if we're doing a case-insensitive string match, false otherwise. */ private final boolean isStringIgnoreCase; - /** - * Bit mask that is applied to the source bytes before pattern matching. - */ + + /** Bit mask that is applied to the source bytes before pattern matching. */ private final byte[] mask; + /** - * First offset (inclusive) of the comparison window within the - * document input stream. Greater than or equal to zero. + * First offset (inclusive) of the comparison window within the document input stream. Greater + * than or equal to zero. */ private final int offsetRangeBegin; + /** - * Last offset (inclusive) of the comparison window within the document - * input stream. Greater than or equal to the - * {@link #offsetRangeBegin first offset}. - *

- * Note that this is not the offset of the last byte read from - * the document stream. Instead, the last window of bytes to be compared - * starts at this offset. + * Last offset (inclusive) of the comparison window within the document input stream. Greater + * than or equal to the {@link #offsetRangeBegin first offset}. + * + *

Note that this is not the offset of the last byte read from the document stream. + * Instead, the last window of bytes to be compared starts at this offset. */ private final int offsetRangeEnd; /** - * Creates a detector for input documents that have the exact given byte - * pattern at the beginning of the document stream. + * Creates a detector for input documents that have the exact given byte pattern at the + * beginning of the document stream. * - * @param type matching media type + * @param type matching media type * @param pattern magic match pattern */ public MagicDetector(MediaType type, byte[] pattern) { @@ -103,42 +100,46 @@ public MagicDetector(MediaType type, byte[] pattern) { } /** - * Creates a detector for input documents that have the exact given byte - * pattern at the given offset of the document stream. + * Creates a detector for input documents that have the exact given byte pattern at the given + * offset of the document stream. * - * @param type matching media type + * @param type matching media type * @param pattern magic match pattern - * @param offset offset of the pattern match + * @param offset offset of the pattern match */ public MagicDetector(MediaType type, byte[] pattern, int offset) { this(type, pattern, null, offset, offset); } /** - * Creates a detector for input documents that meet the specified magic - * match. {@code pattern} must NOT be a regular expression. - * Constructor maintained for legacy reasons. + * Creates a detector for input documents that meet the specified magic match. {@code pattern} + * must NOT be a regular expression. Constructor maintained for legacy reasons. */ - public MagicDetector(MediaType type, byte[] pattern, byte[] mask, int offsetRangeBegin, - int offsetRangeEnd) { + public MagicDetector( + MediaType type, byte[] pattern, byte[] mask, int offsetRangeBegin, int offsetRangeEnd) { this(type, pattern, mask, false, offsetRangeBegin, offsetRangeEnd); } - /** - * Creates a detector for input documents that meet the specified - * magic match. - */ - public MagicDetector(MediaType type, byte[] pattern, byte[] mask, boolean isRegex, - int offsetRangeBegin, int offsetRangeEnd) { + /** Creates a detector for input documents that meet the specified magic match. */ + public MagicDetector( + MediaType type, + byte[] pattern, + byte[] mask, + boolean isRegex, + int offsetRangeBegin, + int offsetRangeEnd) { this(type, pattern, mask, isRegex, false, offsetRangeBegin, offsetRangeEnd); } - /** - * Creates a detector for input documents that meet the specified - * magic match. - */ - public MagicDetector(MediaType type, byte[] pattern, byte[] mask, boolean isRegex, - boolean isStringIgnoreCase, int offsetRangeBegin, int offsetRangeEnd) { + /** Creates a detector for input documents that meet the specified magic match. */ + public MagicDetector( + MediaType type, + byte[] pattern, + byte[] mask, + boolean isRegex, + boolean isStringIgnoreCase, + int offsetRangeBegin, + int offsetRangeEnd) { if (type == null) { throw new IllegalArgumentException("Matching media type is null"); } else if (pattern == null) { @@ -183,8 +184,8 @@ public MagicDetector(MediaType type, byte[] pattern, byte[] mask, boolean isRege this.offsetRangeEnd = offsetRangeEnd; } - public static MagicDetector parse(MediaType mediaType, String type, String offset, String value, - String mask) { + public static MagicDetector parse( + MediaType mediaType, String type, String offset, String value, String mask) { int start = 0; int end = 0; if (offset != null) { @@ -204,8 +205,14 @@ public static MagicDetector parse(MediaType mediaType, String type, String offse maskBytes = decodeValue(mask, type); } - return new MagicDetector(mediaType, patternBytes, maskBytes, type.equals("regex"), - type.equals("stringignorecase"), start, end); + return new MagicDetector( + mediaType, + patternBytes, + maskBytes, + type.equals("regex"), + type.equals("stringignorecase"), + start, + end); } private static byte[] decodeValue(String value, String type) { @@ -241,29 +248,43 @@ private static byte[] decodeValue(String value, String type) { decoded = tmpVal.getBytes(UTF_8); break; case "host16": - case "little16": { - int i = Integer.parseInt(tmpVal, radix); - decoded = new byte[]{(byte) (i & 0x00FF), (byte) (i >> 8)}; - break; - } - case "big16": { - int i = Integer.parseInt(tmpVal, radix); - decoded = new byte[]{(byte) (i >> 8), (byte) (i & 0x00FF)}; - break; - } + case "little16": + { + int i = Integer.parseInt(tmpVal, radix); + decoded = new byte[] {(byte) (i & 0x00FF), (byte) (i >> 8)}; + break; + } + case "big16": + { + int i = Integer.parseInt(tmpVal, radix); + decoded = new byte[] {(byte) (i >> 8), (byte) (i & 0x00FF)}; + break; + } case "host32": - case "little32": { - long i = Long.parseLong(tmpVal, radix); - decoded = new byte[]{(byte) ((i & 0x000000FF)), (byte) ((i & 0x0000FF00) >> 8), - (byte) ((i & 0x00FF0000) >> 16), (byte) ((i & 0xFF000000) >> 24)}; - break; - } - case "big32": { - long i = Long.parseLong(tmpVal, radix); - decoded = new byte[]{(byte) ((i & 0xFF000000) >> 24), (byte) ((i & 0x00FF0000) >> 16), - (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF))}; - break; - } + case "little32": + { + long i = Long.parseLong(tmpVal, radix); + decoded = + new byte[] { + (byte) ((i & 0x000000FF)), + (byte) ((i & 0x0000FF00) >> 8), + (byte) ((i & 0x00FF0000) >> 16), + (byte) ((i & 0xFF000000) >> 24) + }; + break; + } + case "big32": + { + long i = Long.parseLong(tmpVal, radix); + decoded = + new byte[] { + (byte) ((i & 0xFF000000) >> 24), + (byte) ((i & 0x00FF0000) >> 16), + (byte) ((i & 0x0000FF00) >> 8), + (byte) ((i & 0x000000FF)) + }; + break; + } } return decoded; } @@ -295,8 +316,9 @@ private static byte[] decodeString(String value, String type) { i++; } else { int j = i + 1; - while ((j < i + 4) && (j < value.length()) && - (Character.isDigit(value.charAt(j)))) { + while ((j < i + 4) + && (j < value.length()) + && (Character.isDigit(value.charAt(j)))) { j++; } decoded.write(Short.decode("0" + value.substring(i + 1, j)).byteValue()); @@ -333,7 +355,7 @@ private static byte[] decodeString(String value, String type) { } /** - * @param input document input stream, or null + * @param input document input stream, or null * @param metadata ignored */ public MediaType detect(InputStream input, Metadata metadata) throws IOException { @@ -425,13 +447,18 @@ public int getLength() { } /** - * Returns a string representation of the Detection Rule. - * Should sort nicely by type and details, as we sometimes - * compare these. + * Returns a string representation of the Detection Rule. Should sort nicely by type and + * details, as we sometimes compare these. */ public String toString() { // Needs to be unique, as these get compared. - return "Magic Detection for " + type + " looking for " + pattern.length + " bytes = " + - this.pattern + " mask = " + this.mask; + return "Magic Detection for " + + type + + " looking for " + + pattern.length + + " bytes = " + + this.pattern + + " mask = " + + this.mask; } } diff --git a/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java b/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java index bcbf48feb5..76fe42dad6 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java @@ -27,12 +27,10 @@ import java.net.URL; import java.nio.file.Path; import java.util.Objects; - +import org.apache.tika.mime.MediaType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.mime.MediaType; - public class NNExampleModelDetector extends TrainedModelDetector { private static final String EXAMPLE_NNMODEL_FILE = "tika-example.nnmodel"; @@ -68,16 +66,13 @@ public void loadDefaultModels(InputStream modelStream) { // add this model into map of trained models. super.registerModels(nnBuilder.getType(), nnBuilder.build()); } - } } catch (IOException e) { throw new RuntimeException("Unable to read the default media type registry", e); } } - /** - * this method gets overwritten to register load neural network models - */ + /** this method gets overwritten to register load neural network models */ @Override public void loadDefaultModels(ClassLoader classLoader) { if (classLoader == null) { @@ -91,22 +86,20 @@ public void loadDefaultModels(ClassLoader classLoader) { // Get the core URL, and all the extensions URLs URL modelURL = classLoader.getResource(classPrefix + EXAMPLE_NNMODEL_FILE); - Objects.requireNonNull(modelURL, - "required resource " + classPrefix + EXAMPLE_NNMODEL_FILE + " not found"); + Objects.requireNonNull( + modelURL, "required resource " + classPrefix + EXAMPLE_NNMODEL_FILE + " not found"); try (InputStream stream = modelURL.openStream()) { loadDefaultModels(stream); } catch (IOException e) { throw new RuntimeException("Unable to read the default media type registry", e); } - } /** - * read the comments where the model configuration is written, e.g the - * number of inputs, hiddens and output please ensure the first char in the - * given string is # In this example grb model file, there are 4 elements 1) - * type 2) number of input units 3) number of hidden units. 4) number of - * output units. + * read the comments where the model configuration is written, e.g the number of inputs, hiddens + * and output please ensure the first char in the given string is # In this example grb model + * file, there are 4 elements 1) type 2) number of input units 3) number of hidden units. 4) + * number of output units. */ private void readDescription(final NNTrainedModelBuilder builder, final String line) { int numInputs; @@ -130,8 +123,8 @@ private void readDescription(final NNTrainedModelBuilder builder, final String l } /** - * Read the next line for the model parameters and populate the build which - * later will be used to instantiate the instance of TrainedModel + * Read the next line for the model parameters and populate the build which later will be used + * to instantiate the instance of TrainedModel * * @param builder * @param line diff --git a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java index 73ee560db4..c0f06e9846 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java +++ b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java @@ -25,8 +25,8 @@ public class NNTrainedModel extends TrainedModel { private final float[][] Theta1; private final float[][] Theta2; - public NNTrainedModel(final int nInput, final int nHidden, final int nOutput, - final float[] nn_params) { + public NNTrainedModel( + final int nInput, final int nHidden, final int nOutput, final float[] nn_params) { this.numOfInputs = nInput; this.numOfHidden = nHidden; this.numOfOutputs = nOutput; @@ -64,8 +64,7 @@ public double predict(double[] unseen) { } /** - * The given input vector of unseen is m=(256 + 1) * n= 1 this returns a - * prediction probability + * The given input vector of unseen is m=(256 + 1) * n= 1 this returns a prediction probability */ @Override public float predict(float[] unseen) { @@ -74,7 +73,7 @@ public float predict(float[] unseen) { int i, j; int m = this.Theta1.length; int n = this.Theta1[0].length; - float[] hh = new float[m + 1];// hidden unit summation + float[] hh = new float[m + 1]; // hidden unit summation hh[0] = 1; for (i = 0; i < m; i++) { double h = 0; diff --git a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java index 9b4eab3854..f710a9d5ec 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java +++ b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java @@ -15,7 +15,6 @@ * limitations under the License. */ - package org.apache.tika.detect; import org.apache.tika.mime.MediaType; diff --git a/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java b/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java index 36d01e1711..2f49ec31ee 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java @@ -23,37 +23,33 @@ import java.net.URLDecoder; import java.util.Map; import java.util.regex.Pattern; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; /** - * Content type detection based on the resource name. An instance of this - * class contains a set of regular expression patterns that are matched - * against the resource name potentially given as a part of the input metadata. - *

- * If a pattern matches the given name, then the media type associated with - * that pattern is returned as the likely content type of the input document. - * Otherwise the returned type is application/octet-stream. - *

- * See the {@link #detect(InputStream, Metadata)} method for more details - * of the matching algorithm. + * Content type detection based on the resource name. An instance of this class contains a set of + * regular expression patterns that are matched against the resource name potentially given as a + * part of the input metadata. + * + *

If a pattern matches the given name, then the media type associated with that pattern is + * returned as the likely content type of the input document. Otherwise the returned type is + * application/octet-stream. + * + *

See the {@link #detect(InputStream, Metadata)} method for more details of the matching + * algorithm. * * @since Apache Tika 0.3 */ public class NameDetector implements Detector { - /** - * The regular expression patterns used for type detection. - */ + /** The regular expression patterns used for type detection. */ private final Map patterns; /** - * Creates a new content type detector based on the given name patterns. - * The given pattern map is not copied, so the caller may update the - * mappings even after this detector instance has been created. However, - * the map must not be concurrently modified while this instance + * Creates a new content type detector based on the given name patterns. The given pattern map + * is not copied, so the caller may update the mappings even after this detector instance has + * been created. However, the map must not be concurrently modified while this instance * is used for type detection. * * @param patterns map from name patterns to corresponding media types @@ -63,34 +59,25 @@ public NameDetector(Map patterns) { } /** - * Detects the content type of an input document based on the document - * name given in the input metadata. The RESOURCE_NAME_KEY attribute of - * the given input metadata is expected to contain the name (normally - * a file name or a URL) of the input document. - *

- * If a resource name is given, then it is first processed as follows. + * Detects the content type of an input document based on the document name given in the input + * metadata. The RESOURCE_NAME_KEY attribute of the given input metadata is expected to contain + * the name (normally a file name or a URL) of the input document. + * + *

If a resource name is given, then it is first processed as follows. + * *

    - *
  1. - * Potential URL query (?...) and fragment identifier (#...) - * parts are removed from the end of the resource name. - *
  2. - *
  3. - * Potential leading path elements (up to the last slash or backslash) - * are removed from the beginning of the resource name. - *
  4. - *
  5. - * Potential URL encodings (%nn, in UTF-8) are decoded. - *
  6. - *
  7. - * Any leading and trailing whitespace is removed. - *
  8. + *
  9. Potential URL query (?...) and fragment identifier (#...) parts are removed from the + * end of the resource name. + *
  10. Potential leading path elements (up to the last slash or backslash) are removed from + * the beginning of the resource name. + *
  11. Potential URL encodings (%nn, in UTF-8) are decoded. + *
  12. Any leading and trailing whitespace is removed. *
- *

- * The resulting name string (if any) is then matched in sequence against - * all the configured name patterns. If a match is found, then the (first) - * matching media type is returned. * - * @param input ignored + *

The resulting name string (if any) is then matched in sequence against all the configured + * name patterns. If a match is found, then the (first) matching media type is returned. + * + * @param input ignored * @param metadata input metadata, possibly with a RESOURCE_NAME_KEY value * @return detected media type, or application/octet-stream */ @@ -147,5 +134,4 @@ public MediaType detect(InputStream input, Metadata metadata) { return MediaType.OCTET_STREAM; } - } diff --git a/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java index 896a795318..a473cffaff 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java @@ -21,24 +21,17 @@ import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; - import org.apache.tika.config.Field; import org.apache.tika.metadata.Metadata; -/** - * Always returns the charset passed in via the initializer - */ +/** Always returns the charset passed in via the initializer */ public class NonDetectingEncodingDetector implements EncodingDetector { - //would have preferred final, but need mutability for - //loading via TikaConfig + // would have preferred final, but need mutability for + // loading via TikaConfig private Charset charset = StandardCharsets.UTF_8; - /** - * Sets charset to UTF-8. - */ - public NonDetectingEncodingDetector() { - - } + /** Sets charset to UTF-8. */ + public NonDetectingEncodingDetector() {} public NonDetectingEncodingDetector(Charset charset) { this.charset = charset; diff --git a/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java b/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java index b6c5a41f8a..ee2944d227 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java @@ -18,17 +18,16 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; /** - * Use this to force a content type detection via the - * {@link TikaCoreProperties#CONTENT_TYPE_USER_OVERRIDE} key in the metadata object. - *

- * This is also required to override detection by some parsers - * via {@link TikaCoreProperties#CONTENT_TYPE_PARSER_OVERRIDE}. + * Use this to force a content type detection via the {@link + * TikaCoreProperties#CONTENT_TYPE_USER_OVERRIDE} key in the metadata object. + * + *

This is also required to override detection by some parsers via {@link + * TikaCoreProperties#CONTENT_TYPE_PARSER_OVERRIDE}. * * @deprecated after 2.5.0 this functionality was moved to the CompositeDetector */ diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java index 96583be7c2..8cfad3f564 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java @@ -19,46 +19,39 @@ import java.io.IOException; import java.io.InputStream; import java.util.Arrays; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; /** - * Content type detection of plain text documents. This detector looks at the - * beginning of the document input stream and considers the document to be - * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are - * found. As a special case some control bytes (up to 2% of all characters) - * are also allowed in a text document if it also contains no or just a few - * (less than 10%) characters above the 7-bit ASCII range. - *

- * Note that text documents with a character encoding like UTF-16 are better - * detected with {@link MagicDetector} and an appropriate magic byte pattern. + * Content type detection of plain text documents. This detector looks at the beginning of the + * document input stream and considers the document to be a text document if no ASCII (ISO-Latin-1, + * UTF-8, etc.) control bytes are found. As a special case some control bytes (up to 2% of all + * characters) are also allowed in a text document if it also contains no or just a few (less than + * 10%) characters above the 7-bit ASCII range. + * + *

Note that text documents with a character encoding like UTF-16 are better detected with {@link + * MagicDetector} and an appropriate magic byte pattern. * * @since Apache Tika 0.3 */ public class TextDetector implements Detector { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 4774601079503507765L; - /** - * The number of bytes from the beginning of the document stream - * to test for control bytes. - */ + /** The number of bytes from the beginning of the document stream to test for control bytes. */ private static final int DEFAULT_NUMBER_OF_BYTES_TO_TEST = 512; /** - * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes - * in the range below 0x20 (the space character). If an entry in this - * table is true then that byte is very unlikely to occur - * in a plain text document. - *

- * The contents of this lookup table are based on the following definition - * from section 4 of the "Content-Type Processing Model" Internet-draft - * (true then that byte is very + * unlikely to occur in a plain text document. + * + *

The contents of this lookup table are based on the following definition from section 4 of + * the "Content-Type Processing Model" Internet-draft (draft-abarth-mime-sniff-01). + * *

      * +-------------------------+
      * | Binary data byte ranges |
@@ -86,29 +79,29 @@ public class TextDetector implements Detector {
     private final int bytesToTest;
 
     /**
-     * Constructs a {@link TextDetector} which will look at the default number
-     * of bytes from the beginning of the document.
+     * Constructs a {@link TextDetector} which will look at the default number of bytes from the
+     * beginning of the document.
      */
     public TextDetector() {
         this(DEFAULT_NUMBER_OF_BYTES_TO_TEST);
     }
 
     /**
-     * Constructs a {@link TextDetector} which will look at a given number of
-     * bytes from the beginning of the document.
+     * Constructs a {@link TextDetector} which will look at a given number of bytes from the
+     * beginning of the document.
      */
     public TextDetector(int bytesToTest) {
         this.bytesToTest = bytesToTest;
     }
 
     /**
-     * Looks at the beginning of the document input stream to determine
-     * whether the document is text or not.
+     * Looks at the beginning of the document input stream to determine whether the document is text
+     * or not.
      *
-     * @param input    document input stream, or null
+     * @param input document input stream, or null
      * @param metadata ignored
-     * @return "text/plain" if the input stream suggest a text document,
-     * "application/octet-stream" otherwise
+     * @return "text/plain" if the input stream suggest a text document, "application/octet-stream"
+     *     otherwise
      */
     public MediaType detect(InputStream input, Metadata metadata) throws IOException {
         if (input == null) {
@@ -137,5 +130,4 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException
             input.reset();
         }
     }
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
index 50f8d790aa..29252ccd67 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
@@ -35,11 +35,11 @@ public void addData(byte[] buffer, int offset, int length) {
     }
 
     /**
-     * Checks whether at least one byte was seen and that the bytes that
-     * were seen were mostly plain text (i.e. < 2% control, > 90% ASCII range).
+     * Checks whether at least one byte was seen and that the bytes that were seen were mostly plain
+     * text (i.e. < 2% control, > 90% ASCII range).
      *
-     * @return true if the seen bytes were mostly safe ASCII,
-     * false otherwise
+     * @return true if the seen bytes were mostly safe ASCII, false
+     *     otherwise
      * @see TIKA-483
      * @see TIKA-688
      */
@@ -53,8 +53,7 @@ public boolean isMostlyAscii() {
     /**
      * Checks whether the observed byte stream looks like UTF-8 encoded text.
      *
-     * @return true if the seen bytes look like UTF-8,
-     * false otherwise
+     * @return true if the seen bytes look like UTF-8, false otherwise
      * @since Apache Tika 1.3
      */
     public boolean looksLikeUTF8() {
@@ -63,16 +62,18 @@ public boolean looksLikeUTF8() {
         int safe = countSafeControl();
 
         int expectedContinuation = 0;
-        int[] leading = new int[]{count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8)};
+        int[] leading = new int[] {count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8)};
         for (int i = 0; i < leading.length; i++) {
             utf8 += leading[i];
             expectedContinuation += (i + 1) * leading[i];
         }
 
         int continuation = count(0x80, 0xc0);
-        return utf8 > 0 && continuation <= expectedContinuation &&
-                continuation >= expectedContinuation - 3 && count(0xf8, 0x100) == 0 &&
-                (control - safe) * 100 < utf8 * 2;
+        return utf8 > 0
+                && continuation <= expectedContinuation
+                && continuation >= expectedContinuation - 3
+                && count(0xf8, 0x100) == 0
+                && (control - safe) * 100 < utf8 * 2;
     }
 
     /**
@@ -95,13 +96,13 @@ public int count(int b) {
     }
 
     /**
-     * Counts control characters (i.e. < 0x20, excluding tab, CR, LF,
-     * page feed and escape).
-     * 

- * This definition of control characters is based on section 4 of the - * "Content-Type Processing Model" Internet-draft - * (This definition of control characters is based on section 4 of the "Content-Type + * Processing Model" Internet-draft (draft-abarth-mime-sniff-01). + * *

      * +-------------------------+
      * | Binary data byte ranges |
@@ -149,8 +150,10 @@ private int count(int from, int to) {
     }
 
     private int countSafeControl() {
-        return count('\t') + count('\n') + count('\r') // tab, LF, CR
-                + count(0x0c) + count(0x1b);           // new page, escape
+        return count('\t')
+                + count('\n')
+                + count('\r') // tab, LF, CR
+                + count(0x0c)
+                + count(0x1b); // new page, escape
     }
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java b/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java
index 0111b233bb..6d725ee80f 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java
@@ -16,7 +16,6 @@
  */
 package org.apache.tika.detect;
 
-
 public abstract class TrainedModel {
 
     public abstract double predict(double[] input);
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
index 25b9f085be..170ed0ba02 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
@@ -31,7 +31,6 @@
 import java.nio.file.Path;
 import java.util.HashMap;
 import java.util.Map;
-
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
index 60d75c7b0c..d10ef00242 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
@@ -17,27 +17,25 @@
 package org.apache.tika.detect;
 
 import java.io.InputStream;
-
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
 /**
- * Content type detection based on a content type hint. This detector simply
- * trusts any valid content type hint given in the input metadata, and returns
- * that as the likely type of the input document.
+ * Content type detection based on a content type hint. This detector simply trusts any valid
+ * content type hint given in the input metadata, and returns that as the likely type of the input
+ * document.
  *
  * @since Apache Tika 0.3
  */
 public class TypeDetector implements Detector {
 
     /**
-     * Detects the content type of an input document based on a type hint
-     * given in the input metadata. The CONTENT_TYPE attribute of the given
-     * input metadata is expected to contain the type of the input document.
-     * If that attribute exists and contains a valid type name, then that
-     * type is returned.
+     * Detects the content type of an input document based on a type hint given in the input
+     * metadata. The CONTENT_TYPE attribute of the given input metadata is expected to contain the
+     * type of the input document. If that attribute exists and contains a valid type name, then
+     * that type is returned.
      *
-     * @param input    ignored
+     * @param input ignored
      * @param metadata input metadata, possibly with a CONTENT_TYPE value
      * @return detected media type, or application/octet-stream
      */
@@ -52,5 +50,4 @@ public MediaType detect(InputStream input, Metadata metadata) {
         }
         return MediaType.OCTET_STREAM;
     }
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java b/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
index 94d8531498..83f73e1ec7 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
@@ -20,19 +20,17 @@
 import java.io.InputStream;
 import java.util.Arrays;
 import javax.xml.namespace.QName;
-
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.utils.XMLReaderUtils;
-
 /**
- * Utility class that uses a {@link javax.xml.parsers.SAXParser} to determine
- * the namespace URI and local name of the root element of an XML file.
+ * Utility class that uses a {@link javax.xml.parsers.SAXParser} to determine the namespace URI and
+ * local name of the root element of an XML file.
  *
  * @since Apache Tika 0.4
  */
@@ -66,17 +64,17 @@ public QName extractRootElement(byte[] data) {
     public QName extractRootElement(InputStream stream) {
         return extractRootElement(stream, false);
     }
-    
+
     private QName extractRootElement(InputStream stream, boolean throwMalformed) {
         ExtractorHandler handler = new ExtractorHandler();
         try {
-            XMLReaderUtils.parseSAX(CloseShieldInputStream.wrap(stream),
-                    handler, EMPTY_CONTEXT);
+            XMLReaderUtils.parseSAX(CloseShieldInputStream.wrap(stream), handler, EMPTY_CONTEXT);
         } catch (SecurityException e) {
             throw e;
         } catch (Exception e) {
-            if (throwMalformed && (e instanceof CharConversionException
-                    || e.getCause() instanceof CharConversionException)) {
+            if (throwMalformed
+                    && (e instanceof CharConversionException
+                            || e.getCause() instanceof CharConversionException)) {
                 throw new MalformedCharException(e);
             }
         }
@@ -93,7 +91,6 @@ public void startElement(String uri, String local, String name, Attributes attri
             this.rootElement = new QName(uri, local);
             throw new SAXException("Aborting: root element received");
         }
-
     }
 
     private static class MalformedCharException extends RuntimeException {
@@ -101,7 +98,5 @@ private static class MalformedCharException extends RuntimeException {
         public MalformedCharException(Exception e) {
             super(e);
         }
-
     }
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java b/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java
index 5ce52681e2..86f4917250 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java
@@ -18,13 +18,10 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
-/**
- * Detector to identify zero length files as application/x-zerovalue
- */
+/** Detector to identify zero length files as application/x-zerovalue */
 public class ZeroSizeFileDetector implements Detector {
     public MediaType detect(InputStream stream, Metadata metadata) throws IOException {
         if (stream != null) {
diff --git a/tika-core/src/main/java/org/apache/tika/detect/package-info.java b/tika-core/src/main/java/org/apache/tika/detect/package-info.java
index dede49cfb6..b04a9d1c96 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/package-info.java
@@ -15,8 +15,6 @@
  * limitations under the License.
  */
 
-/**
- * Media type detection.
- */
+/** Media type detection. */
 @aQute.bnd.annotation.Version("1.0.0")
 package org.apache.tika.detect;
diff --git a/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java b/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java
index 2af59d326e..d1bead9eb0 100644
--- a/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java
+++ b/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java
@@ -21,7 +21,6 @@
 import java.io.OutputStream;
 import java.io.Serializable;
 import java.util.Set;
-
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -36,11 +35,11 @@
 public interface Embedder extends Serializable {
 
     /**
-     * Returns the set of media types supported by this embedder when used with
-     * the given parse context.
-     * 

- * The name differs from the precedence of {@link Parser#getSupportedTypes(ParseContext)} - * so that parser implementations may also choose to implement this interface. + * Returns the set of media types supported by this embedder when used with the given parse + * context. + * + *

The name differs from the precedence of {@link Parser#getSupportedTypes(ParseContext)} so + * that parser implementations may also choose to implement this interface. * * @param context parse context * @return immutable set of media types @@ -48,46 +47,45 @@ public interface Embedder extends Serializable { Set getSupportedEmbedTypes(ParseContext context); /** - * Embeds related document metadata from the given metadata object into the - * given output stream. - *

- * The given document stream is consumed but not closed by this method. The - * responsibility to close the stream remains on the caller. - *

- * Information about the parsing context can be passed in the context - * parameter. See the parser implementations for the kinds of context - * information they expect. - *

- * In general implementations should favor preserving the source file's metadata - * unless an update to a field is explicitly defined in the Metadata object. - * More specifically: + * Embeds related document metadata from the given metadata object into the given output stream. + * + *

The given document stream is consumed but not closed by this method. The responsibility to + * close the stream remains on the caller. + * + *

Information about the parsing context can be passed in the context parameter. See the + * parser implementations for the kinds of context information they expect. + * + *

In general implementations should favor preserving the source file's metadata unless an + * update to a field is explicitly defined in the Metadata object. More specifically: + * *

    - *
  • Embedder implementations should only attempt to update metadata fields - * present in the given Metadata object. Other fields should be left untouched.
  • - *
  • Embedder implementations should set properties as empty when the - * corresponding field in the Metadata object is an empty string, i.e. ""
  • - *
  • Embedder implementations should nullify or delete properties - * corresponding to fields with a null value in the given Metadata object.
  • - *
  • Embedder implementations should set the property - * corresponding to a particular field in the given Metadata object in all - * metadata containers whenever possible and appropriate for the file format at the time. - * If a particular metadata container falls out of use and/or is superseded by another - * (such as IIC vs XMP for IPTC) it is up to the implementation to decide if and when - * to cease embedding in the alternate container.
  • - *
  • Embedder implementations should attempt to embed as much of the metadata - * as accurately as possible. An implementation may choose a strict approach - * and throw an exception if a value to be embedded exceeds the length allowed - * or may choose to truncate the value.
  • + *
  • Embedder implementations should only attempt to update metadata fields present in the + * given Metadata object. Other fields should be left untouched. + *
  • Embedder implementations should set properties as empty when the corresponding field in + * the Metadata object is an empty string, i.e. "" + *
  • Embedder implementations should nullify or delete properties corresponding to fields + * with a null value in the given Metadata object. + *
  • Embedder implementations should set the property corresponding to a particular field in + * the given Metadata object in all metadata containers whenever possible and appropriate + * for the file format at the time. If a particular metadata container falls out of use + * and/or is superseded by another (such as IIC vs XMP for IPTC) it is up to the + * implementation to decide if and when to cease embedding in the alternate container. + *
  • Embedder implementations should attempt to embed as much of the metadata as accurately + * as possible. An implementation may choose a strict approach and throw an exception if a + * value to be embedded exceeds the length allowed or may choose to truncate the value. *
* - * @param metadata document metadata (input and output) + * @param metadata document metadata (input and output) * @param originalStream the document stream (input) - * @param outputStream the output stream to write the metadata embedded data to - * @param context parse context - * @throws IOException if the document stream could not be read + * @param outputStream the output stream to write the metadata embedded data to + * @param context parse context + * @throws IOException if the document stream could not be read * @throws TikaException if the document could not be parsed */ - void embed(Metadata metadata, InputStream originalStream, OutputStream outputStream, - ParseContext context) throws IOException, TikaException; - + void embed( + Metadata metadata, + InputStream originalStream, + OutputStream outputStream, + ParseContext context) + throws IOException, TikaException; } diff --git a/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java b/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java index b2411d7a89..a1b2cf65c1 100644 --- a/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java +++ b/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java @@ -29,10 +29,8 @@ import java.util.List; import java.util.Map; import java.util.Set; - import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -43,50 +41,49 @@ import org.apache.tika.parser.external.ExternalParser; /** - * Embedder that uses an external program (like sed or exiftool) to embed text - * content and metadata into a given document. + * Embedder that uses an external program (like sed or exiftool) to embed text content and metadata + * into a given document. * * @since Apache Tika 1.3 */ public class ExternalEmbedder implements Embedder { - /** - * Token to be replaced with a String array of metadata assignment command - * arguments - */ + /** Token to be replaced with a String array of metadata assignment command arguments */ public static final String METADATA_COMMAND_ARGUMENTS_TOKEN = "${METADATA}"; - /** - * Token to be replaced with a String array of metadata assignment command - * arguments - */ + + /** Token to be replaced with a String array of metadata assignment command arguments */ public static final String METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN = "${METADATA_SERIALIZED}"; + private static final long serialVersionUID = -2828829275642475697L; private final TemporaryResources tmp = new TemporaryResources(); - /** - * Media types supported by the external program. - */ + + /** Media types supported by the external program. */ private Set supportedEmbedTypes = Collections.emptySet(); - /** - * Mapping of Tika metadata to command line parameters. - */ + + /** Mapping of Tika metadata to command line parameters. */ private Map metadataCommandArguments = null; + /** * The external command to invoke. * * @see Runtime#exec(String[]) */ private String[] command = - new String[]{"sed", "-e", "$a\\\n" + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, - ExternalParser.INPUT_FILE_TOKEN}; + new String[] { + "sed", + "-e", + "$a\\\n" + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, + ExternalParser.INPUT_FILE_TOKEN + }; + private String commandAssignmentOperator = "="; private String commandAssignmentDelimeter = ", "; private String commandAppendOperator = "="; private boolean quoteAssignmentValues = false; /** - * Serializes a collection of metadata command line arguments into a single - * string. + * Serializes a collection of metadata command line arguments into a single string. * * @param metadataCommandArguments * @return the serialized metadata arguments string @@ -99,30 +96,28 @@ protected static String serializeMetadata(List metadataCommandArguments) } /** - * Checks to see if the command can be run. Typically used with something - * like "myapp --version" to check to see if "myapp" is installed and on the - * path. + * Checks to see if the command can be run. Typically used with something like "myapp --version" + * to check to see if "myapp" is installed and on the path. * - * @param checkCmd the check command to run + * @param checkCmd the check command to run * @param errorValue what is considered an error value? * @return whether or not the check completed without error */ public static boolean check(String checkCmd, int... errorValue) { - return check(new String[]{checkCmd}, errorValue); + return check(new String[] {checkCmd}, errorValue); } /** - * Checks to see if the command can be run. Typically used with something - * like "myapp --version" to check to see if "myapp" is installed and on the - * path. + * Checks to see if the command can be run. Typically used with something like "myapp --version" + * to check to see if "myapp" is installed and on the path. * - * @param checkCmd the check command to run + * @param checkCmd the check command to run * @param errorValue what is considered an error value? * @return whether or not the check completed without error */ public static boolean check(String[] checkCmd, int... errorValue) { if (errorValue.length == 0) { - errorValue = new int[]{127}; + errorValue = new int[] {127}; } try { @@ -155,14 +150,12 @@ public Set getSupportedEmbedTypes() { } public void setSupportedEmbedTypes(Set supportedEmbedTypes) { - this.supportedEmbedTypes = - Collections.unmodifiableSet(new HashSet<>(supportedEmbedTypes)); + this.supportedEmbedTypes = Collections.unmodifiableSet(new HashSet<>(supportedEmbedTypes)); } /** - * Gets the command to be run. This can include either of - * {@link ExternalParser#INPUT_FILE_TOKEN} or - * {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command + * Gets the command to be run. This can include either of {@link + * ExternalParser#INPUT_FILE_TOKEN} or {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command * needs filenames. * * @return @@ -172,9 +165,8 @@ public String[] getCommand() { } /** - * Sets the command to be run. This can include either of - * {@link ExternalParser#INPUT_FILE_TOKEN} or - * {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command + * Sets the command to be run. This can include either of {@link + * ExternalParser#INPUT_FILE_TOKEN} or {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command * needs filenames. * * @see Runtime#exec(String[]) @@ -202,8 +194,7 @@ public void setCommandAssignmentOperator(String commandAssignmentOperator) { } /** - * Gets the delimiter for multiple assignments for the command line tool, - * i.e. ", ". + * Gets the delimiter for multiple assignments for the command line tool, i.e. ", ". * * @return the assignment delimiter */ @@ -212,8 +203,7 @@ public String getCommandAssignmentDelimeter() { } /** - * Sets the delimiter for multiple assignments for the command line tool, - * i.e. ", ". + * Sets the delimiter for multiple assignments for the command line tool, i.e. ", ". * * @param commandAssignmentDelimeter */ @@ -222,8 +212,7 @@ public void setCommandAssignmentDelimeter(String commandAssignmentDelimeter) { } /** - * Gets the operator to append rather than replace a value for the command - * line tool, i.e. "+=". + * Gets the operator to append rather than replace a value for the command line tool, i.e. "+=". * * @return the append operator */ @@ -232,8 +221,7 @@ public String getCommandAppendOperator() { } /** - * Sets the operator to append rather than replace a value for the command - * line tool, i.e. "+=". + * Sets the operator to append rather than replace a value for the command line tool, i.e. "+=". * * @param commandAppendOperator */ @@ -242,8 +230,7 @@ public void setCommandAppendOperator(String commandAppendOperator) { } /** - * Gets whether or not to quote assignment values, i.e. tag='value'. The - * default is false. + * Gets whether or not to quote assignment values, i.e. tag='value'. The default is false. * * @return whether or not to quote assignment values */ @@ -270,8 +257,8 @@ public Map getMetadataCommandArguments() { } /** - * Sets the map of Metadata keys to command line parameters. Set this to - * null to disable Metadata embedding. + * Sets the map of Metadata keys to command line parameters. Set this to null to disable + * Metadata embedding. * * @param arguments */ @@ -280,8 +267,8 @@ public void setMetadataCommandArguments(Map arguments) { } /** - * Constructs a collection of command line arguments responsible for setting - * individual metadata fields based on the given metadata. + * Constructs a collection of command line arguments responsible for setting individual metadata + * fields based on the given metadata. * * @param metadata the metadata to embed * @return the metadata-related command line arguments @@ -303,18 +290,20 @@ protected List getCommandMetadataSegments(Metadata metadata) { if (quoteAssignmentValues) { assignmentValue = "'" + assignmentValue + "'"; } - commandMetadataSegments - .add(metadataCommandArgument + commandAppendOperator + - assignmentValue); + commandMetadataSegments.add( + metadataCommandArgument + + commandAppendOperator + + assignmentValue); } } else { String assignmentValue = metadata.get(metadataName); if (quoteAssignmentValues) { assignmentValue = "'" + assignmentValue + "'"; } - commandMetadataSegments - .add(metadataCommandArgument + commandAssignmentOperator + - assignmentValue); + commandMetadataSegments.add( + metadataCommandArgument + + commandAssignmentOperator + + assignmentValue); } } } @@ -325,13 +314,15 @@ protected List getCommandMetadataSegments(Metadata metadata) { } /** - * Executes the configured external command and passes the given document - * stream as a simple XHTML document to the given SAX content handler. - * Metadata is only extracted if {@link #setMetadataCommandArguments(Map)} - * has been called to set arguments. + * Executes the configured external command and passes the given document stream as a simple + * XHTML document to the given SAX content handler. Metadata is only extracted if {@link + * #setMetadataCommandArguments(Map)} has been called to set arguments. */ - public void embed(final Metadata metadata, final InputStream inputStream, - final OutputStream outputStream, final ParseContext context) + public void embed( + final Metadata metadata, + final InputStream inputStream, + final OutputStream outputStream, + final ParseContext context) throws IOException, TikaException { boolean inputToStdIn = true; @@ -354,14 +345,17 @@ public void embed(final Metadata metadata, final InputStream inputStream, List cmd = new ArrayList<>(); for (String commandSegment : origCmd) { if (commandSegment.contains(ExternalParser.INPUT_FILE_TOKEN)) { - commandSegment = commandSegment.replace(ExternalParser.INPUT_FILE_TOKEN, - tikaInputStream.getFile().toString()); + commandSegment = + commandSegment.replace( + ExternalParser.INPUT_FILE_TOKEN, + tikaInputStream.getFile().toString()); inputToStdIn = false; } if (commandSegment.contains(ExternalParser.OUTPUT_FILE_TOKEN)) { tempOutputFile = tmp.createTemporaryFile(); - commandSegment = commandSegment - .replace(ExternalParser.OUTPUT_FILE_TOKEN, tempOutputFile.toString()); + commandSegment = + commandSegment.replace( + ExternalParser.OUTPUT_FILE_TOKEN, tempOutputFile.toString()); outputFromStdOut = false; } if (commandSegment.contains(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN)) { @@ -382,15 +376,16 @@ public void embed(final Metadata metadata, final InputStream inputStream, int i = 0; for (String commandSegment : cmd) { if (commandSegment.contains(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN)) { - commandSegment = commandSegment - .replace(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, + commandSegment = + commandSegment.replace( + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, serializeMetadata(commandMetadataSegments)); cmd.set(i, commandSegment); } i++; } - } else if (!replacedMetadataCommandArgumentsToken && - !serializeMetadataCommandArgumentsToken) { + } else if (!replacedMetadataCommandArgumentsToken + && !serializeMetadataCommandArgumentsToken) { // Tack metadata onto the end of the cmd as arguments cmd.addAll(commandMetadataSegments); } @@ -399,12 +394,13 @@ public void embed(final Metadata metadata, final InputStream inputStream, // Execute Process process; if (cmd.toArray().length == 1) { - process = Runtime.getRuntime().exec(cmd.toArray(new String[]{})[0]); + process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]); } else { - process = Runtime.getRuntime().exec(cmd.toArray(new String[]{})); + process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})); } - UnsynchronizedByteArrayOutputStream stdErrOutputStream = UnsynchronizedByteArrayOutputStream.builder().get(); + UnsynchronizedByteArrayOutputStream stdErrOutputStream = + UnsynchronizedByteArrayOutputStream.builder().get(); try { sendStdErrToOutputStream(process, stdErrOutputStream); @@ -439,7 +435,7 @@ public void embed(final Metadata metadata, final InputStream inputStream, // Clean up temp output files tempOutputFile.delete(); } catch (Exception e) { - //swallow + // swallow } } if (!inputToStdIn) { @@ -450,9 +446,12 @@ public void embed(final Metadata metadata, final InputStream inputStream, IOUtils.closeQuietly(outputStream); IOUtils.closeQuietly(stdErrOutputStream); if (process.exitValue() != 0) { - throw new TikaException("There was an error executing the command line" + - "\nExecutable Command:\n\n" + cmd + "\nExecutable Error:\n\n" + - stdErrOutputStream.toString(UTF_8.name())); + throw new TikaException( + "There was an error executing the command line" + + "\nExecutable Command:\n\n" + + cmd + + "\nExecutable Error:\n\n" + + stdErrOutputStream.toString(UTF_8.name())); } } } @@ -460,28 +459,29 @@ public void embed(final Metadata metadata, final InputStream inputStream, /** * Creates a new thread for copying a given input stream to a given output stream. * - * @param inputStream the source input stream + * @param inputStream the source input stream * @param outputStream the target output stream */ - private void multiThreadedStreamCopy(final InputStream inputStream, - final OutputStream outputStream) { - new Thread(() -> { - try { - IOUtils.copy(inputStream, outputStream); - } catch (IOException e) { - System.out.println("ERROR: " + e.getMessage()); - } - }).start(); + private void multiThreadedStreamCopy( + final InputStream inputStream, final OutputStream outputStream) { + new Thread( + () -> { + try { + IOUtils.copy(inputStream, outputStream); + } catch (IOException e) { + System.out.println("ERROR: " + e.getMessage()); + } + }) + .start(); } /** - * Sends the contents of the given input stream to the - * standard input of the given process. Potential exceptions are - * ignored. - *

- * Note that the given input stream is not closed by this method. + * Sends the contents of the given input stream to the standard input of the given process. + * Potential exceptions are ignored. * - * @param process the process + *

Note that the given input stream is not closed by this method. + * + * @param process the process * @param inputStream the input stream to send to standard input of the process */ private void sendInputStreamToStdIn(final InputStream inputStream, final Process process) { @@ -489,13 +489,12 @@ private void sendInputStreamToStdIn(final InputStream inputStream, final Process } /** - * Sends the standard output of the given - * process to the given output stream. Potential exceptions are - * ignored. - *

- * Note that the given output stream is not closed by this method. + * Sends the standard output of the given process to the given output stream. Potential + * exceptions are ignored. + * + *

Note that the given output stream is not closed by this method. * - * @param process the process + * @param process the process * @param outputStream the putput stream to send to standard input of the process */ private void sendStdOutToOutputStream(final Process process, final OutputStream outputStream) { @@ -507,12 +506,11 @@ private void sendStdOutToOutputStream(final Process process, final OutputStream } /** - * Starts a thread that reads and discards the contents of the standard - * stream of the given process. Potential exceptions are ignored, and the - * stream is closed once fully processed. + * Starts a thread that reads and discards the contents of the standard stream of the given + * process. Potential exceptions are ignored, and the stream is closed once fully processed. * - * @param process the process - * param outputStream the output stream to send to standard error of the process + * @param process the process param outputStream the output stream to send to standard error of + * the process */ private void sendStdErrToOutputStream(final Process process, final OutputStream outputStream) { multiThreadedStreamCopy(process.getErrorStream(), outputStream); diff --git a/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java b/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java index b5f2136ea9..9042868c39 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java @@ -17,9 +17,8 @@ package org.apache.tika.exception; /** - * Exception to be thrown when a document does not allow content extraction. - * As of this writing, PDF documents are the only type of document that might - * cause this type of exception. + * Exception to be thrown when a document does not allow content extraction. As of this writing, PDF + * documents are the only type of document that might cause this type of exception. */ public class AccessPermissionException extends TikaException { public AccessPermissionException() { diff --git a/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java b/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java index 5ebad6d3a6..ede9da571b 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java @@ -17,8 +17,8 @@ package org.apache.tika.exception; /** - * This exception should be thrown when the parse absolutely, positively has to stop. - * This exception must not be caught and swallowed if an embedded parser throws it. + * This exception should be thrown when the parse absolutely, positively has to stop. This exception + * must not be caught and swallowed if an embedded parser throws it. */ public class CorruptedFileException extends TikaException { public CorruptedFileException(String msg) { diff --git a/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java b/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java index 3ec3294b31..97ebb60ebb 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java @@ -34,8 +34,11 @@ public FileTooLongException(long length, long maxLength) { } private static String msg(long length, long maxLength) { - return "File is " + length + " bytes, but " + maxLength + - " is the maximum length allowed. You can modify maxLength via " + - "the setter on the fetcher."; + return "File is " + + length + + " bytes, but " + + maxLength + + " is the maximum length allowed. You can modify maxLength via " + + "the setter on the fetcher."; } } diff --git a/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java b/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java index 4e0bc43087..853dfc958a 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java @@ -18,13 +18,10 @@ import org.xml.sax.SAXException; -/** - * Use this to throw a SAXException in subclassed methods that don't throw SAXExceptions - */ +/** Use this to throw a SAXException in subclassed methods that don't throw SAXExceptions */ public class RuntimeSAXException extends RuntimeException { public RuntimeSAXException(SAXException t) { super(t); } - } diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java index 1dcd3275f9..3c6855354f 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java @@ -17,9 +17,8 @@ package org.apache.tika.exception; /** - * Tika Config Exception is an exception to occur when there is an error - * in Tika config file and/or one or more of the parsers failed to initialize - * from that erroneous config. + * Tika Config Exception is an exception to occur when there is an error in Tika config file and/or + * one or more of the parsers failed to initialize from that erroneous config. * * @since Apache Tika 1.14 */ diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaException.java index ceac19d6a6..a2bfc87cb3 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/TikaException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/TikaException.java @@ -16,9 +16,7 @@ */ package org.apache.tika.exception; -/** - * Tika exception - */ +/** Tika exception */ public class TikaException extends Exception { public TikaException(String msg) { @@ -28,5 +26,4 @@ public TikaException(String msg) { public TikaException(String msg, Throwable cause) { super(msg, cause); } - } diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java index fbc1a95528..9730f54d7d 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java @@ -33,8 +33,11 @@ public TikaMemoryLimitException(long triedToAllocate, long maxAllowable) { } private static String msg(long triedToAllocate, long maxAllowable) { - return "Tried to allocate " + triedToAllocate + " bytes, but " + maxAllowable + - " is the maximum allowed. Please open an issue https://issues.apache.org/jira/projects/TIKA" + - " if you believe this file is not corrupt."; + return "Tried to allocate " + + triedToAllocate + + " bytes, but " + + maxAllowable + + " is the maximum allowed. Please open an issue https://issues.apache.org/jira/projects/TIKA" + + " if you believe this file is not corrupt."; } } diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java index a53dbd6a31..97bfebed8a 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java @@ -16,9 +16,7 @@ */ package org.apache.tika.exception; -/** - * Runtime/unchecked version of {@link java.util.concurrent.TimeoutException} - */ +/** Runtime/unchecked version of {@link java.util.concurrent.TimeoutException} */ public class TikaTimeoutException extends RuntimeException { public TikaTimeoutException(String message) { super(message); diff --git a/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java b/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java index 4322e64f9b..76844ac56a 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java @@ -18,18 +18,15 @@ package org.apache.tika.exception; /** - * Parsers should throw this exception when they encounter - * a file format that they do not support. This should only happen - * when we're not able to differentiate versions by the mime. For example, - * At the time of this writing, "application/wordperfect" covers all versions - * of the wordperfect format; however, the parser only handles 6.x. - *

- * Whenever possible/convenient, it is better to distinguish file formats by mime - * so that unsupported formats will be handled by the - * {@link org.apache.tika.parser.EmptyParser}. - * However, if we can't differentiate by mime or we need to rely on the parser - * to distinguish the versions (in the case that magic can't distinguish), - * this exception should be thrown. + * Parsers should throw this exception when they encounter a file format that they do not support. + * This should only happen when we're not able to differentiate versions by the mime. For example, + * At the time of this writing, "application/wordperfect" covers all versions of the wordperfect + * format; however, the parser only handles 6.x. + * + *

Whenever possible/convenient, it is better to distinguish file formats by mime so that + * unsupported formats will be handled by the {@link org.apache.tika.parser.EmptyParser}. However, + * if we can't differentiate by mime or we need to rely on the parser to distinguish the versions + * (in the case that magic can't distinguish), this exception should be thrown. */ public class UnsupportedFormatException extends TikaException { diff --git a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java index 3e661ada5c..b4559a69aa 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java @@ -20,29 +20,31 @@ public class WriteLimitReachedException extends SAXException { - //in case of (hopefully impossible) cyclic exception - private final static int MAX_DEPTH = 100; + // in case of (hopefully impossible) cyclic exception + private static final int MAX_DEPTH = 100; private final int writeLimit; + public WriteLimitReachedException(int writeLimit) { this.writeLimit = writeLimit; } @Override public String getMessage() { - return "Your document contained more than " + writeLimit + return "Your document contained more than " + + writeLimit + " characters, and so your requested limit has been" + " reached. To receive the full text of the document," + " increase your limit. (Text up to the limit is" + " however available)."; } + /** - * Checks whether the given exception (or any of it's root causes) was - * thrown by this handler as a signal of reaching the write limit. + * Checks whether the given exception (or any of it's root causes) was thrown by this handler as + * a signal of reaching the write limit. * * @param t throwable - * @return true if the write limit was reached, - * false otherwise + * @return true if the write limit was reached, false otherwise * @since Apache Tika 2.0 */ public static boolean isWriteLimitReached(Throwable t) { diff --git a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java index 125bc21b90..9adcf5bd03 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java @@ -17,28 +17,24 @@ package org.apache.tika.exception; -/** - * Exception thrown by the AutoDetectParser when a file contains zero-bytes. - */ +/** Exception thrown by the AutoDetectParser when a file contains zero-bytes. */ public class ZeroByteFileException extends TikaException { - /** - * If this is in the {@link org.apache.tika.parser.ParseContext}, the - * {@link org.apache.tika.parser.AutoDetectParser} and the - * {@link org.apache.tika.parser.RecursiveParserWrapper} will - * ignore embedded files with zero-byte length inputstreams + * If this is in the {@link org.apache.tika.parser.ParseContext}, the {@link + * org.apache.tika.parser.AutoDetectParser} and the {@link + * org.apache.tika.parser.RecursiveParserWrapper} will ignore embedded files with zero-byte + * length inputstreams */ public static IgnoreZeroByteFileException IGNORE_ZERO_BYTE_FILE_EXCEPTION = new IgnoreZeroByteFileException(); - //If this is in the parse context, the AutoDetectParser and the - //RecursiveParserWrapper should ignore zero byte files - //and not throw a Zero} + // If this is in the parse context, the AutoDetectParser and the + // RecursiveParserWrapper should ignore zero byte files + // and not throw a Zero} public ZeroByteFileException(String msg) { super(msg); } - public static class IgnoreZeroByteFileException { - } + public static class IgnoreZeroByteFileException {} } diff --git a/tika-core/src/main/java/org/apache/tika/exception/package-info.java b/tika-core/src/main/java/org/apache/tika/exception/package-info.java index 80ab125814..af490fba56 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/exception/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * Tika exception. - */ +/** Tika exception. */ @aQute.bnd.annotation.Version("1.0.0") package org.apache.tika.exception; diff --git a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java index 3f2f38f944..ca74e99a19 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java @@ -21,7 +21,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; - import org.apache.tika.io.FilenameUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -32,23 +31,29 @@ public abstract class AbstractEmbeddedDocumentBytesHandler implements EmbeddedDo List ids = new ArrayList<>(); - public String getEmitKey(String containerEmitKey, int embeddedId, - EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig, - Metadata metadata) { - String embeddedIdString = embeddedDocumentBytesConfig.getZeroPadName() > 0 ? - StringUtils.leftPad(Integer.toString(embeddedId), - embeddedDocumentBytesConfig.getZeroPadName(), "0") : - Integer.toString(embeddedId); - + public String getEmitKey( + String containerEmitKey, + int embeddedId, + EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig, + Metadata metadata) { + String embeddedIdString = + embeddedDocumentBytesConfig.getZeroPadName() > 0 + ? StringUtils.leftPad( + Integer.toString(embeddedId), + embeddedDocumentBytesConfig.getZeroPadName(), + "0") + : Integer.toString(embeddedId); - StringBuilder emitKey = new StringBuilder(containerEmitKey) - .append("/") - .append(FilenameUtils.getName(containerEmitKey)) - .append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix()) - .append(embeddedIdString); + StringBuilder emitKey = + new StringBuilder(containerEmitKey) + .append("/") + .append(FilenameUtils.getName(containerEmitKey)) + .append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix()) + .append(embeddedIdString); - if (embeddedDocumentBytesConfig.getSuffixStrategy().equals( - EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING)) { + if (embeddedDocumentBytesConfig + .getSuffixStrategy() + .equals(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING)) { String fName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); String suffix = FilenameUtils.getSuffixFromPath(fName); suffix = suffix.toLowerCase(Locale.US); diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java index 1d5a239db6..c866139799 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java @@ -17,7 +17,6 @@ package org.apache.tika.extractor; import java.util.Set; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -25,17 +24,17 @@ public class BasicEmbeddedBytesSelector implements EmbeddedBytesSelector { - - private final Set includeMimes; private final Set excludeMimes; private final Set includeEmbeddedResourceTypes; private final Set excludeEmbeddedResourceTypes; - public BasicEmbeddedBytesSelector(Set includeMimes, Set excludeMimes, - Set includeEmbeddedResourceTypes, - Set excludeEmbeddedResourceTypes) { + public BasicEmbeddedBytesSelector( + Set includeMimes, + Set excludeMimes, + Set includeEmbeddedResourceTypes, + Set excludeEmbeddedResourceTypes) { this.includeMimes = includeMimes; this.excludeMimes = excludeMimes; this.includeEmbeddedResourceTypes = includeEmbeddedResourceTypes; @@ -47,7 +46,7 @@ public boolean select(Metadata metadata) { if (mime == null) { mime = ""; } else { - //if mime matters at all, make sure to get the mime without parameters + // if mime matters at all, make sure to get the mime without parameters if (includeMimes.size() > 0 || excludeMimes.size() > 0) { MediaType mt = MediaType.parse(mime); if (mt != null) { @@ -58,18 +57,19 @@ public boolean select(Metadata metadata) { if (excludeMimes.contains(mime)) { return false; } - if (includeMimes.size() > 0 && ! includeMimes.contains(mime)) { + if (includeMimes.size() > 0 && !includeMimes.contains(mime)) { return false; } String embeddedResourceType = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); - //if a parser doesn't specify the type, treat it as ATTACHMENT - embeddedResourceType = StringUtils.isBlank(embeddedResourceType) ? "ATTACHMENT" : - embeddedResourceType; + // if a parser doesn't specify the type, treat it as ATTACHMENT + embeddedResourceType = + StringUtils.isBlank(embeddedResourceType) ? "ATTACHMENT" : embeddedResourceType; if (excludeEmbeddedResourceTypes.contains(embeddedResourceType)) { return false; } - if (includeEmbeddedResourceTypes.size() > 0 && includeEmbeddedResourceTypes.contains(embeddedResourceType)) { + if (includeEmbeddedResourceTypes.size() > 0 + && includeEmbeddedResourceTypes.contains(embeddedResourceType)) { return true; } return false; diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java index cf6441b4fb..7221c9cfe1 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java @@ -20,27 +20,28 @@ import java.io.InputStream; import java.util.HashMap; import java.util.Map; - import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.UnsynchronizedBufferedInputStream; - import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; /** - * For now, this is an in-memory EmbeddedDocumentBytesHandler that stores - * all the bytes in memory. Users can retrieve the documents with {@link #getDocument(int)}. + * For now, this is an in-memory EmbeddedDocumentBytesHandler that stores all the bytes in memory. + * Users can retrieve the documents with {@link #getDocument(int)}. * - * We'll need to make this cache to disk at some point if there are many bytes of - * embedded documents. + *

We'll need to make this cache to disk at some point if there are many bytes of embedded + * documents. */ public class BasicEmbeddedDocumentBytesHandler extends AbstractEmbeddedDocumentBytesHandler { private final EmbeddedDocumentBytesConfig config; + public BasicEmbeddedDocumentBytesHandler(EmbeddedDocumentBytesConfig config) { this.config = config; } - //this won't scale, but let's start fully in memory for now; + + // this won't scale, but let's start fully in memory for now; Map docBytes = new HashMap<>(); + @Override public void add(int id, Metadata metadata, InputStream is) throws IOException { super.add(id, metadata, is); @@ -53,6 +54,6 @@ public InputStream getDocument(int id) throws IOException { @Override public void close() throws IOException { - //delete tmp dir or whatever here + // delete tmp dir or whatever here } } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java index cfc70b5f36..4858911621 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java @@ -18,47 +18,44 @@ import java.io.IOException; import java.io.Serializable; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; /** - * Tika container extractor interface. - * Container Extractors provide access to the embedded - * resources within container formats such as .zip and .doc + * Tika container extractor interface. Container Extractors provide access to the embedded resources + * within container formats such as .zip and .doc */ public interface ContainerExtractor extends Serializable { /** - * Is this Container Extractor able to process the - * supplied container? + * Is this Container Extractor able to process the supplied container? * * @since Apache Tika 0.8 */ boolean isSupported(TikaInputStream input) throws IOException; /** - * Processes a container file, and extracts all the embedded - * resources from within it. - *

- * The {@link EmbeddedResourceHandler} you supply will - * be called for each embedded resource in the container. It is - * up to you whether you process the contents of the resource or not. - *

- * The given document stream is consumed but not closed by this method. - * The responsibility to close the stream remains on the caller. - *

- * If required, nested containers (such as a .docx within a .zip) - * can automatically be recursed into, and processed inline. If - * no recurseExtractor is given, the nested containers will be - * treated as with any other embedded resources. + * Processes a container file, and extracts all the embedded resources from within it. + * + *

The {@link EmbeddedResourceHandler} you supply will be called for each embedded resource + * in the container. It is up to you whether you process the contents of the resource or not. + * + *

The given document stream is consumed but not closed by this method. The responsibility to + * close the stream remains on the caller. + * + *

If required, nested containers (such as a .docx within a .zip) can automatically be + * recursed into, and processed inline. If no recurseExtractor is given, the nested containers + * will be treated as with any other embedded resources. * - * @param stream the document stream (input) + * @param stream the document stream (input) * @param recurseExtractor the extractor to use on any embedded containers - * @param handler handler for the embedded files (output) - * @throws IOException if the document stream could not be read + * @param handler handler for the embedded files (output) + * @throws IOException if the document stream could not be read * @throws TikaException if the container could not be parsed * @since Apache Tika 0.8 */ - void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, - EmbeddedResourceHandler handler) throws IOException, TikaException; + void extract( + TikaInputStream stream, + ContainerExtractor recurseExtractor, + EmbeddedResourceHandler handler) + throws IOException, TikaException; } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java index 537c5ffa15..d8d4acada7 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java @@ -19,24 +19,22 @@ import java.io.IOException; import java.io.InputStream; import java.util.List; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.metadata.Metadata; import org.apache.tika.utils.ServiceLoaderUtils; /** - * Loads EmbeddedStreamTranslators via service loading. Tries to run each - * in turn and returns the first non-null value. If no translation has occurred, - * this returns the original InputStream. If a translation has occurred, the - * translator will consume the InputStream but not close it. + * Loads EmbeddedStreamTranslators via service loading. Tries to run each in turn and returns the + * first non-null value. If no translation has occurred, this returns the original InputStream. If a + * translation has occurred, the translator will consume the InputStream but not close it. */ public class DefaultEmbeddedStreamTranslator implements EmbeddedStreamTranslator { final List translators; private static List getDefaultFilters(ServiceLoader loader) { - List embeddedStreamTranslators - = loader.loadServiceProviders(EmbeddedStreamTranslator.class); + List embeddedStreamTranslators = + loader.loadServiceProviders(EmbeddedStreamTranslator.class); ServiceLoaderUtils.sortLoadedClasses(embeddedStreamTranslators); return embeddedStreamTranslators; } @@ -50,8 +48,9 @@ private DefaultEmbeddedStreamTranslator(List translato } /** - * This should sniff the stream to determine if it needs to be translated. - * The translator is responsible for resetting the stream if any bytes have been read. + * This should sniff the stream to determine if it needs to be translated. The translator is + * responsible for resetting the stream if any bytes have been read. + * * @param inputStream * @param metadata * @return @@ -69,6 +68,7 @@ public boolean shouldTranslate(InputStream inputStream, Metadata metadata) throw /** * This will consume the InputStream and return a new stream of translated bytes. + * * @param inputStream * @param metadata * @return diff --git a/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java index aa34aa12bb..4976fe443d 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java @@ -19,24 +19,21 @@ import org.apache.tika.metadata.Metadata; /** - * Interface for different document selection strategies for purposes like - * embedded document extraction by a {@link ContainerExtractor} instance. - * An implementation of this interface defines some specific selection - * criteria to be applied against the document metadata passed to the - * {@link #select(Metadata)} method. + * Interface for different document selection strategies for purposes like embedded document + * extraction by a {@link ContainerExtractor} instance. An implementation of this interface defines + * some specific selection criteria to be applied against the document metadata passed to the {@link + * #select(Metadata)} method. * * @since Apache Tika 0.8 */ public interface DocumentSelector { /** - * Checks if a document with the given metadata matches the specified - * selection criteria. + * Checks if a document with the given metadata matches the specified selection criteria. * * @param metadata document metadata - * @return true if the document matches the selection criteria, - * false otherwise + * @return true if the document matches the selection criteria, false + * otherwise */ boolean select(Metadata metadata); - } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java index 2ec7df667e..1e8473159c 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java @@ -26,6 +26,7 @@ public boolean select(Metadata metadata) { return true; } } + EmbeddedBytesSelector ACCEPT_ALL = new AcceptAll(); boolean select(Metadata metadata); diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java index f7237bd6ac..15acaac882 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java @@ -16,18 +16,15 @@ */ package org.apache.tika.extractor; - /** - * This factory creates EmbeddedDocumentExtractors that require an - * {@link EmbeddedDocumentBytesHandler} in the - * {@link org.apache.tika.parser.ParseContext} should extend this. + * This factory creates EmbeddedDocumentExtractors that require an {@link + * EmbeddedDocumentBytesHandler} in the {@link org.apache.tika.parser.ParseContext} should extend + * this. * - * This is a shim interface to signal to {@link org.apache.tika.pipes.PipesServer} - * to use the {@link @RUnpackExtractor} if the user doesn't configure a custom - * EmbeddedDocumentExtractor. + *

This is a shim interface to signal to {@link org.apache.tika.pipes.PipesServer} to use the + * {@link @RUnpackExtractor} if the user doesn't configure a custom EmbeddedDocumentExtractor. * - * TODO: Figure out how to simplify this and allow for emitting of the source document. + *

TODO: Figure out how to simplify this and allow for emitting of the source document. */ -public interface EmbeddedDocumentByteStoreExtractorFactory extends EmbeddedDocumentExtractorFactory { - -} +public interface EmbeddedDocumentByteStoreExtractorFactory + extends EmbeddedDocumentExtractorFactory {} diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java index 12357a7189..e665a87a7a 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java @@ -20,11 +20,10 @@ import java.io.IOException; import java.io.InputStream; import java.util.List; - import org.apache.tika.metadata.Metadata; public interface EmbeddedDocumentBytesHandler extends Closeable { - //we need metadata for the emitter store...can we get away without it? + // we need metadata for the emitter store...can we get away without it? void add(int id, Metadata metadata, InputStream inputStream) throws IOException; List getIds(); diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java index f80420868b..f4b3cbc209 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java @@ -19,18 +19,17 @@ import java.io.IOException; import java.io.InputStream; - +import org.apache.tika.metadata.Metadata; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.metadata.Metadata; - public interface EmbeddedDocumentExtractor { boolean shouldParseEmbedded(Metadata metadata); /** - * Processes the supplied embedded resource, calling the delegating - * parser with the appropriate details. + * Processes the supplied embedded resource, calling the delegating parser with the appropriate + * details. + * * @param stream The embedded resource * @param handler The handler to use * @param metadata The metadata for the embedded resource diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java index 4a55052aa3..3c795a893c 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java @@ -18,7 +18,6 @@ package org.apache.tika.extractor; import java.io.Serializable; - import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java index d6e2c28a81..da7ebf6856 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java @@ -16,14 +16,9 @@ */ package org.apache.tika.extractor; - import java.io.IOException; import java.io.InputStream; import java.io.Serializable; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; @@ -41,21 +36,22 @@ import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.StatefulParser; import org.apache.tika.utils.ExceptionUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Utility class to handle common issues with embedded documents. - *

- * Use statically if all that is needed is getting the EmbeddedDocumentExtractor. - * Otherwise, instantiate an instance. - *

- * Note: This is not thread safe. Make sure to instantiate one per thread. + * + *

Use statically if all that is needed is getting the EmbeddedDocumentExtractor. Otherwise, + * instantiate an instance. + * + *

Note: This is not thread safe. Make sure to instantiate one per thread. */ public class EmbeddedDocumentUtil implements Serializable { - private final ParseContext context; private final EmbeddedDocumentExtractor embeddedDocumentExtractor; - //these are lazily initialized and can be null + // these are lazily initialized and can be null private TikaConfig tikaConfig; private MimeTypes mimeTypes; private Detector detector; @@ -66,12 +62,12 @@ public EmbeddedDocumentUtil(ParseContext context) { } /** - * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext. - * As of Tika 1.15, an AutoDetectParser will automatically be added to parse - * embedded documents if no Parser.class is specified in the ParseContext. - *

- * If you'd prefer not to parse embedded documents, set Parser.class - * to {@link org.apache.tika.parser.EmptyParser} in the ParseContext. + * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext. As of Tika + * 1.15, an AutoDetectParser will automatically be added to parse embedded documents if no + * Parser.class is specified in the ParseContext. + * + *

If you'd prefer not to parse embedded documents, set Parser.class to {@link + * org.apache.tika.parser.EmptyParser} in the ParseContext. * * @param context * @return EmbeddedDocumentExtractor @@ -81,8 +77,8 @@ public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContex if (extractor != null) { return extractor; } - //ensure that an AutoDetectParser is - //available for parsing embedded docs TIKA-2096 + // ensure that an AutoDetectParser is + // available for parsing embedded docs TIKA-2096 Parser embeddedParser = context.get(Parser.class); if (embeddedParser == null) { TikaConfig tikaConfig = context.get(TikaConfig.class); @@ -98,11 +94,10 @@ public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContex } /** - * Utility function to get the Parser that was sent in to the - * ParseContext to handle embedded documents. If it is stateful, - * unwrap it to get its stateless delegating parser. - *

- * If there is no Parser in the parser context, this will return null. + * Utility function to get the Parser that was sent in to the ParseContext to handle embedded + * documents. If it is stateful, unwrap it to get its stateless delegating parser. + * + *

If there is no Parser in the parser context, this will return null. * * @param context * @return @@ -123,7 +118,7 @@ public PasswordProvider getPasswordProvider() { } public Detector getDetector() { - //be as lazy as possible and cache + // be as lazy as possible and cache Detector localDetector = context.get(Detector.class); if (localDetector != null) { return localDetector; @@ -138,7 +133,7 @@ public Detector getDetector() { public MimeTypes getMimeTypes() { MimeTypes localMimeTypes = context.get(MimeTypes.class); - //be as lazy as possible and cache the mimeTypes + // be as lazy as possible and cache the mimeTypes if (localMimeTypes != null) { return localMimeTypes; } @@ -150,13 +145,13 @@ public MimeTypes getMimeTypes() { } /** - * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext - * that was included during initialization, and then creating a new one from - * via {@link TikaConfig#getDefaultConfig()} if it can't find one in the - * ParseContext. This caches the default config so that it only has to be created once. + * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext that was + * included during initialization, and then creating a new one from via {@link + * TikaConfig#getDefaultConfig()} if it can't find one in the ParseContext. This caches the + * default config so that it only has to be created once. */ public TikaConfig getTikaConfig() { - //be as lazy as possible and cache the TikaConfig + // be as lazy as possible and cache the TikaConfig if (tikaConfig == null) { tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { @@ -169,7 +164,7 @@ public TikaConfig getTikaConfig() { public String getExtension(TikaInputStream is, Metadata metadata) { String mimeString = metadata.get(Metadata.CONTENT_TYPE); - //use the buffered mimetypes as default + // use the buffered mimetypes as default MimeTypes localMimeTypes = getMimeTypes(); MimeType mimeType = null; @@ -178,7 +173,7 @@ public String getExtension(TikaInputStream is, Metadata metadata) { try { mimeType = localMimeTypes.forName(mimeString); } catch (MimeTypeException e) { - //swallow + // swallow } } if (mimeType == null) { @@ -188,12 +183,12 @@ public String getExtension(TikaInputStream is, Metadata metadata) { detected = true; is.reset(); } catch (IOException | MimeTypeException e) { - //swallow + // swallow } } if (mimeType != null) { if (detected) { - //set or correct the mime type + // set or correct the mime type metadata.set(Metadata.CONTENT_TYPE, mimeType.toString()); } return mimeType.getExtension(); @@ -219,21 +214,21 @@ private EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { return embeddedDocumentExtractor; } - public void parseEmbedded(InputStream inputStream, ContentHandler handler, Metadata metadata, - boolean outputHtml) throws IOException, SAXException { + public void parseEmbedded( + InputStream inputStream, ContentHandler handler, Metadata metadata, boolean outputHtml) + throws IOException, SAXException { embeddedDocumentExtractor.parseEmbedded(inputStream, handler, metadata, outputHtml); } /** - * Tries to find an existing parser within the ParseContext. - * It looks inside of CompositeParsers and ParserDecorators. - * The use case is when a parser needs to parse an internal stream - * that is _part_ of the document, e.g. rtf body inside an msg. - *

- * Can return null if the context contains no parser or - * the correct parser can't be found. + * Tries to find an existing parser within the ParseContext. It looks inside of CompositeParsers + * and ParserDecorators. The use case is when a parser needs to parse an internal stream that is + * _part_ of the document, e.g. rtf body inside an msg. + * + *

Can return null if the context contains no parser or the correct parser can't + * be found. * - * @param clazz parser class to search for + * @param clazz parser class to search for * @param context * @return */ diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java index 23d00635d9..de20ae4c7b 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java @@ -17,21 +17,18 @@ package org.apache.tika.extractor; import java.io.InputStream; - import org.apache.tika.mime.MediaType; /** - * Tika container extractor callback interface. - * To work with a {@link ContainerExtractor}, your code needs - * to implement this interface. + * Tika container extractor callback interface. To work with a {@link ContainerExtractor}, your code + * needs to implement this interface. */ public interface EmbeddedResourceHandler { /** - * Called to process an embedded resource within the container. - * This will be called once per embedded resource within the - * container, along with whatever details are available on - * the embedded resource. - * + * Called to process an embedded resource within the container. This will be called once per + * embedded resource within the container, along with whatever details are available on the + * embedded resource. + * * @since Apache Tika 0.8 * @param filename The filename of the embedded resource, if known * @param mediaType The media type of the embedded resource, if known diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java index b2ce05db42..4ea2c77d9e 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java @@ -18,13 +18,11 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.tika.metadata.Metadata; /** - * Interface for different filtering of embedded streams. - * Specifically, unravel OLE streams in tika-server unpack, - * and/or handle open containers in TikaInputStream + * Interface for different filtering of embedded streams. Specifically, unravel OLE streams in + * tika-server unpack, and/or handle open containers in TikaInputStream * * @since Apache Tika 2.0.0 */ @@ -32,7 +30,5 @@ public interface EmbeddedStreamTranslator { boolean shouldTranslate(InputStream inputStream, Metadata metadata) throws IOException; - InputStream translate(InputStream inputStream, - Metadata metadata) throws IOException; - + InputStream translate(InputStream inputStream, Metadata metadata) throws IOException; } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java index b2e9cd169c..e9bf87461a 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java @@ -20,11 +20,6 @@ import java.io.IOException; import java.io.InputStream; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; @@ -38,13 +33,15 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.StatefulParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** - * An implementation of {@link ContainerExtractor} powered by the regular - * {@link Parser} API. This allows you to easily extract out all the - * embedded resources from within container files supported by normal Tika - * parsers. By default the {@link AutoDetectParser} will be used, to allow - * extraction from the widest range of containers. + * An implementation of {@link ContainerExtractor} powered by the regular {@link Parser} API. This + * allows you to easily extract out all the embedded resources from within container files supported + * by normal Tika parsers. By default the {@link AutoDetectParser} will be used, to allow extraction + * from the widest range of containers. */ public class ParserContainerExtractor implements ContainerExtractor { @@ -60,8 +57,7 @@ public ParserContainerExtractor() { } public ParserContainerExtractor(TikaConfig config) { - this(new AutoDetectParser(config), - new DefaultDetector(config.getMimeRepository())); + this(new AutoDetectParser(config), new DefaultDetector(config.getMimeRepository())); } public ParserContainerExtractor(Parser parser, Detector detector) { @@ -75,7 +71,8 @@ public boolean isSupported(TikaInputStream input) throws IOException { } public void extract( - TikaInputStream stream, ContainerExtractor recurseExtractor, + TikaInputStream stream, + ContainerExtractor recurseExtractor, EmbeddedResourceHandler handler) throws IOException, TikaException { ParseContext context = new ParseContext(); @@ -93,7 +90,8 @@ private class RecursiveParser extends StatefulParser { private final EmbeddedResourceHandler handler; - private RecursiveParser(Parser statelessParser, + private RecursiveParser( + Parser statelessParser, ContainerExtractor extractor, EmbeddedResourceHandler handler) { super(statelessParser); @@ -106,8 +104,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse( - InputStream stream, ContentHandler ignored, - Metadata metadata, ParseContext context) + InputStream stream, ContentHandler ignored, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); try { @@ -118,7 +115,7 @@ public void parse( MediaType type = detector.detect(tis, metadata); if (extractor == null) { - // Let the handler process the embedded resource + // Let the handler process the embedded resource handler.handle(filename, type, tis); } else { // Use a temporary file to process the stream twice @@ -136,7 +133,5 @@ public void parse( tmp.dispose(); } } - } - } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java index edcb78ff11..28f2d86703 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java @@ -22,12 +22,7 @@ import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; - import org.apache.commons.io.input.CloseShieldInputStream; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; - import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; @@ -41,10 +36,13 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; /** - * Helper class for parsers of package archives or other compound document - * formats that support embedded or attached component documents. + * Helper class for parsers of package archives or other compound document formats that support + * embedded or attached component documents. * * @since Apache Tika 0.8 */ @@ -106,13 +104,16 @@ public void parseEmbedded( newStream.setOpenContainer(container); } } - DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), - metadata, context); + DELEGATING_PARSER.parse( + newStream, + new EmbeddedContentHandler(new BodyContentHandler(handler)), + metadata, + context); } catch (EncryptedDocumentException ede) { recordException(ede, context); } catch (CorruptedFileException e) { - //necessary to stop the parse to avoid infinite loops - //on corrupt sqlite3 files + // necessary to stop the parse to avoid infinite loops + // on corrupt sqlite3 files throw new IOException(e); } catch (TikaException e) { recordException(e, context); diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java index 9136228c4a..f6a3611e51 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java @@ -20,8 +20,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -public class ParsingEmbeddedDocumentExtractorFactory - implements EmbeddedDocumentExtractorFactory { +public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocumentExtractorFactory { private boolean writeFileNameToContent = true; @@ -32,8 +31,7 @@ public void setWriteFileNameToContent(boolean writeFileNameToContent) { @Override public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) { - ParsingEmbeddedDocumentExtractor ex = - new ParsingEmbeddedDocumentExtractor(parseContext); + ParsingEmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(parseContext); ex.setWriteFileNameToContent(writeFileNameToContent); return ex; } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java index 76b297dd78..7f12f6312d 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java @@ -23,14 +23,7 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; - import org.apache.commons.io.input.CloseShieldInputStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; - import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; @@ -42,6 +35,11 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; /** * Recursive Unpacker and text and metadata extractor. @@ -65,7 +63,6 @@ public RUnpackExtractor(ParseContext context, long maxEmbeddedBytesForExtraction this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction; } - @Override public void parseEmbedded( InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) @@ -94,7 +91,8 @@ public void parseEmbedded( newStream.setOpenContainer(container); } } - EmbeddedDocumentBytesHandler bytesHandler = context.get(EmbeddedDocumentBytesHandler.class); + EmbeddedDocumentBytesHandler bytesHandler = + context.get(EmbeddedDocumentBytesHandler.class); if (bytesHandler != null) { parseWithBytes(newStream, handler, metadata); } else { @@ -103,8 +101,8 @@ public void parseEmbedded( } catch (EncryptedDocumentException ede) { recordException(ede, context); } catch (CorruptedFileException e) { - //necessary to stop the parse to avoid infinite loops - //on corrupt sqlite3 files + // necessary to stop the parse to avoid infinite loops + // on corrupt sqlite3 files throw new IOException(e); } catch (TikaException e) { recordException(e, context); @@ -117,8 +115,8 @@ public void parseEmbedded( private void parseWithBytes(TikaInputStream stream, ContentHandler handler, Metadata metadata) throws TikaException, IOException, SAXException { - //TODO -- improve the efficiency of this so that we're not - //literally writing out a file per request + // TODO -- improve the efficiency of this so that we're not + // literally writing out a file per request Path p = stream.getPath(); try { parse(stream, handler, metadata); @@ -129,15 +127,19 @@ private void parseWithBytes(TikaInputStream stream, ContentHandler handler, Meta private void parse(TikaInputStream stream, ContentHandler handler, Metadata metadata) throws TikaException, IOException, SAXException { - getDelegatingParser().parse(stream, - new EmbeddedContentHandler(new BodyContentHandler(handler)), - metadata, context); + getDelegatingParser() + .parse( + stream, + new EmbeddedContentHandler(new BodyContentHandler(handler)), + metadata, + context); } private void storeEmbeddedBytes(Path p, Metadata metadata) { - if (! embeddedBytesSelector.select(metadata)) { + if (!embeddedBytesSelector.select(metadata)) { if (LOGGER.isDebugEnabled()) { - LOGGER.debug("skipping embedded bytes {} <-> {}", + LOGGER.debug( + "skipping embedded bytes {} <-> {}", metadata.get(Metadata.CONTENT_TYPE), metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); } @@ -148,8 +150,12 @@ private void storeEmbeddedBytes(Path p, Metadata metadata) { int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID); try (InputStream is = Files.newInputStream(p)) { if (bytesExtracted >= maxEmbeddedBytesForExtraction) { - throw new IOException("Bytes extracted (" + bytesExtracted + - ") >= max allowed (" + maxEmbeddedBytesForExtraction + ")"); + throw new IOException( + "Bytes extracted (" + + bytesExtracted + + ") >= max allowed (" + + maxEmbeddedBytesForExtraction + + ")"); } long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted; @@ -157,19 +163,23 @@ private void storeEmbeddedBytes(Path p, Metadata metadata) { embeddedDocumentBytesHandler.add(id, metadata, boundedIs); bytesExtracted += boundedIs.getPos(); if (boundedIs.hasHitBound()) { - throw new IOException("Bytes extracted (" + bytesExtracted + - ") >= max allowed (" + maxEmbeddedBytesForExtraction + "). Truncated " + - "bytes"); + throw new IOException( + "Bytes extracted (" + + bytesExtracted + + ") >= max allowed (" + + maxEmbeddedBytesForExtraction + + "). Truncated " + + "bytes"); } } } catch (IOException e) { LOGGER.warn("problem writing out embedded bytes", e); - //info in metadata doesn't actually make it back to the metadata list - //because we're filtering and cloning the metadata at the end of the parse - //which happens before we try to copy out the files. - //TODO fix this - //metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION, - // ExceptionUtils.getStackTrace(e)); + // info in metadata doesn't actually make it back to the metadata list + // because we're filtering and cloning the metadata at the end of the parse + // which happens before we try to copy out the files. + // TODO fix this + // metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION, + // ExceptionUtils.getStackTrace(e)); } } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java index a715ed25f4..f70ebe0cda 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java @@ -20,7 +20,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.Metadata; @@ -37,6 +36,7 @@ public class RUnpackExtractorFactory implements EmbeddedDocumentByteStoreExtract private Set embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET; private long maxEmbeddedBytesForExtraction = DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION; + @Field public void setWriteFileNameToContent(boolean writeFileNameToContent) { this.writeFileNameToContent = writeFileNameToContent; @@ -52,34 +52,32 @@ public void setEmbeddedBytesIncludeMimeTypes(List includeMimeTypes) { public void setEmbeddedBytesExcludeMimeTypes(List excludeMimeTypes) { embeddedBytesExcludeMimeTypes = new HashSet<>(); embeddedBytesExcludeMimeTypes.addAll(excludeMimeTypes); - } @Field public void setEmbeddedBytesIncludeEmbeddedResourceTypes(List includeAttachmentTypes) { embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>(); embeddedBytesIncludeEmbeddedResourceTypes.addAll(includeAttachmentTypes); - } @Field public void setEmbeddedBytesExcludeEmbeddedResourceTypes(List excludeAttachmentTypes) { embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>(); embeddedBytesExcludeEmbeddedResourceTypes.addAll(excludeAttachmentTypes); - } /** - * Total number of bytes to write out. A good zip bomb may contain petabytes - * compressed into a few kb. Make sure that you can't fill up a disk! + * Total number of bytes to write out. A good zip bomb may contain petabytes compressed into a + * few kb. Make sure that you can't fill up a disk! * - * This does not include the container file in the count of bytes written out. - * This only counts the lengths of the embedded files. + *

This does not include the container file in the count of bytes written out. This only + * counts the lengths of the embedded files. * * @param maxEmbeddedBytesForExtraction */ @Field - public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction) throws TikaConfigException { + public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction) + throws TikaConfigException { if (maxEmbeddedBytesForExtraction < 0) { throw new TikaConfigException("maxEmbeddedBytesForExtraction must be >= 0"); } @@ -88,24 +86,23 @@ public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction) @Override public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) { - RUnpackExtractor ex = - new RUnpackExtractor(parseContext, - maxEmbeddedBytesForExtraction); + RUnpackExtractor ex = new RUnpackExtractor(parseContext, maxEmbeddedBytesForExtraction); ex.setWriteFileNameToContent(writeFileNameToContent); ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector()); return ex; } - private EmbeddedBytesSelector createEmbeddedBytesSelector() { - if (embeddedBytesIncludeMimeTypes.size() == 0 && - embeddedBytesExcludeMimeTypes.size() == 0 && - embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 && - embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) { + if (embeddedBytesIncludeMimeTypes.size() == 0 + && embeddedBytesExcludeMimeTypes.size() == 0 + && embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 + && embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) { return EmbeddedBytesSelector.ACCEPT_ALL; } - return new BasicEmbeddedBytesSelector(embeddedBytesIncludeMimeTypes, - embeddedBytesExcludeMimeTypes, embeddedBytesIncludeEmbeddedResourceTypes, + return new BasicEmbeddedBytesSelector( + embeddedBytesIncludeMimeTypes, + embeddedBytesExcludeMimeTypes, + embeddedBytesIncludeEmbeddedResourceTypes, embeddedBytesExcludeEmbeddedResourceTypes); } } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/package-info.java b/tika-core/src/main/java/org/apache/tika/extractor/package-info.java index 3d3e92b525..5917177c98 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * Extraction of component documents. - */ +/** Extraction of component documents. */ @aQute.bnd.annotation.Version("1.0.0") package org.apache.tika.extractor; diff --git a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java index 51b1beeff1..91707ebcd8 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java @@ -30,15 +30,13 @@ class ClassLoaderProxy extends ClassLoader implements ForkProxy { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = -7303109260448540420L; /** - * Names of resources that could not be found. Used to avoid repeated - * lookup of commonly accessed, but often not present, resources like - * META-INF/services/javax.xml.parsers.SAXParserFactory. + * Names of resources that could not be found. Used to avoid repeated lookup of commonly + * accessed, but often not present, resources like + * META-INF/services/javax.xml.parsers.SAXParserFactory. */ private final Set notFound = new HashSet<>(); @@ -149,5 +147,4 @@ private byte[] readStream() throws IOException { return stream.toByteArray(); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java index 7af85ada51..dc7e71818b 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java @@ -32,11 +32,10 @@ public ClassLoaderResource(ClassLoader loader) { } /** - * Processes a request for one (code 1) or many (code 2) class loader - * resources. The requested resources are sent preceded with a boolean - * true value. If the resource was not found (code 1) or - * when the last resource has been sent (code 2), a boolean - * false value is sent instead. + * Processes a request for one (code 1) or many (code 2) class loader resources. The requested + * resources are sent preceded with a boolean true value. If the resource was not + * found (code 1) or when the last resource has been sent (code 2), a boolean false + * value is sent instead. * * @param name resource name * @throws IOException if the resource could not be sent @@ -66,14 +65,12 @@ public Throwable process(DataInputStream input, DataOutputStream output) throws } /** - * Sends the contents of the given input stream to the given output. - * The stream is sent in chunks of less than 64kB, each preceded by - * a 16-bit integer value that indicates the length of the following - * chunk. A zero short value is sent at the end to signify the end of - * the stream. - *

- * The stream is guaranteed to be closed by this method, regardless of - * the way it returns. + * Sends the contents of the given input stream to the given output. The stream is sent in + * chunks of less than 64kB, each preceded by a 16-bit integer value that indicates the length + * of the following chunk. A zero short value is sent at the end to signify the end of the + * stream. + * + *

The stream is guaranteed to be closed by this method, regardless of the way it returns. * * @param stream the stream to be sent * @throws IOException if the stream could not be sent @@ -92,5 +89,4 @@ private void writeAndCloseStream(DataOutputStream output, InputStream stream) stream.close(); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java index 371dd05c57..d9430ed493 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java @@ -19,7 +19,6 @@ import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; - import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; @@ -38,9 +37,7 @@ class ContentHandlerProxy implements ContentHandler, ForkProxy { public static final int PROCESSING_INSTRUCTION = 9; public static final int SKIPPED_ENTITY = 10; - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 737511106054617524L; private final int resource; @@ -79,8 +76,8 @@ private void sendString(String string) throws SAXException { } /** - * Breaks the string in 21,845 size chunks to not - * throw UTFDataFormatException at least in Oracle JDK 8. + * Breaks the string in 21,845 size chunks to not throw UTFDataFormatException at least in + * Oracle JDK 8. */ private void writeString(String string) throws IOException { int max = 65535 / 3; @@ -195,5 +192,4 @@ public void skippedEntity(String name) throws SAXException { sendString(name); doneSending(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java index f8971b9a67..6d3e8303d9 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java @@ -19,7 +19,6 @@ import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; - import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -60,8 +59,12 @@ private void internalProcess(DataInputStream input) throws IOException, SAXExcep if (n >= 0) { atts = new AttributesImpl(); for (int i = 0; i < n; i++) { - atts.addAttribute(readString(input), readString(input), readString(input), - readString(input), readString(input)); + atts.addAttribute( + readString(input), + readString(input), + readString(input), + readString(input), + readString(input)); } } handler.startElement(uri, localName, qName, atts); @@ -103,5 +106,4 @@ private String readStringUTF(DataInputStream input) throws IOException { } return sb.toString(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java b/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java index f1a47206d9..a49a53edfe 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java @@ -33,14 +33,12 @@ import java.util.jar.JarEntry; import java.util.jar.JarOutputStream; import java.util.zip.ZipEntry; - import org.apache.commons.io.IOUtils; -import org.xml.sax.ContentHandler; - import org.apache.tika.exception.TikaException; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.utils.ProcessUtils; +import org.xml.sax.ContentHandler; class ForkClient { private static final AtomicInteger CLIENT_COUNTER = new AtomicInteger(0); @@ -57,29 +55,35 @@ class ForkClient { private final DataInputStream input; - //this is used for debugging/smoke testing + // this is used for debugging/smoke testing private final int id = CLIENT_COUNTER.incrementAndGet(); private volatile int filesProcessed = 0; - public ForkClient(Path tikaDir, ParserFactoryFactory parserFactoryFactory, List java, - TimeoutLimits timeoutLimits) throws IOException, TikaException { + public ForkClient( + Path tikaDir, + ParserFactoryFactory parserFactoryFactory, + List java, + TimeoutLimits timeoutLimits) + throws IOException, TikaException { this(tikaDir, parserFactoryFactory, null, java, timeoutLimits); } /** - * @param tikaDir directory containing jars from which to start - * the child server and load the Parser - * @param parserFactoryFactory factory to send to forked process to build parser - * upon arrival - * @param classLoader class loader to use for non-parser resource - * (content-handler, etc.) - * @param java java commandline to use for the commandline server + * @param tikaDir directory containing jars from which to start the child server and load the + * Parser + * @param parserFactoryFactory factory to send to forked process to build parser upon arrival + * @param classLoader class loader to use for non-parser resource (content-handler, etc.) + * @param java java commandline to use for the commandline server * @throws IOException * @throws TikaException */ - public ForkClient(Path tikaDir, ParserFactoryFactory parserFactoryFactory, - ClassLoader classLoader, List java, TimeoutLimits timeoutLimits) + public ForkClient( + Path tikaDir, + ParserFactoryFactory parserFactoryFactory, + ClassLoader classLoader, + List java, + TimeoutLimits timeoutLimits) throws IOException, TikaException { jar = null; loader = null; @@ -130,9 +134,9 @@ public ForkClient(Path tikaDir, ParserFactoryFactory parserFactoryFactory, } } - - public ForkClient(ClassLoader loader, Object object, List java, - TimeoutLimits timeoutLimits) throws IOException, TikaException { + public ForkClient( + ClassLoader loader, Object object, List java, TimeoutLimits timeoutLimits) + throws IOException, TikaException { boolean ok = false; try { this.loader = loader; @@ -168,8 +172,8 @@ public ForkClient(ClassLoader loader, Object object, List java, } /** - * Creates a temporary jar file that can be used to bootstrap the forked - * server process. Remember to remove the file when no longer used. + * Creates a temporary jar file that can be used to bootstrap the forked server process. + * Remember to remove the file when no longer used. * * @return the created jar file * @throws IOException if the bootstrap archive could not be created @@ -189,9 +193,9 @@ private static File createBootstrapJar() throws IOException { } /** - * Fills in the jar file used to bootstrap the forked server process. - * All the required .class files and a manifest with a - * Main-Class entry are written into the archive. + * Fills in the jar file used to bootstrap the forked server process. All the required + * .class files and a manifest with a Main-Class entry are written into the + * archive. * * @param file file to hold the bootstrap archive * @throws IOException if the bootstrap archive could not be created @@ -202,10 +206,17 @@ private static void fillBootstrapJar(File file) throws IOException { jar.putNextEntry(new ZipEntry("META-INF/MANIFEST.MF")); jar.write(manifest.getBytes(UTF_8)); - Class[] bootstrap = {ForkServer.class, ForkObjectInputStream.class, ForkProxy.class, - ClassLoaderProxy.class, MemoryURLConnection.class, MemoryURLStreamHandler.class, - MemoryURLStreamHandlerFactory.class, MemoryURLStreamRecord.class, - TikaException.class}; + Class[] bootstrap = { + ForkServer.class, + ForkObjectInputStream.class, + ForkProxy.class, + ClassLoaderProxy.class, + MemoryURLConnection.class, + MemoryURLStreamHandler.class, + MemoryURLStreamHandlerFactory.class, + MemoryURLStreamRecord.class, + TikaException.class + }; ClassLoader loader = ForkServer.class.getClassLoader(); for (Class klass : bootstrap) { String path = klass.getName().replace('.', '/') + ".class"; @@ -227,10 +238,11 @@ private void waitForStartBeacon() throws IOException { } else if (type == -1) { throw new IOException("EOF while waiting for start beacon"); } else { - //can't do this because of + // can't do this because of // ForkParserIntegrationTest // #testAttachingADebuggerOnTheForkedParserShouldWork -// throw new IOException("Unexpected byte while waiting for start beacon: "+type); + // throw new IOException("Unexpected byte while waiting for start + // beacon: "+type); } } } @@ -265,10 +277,10 @@ public int getFilesProcessed() { } /** - * Serializes the object first into an in-memory buffer and then - * writes it to the output stream with a preceding size integer. + * Serializes the object first into an in-memory buffer and then writes it to the output stream + * with a preceding size integer. * - * @param object object to be serialized + * @param object object to be serialized * @param resources list of fork resources, used when adding proxies * @throws IOException if the object could not be serialized */ @@ -279,12 +291,14 @@ private void sendObject(Object object, List resources) resources.add(new InputStreamResource((InputStream) object)); object = new InputStreamProxy(n); } else if (object instanceof RecursiveParserWrapperHandler) { - resources.add(new RecursiveMetadataContentHandlerResource( - (RecursiveParserWrapperHandler) object)); - object = new RecursiveMetadataContentHandlerProxy(n, - ((RecursiveParserWrapperHandler) object).getContentHandlerFactory()); - } else if (object instanceof ContentHandler && - !(object instanceof AbstractRecursiveParserWrapperHandler)) { + resources.add( + new RecursiveMetadataContentHandlerResource( + (RecursiveParserWrapperHandler) object)); + object = + new RecursiveMetadataContentHandlerProxy( + n, ((RecursiveParserWrapperHandler) object).getContentHandlerFactory()); + } else if (object instanceof ContentHandler + && !(object instanceof AbstractRecursiveParserWrapperHandler)) { resources.add(new ContentHandlerResource((ContentHandler) object)); object = new ContentHandlerProxy(n); } else if (object instanceof ClassLoader) { @@ -296,8 +310,11 @@ private void sendObject(Object object, List resources) ForkObjectInputStream.sendObject(object, output); } catch (NotSerializableException nse) { // Build a more friendly error message for this - throw new TikaException("Unable to serialize " + object.getClass().getSimpleName() + - " to pass to the Forked Parser", nse); + throw new TikaException( + "Unable to serialize " + + object.getClass().getSimpleName() + + " to pass to the Forked Parser", + nse); } waitForResponse(resources); @@ -316,10 +333,10 @@ public synchronized void close() { if (process != null) { process.destroyForcibly(); try { - //TIKA-1933 + // TIKA-1933 process.waitFor(); } catch (InterruptedException e) { - //swallow + // swallow } } if (jar != null) { diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java b/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java index 61e2dae48f..2e7b6d36bc 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java @@ -27,28 +27,24 @@ import java.io.ObjectStreamClass; /** - * An object input stream that uses a given class loader when deserializing - * objects. - *

- * Note that this functionality could easily be implemented as a simple - * anonymous {@link ObjectInputStream} subclass, but since the - * functionality is needed during the somewhat complicated bootstrapping - * of the stdin/out communication channel of a forked server process, - * it's better if class has a stable name that can be referenced at - * compile-time by the {@link ForkClient} class. + * An object input stream that uses a given class loader when deserializing objects. + * + *

Note that this functionality could easily be implemented as a simple anonymous {@link + * ObjectInputStream} subclass, but since the functionality is needed during the somewhat + * complicated bootstrapping of the stdin/out communication channel of a forked server process, it's + * better if class has a stable name that can be referenced at compile-time by the {@link + * ForkClient} class. */ class ForkObjectInputStream extends ObjectInputStream { - /** - * The class loader used when deserializing objects. - */ + /** The class loader used when deserializing objects. */ private final ClassLoader loader; /** - * Creates a new object input stream that uses the given class loader - * when deserializing objects. + * Creates a new object input stream that uses the given class loader when deserializing + * objects. * - * @param input underlying input stream + * @param input underlying input stream * @param loader class loader used when deserializing objects * @throws IOException if this stream could not be initiated */ @@ -58,8 +54,8 @@ public ForkObjectInputStream(InputStream input, ClassLoader loader) throws IOExc } /** - * Serializes the object first into an in-memory buffer and then - * writes it to the output stream with a preceding size integer. + * Serializes the object first into an in-memory buffer and then writes it to the output stream + * with a preceding size integer. * * @param object object to be serialized * @param output output stream @@ -77,13 +73,13 @@ public static void sendObject(Object object, DataOutputStream output) throws IOE } /** - * Deserializes an object from the given stream. The serialized object - * is expected to be preceded by a size integer, that is used for reading - * the entire serialization into a memory before deserializing it. + * Deserializes an object from the given stream. The serialized object is expected to be + * preceded by a size integer, that is used for reading the entire serialization into a memory + * before deserializing it. * - * @param input input stream from which the serialized object is read + * @param input input stream from which the serialized object is read * @param loader class loader to be used for loading referenced classes - * @throws IOException if the object could not be deserialized + * @throws IOException if the object could not be deserialized * @throws ClassNotFoundException if a referenced class is not found */ public static Object readObject(DataInputStream input, ClassLoader loader) @@ -108,5 +104,4 @@ public static Object readObject(DataInputStream input, ClassLoader loader) protected Class resolveClass(ObjectStreamClass desc) throws ClassNotFoundException { return Class.forName(desc.getName(), false, loader); } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java b/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java index 84d1156e2a..2b1c9dfc25 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java @@ -27,10 +27,6 @@ import java.util.List; import java.util.Queue; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -41,54 +37,46 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.TeeContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class ForkParser implements Parser, Closeable { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = -4962742892274663950L; - //these are used by the legacy usage + // these are used by the legacy usage private final ClassLoader loader; private final Parser parser; - //these are used when the server builds a parser via a directory - //of jars, not via legacy bootstrap etc. + // these are used when the server builds a parser via a directory + // of jars, not via legacy bootstrap etc. private final Path tikaBin; private final ParserFactoryFactory parserFactoryFactory; private final Queue pool = new LinkedList<>(); - /** - * Java command line - */ + + /** Java command line */ private List java = Arrays.asList("java", "-Xmx32m", "-Djava.awt.headless=true"); - /** - * Process pool size - */ - @Field - private int poolSize = 5; + + /** Process pool size */ + @Field private int poolSize = 5; + private int currentlyInUse = 0; - @Field - private long serverPulseMillis = 1000; + @Field private long serverPulseMillis = 1000; - @Field - private long serverParseTimeoutMillis = 60000; + @Field private long serverParseTimeoutMillis = 60000; - @Field - private long serverWaitTimeoutMillis = 60000; + @Field private long serverWaitTimeoutMillis = 60000; - @Field - private int maxFilesProcessedPerClient = -1; + @Field private int maxFilesProcessedPerClient = -1; /** - * If you have a directory with, say, tike-app.jar and you want the - * forked process/server to build a parser - * and run it from that -- so that you can keep all of those dependencies out of + * If you have a directory with, say, tike-app.jar and you want the forked process/server to + * build a parser and run it from that -- so that you can keep all of those dependencies out of * your client code, use this initializer. * - * @param tikaBin directory containing the tika-app.jar or similar -- - * full jar including tika-core and all - * desired parsers and dependencies + * @param tikaBin directory containing the tika-app.jar or similar -- full jar including + * tika-core and all desired parsers and dependencies * @param factoryFactory */ public ForkParser(Path tikaBin, ParserFactoryFactory factoryFactory) { @@ -101,16 +89,14 @@ public ForkParser(Path tikaBin, ParserFactoryFactory factoryFactory) { /** * EXPERT * - * @param tikaBin directory containing the tika-app.jar or similar - * -- full jar including tika-core and all - * desired parsers and dependencies - * @param parserFactoryFactory -- the factory to use to generate the parser factory - * in the forked process/server - * @param classLoader to use for all classes besides the parser in the - * forked process/server + * @param tikaBin directory containing the tika-app.jar or similar -- full jar including + * tika-core and all desired parsers and dependencies + * @param parserFactoryFactory -- the factory to use to generate the parser factory in the + * forked process/server + * @param classLoader to use for all classes besides the parser in the forked process/server */ - public ForkParser(Path tikaBin, ParserFactoryFactory parserFactoryFactory, - ClassLoader classLoader) { + public ForkParser( + Path tikaBin, ParserFactoryFactory parserFactoryFactory, ClassLoader classLoader) { parser = null; loader = classLoader; this.tikaBin = tikaBin; @@ -124,8 +110,8 @@ public ForkParser(Path tikaBin, ParserFactoryFactory parserFactoryFactory, public ForkParser(ClassLoader loader, Parser parser) { if (parser instanceof ForkParser) { throw new IllegalArgumentException( - "The underlying parser of a ForkParser should not be a ForkParser, " + - "but a specific implementation."); + "The underlying parser of a ForkParser should not be a ForkParser, " + + "but a specific implementation."); } this.tikaBin = null; this.parserFactoryFactory = null; @@ -160,13 +146,11 @@ public synchronized void setPoolSize(int poolSize) { } /** - * Sets the command used to start the forked server process. - * The arguments "-jar" and "/path/to/bootstrap.jar" - * or "-cp" and "/path/to/tika_bin" are - * appended to the given command when starting the process. - * The default setting is {"java", "-Xmx32m"}. - *

- * Creates a defensive copy. + * Sets the command used to start the forked server process. The arguments "-jar" and + * "/path/to/bootstrap.jar" or "-cp" and "/path/to/tika_bin" are appended to the given command + * when starting the process. The default setting is {"java", "-Xmx32m"}. + * + *

Creates a defensive copy. * * @param java java command line */ @@ -176,8 +160,8 @@ public void setJavaCommand(List java) { /** * Returns the command used to start the forked server process. - *

- * Returned list is unmodifiable. + * + *

Returned list is unmodifiable. * * @return java command line args */ @@ -190,51 +174,45 @@ public Set getSupportedTypes(ParseContext context) { } /** - * This sends the objects to the server for parsing, and the server via - * the proxies acts on the handler as if it were updating it directly. - *

- * If using a {@link org.apache.tika.parser.RecursiveParserWrapper}, there are two options: - *

- *

- *

    - *
  1. Send in a class that extends - * {@link org.apache.tika.sax.RecursiveParserWrapperHandler}, - * and the server will proxy back the data as best it can[0].
  2. - *
  3. Send in a class that extends {@link AbstractRecursiveParserWrapperHandler} - * and the server will act on the class but not proxy back the data. This - * can be used, for example, if all you want to do is write to disc, extend - * {@link AbstractRecursiveParserWrapperHandler} to write to disc when - * {@link AbstractRecursiveParserWrapperHandler#endDocument(ContentHandler, - * Metadata)} - * is called, and the server will take care of the writing via the handler.
  4. - *
- *

+ * This sends the objects to the server for parsing, and the server via the proxies acts on the + * handler as if it were updating it directly. + * + *

If using a {@link org.apache.tika.parser.RecursiveParserWrapper}, there are two options: + * *

- * NOTE:[0] "the server will proxy back the data as best it can". - * If the handler implements Serializable and is actually serializable, the - * server will send it and the - * {@link Metadata} back upon - * {@link org.apache.tika.sax.RecursiveParserWrapperHandler# - * endEmbeddedDocument(ContentHandler, Metadata)} - * or {@link org.apache.tika.sax.RecursiveParserWrapperHandler# - * endEmbeddedDocument(ContentHandler, Metadata)}. - * If the handler does not implement {@link java.io.Serializable} or if there is a - * {@link java.io.NotSerializableException} thrown during serialization, the server will - * call {@link ContentHandler#toString()} on the ContentHandler and set that value with the - * {@link TikaCoreProperties#TIKA_CONTENT} key and then - * serialize and proxy that data back. - *

* - * @param stream the document stream (input) - * @param handler handler for the XHTML SAX events (output) + *
    + *
  1. Send in a class that extends {@link org.apache.tika.sax.RecursiveParserWrapperHandler}, + * and the server will proxy back the data as best it can[0]. + *
  2. Send in a class that extends {@link AbstractRecursiveParserWrapperHandler} and the + * server will act on the class but not proxy back the data. This can be used, for + * example, if all you want to do is write to disc, extend {@link + * AbstractRecursiveParserWrapperHandler} to write to disc when {@link + * AbstractRecursiveParserWrapperHandler#endDocument(ContentHandler, Metadata)} is called, + * and the server will take care of the writing via the handler. + *
+ * + *

NOTE:[0] "the server will proxy back the data as best it can". If the + * handler implements Serializable and is actually serializable, the server will send it and the + * {@link Metadata} back upon {@link org.apache.tika.sax.RecursiveParserWrapperHandler# + * endEmbeddedDocument(ContentHandler, Metadata)} or {@link + * org.apache.tika.sax.RecursiveParserWrapperHandler# endEmbeddedDocument(ContentHandler, + * Metadata)}. If the handler does not implement {@link java.io.Serializable} or if there is a + * {@link java.io.NotSerializableException} thrown during serialization, the server will call + * {@link ContentHandler#toString()} on the ContentHandler and set that value with the {@link + * TikaCoreProperties#TIKA_CONTENT} key and then serialize and proxy that data back. + * + * @param stream the document stream (input) + * @param handler handler for the XHTML SAX events (output) * @param metadata document metadata (input and output) - * @param context parse context + * @param context parse context * @throws IOException * @throws SAXException * @throws TikaException */ - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { if (stream == null) { throw new NullPointerException("null stream"); } @@ -245,8 +223,9 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ForkClient client = acquireClient(); try { ContentHandler tee = - (handler instanceof AbstractRecursiveParserWrapperHandler) ? handler : - new TeeContentHandler(handler, new MetadataContentHandler(metadata)); + (handler instanceof AbstractRecursiveParserWrapperHandler) + ? handler + : new TeeContentHandler(handler, new MetadataContentHandler(metadata)); t = client.call("parse", stream, tee, metadata, context); alive = true; @@ -256,10 +235,12 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, throw te; } catch (IOException e) { // Problem occurred on the other side - throw new TikaException("Failed to communicate with a forked parser process." + - " The process has most likely crashed due to some error" + - " like running out of memory. A new process will be" + - " started for the next parsing request.", e); + throw new TikaException( + "Failed to communicate with a forked parser process." + + " The process has most likely crashed due to some error" + + " like running out of memory. A new process will be" + + " started for the next parsing request.", + e); } finally { releaseClient(client, alive); } @@ -312,18 +293,23 @@ private synchronized ForkClient acquireClient() throws IOException, TikaExceptio } private ForkClient newClient() throws IOException, TikaException { - TimeoutLimits timeoutLimits = new TimeoutLimits(serverPulseMillis, serverParseTimeoutMillis, - serverWaitTimeoutMillis); + TimeoutLimits timeoutLimits = + new TimeoutLimits( + serverPulseMillis, serverParseTimeoutMillis, serverWaitTimeoutMillis); if (loader == null && parser == null && tikaBin != null && parserFactoryFactory != null) { return new ForkClient(tikaBin, parserFactoryFactory, java, timeoutLimits); - } else if (loader != null && parser != null && tikaBin == null && - parserFactoryFactory == null) { + } else if (loader != null + && parser != null + && tikaBin == null + && parserFactoryFactory == null) { return new ForkClient(loader, parser, java, timeoutLimits); - } else if (loader != null && parser == null && tikaBin != null && - parserFactoryFactory != null) { + } else if (loader != null + && parser == null + && tikaBin != null + && parserFactoryFactory != null) { return new ForkClient(tikaBin, parserFactoryFactory, loader, java, timeoutLimits); } else { - //TODO: make this more useful + // TODO: make this more useful throw new IllegalStateException("Unexpected combination of state items"); } } @@ -331,8 +317,8 @@ private ForkClient newClient() throws IOException, TikaException { private synchronized void releaseClient(ForkClient client, boolean alive) { currentlyInUse--; if (currentlyInUse + pool.size() < poolSize && alive) { - if (maxFilesProcessedPerClient > 0 && - client.getFilesProcessed() >= maxFilesProcessedPerClient) { + if (maxFilesProcessedPerClient > 0 + && client.getFilesProcessed() >= maxFilesProcessedPerClient) { client.close(); } else { pool.offer(client); @@ -344,10 +330,8 @@ private synchronized void releaseClient(ForkClient client, boolean alive) { } /** - * The amount of time in milliseconds that the server - * should wait before checking to see if the parse has timed out - * or if the wait has timed out - * The default is 5 seconds. + * The amount of time in milliseconds that the server should wait before checking to see if the + * parse has timed out or if the wait has timed out The default is 5 seconds. * * @param serverPulseMillis milliseconds to sleep before checking if there has been any activity */ @@ -356,9 +340,8 @@ public void setServerPulseMillis(long serverPulseMillis) { } /** - * The maximum amount of time allowed for the server to try to parse a file. - * If more than this time elapses, the server shuts down, and the ForkParser - * throws an exception. + * The maximum amount of time allowed for the server to try to parse a file. If more than this + * time elapses, the server shuts down, and the ForkParser throws an exception. * * @param serverParseTimeoutMillis */ @@ -367,9 +350,9 @@ public void setServerParseTimeoutMillis(long serverParseTimeoutMillis) { } /** - * The maximum amount of time allowed for the server to wait for a new request to parse - * a file. The server will shutdown after this amount of time, and a new server will have - * to be started by a new client. + * The maximum amount of time allowed for the server to wait for a new request to parse a file. + * The server will shutdown after this amount of time, and a new server will have to be started + * by a new client. * * @param serverWaitTimeoutMillis */ @@ -378,17 +361,15 @@ public void setServerWaitTimeoutMillis(long serverWaitTimeoutMillis) { } /** - * If there is a slowly building memory leak in one of the parsers, - * it is useful to set a limit on the number of files processed - * by a server before it is shutdown and restarted. Default value is -1. + * If there is a slowly building memory leak in one of the parsers, it is useful to set a limit + * on the number of files processed by a server before it is shutdown and restarted. Default + * value is -1. * - * @param maxFilesProcessedPerClient maximum number of files that a server can handle - * before the parser shuts down a client and creates - * a new process. If set to -1, the server is never restarted - * because of the number of files handled. + * @param maxFilesProcessedPerClient maximum number of files that a server can handle before the + * parser shuts down a client and creates a new process. If set to -1, the server is never + * restarted because of the number of files handled. */ public void setMaxFilesProcessedPerServer(int maxFilesProcessedPerClient) { this.maxFilesProcessedPerClient = maxFilesProcessedPerClient; } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java index b10eac8ded..01aefc5b5b 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java @@ -23,5 +23,4 @@ public interface ForkProxy extends Serializable { void init(DataInputStream input, DataOutputStream output); - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java b/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java index 9bbd82bdd5..89dc932890 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java @@ -23,5 +23,4 @@ public interface ForkResource { Throwable process(DataInputStream input, DataOutputStream output) throws IOException; - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java b/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java index c3249c1d1f..21002d12f5 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java @@ -26,11 +26,9 @@ import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.net.URL; - -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.parser.ParserFactory; +import org.xml.sax.SAXException; class ForkServer implements Runnable { @@ -52,36 +50,39 @@ class ForkServer implements Runnable { public static final byte INIT_LOADER_PARSER = 7; public static final byte INIT_PARSER_FACTORY_FACTORY_LOADER = 8; private final Object[] lock = new Object[0]; - /** - * Input stream for reading from the parent process - */ + + /** Input stream for reading from the parent process */ private final DataInputStream input; - /** - * Output stream for writing to the parent process - */ + + /** Output stream for writing to the parent process */ private final DataOutputStream output; + private final boolean active = true; - //milliseconds to sleep before checking to see if there has been any reading/writing - //If no reading or writing in this time, shutdown the server. + // milliseconds to sleep before checking to see if there has been any reading/writing + // If no reading or writing in this time, shutdown the server. private long serverPulseMillis = 5000; private long serverParserTimeoutMillis = 60000; private long serverWaitTimeoutMillis = 60000; - //can't be class Parser because then you'd - //have to include that in bootstrap jar (legacy mode) + // can't be class Parser because then you'd + // have to include that in bootstrap jar (legacy mode) private Object parser; private ClassLoader classLoader; private boolean parsing = false; private long since; + /** - * Sets up a forked server instance using the given stdin/out - * communication channel. + * Sets up a forked server instance using the given stdin/out communication channel. * - * @param input input stream for reading from the parent process + * @param input input stream for reading from the parent process * @param output output stream for writing to the parent process * @throws IOException if the server instance could not be created */ - public ForkServer(InputStream input, OutputStream output, long serverPulseMillis, - long serverParserTimeoutMillis, long serverWaitTimeoutMillis) + public ForkServer( + InputStream input, + OutputStream output, + long serverPulseMillis, + long serverParserTimeoutMillis, + long serverWaitTimeoutMillis) throws IOException { this.input = new DataInputStream(input); this.output = new DataOutputStream(output); @@ -93,10 +94,9 @@ public ForkServer(InputStream input, OutputStream output, long serverPulseMillis } /** - * Starts a forked server process using the standard input and output - * streams for communication with the parent process. Any attempts by - * stray code to read from standard input or write to standard output - * is redirected to avoid interfering with the communication channel. + * Starts a forked server process using the standard input and output streams for communication + * with the parent process. Any attempts by stray code to read from standard input or write to + * standard output is redirected to avoid interfering with the communication channel. * * @param args command line arguments, ignored * @throws Exception if the server could not be started @@ -109,7 +109,11 @@ public static void main(String[] args) throws Exception { URL.setURLStreamHandlerFactory(new MemoryURLStreamHandlerFactory()); ForkServer server = - new ForkServer(System.in, System.out, serverPulseMillis, serverParseTimeoutMillis, + new ForkServer( + System.in, + System.out, + serverPulseMillis, + serverParseTimeoutMillis, serverWaitTimeoutMillis); System.setIn(new ByteArrayInputStream(new byte[0])); System.setOut(System.err); @@ -128,8 +132,9 @@ public void run() { long elapsed = System.currentTimeMillis() - since; if (parsing && elapsed > serverParserTimeoutMillis) { break; - } else if (!parsing && serverWaitTimeoutMillis > 0 && - elapsed > serverWaitTimeoutMillis) { + } else if (!parsing + && serverWaitTimeoutMillis > 0 + && elapsed > serverWaitTimeoutMillis) { break; } } @@ -137,12 +142,12 @@ public void run() { } System.exit(0); } catch (InterruptedException e) { - //swallow + // swallow } } public void processRequests() { - //initialize + // initialize try { initializeParserAndLoader(); } catch (Throwable t) { @@ -157,7 +162,7 @@ public void processRequests() { } return; } - //main loop + // main loop try { while (true) { int request = input.read(); @@ -192,7 +197,7 @@ private void initializeParserAndLoader() switch (configIndex) { case INIT_PARSER_FACTORY_FACTORY: if (firstObject instanceof ParserFactoryFactory) { - //the user has submitted a parser factory, but no class loader + // the user has submitted a parser factory, but no class loader classLoader = ForkServer.class.getClassLoader(); ParserFactory parserFactory = ((ParserFactoryFactory) firstObject).build(); parser = parserFactory.build(); @@ -205,7 +210,7 @@ private void initializeParserAndLoader() if (firstObject instanceof ClassLoader) { classLoader = (ClassLoader) firstObject; Thread.currentThread().setContextClassLoader(classLoader); - //parser from parent process + // parser from parent process parser = readObject(classLoader); } else { throw new IllegalArgumentException( @@ -214,7 +219,7 @@ private void initializeParserAndLoader() break; case INIT_PARSER_FACTORY_FACTORY_LOADER: if (firstObject instanceof ParserFactoryFactory) { - //the user has submitted a parser factory and a class loader + // the user has submitted a parser factory and a class loader ParserFactory parserFactory = ((ParserFactoryFactory) firstObject).build(); parser = parserFactory.build(); classLoader = (ClassLoader) readObject(ForkServer.class.getClassLoader()); @@ -255,7 +260,6 @@ private void call(ClassLoader loader, Object object) throws Exception { te.setStackTrace(toSend.getStackTrace()); ForkObjectInputStream.sendObject(te, output); } - } } finally { synchronized (lock) { @@ -281,12 +285,12 @@ private Method getMethod(Object object, String name) { } /** - * Deserializes an object from the given stream. The serialized object - * is expected to be preceded by a size integer, that is used for reading - * the entire serialization into a memory before deserializing it. + * Deserializes an object from the given stream. The serialized object is expected to be + * preceded by a size integer, that is used for reading the entire serialization into a memory + * before deserializing it. * * @param loader class loader to be used for loading referenced classes - * @throws IOException if the object could not be deserialized + * @throws IOException if the object could not be deserialized * @throws ClassNotFoundException if a referenced class is not found */ private Object readObject(ClassLoader loader) throws IOException, ClassNotFoundException { diff --git a/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java b/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java index cca9b74711..d6b9ed5d4e 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java +++ b/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java @@ -23,9 +23,7 @@ class InputStreamProxy extends InputStream implements ForkProxy { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 4350939227765568438L; private final int resource; @@ -69,5 +67,4 @@ public int read(byte[] b, int off, int len) throws IOException { } return n; } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java b/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java index 04ba93cdcb..2fd768e2c7 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java +++ b/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java @@ -48,5 +48,4 @@ public Throwable process(DataInputStream input, DataOutputStream output) throws output.flush(); return null; } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java index 74a16878dc..8afea7cc34 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java +++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java @@ -31,12 +31,10 @@ class MemoryURLConnection extends URLConnection { } @Override - public void connect() { - } + public void connect() {} @Override public InputStream getInputStream() { return new ByteArrayInputStream(data); } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java index bfbb886c21..23a5270bdd 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java +++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java @@ -31,8 +31,7 @@ class MemoryURLStreamHandler extends URLStreamHandler { private static final AtomicInteger counter = new AtomicInteger(); - private static final List records = - new LinkedList<>(); + private static final List records = new LinkedList<>(); public static URL createURL(byte[] data) { try { @@ -64,5 +63,4 @@ protected URLConnection openConnection(URL u) throws IOException { } throw new IOException("Unknown URL: " + u); } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java index 5f3d818d92..4e07759567 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java @@ -28,5 +28,4 @@ public URLStreamHandler createURLStreamHandler(String protocol) { return null; } } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java index 8a72035fda..52a55e4df0 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java +++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java @@ -23,5 +23,4 @@ class MemoryURLStreamRecord { public WeakReference url; public byte[] data; - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java b/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java index c1f1f5612b..1c99900fa5 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java @@ -16,12 +16,11 @@ */ package org.apache.tika.fork; +import org.apache.tika.metadata.Metadata; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.metadata.Metadata; - class MetadataContentHandler extends DefaultHandler { private final Metadata metadata; @@ -38,5 +37,4 @@ public void startElement(String uri, String local, String name, Attributes attri metadata.add(aname, content); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java b/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java index 580b1ef435..4e04498869 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java @@ -17,23 +17,19 @@ package org.apache.tika.fork; - import java.io.Serializable; import java.lang.reflect.Constructor; import java.util.Map; - import org.apache.tika.exception.TikaException; import org.apache.tika.parser.ParserFactory; /** - * Lightweight, easily serializable class that contains enough information - * to build a {@link ParserFactory} + * Lightweight, easily serializable class that contains enough information to build a {@link + * ParserFactory} */ public class ParserFactoryFactory implements Serializable { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 4710974869988895410L; private final String className; @@ -53,5 +49,4 @@ public ParserFactory build() throws TikaException { throw new TikaException("Couldn't create factory", e); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java index 348c33d92d..05ecd93a05 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java +++ b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java @@ -23,22 +23,18 @@ import java.io.NotSerializableException; import java.io.ObjectOutputStream; import java.io.Serializable; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - *

This class calls #toString() on the ContentHandler, inserts it into the Metadata object - * and serializes the Metadata object. - *

- * Ideally, this would serialize the ContentHandler and the Metadata object as separate objects, - * but we can't guarantee that the ContentHandler is Serializable (e.g. the StringWriter in - * the WriteOutContentHandler). + * This class calls #toString() on the ContentHandler, inserts it into the Metadata object and + * serializes the Metadata object. Ideally, this would serialize the ContentHandler and the Metadata + * object as separate objects, but we can't guarantee that the ContentHandler is Serializable (e.g. + * the StringWriter in the WriteOutContentHandler). */ class RecursiveMetadataContentHandlerProxy extends RecursiveParserWrapperHandler implements ForkProxy { @@ -49,17 +45,15 @@ class RecursiveMetadataContentHandlerProxy extends RecursiveParserWrapperHandler public static final byte METADATA_ONLY = 4; public static final byte COMPLETE = 5; - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 737511106054617524L; private final int resource; private transient DataOutputStream output; - public RecursiveMetadataContentHandlerProxy(int resource, - ContentHandlerFactory contentHandlerFactory) { + public RecursiveMetadataContentHandlerProxy( + int resource, ContentHandlerFactory contentHandlerFactory) { super(contentHandlerFactory); this.resource = resource; } @@ -82,8 +76,9 @@ public void endDocument(ContentHandler contentHandler, Metadata metadata) throws proxyBackToClient(MAIN_DOCUMENT, contentHandler, metadata); } - private void proxyBackToClient(int embeddedOrMainDocument, ContentHandler contentHandler, - Metadata metadata) throws SAXException { + private void proxyBackToClient( + int embeddedOrMainDocument, ContentHandler contentHandler, Metadata metadata) + throws SAXException { try { output.write(ForkServer.RESOURCE); output.writeByte(resource); @@ -95,7 +90,7 @@ private void proxyBackToClient(int embeddedOrMainDocument, ContentHandler conten bytes = serialize(contentHandler); success = true; } catch (NotSerializableException e) { - //object lied + // object lied } if (success) { @@ -106,9 +101,9 @@ private void proxyBackToClient(int embeddedOrMainDocument, ContentHandler conten return; } } - //if contenthandler is not allegedly or actually Serializable - //fall back to adding contentHandler.toString() to the metadata object - //and send that. + // if contenthandler is not allegedly or actually Serializable + // fall back to adding contentHandler.toString() to the metadata object + // and send that. metadata.set(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString()); output.writeByte(METADATA_ONLY); send(metadata); @@ -132,16 +127,15 @@ private void sendBytes(byte[] bytes) throws IOException { } private byte[] serialize(Object object) throws IOException { - //can't figure out why I'm getting an IllegalAccessException - //when I try to use ForkedObjectInputStream, but - //not when I do this manually ?! + // can't figure out why I'm getting an IllegalAccessException + // when I try to use ForkedObjectInputStream, but + // not when I do this manually ?! ByteArrayOutputStream bos = new ByteArrayOutputStream(); try (ObjectOutputStream oos = new ObjectOutputStream(bos)) { oos.writeObject(object); oos.flush(); } return bos.toByteArray(); - } private void doneSending() throws SAXException { diff --git a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java index 638e24daab..77fcc5a095 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java +++ b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java @@ -19,14 +19,12 @@ import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; class RecursiveMetadataContentHandlerResource implements ForkResource { @@ -51,14 +49,14 @@ private void internalProcess(DataInputStream input) throws IOException, SAXExcep byte handlerAndMetadataOrMetadataOnly = input.readByte(); ContentHandler localContentHandler = DEFAULT_HANDLER; - if (handlerAndMetadataOrMetadataOnly == - RecursiveMetadataContentHandlerProxy.HANDLER_AND_METADATA) { + if (handlerAndMetadataOrMetadataOnly + == RecursiveMetadataContentHandlerProxy.HANDLER_AND_METADATA) { localContentHandler = (ContentHandler) readObject(input); - } else if (handlerAndMetadataOrMetadataOnly != - RecursiveMetadataContentHandlerProxy.METADATA_ONLY) { + } else if (handlerAndMetadataOrMetadataOnly + != RecursiveMetadataContentHandlerProxy.METADATA_ONLY) { throw new IllegalArgumentException( - "Expected HANDLER_AND_METADATA or METADATA_ONLY, but got:" + - handlerAndMetadataOrMetadataOnly); + "Expected HANDLER_AND_METADATA or METADATA_ONLY, but got:" + + handlerAndMetadataOrMetadataOnly); } Metadata metadata = (Metadata) readObject(input); @@ -82,6 +80,5 @@ private Object readObject(DataInputStream inputStream) throws IOException { } catch (ClassNotFoundException e) { throw new IOException(e); } - } } diff --git a/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java b/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java index 6610437c80..254783a191 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java +++ b/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java @@ -22,7 +22,6 @@ class TimeoutLimits { private final long parseTimeoutMS; private final long waitTimeoutMS; - TimeoutLimits(long pulseMS, long parseTimeoutMS, long waitTimeoutMS) { this.pulseMS = pulseMS; this.parseTimeoutMS = parseTimeoutMS; diff --git a/tika-core/src/main/java/org/apache/tika/fork/package-info.java b/tika-core/src/main/java/org/apache/tika/fork/package-info.java index 74cdd062d2..f03be88c31 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/fork/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * Forked parser. - */ +/** Forked parser. */ @aQute.bnd.annotation.Version("1.0.0") package org.apache.tika.fork; diff --git a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java index 31290cc1a9..5d2a4aaee9 100644 --- a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java @@ -21,16 +21,14 @@ import java.io.OutputStream; /** - * Very slight modification of Commons' BoundedInputStream - * so that we can figure out if this hit the bound or not. - *

- * This relies on IOUtils' skip and read to try to fully - * read/skip inputstream. + * Very slight modification of Commons' BoundedInputStream so that we can figure out if this hit the + * bound or not. + * + *

This relies on IOUtils' skip and read to try to fully read/skip inputstream. */ public class BoundedInputStream extends InputStream { - - private final static int EOF = -1; + private static final int EOF = -1; private final long max; private final InputStream in; private long pos; @@ -54,8 +52,7 @@ public int read() throws IOException { * Invokes the delegate's read(byte[]) method. * * @param b the buffer to read the bytes into - * @return the number of bytes read or -1 if the end of stream or - * the limit has been reached. + * @return the number of bytes read or -1 if the end of stream or the limit has been reached. * @throws IOException if an I/O error occurs */ @Override @@ -65,14 +62,13 @@ public int read(final byte[] b) throws IOException { /** * Invokes the delegate's read(byte[], int, int) method. - *

- * This does not have the same guarantees as IOUtil's readFully()...be careful. * - * @param b the buffer to read the bytes into + *

This does not have the same guarantees as IOUtil's readFully()...be careful. + * + * @param b the buffer to read the bytes into * @param off The start offset * @param len The number of bytes to read - * @return the number of bytes read or -1 if the end of stream or - * the limit has been reached. + * @return the number of bytes read or -1 if the end of stream or the limit has been reached. * @throws IOException if an I/O error occurs */ @Override @@ -92,9 +88,8 @@ public int read(final byte[] b, final int off, final int len) throws IOException } /** - * Invokes the delegate's skip(long) method. - * As with InputStream generally, this does not guarantee reading n bytes. - * Use IOUtils' skipFully for that functionality. + * Invokes the delegate's skip(long) method. As with InputStream generally, this + * does not guarantee reading n bytes. Use IOUtils' skipFully for that functionality. * * @param n the number of bytes to skip * @return the actual number of bytes skipped diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java index 242dd8c748..14d859ba12 100644 --- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java +++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java @@ -18,16 +18,15 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.tika.exception.TikaException; /** * General Endian Related Utilties. - *

- * This class provides static utility methods for input/output operations - * on numbers in Big and Little Endian formats. - *

- * Origin of code: Based on the version in POI + * + *

This class provides static utility methods for input/output operations on numbers in Big and + * Little Endian formats. + * + *

Origin of code: Based on the version in POI */ public class EndianUtils { private static final int LONG_SIZE = 8; @@ -37,7 +36,7 @@ public class EndianUtils { * * @param stream the InputStream from which the short is to be read * @return the short (16-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static short readShortLE(InputStream stream) @@ -50,7 +49,7 @@ public static short readShortLE(InputStream stream) * * @param stream the InputStream from which the short is to be read * @return the short (16-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static short readShortBE(InputStream stream) @@ -81,7 +80,7 @@ public static int readUShortBE(InputStream stream) throws IOException, BufferUnd * * @param stream the InputStream from which the int is to be read * @return the int (32-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static long readUIntLE(InputStream stream) throws IOException, BufferUnderrunException { @@ -100,7 +99,7 @@ public static long readUIntLE(InputStream stream) throws IOException, BufferUnde * * @param stream the InputStream from which the int is to be read * @return the int (32-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static long readUIntBE(InputStream stream) throws IOException, BufferUnderrunException { @@ -119,7 +118,7 @@ public static long readUIntBE(InputStream stream) throws IOException, BufferUnde * * @param stream the InputStream from which the int is to be read * @return the int (32-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static int readIntLE(InputStream stream) throws IOException, BufferUnderrunException { @@ -138,7 +137,7 @@ public static int readIntLE(InputStream stream) throws IOException, BufferUnderr * * @param stream the InputStream from which the int is to be read * @return the int (32-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static int readIntBE(InputStream stream) throws IOException, BufferUnderrunException { @@ -157,7 +156,7 @@ public static int readIntBE(InputStream stream) throws IOException, BufferUnderr * * @param stream the InputStream from which the int is to be read * @return the int (32-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static int readIntME(InputStream stream) throws IOException, BufferUnderrunException { @@ -176,7 +175,7 @@ public static int readIntME(InputStream stream) throws IOException, BufferUnderr * * @param stream the InputStream from which the long is to be read * @return the long (64-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static long readLongLE(InputStream stream) throws IOException, BufferUnderrunException { @@ -192,9 +191,15 @@ public static long readLongLE(InputStream stream) throws IOException, BufferUnde throw new BufferUnderrunException(); } - return ((long) ch8 << 56) + ((long) ch7 << 48) + ((long) ch6 << 40) + ((long) ch5 << 32) + - ((long) ch4 << 24) + // cast to long to preserve bit 31 (sign bit for ints) - (ch3 << 16) + (ch2 << 8) + (ch1); + return ((long) ch8 << 56) + + ((long) ch7 << 48) + + ((long) ch6 << 40) + + ((long) ch5 << 32) + + ((long) ch4 << 24) + + // cast to long to preserve bit 31 (sign bit for ints) + (ch3 << 16) + + (ch2 << 8) + + (ch1); } /** @@ -202,7 +207,7 @@ public static long readLongLE(InputStream stream) throws IOException, BufferUnde * * @param stream the InputStream from which the long is to be read * @return the long (64-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static long readLongBE(InputStream stream) throws IOException, BufferUnderrunException { @@ -218,14 +223,20 @@ public static long readLongBE(InputStream stream) throws IOException, BufferUnde throw new BufferUnderrunException(); } - return ((long) ch1 << 56) + ((long) ch2 << 48) + ((long) ch3 << 40) + ((long) ch4 << 32) + - ((long) ch5 << 24) + // cast to long to preserve bit 31 (sign bit for ints) - (ch6 << 16) + (ch7 << 8) + (ch8); + return ((long) ch1 << 56) + + ((long) ch2 << 48) + + ((long) ch3 << 40) + + ((long) ch4 << 32) + + ((long) ch5 << 24) + + // cast to long to preserve bit 31 (sign bit for ints) + (ch6 << 16) + + (ch7 << 8) + + (ch8); } /** - * Gets the integer value that is stored in UTF-8 like fashion, in Big Endian - * but with the high bit on each number indicating if it continues or not + * Gets the integer value that is stored in UTF-8 like fashion, in Big Endian but with the high + * bit on each number indicating if it continues or not */ public static long readUE7(InputStream stream) throws IOException { int i; @@ -262,7 +273,7 @@ public static short getShortLE(byte[] data) { /** * Get a LE short value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the short (16-bit) value */ @@ -283,7 +294,7 @@ public static int getUShortLE(byte[] data) { /** * Get a LE unsigned short value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the unsigned short (16-bit) value in an integer */ @@ -306,7 +317,7 @@ public static short getShortBE(byte[] data) { /** * Get a BE short value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the short (16-bit) value */ @@ -327,7 +338,7 @@ public static int getUShortBE(byte[] data) { /** * Get a BE unsigned short value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the unsigned short (16-bit) value in an integer */ @@ -350,7 +361,7 @@ public static int getIntLE(byte[] data) { /** * Get a LE int value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the int (32-bit) value */ @@ -376,7 +387,7 @@ public static int getIntBE(byte[] data) { /** * Get a BE int value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the int (32-bit) value */ @@ -402,7 +413,7 @@ public static long getUIntLE(byte[] data) { /** * Get a LE unsigned int value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the unsigned int (32-bit) value in a long */ @@ -424,7 +435,7 @@ public static long getUIntBE(byte[] data) { /** * Get a BE unsigned int value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the unsigned int (32-bit) value in a long */ @@ -436,7 +447,7 @@ public static long getUIntBE(byte[] data, int offset) { /** * Get a LE long value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the long (64-bit) value */ @@ -451,8 +462,7 @@ public static long getLongLE(byte[] data, int offset) { } /** - * Convert an 'unsigned' byte to an integer. ie, don't carry across the - * sign. + * Convert an 'unsigned' byte to an integer. ie, don't carry across the sign. * * @param b Description of the Parameter * @return Description of the Return Value @@ -464,7 +474,7 @@ public static int ubyteToInt(byte b) { /** * get the unsigned value of a byte. * - * @param data the byte array. + * @param data the byte array. * @param offset a starting offset into the byte array. * @return the unsigned value of the byte as a 16 bit short */ @@ -472,7 +482,6 @@ public static short getUByte(byte[] data, int offset) { return (short) (data[offset] & 0xFF); } - public static class BufferUnderrunException extends TikaException { private static final long serialVersionUID = 8358288231138076276L; diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java index 17bc9e9207..fccf0c9715 100644 --- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java +++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java @@ -18,23 +18,18 @@ import java.util.HashSet; import java.util.Locale; - import org.apache.tika.utils.StringUtils; - public class FilenameUtils { + /** Reserved characters */ + public static final char[] RESERVED_FILENAME_CHARACTERS = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, + 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, + 0x1E, 0x1F, '?', ':', '*', '<', '>', '|' + }; - /** - * Reserved characters - */ - public final static char[] RESERVED_FILENAME_CHARACTERS = - {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, - 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, - 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, '?', ':', '*', '<', '>', '|'}; - - private final static HashSet RESERVED = new HashSet<>(38); - + private static final HashSet RESERVED = new HashSet<>(38); static { for (char reservedFilenameCharacter : RESERVED_FILENAME_CHARACTERS) { @@ -42,13 +37,12 @@ public class FilenameUtils { } } - /** - * Scans the given file name for reserved characters on different OSs and - * file systems and returns a sanitized version of the name with the - * reserved chars replaced by their hexadecimal value. - *

- * For example why?.zip will be converted into why%3F.zip + * Scans the given file name for reserved characters on different OSs and file systems and + * returns a sanitized version of the name with the reserved chars replaced by their hexadecimal + * value. + * + *

For example why?.zip will be converted into why%3F.zip * * @param name the file name to be normalized - NOT NULL * @return the normalized file name @@ -63,7 +57,8 @@ public static String normalize(final String name) { for (char c : name.toCharArray()) { if (RESERVED.contains(c)) { - sb.append('%').append((c < 16) ? "0" : "") + sb.append('%') + .append((c < 16) ? "0" : "") .append(Integer.toHexString(c).toUpperCase(Locale.ROOT)); } else { sb.append(c); @@ -74,20 +69,17 @@ public static String normalize(final String name) { } /** - * This is a duplication of the algorithm and functionality - * available in commons io FilenameUtils. If Java's File were - * able handle Windows file paths correctly in linux, - * we wouldn't need this. - *

- * The goal of this is to get a filename from a path. - * The package parsers and some other embedded doc - * extractors could put anything into TikaCoreProperties.RESOURCE_NAME_KEY. - *

- * If a careless client used that filename as if it were a - * filename and not a path when writing embedded files, - * bad things could happen. Consider: "../../../my_ppt.ppt". - *

- * Consider using this in combination with {@link #normalize(String)}. + * This is a duplication of the algorithm and functionality available in commons io + * FilenameUtils. If Java's File were able handle Windows file paths correctly in linux, we + * wouldn't need this. + * + *

The goal of this is to get a filename from a path. The package parsers and some other + * embedded doc extractors could put anything into TikaCoreProperties.RESOURCE_NAME_KEY. + * + *

If a careless client used that filename as if it were a filename and not a path when + * writing embedded files, bad things could happen. Consider: "../../../my_ppt.ppt". + * + *

Consider using this in combination with {@link #normalize(String)}. * * @param path path to strip * @return empty string or a filename, never null @@ -99,8 +91,8 @@ public static String getName(final String path) { } int unix = path.lastIndexOf("/"); int windows = path.lastIndexOf("\\"); - //some macintosh file names are stored with : as the delimiter - //also necessary to properly handle C:somefilename + // some macintosh file names are stored with : as the delimiter + // also necessary to properly handle C:somefilename int colon = path.lastIndexOf(":"); String cand = path.substring(Math.max(colon, Math.max(unix, windows)) + 1); if (cand.equals("..") || cand.equals(".")) { @@ -111,13 +103,14 @@ public static String getName(final String path) { /** * This includes the period, e.g. ".pdf" + * * @param path * @return the suffix or an empty string if one could not be found */ public static String getSuffixFromPath(String path) { String n = getName(path); int i = n.lastIndexOf("."); - //arbitrarily sets max extension length + // arbitrarily sets max extension length if (i > -1 && n.length() - i < 6) { return n.substring(i); } diff --git a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java index 247705b0d5..f96935e1d9 100644 --- a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java +++ b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java @@ -36,7 +36,8 @@ public static long skip(final InputStream input, final long toSkip, byte[] buffe */ long remain = toSkip; while (remain > 0) { - // See https://issues.apache.org/jira/browse/IO-203 for why we use read() rather than delegating to skip() + // See https://issues.apache.org/jira/browse/IO-203 for why we use read() rather than + // delegating to skip() final long n = input.read(buffer, 0, (int) Math.min(remain, buffer.length)); if (n < 0) { // EOF break; diff --git a/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java b/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java index 17e416a579..6ca0e30a57 100644 --- a/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java +++ b/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java @@ -20,14 +20,13 @@ import java.io.InputStream; /** - *

A factory which returns a fresh {@link InputStream} for the same - * resource each time.

- *

This is typically desired where it is easier / quicker / simpler to - * fetch a fresh {@link InputStream} to re-read a given resource, rather - * than do any kind of buffering.

- *

It is typically used with {@link TikaInputStream#get(InputStreamFactory)} - * when combined with a Parser which needs to read the resource's stream - * multiple times when processing.

+ * A factory which returns a fresh {@link InputStream} for the same resource each time. + * + *

This is typically desired where it is easier / quicker / simpler to fetch a fresh {@link + * InputStream} to re-read a given resource, rather than do any kind of buffering. + * + *

It is typically used with {@link TikaInputStream#get(InputStreamFactory)} when combined with a + * Parser which needs to read the resource's stream multiple times when processing. */ public interface InputStreamFactory { InputStream getInputStream() throws IOException; diff --git a/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java b/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java index 32e671e78b..146d83008e 100644 --- a/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java @@ -20,24 +20,23 @@ import java.io.InputStream; /** - * Stream wrapper that make it easy to read up to n bytes ahead from - * a stream that supports the mark feature. This class insulates the - * underlying stream from things like possible mark(), reset() and close() - * calls by external components that might otherwise invalidate the marked - * state of a stream. - *

- * The recommended usage pattern of this class is: + * Stream wrapper that make it easy to read up to n bytes ahead from a stream that supports the mark + * feature. This class insulates the underlying stream from things like possible mark(), reset() and + * close() calls by external components that might otherwise invalidate the marked state of a + * stream. + * + *

The recommended usage pattern of this class is: + * *

  *     try (InputStream lookahead = new LookaheadInputStream(stream, n)) {
  *         processStream(lookahead);
  *     }
  * 
- *

- * This usage pattern guarantees that only up to n bytes from the original - * stream can ever be read, and that the stream will have been marked and - * then reset to its original state once the above code block exits. No - * code in the fictional processStream() method can affect the the state of - * the original stream. + * + *

This usage pattern guarantees that only up to n bytes from the original stream can ever be + * read, and that the stream will have been marked and then reset to its original state once the + * above code block exits. No code in the fictional processStream() method can affect the the state + * of the original stream. * * @since Apache Tika 0.10 */ @@ -52,14 +51,13 @@ public class LookaheadInputStream extends InputStream { private int mark = 0; /** - * Creates a lookahead wrapper for the given input stream. - * The given input stream should support the mark feature, - * as otherwise the state of that stream will be undefined - * after the lookahead wrapper has been closed. As a special - * case a null stream is treated as an empty stream. + * Creates a lookahead wrapper for the given input stream. The given input stream should support + * the mark feature, as otherwise the state of that stream will be undefined after the lookahead + * wrapper has been closed. As a special case a null stream is treated as an empty + * stream. * * @param stream input stream, can be null - * @param n maximum number of bytes to look ahead + * @param n maximum number of bytes to look ahead */ public LookaheadInputStream(InputStream stream, int n) { this.stream = stream; @@ -138,5 +136,4 @@ public synchronized void mark(int readlimit) { public synchronized void reset() { position = mark; } - } diff --git a/tika-core/src/main/java/org/apache/tika/io/TailStream.java b/tika-core/src/main/java/org/apache/tika/io/TailStream.java index a1621c20b3..5f335b3ba4 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TailStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TailStream.java @@ -21,66 +21,46 @@ import java.io.InputStream; /** - *

- * A specialized input stream implementation which records the last portion read - * from an underlying stream. - *

- *

- * This stream implementation is useful to deal with information which is known - * to be located at the end of a stream (e.g. ID3 v1 tags). While reading bytes - * from the underlying stream, a given number of bytes is kept in an internal - * buffer. This buffer can then be queried after the whole stream was read. It - * contains the last bytes read from the original input stream. - *

+ * A specialized input stream implementation which records the last portion read from an underlying + * stream. * - * @param in the underlying input stream + *

This stream implementation is useful to deal with information which is known to be located at + * the end of a stream (e.g. ID3 v1 tags). While reading bytes from the underlying stream, a given + * number of bytes is kept in an internal buffer. This buffer can then be queried after the whole + * stream was read. It contains the last bytes read from the original input stream. + * + * @param in the underlying input stream * @param tailSize the size of the tail buffer */ public class TailStream extends FilterInputStream { - /** - * Constant for the default skip buffer size. - */ + /** Constant for the default skip buffer size. */ private static final int SKIP_SIZE = 4096; - /** - * The buffer in which the tail data is stored. - */ + /** The buffer in which the tail data is stored. */ private final byte[] tailBuffer; - /** - * The size of the internal tail buffer. - */ + /** The size of the internal tail buffer. */ private final int tailSize; - /** - * A copy of the internal tail buffer used for mark() operations. - */ + /** A copy of the internal tail buffer used for mark() operations. */ private byte[] markBuffer; - /** - * The number of bytes that have been read so far. - */ + /** The number of bytes that have been read so far. */ private long bytesRead; - /** - * The number of bytes read at the last mark() operation. - */ + /** The number of bytes read at the last mark() operation. */ private long markBytesRead; - /** - * The current index into the tail buffer. - */ + /** The current index into the tail buffer. */ private int currentIndex; - /** - * A copy of the current index used for mark() operations. - */ + /** A copy of the current index used for mark() operations. */ private int markIndex; /** * Creates a new instance of {@code TailStream}. * - * @param in the underlying input stream + * @param in the underlying input stream * @param size the size of the tail buffer */ public TailStream(InputStream in, int size) { @@ -89,10 +69,7 @@ public TailStream(InputStream in, int size) { tailBuffer = new byte[size]; } - /** - * {@inheritDoc} This implementation adds the read byte to the internal tail - * buffer. - */ + /** {@inheritDoc} This implementation adds the read byte to the internal tail buffer. */ @Override public int read() throws IOException { int c = super.read(); @@ -103,9 +80,8 @@ public int read() throws IOException { } /** - * {@inheritDoc} This implementation delegates to the underlying stream and - * then adds the correct portion of the read buffer to the internal tail - * buffer. + * {@inheritDoc} This implementation delegates to the underlying stream and then adds the + * correct portion of the read buffer to the internal tail buffer. */ @Override public int read(byte[] buf) throws IOException { @@ -117,9 +93,8 @@ public int read(byte[] buf) throws IOException { } /** - * {@inheritDoc} This implementation delegates to the underlying stream and - * then adds the correct portion of the read buffer to the internal tail - * buffer. + * {@inheritDoc} This implementation delegates to the underlying stream and then adds the + * correct portion of the read buffer to the internal tail buffer. */ @Override public int read(byte[] buf, int ofs, int length) throws IOException { @@ -131,8 +106,8 @@ public int read(byte[] buf, int ofs, int length) throws IOException { } /** - * {@inheritDoc} This implementation delegates to the {@code read()} method - * to ensure that the tail buffer is also filled if data is skipped. + * {@inheritDoc} This implementation delegates to the {@code read()} method to ensure that the + * tail buffer is also filled if data is skipped. */ @Override public long skip(long n) throws IOException { @@ -153,9 +128,8 @@ public long skip(long n) throws IOException { } /** - * {@inheritDoc} This implementation saves the internal state including the - * content of the tail buffer so that it can be restored when ''reset()'' is - * called later. + * {@inheritDoc} This implementation saves the internal state including the content of the tail + * buffer so that it can be restored when ''reset()'' is called later. */ @Override public void mark(int limit) { @@ -166,9 +140,9 @@ public void mark(int limit) { } /** - * {@inheritDoc} This implementation restores this stream's state to the - * state when ''mark()'' was called the last time. If ''mark()'' has not - * been called before, this method has no effect. + * {@inheritDoc} This implementation restores this stream's state to the state when ''mark()'' + * was called the last time. If ''mark()'' has not been called before, this method has no + * effect. */ @Override public void reset() { @@ -180,10 +154,9 @@ public void reset() { } /** - * Returns an array with the last data read from the underlying stream. If - * the underlying stream contained more data than the ''tailSize'' - * constructor argument, the returned array has a length of ''tailSize''. - * Otherwise, its length equals the number of bytes read. + * Returns an array with the last data read from the underlying stream. If the underlying stream + * contained more data than the ''tailSize'' constructor argument, the returned array has a + * length of ''tailSize''. Otherwise, its length equals the number of bytes read. * * @return an array with the last data read from the underlying stream */ @@ -211,8 +184,8 @@ private void appendByte(byte b) { /** * Adds the content of the given buffer to the internal tail buffer. * - * @param buf the buffer - * @param ofs the start offset in the buffer + * @param buf the buffer + * @param ofs the start offset in the buffer * @param length the number of bytes to be copied */ private void appendBuf(byte[] buf, int ofs, int length) { @@ -226,12 +199,12 @@ private void appendBuf(byte[] buf, int ofs, int length) { } /** - * Replaces the content of the internal tail buffer by the last portion of - * the given buffer. This method is called if a buffer was read from the - * underlying stream whose length is larger than the tail buffer. + * Replaces the content of the internal tail buffer by the last portion of the given buffer. + * This method is called if a buffer was read from the underlying stream whose length is larger + * than the tail buffer. * - * @param buf the buffer - * @param ofs the start offset in the buffer + * @param buf the buffer + * @param ofs the start offset in the buffer * @param length the number of bytes to be copied */ private void replaceTailBuffer(byte[] buf, int ofs, int length) { @@ -240,13 +213,12 @@ private void replaceTailBuffer(byte[] buf, int ofs, int length) { } /** - * Copies the given buffer into the internal tail buffer at the current - * position. This method is called if a buffer is read from the underlying - * stream whose length is smaller than the tail buffer. In this case the - * tail buffer is only partly overwritten. + * Copies the given buffer into the internal tail buffer at the current position. This method is + * called if a buffer is read from the underlying stream whose length is smaller than the tail + * buffer. In this case the tail buffer is only partly overwritten. * - * @param buf the buffer - * @param ofs the start offset in the buffer + * @param buf the buffer + * @param ofs the start offset in the buffer * @param length the number of bytes to be copied */ private void copyToTailBuffer(byte[] buf, int ofs, int length) { diff --git a/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java b/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java index c1565ab86d..284ba3dee2 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java +++ b/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java @@ -22,20 +22,18 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.LinkedList; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * Utility class for tracking and ultimately closing or otherwise disposing - * a collection of temporary resources. - *

- * Note that this class is not thread-safe. + * Utility class for tracking and ultimately closing or otherwise disposing a collection of + * temporary resources. + * + *

Note that this class is not thread-safe. * * @since Apache Tika 0.10 */ @@ -43,33 +41,27 @@ public class TemporaryResources implements Closeable { private static final Logger LOG = LoggerFactory.getLogger(TemporaryResources.class); - /** - * Tracked resources in LIFO order. - */ + /** Tracked resources in LIFO order. */ private final LinkedList resources = new LinkedList<>(); - /** - * Directory for temporary files, null for the system default. - */ + /** Directory for temporary files, null for the system default. */ private Path tempFileDir = null; /** - * Sets the directory to be used for the temporary files created by - * the {@link #createTempFile(String)} method. + * Sets the directory to be used for the temporary files created by the {@link + * #createTempFile(String)} method. * - * @param tempFileDir temporary file directory, - * or null for the system default + * @param tempFileDir temporary file directory, or null for the system default */ public void setTemporaryFileDirectory(Path tempFileDir) { this.tempFileDir = tempFileDir; } /** - * Sets the directory to be used for the temporary files created by - * the {@link #createTempFile(String)} method. + * Sets the directory to be used for the temporary files created by the {@link + * #createTempFile(String)} method. * - * @param tempFileDir temporary file directory, - * or null for the system default + * @param tempFileDir temporary file directory, or null for the system default * @see #setTemporaryFileDirectory(Path) */ public void setTemporaryFileDirectory(File tempFileDir) { @@ -77,8 +69,9 @@ public void setTemporaryFileDirectory(File tempFileDir) { } /** - * Creates a temporary file that will automatically be deleted when - * the {@link #close()} method is called, returning its path. + * Creates a temporary file that will automatically be deleted when the {@link #close()} method + * is called, returning its path. + * * @param suffix -- the suffix of the file if known, starting with "." as in ".pdf" * @return Path to created temporary file that will be deleted after closing * @throws IOException @@ -86,17 +79,20 @@ public void setTemporaryFileDirectory(File tempFileDir) { public Path createTempFile(String suffix) throws IOException { String actualSuffix = StringUtils.isBlank(suffix) ? ".tmp" : suffix; - final Path path = tempFileDir == null ? Files.createTempFile("apache-tika-", actualSuffix) : - Files.createTempFile(tempFileDir, "apache-tika-", actualSuffix); - addResource(() -> { - try { - Files.delete(path); - } catch (IOException e) { - // delete when exit if current delete fail - LOG.warn("delete tmp file fail, will delete it on exit"); - path.toFile().deleteOnExit(); - } - }); + final Path path = + tempFileDir == null + ? Files.createTempFile("apache-tika-", actualSuffix) + : Files.createTempFile(tempFileDir, "apache-tika-", actualSuffix); + addResource( + () -> { + try { + Files.delete(path); + } catch (IOException e) { + // delete when exit if current delete fail + LOG.warn("delete tmp file fail, will delete it on exit"); + path.toFile().deleteOnExit(); + } + }); return path; } @@ -105,8 +101,8 @@ public Path createTempFile() throws IOException { } /** - * Creates a temporary file that will automatically be deleted when - * the {@link #close()} method is called, returning its path. + * Creates a temporary file that will automatically be deleted when the {@link #close()} method + * is called, returning its path. * * @return Path to created temporary file that will be deleted after closing * @throws IOException @@ -118,9 +114,10 @@ public Path createTempFile(Metadata metadata) throws IOException { } return createTempFile(FilenameUtils.getSuffixFromPath(resourceName)); } + /** - * Creates and returns a temporary file that will automatically be - * deleted when the {@link #close()} method is called. + * Creates and returns a temporary file that will automatically be deleted when the {@link + * #close()} method is called. * * @return Created temporary file that'll be deleted after closing * @throws IOException @@ -131,8 +128,8 @@ public File createTemporaryFile() throws IOException { } /** - * Adds a new resource to the set of tracked resources that will all be - * closed when the {@link #close()} method is called. + * Adds a new resource to the set of tracked resources that will all be closed when the {@link + * #close()} method is called. * * @param resource resource to be tracked */ @@ -141,8 +138,8 @@ public void addResource(Closeable resource) { } /** - * Returns the latest of the tracked resources that implements or - * extends the given interface or class. + * Returns the latest of the tracked resources that implements or extends the given interface or + * class. * * @param klass interface or class * @return matching resource, or null if not found @@ -158,15 +155,13 @@ public T getResource(Class klass) { } /** - * Closes all tracked resources. The resources are closed in reverse order - * from how they were added. - *

- * Any suppressed exceptions from managed resources are collected and - * then added to the first thrown exception, which is re-thrown once - * all the resources have been closed. + * Closes all tracked resources. The resources are closed in reverse order from how they were + * added. * - * @throws IOException if one or more of the tracked resources - * could not be closed + *

Any suppressed exceptions from managed resources are collected and then added to the first + * thrown exception, which is re-thrown once all the resources have been closed. + * + * @throws IOException if one or more of the tracked resources could not be closed */ public void close() throws IOException { // Release all resources and keep track of any exceptions @@ -191,12 +186,10 @@ public void close() throws IOException { } /** - * Calls the {@link #close()} method and wraps the potential - * {@link IOException} into a {@link TikaException} for convenience - * when used within Tika. + * Calls the {@link #close()} method and wraps the potential {@link IOException} into a {@link + * TikaException} for convenience when used within Tika. * - * @throws TikaException if one or more of the tracked resources - * could not be closed + * @throws TikaException if one or more of the tracked resources could not be closed */ public void dispose() throws TikaException { try { @@ -205,5 +198,4 @@ public void dispose() throws TikaException { throw new TikaException("Failed to close temporary resources", e); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java index a70ade4c3f..b121ad4e3c 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java @@ -35,97 +35,88 @@ import java.nio.file.Paths; import java.sql.Blob; import java.sql.SQLException; - import org.apache.commons.io.input.TaggedInputStream; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.Parser; import org.apache.tika.utils.StringUtils; /** - * Input stream with extended capabilities. The purpose of this class is - * to allow files and other resources and information to be associated with - * the {@link InputStream} instance passed through the - * {@link org.apache.tika.parser.Parser} interface and other similar APIs. - *

- * TikaInputStream instances can be created using the various static - * get() factory methods. Most of these methods take an optional - * {@link Metadata} argument that is then filled with the available input - * metadata from the given resource. The created TikaInputStream instance - * keeps track of the original resource used to create it, while behaving - * otherwise just like a normal, buffered {@link InputStream}. - * A TikaInputStream instance is also guaranteed to support the - * {@link #mark(int)} feature. - *

- * Code that wants to access the underlying file or other resources - * associated with a TikaInputStream should first use the - * {@link #get(InputStream)} factory method to cast or wrap a given - * {@link InputStream} into a TikaInputStream instance. - *

- * TikaInputStream includes a few safety features to protect against parsers - * that may fail to check for an EOF or may incorrectly rely on the unreliable - * value returned from {@link FileInputStream#skip}. These parser failures - * can lead to infinite loops. We strongly encourage the use of - * TikaInputStream. + * Input stream with extended capabilities. The purpose of this class is to allow files and other + * resources and information to be associated with the {@link InputStream} instance passed through + * the {@link org.apache.tika.parser.Parser} interface and other similar APIs. + * + *

TikaInputStream instances can be created using the various static get() factory + * methods. Most of these methods take an optional {@link Metadata} argument that is then filled + * with the available input metadata from the given resource. The created TikaInputStream instance + * keeps track of the original resource used to create it, while behaving otherwise just like a + * normal, buffered {@link InputStream}. A TikaInputStream instance is also guaranteed to support + * the {@link #mark(int)} feature. + * + *

Code that wants to access the underlying file or other resources associated with a + * TikaInputStream should first use the {@link #get(InputStream)} factory method to cast or wrap a + * given {@link InputStream} into a TikaInputStream instance. + * + *

TikaInputStream includes a few safety features to protect against parsers that may fail to + * check for an EOF or may incorrectly rely on the unreliable value returned from {@link + * FileInputStream#skip}. These parser failures can lead to infinite loops. We strongly encourage + * the use of TikaInputStream. * * @since Apache Tika 0.8 */ public class TikaInputStream extends TaggedInputStream { private static final int MAX_CONSECUTIVE_EOFS = 1000; + /** - * Blob size threshold that limits the largest BLOB size to be - * buffered fully in memory by the {@link #get(Blob, Metadata)} - * method. + * Blob size threshold that limits the largest BLOB size to be buffered fully in memory by the + * {@link #get(Blob, Metadata)} method. */ private static final int BLOB_SIZE_THRESHOLD = 1024 * 1024; - /** - * Tracker of temporary resources. - */ + + /** Tracker of temporary resources. */ private final TemporaryResources tmp; + /** - * The Factory that can create fresh {@link InputStream}s for - * the resource this reads for, eg when needing to re-read. + * The Factory that can create fresh {@link InputStream}s for the resource this reads for, eg + * when needing to re-read. */ private InputStreamFactory streamFactory; + /** - * The path to the file that contains the contents of this stream. - * This is either the original file passed to the - * {@link #TikaInputStream(Path)} constructor or a temporary file created - * by a call to the {@link #getPath()} method. If neither has been called, - * then the value is null. + * The path to the file that contains the contents of this stream. This is either the original + * file passed to the {@link #TikaInputStream(Path)} constructor or a temporary file created by + * a call to the {@link #getPath()} method. If neither has been called, then the value is + * null. */ private Path path; - /** - * Total length of the stream, or -1 if unknown. - */ + + /** Total length of the stream, or -1 if unknown. */ private long length; - /** - * Current read position within this stream. - */ + + /** Current read position within this stream. */ private long position = 0; - /** - * Marked position, or -1 if there is no current mark. - */ + + /** Marked position, or -1 if there is no current mark. */ private long mark = -1; + /** - * A opened container, such as a POIFS FileSystem - * for an OLE2 document, or a Zip file for a - * zip based (eg ooxml, odf) document. + * A opened container, such as a POIFS FileSystem for an OLE2 document, or a Zip file for a zip + * based (eg ooxml, odf) document. */ private Object openContainer; + private int consecutiveEOFs = 0; private byte[] skipBuffer; - //suffix of the file if known. This is used to create temp files - //with the right suffixes. This should include the initial . as in ".doc" + // suffix of the file if known. This is used to create temp files + // with the right suffixes. This should include the initial . as in ".doc" private String suffix = null; /** - * Creates a TikaInputStream instance. This private constructor is used - * by the static factory methods based on the available information. + * Creates a TikaInputStream instance. This private constructor is used by the static factory + * methods based on the available information. * * @param path the path to the file that contains the stream * @throws IOException if an I/O error occurs @@ -147,8 +138,8 @@ private TikaInputStream(Path path, TemporaryResources tmp, long length) throws I } /** - * Creates a TikaInputStream instance. This private constructor is used - * by the static factory methods based on the available information. + * Creates a TikaInputStream instance. This private constructor is used by the static factory + * methods based on the available information. * * @param file the file that contains the stream * @throws FileNotFoundException if the file does not exist @@ -161,22 +152,21 @@ private TikaInputStream(File file) throws FileNotFoundException { this.tmp = new TemporaryResources(); this.length = file.length(); this.suffix = FilenameUtils.getSuffixFromPath(path.getFileName().toString()); - } /** - * Creates a TikaInputStream instance. This private constructor is used - * by the static factory methods based on the available information. - *

- * The given stream needs to be included in the given temporary resource - * collection if the caller wants it also to get closed when the - * {@link #close()} method is invoked. + * Creates a TikaInputStream instance. This private constructor is used by the static factory + * methods based on the available information. + * + *

The given stream needs to be included in the given temporary resource collection if the + * caller wants it also to get closed when the {@link #close()} method is invoked. * * @param stream buffered stream (must support the mark feature) - * @param tmp tracker for temporary resources associated with this stream + * @param tmp tracker for temporary resources associated with this stream * @param length total length of the stream, or -1 if unknown */ - private TikaInputStream(InputStream stream, TemporaryResources tmp, long length, String suffix) { + private TikaInputStream( + InputStream stream, TemporaryResources tmp, long length, String suffix) { super(stream); this.path = null; this.tmp = tmp; @@ -185,46 +175,44 @@ private TikaInputStream(InputStream stream, TemporaryResources tmp, long length, } /** - * Checks whether the given stream is a TikaInputStream instance. - * The given stream can be null, in which case the return - * value is false. + * Checks whether the given stream is a TikaInputStream instance. The given stream can be + * null, in which case the return value is false. * * @param stream input stream, possibly null - * @return true if the stream is a TikaInputStream instance, - * false otherwise + * @return true if the stream is a TikaInputStream instance, false + * otherwise */ public static boolean isTikaInputStream(InputStream stream) { return stream instanceof TikaInputStream; } /** - * Casts or wraps the given stream to a TikaInputStream instance. - * This method can be used to access the functionality of this class - * even when given just a normal input stream instance. - *

- * The given temporary file provider is used for any temporary files, - * and should be disposed when the returned stream is no longer used. - *

- * Use this method instead of the {@link #get(InputStream)} alternative - * when you don't explicitly close the returned stream. The - * recommended access pattern is: + * Casts or wraps the given stream to a TikaInputStream instance. This method can be used to + * access the functionality of this class even when given just a normal input stream instance. + * + *

The given temporary file provider is used for any temporary files, and should be disposed + * when the returned stream is no longer used. + * + *

Use this method instead of the {@link #get(InputStream)} alternative when you + * don't explicitly close the returned stream. The recommended access pattern is: + * *

      * try (TemporaryResources tmp = new TemporaryResources()) {
      *     TikaInputStream stream = TikaInputStream.get(..., tmp);
      *     // process stream but don't close it
      * }
      * 
- *

- * The given stream instance will not be closed when the - * {@link TemporaryResources#close()} method is called by the - * try-with-resources statement. The caller is expected to explicitly - * close the original stream when it's no longer used. + * + *

The given stream instance will not be closed when the {@link + * TemporaryResources#close()} method is called by the try-with-resources statement. The caller + * is expected to explicitly close the original stream when it's no longer used. * * @param stream normal input stream * @return a TikaInputStream instance * @since Apache Tika 0.10 */ - public static TikaInputStream get(InputStream stream, TemporaryResources tmp, Metadata metadata) { + public static TikaInputStream get( + InputStream stream, TemporaryResources tmp, Metadata metadata) { if (stream == null) { throw new NullPointerException("The Stream must not be null"); } @@ -241,23 +229,22 @@ public static TikaInputStream get(InputStream stream, TemporaryResources tmp, Me } /** - * Casts or wraps the given stream to a TikaInputStream instance. - * This method can be used to access the functionality of this class - * even when given just a normal input stream instance. - *

- * Use this method instead of the - * {@link #get(InputStream, TemporaryResources, Metadata)} alternative when you - * do explicitly close the returned stream. The recommended - * access pattern is: + * Casts or wraps the given stream to a TikaInputStream instance. This method can be used to + * access the functionality of this class even when given just a normal input stream instance. + * + *

Use this method instead of the {@link #get(InputStream, TemporaryResources, Metadata)} + * alternative when you do explicitly close the returned stream. The recommended access + * pattern is: + * *

      * try (TikaInputStream stream = TikaInputStream.get(...)) {
      *     // process stream
      * }
      * 
- *

- * The given stream instance will be closed along with any other resources - * associated with the returned TikaInputStream instance when the - * {@link #close()} method is called by the try-with-resources statement. + * + *

The given stream instance will be closed along with any other resources associated with + * the returned TikaInputStream instance when the {@link #close()} method is called by the + * try-with-resources statement. * * @param stream normal input stream * @return a TikaInputStream instance @@ -267,8 +254,8 @@ public static TikaInputStream get(InputStream stream) { } /** - * Returns the given stream casts to a TikaInputStream, or - * null if the stream is not a TikaInputStream. + * Returns the given stream casts to a TikaInputStream, or null if the stream is + * not a TikaInputStream. * * @param stream normal input stream * @return a TikaInputStream instance @@ -284,9 +271,9 @@ public static TikaInputStream cast(InputStream stream) { /** * Creates a TikaInputStream from the given array of bytes. - *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the given data to a temporary file. + * + *

Note that you must always explicitly close the returned stream as in some cases it may end + * up writing the given data to a temporary file. * * @param data input data * @return a TikaInputStream instance @@ -296,28 +283,31 @@ public static TikaInputStream get(byte[] data) { } /** - * Creates a TikaInputStream from the given array of bytes. The length of - * the array is stored as input metadata in the given metadata instance. - *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the given data to a temporary file. + * Creates a TikaInputStream from the given array of bytes. The length of the array is stored as + * input metadata in the given metadata instance. * - * @param data input data + *

Note that you must always explicitly close the returned stream as in some cases it may end + * up writing the given data to a temporary file. + * + * @param data input data * @param metadata metadata instance * @return a TikaInputStream instance * @throws IOException */ public static TikaInputStream get(byte[] data, Metadata metadata) { metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length)); - return new TikaInputStream(new UnsynchronizedByteArrayInputStream(data), new TemporaryResources(), - data.length, getExtension(metadata)); + return new TikaInputStream( + new UnsynchronizedByteArrayInputStream(data), + new TemporaryResources(), + data.length, + getExtension(metadata)); } /** * Creates a TikaInputStream from the file at the given path. - *

- * Note that you must always explicitly close the returned stream to - * prevent leaking open file handles. + * + *

Note that you must always explicitly close the returned stream to prevent leaking open + * file handles. * * @param path input file * @return a TikaInputStream instance @@ -328,16 +318,16 @@ public static TikaInputStream get(Path path) throws IOException { } /** - * Creates a TikaInputStream from the file at the given path. The file name - * and length are stored as input metadata in the given metadata instance. - *

- * If there's an {@link TikaCoreProperties#RESOURCE_NAME_KEY} in the - * metadata object, this will not overwrite that value with the path's name. - *

- * Note that you must always explicitly close the returned stream to - * prevent leaking open file handles. + * Creates a TikaInputStream from the file at the given path. The file name and length are + * stored as input metadata in the given metadata instance. * - * @param path input file + *

If there's an {@link TikaCoreProperties#RESOURCE_NAME_KEY} in the metadata object, this + * will not overwrite that value with the path's name. + * + *

Note that you must always explicitly close the returned stream to prevent leaking open + * file handles. + * + * @param path input file * @param metadata metadata instance * @return a TikaInputStream instance * @throws IOException if an I/O error occurs @@ -362,15 +352,15 @@ public static TikaInputStream get(Path path, Metadata metadata, TemporaryResourc /** * Creates a TikaInputStream from the given file. - *

- * Note that you must always explicitly close the returned stream to - * prevent leaking open file handles. + * + *

Note that you must always explicitly close the returned stream to prevent leaking open + * file handles. * * @param file input file * @return a TikaInputStream instance * @throws FileNotFoundException if the file does not exist - * @deprecated use {@link #get(Path)}. In Tika 2.0, this will be removed - * or modified to throw an IOException. + * @deprecated use {@link #get(Path)}. In Tika 2.0, this will be removed or modified to throw an + * IOException. */ @Deprecated public static TikaInputStream get(File file) throws FileNotFoundException { @@ -378,19 +368,18 @@ public static TikaInputStream get(File file) throws FileNotFoundException { } /** - * Creates a TikaInputStream from the given file. The file name and - * length are stored as input metadata in the given metadata instance. - *

- * Note that you must always explicitly close the returned stream to - * prevent leaking open file handles. + * Creates a TikaInputStream from the given file. The file name and length are stored as input + * metadata in the given metadata instance. * - * @param file input file + *

Note that you must always explicitly close the returned stream to prevent leaking open + * file handles. + * + * @param file input file * @param metadata metadata instance * @return a TikaInputStream instance - * @throws FileNotFoundException if the file does not exist - * or cannot be opened for reading - * @deprecated use {@link #get(Path, Metadata)}. In Tika 2.0, - * this will be removed or modified to throw an IOException. + * @throws FileNotFoundException if the file does not exist or cannot be opened for reading + * @deprecated use {@link #get(Path, Metadata)}. In Tika 2.0, this will be removed or modified + * to throw an IOException. */ @Deprecated public static TikaInputStream get(File file, Metadata metadata) throws FileNotFoundException { @@ -402,24 +391,24 @@ public static TikaInputStream get(File file, Metadata metadata) throws FileNotFo } /** - * Creates a TikaInputStream from a Factory which can create - * fresh {@link InputStream}s for the same resource multiple times. - *

This is typically desired when working with {@link Parser}s that - * need to re-read the stream multiple times, where other forms - * of buffering (eg File) are slower than just getting a fresh - * new stream each time. + * Creates a TikaInputStream from a Factory which can create fresh {@link InputStream}s for the + * same resource multiple times. + * + *

This is typically desired when working with {@link Parser}s that need to re-read the + * stream multiple times, where other forms of buffering (eg File) are slower than just getting + * a fresh new stream each time. */ public static TikaInputStream get(InputStreamFactory factory) throws IOException { return get(factory, new TemporaryResources()); } /** - * Creates a TikaInputStream from a Factory which can create - * fresh {@link InputStream}s for the same resource multiple times. - *

This is typically desired when working with {@link Parser}s that - * need to re-read the stream multiple times, where other forms - * of buffering (eg File) are slower than just getting a fresh - * new stream each time. + * Creates a TikaInputStream from a Factory which can create fresh {@link InputStream}s for the + * same resource multiple times. + * + *

This is typically desired when working with {@link Parser}s that need to re-read the + * stream multiple times, where other forms of buffering (eg File) are slower than just getting + * a fresh new stream each time. */ public static TikaInputStream get(InputStreamFactory factory, TemporaryResources tmp) throws IOException { @@ -430,11 +419,10 @@ public static TikaInputStream get(InputStreamFactory factory, TemporaryResources /** * Creates a TikaInputStream from the given database BLOB. - *

- * Note that the result set containing the BLOB may need to be kept open - * until the returned TikaInputStream has been processed and closed. - * You must also always explicitly close the returned stream as in - * some cases it may end up writing the blob data to a temporary file. + * + *

Note that the result set containing the BLOB may need to be kept open until the returned + * TikaInputStream has been processed and closed. You must also always explicitly close the + * returned stream as in some cases it may end up writing the blob data to a temporary file. * * @param blob database BLOB * @return a TikaInputStream instance @@ -445,16 +433,14 @@ public static TikaInputStream get(Blob blob) throws SQLException { } /** - * Creates a TikaInputStream from the given database BLOB. The BLOB - * length (if available) is stored as input metadata in the given - * metadata instance. - *

- * Note that the result set containing the BLOB may need to be kept open - * until the returned TikaInputStream has been processed and closed. - * You must also always explicitly close the returned stream as in - * some cases it may end up writing the blob data to a temporary file. + * Creates a TikaInputStream from the given database BLOB. The BLOB length (if available) is + * stored as input metadata in the given metadata instance. + * + *

Note that the result set containing the BLOB may need to be kept open until the returned + * TikaInputStream has been processed and closed. You must also always explicitly close the + * returned stream as in some cases it may end up writing the blob data to a temporary file. * - * @param blob database BLOB + * @param blob database BLOB * @param metadata metadata instance * @return a TikaInputStream instance * @throws SQLException if BLOB data can not be accessed @@ -474,8 +460,10 @@ public static TikaInputStream get(Blob blob, Metadata metadata) throws SQLExcept // the offset in Blob.getBytes() starts at 1 return get(blob.getBytes(1, (int) length), metadata); } else { - return new TikaInputStream(new BufferedInputStream(blob.getBinaryStream()), - new TemporaryResources(), length, + return new TikaInputStream( + new BufferedInputStream(blob.getBinaryStream()), + new TemporaryResources(), + length, getExtension(metadata)); } } @@ -490,9 +478,9 @@ private static String getExtension(Metadata metadata) { /** * Creates a TikaInputStream from the resource at the given URI. - *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the resource to a temporary file. + * + *

Note that you must always explicitly close the returned stream as in some cases it may end + * up writing the resource to a temporary file. * * @param uri resource URI * @return a TikaInputStream instance @@ -503,13 +491,13 @@ public static TikaInputStream get(URI uri) throws IOException { } /** - * Creates a TikaInputStream from the resource at the given URI. The - * available input metadata is stored in the given metadata instance. - *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the resource to a temporary file. + * Creates a TikaInputStream from the resource at the given URI. The available input metadata is + * stored in the given metadata instance. + * + *

Note that you must always explicitly close the returned stream as in some cases it may end + * up writing the resource to a temporary file. * - * @param uri resource URI + * @param uri resource URI * @param metadata metadata instance * @return a TikaInputStream instance * @throws IOException if the resource can not be accessed @@ -528,9 +516,9 @@ public static TikaInputStream get(URI uri, Metadata metadata) throws IOException /** * Creates a TikaInputStream from the resource at the given URL. - *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the resource to a temporary file. + * + *

Note that you must always explicitly close the returned stream as in some cases it may end + * up writing the resource to a temporary file. * * @param url resource URL * @return a TikaInputStream instance @@ -541,13 +529,13 @@ public static TikaInputStream get(URL url) throws IOException { } /** - * Creates a TikaInputStream from the resource at the given URL. The - * available input metadata is stored in the given metadata instance. - *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the resource to a temporary file. + * Creates a TikaInputStream from the resource at the given URL. The available input metadata is + * stored in the given metadata instance. * - * @param url resource URL + *

Note that you must always explicitly close the returned stream as in some cases it may end + * up writing the resource to a temporary file. + * + * @param url resource URL * @param metadata metadata instance * @return a TikaInputStream instance * @throws IOException if the resource can not be accessed @@ -588,15 +576,17 @@ public static TikaInputStream get(URL url, Metadata metadata) throws IOException metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length)); } - return new TikaInputStream(new BufferedInputStream(connection.getInputStream()), - new TemporaryResources(), length, getExtension(metadata)); + return new TikaInputStream( + new BufferedInputStream(connection.getInputStream()), + new TemporaryResources(), + length, + getExtension(metadata)); } /** - * Fills the given buffer with upcoming bytes from this stream without - * advancing the current stream position. The buffer is filled up unless - * the end of stream is encountered before that. This method will block - * if not enough bytes are immediately available. + * Fills the given buffer with upcoming bytes from this stream without advancing the current + * stream position. The buffer is filled up unless the end of stream is encountered before that. + * This method will block if not enough bytes are immediately available. * * @param buffer byte buffer * @return number of bytes written to the buffer @@ -623,9 +613,8 @@ public int peek(byte[] buffer) throws IOException { } /** - * Returns the open container object if any, such as a - * POIFS FileSystem in the event of an OLE2 document - * being detected and processed by the OLE2 detector. + * Returns the open container object if any, such as a POIFS FileSystem in the event of an OLE2 + * document being detected and processed by the OLE2 detector. * * @return Open Container for this stream, or null if none */ @@ -634,10 +623,8 @@ public Object getOpenContainer() { } /** - * Stores the open container object against - * the stream, eg after a Zip contents - * detector has loaded the file to decide - * what it contains. + * Stores the open container object against the stream, eg after a Zip contents detector has + * loaded the file to decide what it contains. */ public void setOpenContainer(Object container) { openContainer = container; @@ -647,7 +634,6 @@ public void setOpenContainer(Object container) { } /** - * * @param closeable */ public void addCloseableResource(Closeable closeable) { @@ -659,8 +645,8 @@ public boolean hasInputStreamFactory() { } /** - * If the Stream was created from an {@link InputStreamFactory}, - * return that, otherwise null. + * If the Stream was created from an {@link InputStreamFactory}, return that, otherwise + * null. */ public InputStreamFactory getInputStreamFactory() { return streamFactory; @@ -670,12 +656,10 @@ public boolean hasFile() { return path != null; } - /** - * If the user created this TikaInputStream with a file, - * the original file will be returned. If not, the entire stream - * will be spooled to a temporary file which will be deleted - * upon the close of this TikaInputStream + * If the user created this TikaInputStream with a file, the original file will be returned. If + * not, the entire stream will be spooled to a temporary file which will be deleted upon the + * close of this TikaInputStream * * @return * @throws IOException @@ -685,11 +669,11 @@ public Path getPath() throws IOException { } /** - * @param maxBytes if this is less than 0 and if an underlying file doesn't already exist, - * the full file will be spooled to disk - * @return the original path used in the initialization of this TikaInputStream, - * a temporary file if the stream was shorter than maxBytes, or null - * if the underlying stream was longer than maxBytes. + * @param maxBytes if this is less than 0 and if an underlying file doesn't already exist, the + * full file will be spooled to disk + * @return the original path used in the initialization of this TikaInputStream, a temporary + * file if the stream was shorter than maxBytes, or null if the + * underlying stream was longer than maxBytes. * @throws IOException */ public Path getPath(int maxBytes) throws IOException { @@ -701,10 +685,10 @@ public Path getPath(int maxBytes) throws IOException { if (maxBytes > -1) { this.mark(maxBytes); try (BoundedInputStream boundedInputStream = - new BoundedInputStream(maxBytes, this)) { + new BoundedInputStream(maxBytes, this)) { Files.copy(boundedInputStream, tmpFile, REPLACE_EXISTING); if (boundedInputStream.hasHitBound()) { - //tmpFile will be cleaned up when this TikaInputStream is closed + // tmpFile will be cleaned up when this TikaInputStream is closed return null; } } finally { @@ -714,7 +698,7 @@ public Path getPath(int maxBytes) throws IOException { // Spool the entire stream into a temporary file Files.copy(this, tmpFile, REPLACE_EXISTING); } - //successful so far, set tis' path to tmpFile + // successful so far, set tis' path to tmpFile path = tmpFile; // Create a new input stream and make sure it'll get closed @@ -726,12 +710,13 @@ public Path getPath(int maxBytes) throws IOException { // close() method is called. The closing of the new stream // is already being handled as noted above. final InputStream oldStream = in; - in = new BufferedInputStream(newStream) { - @Override - public void close() throws IOException { - oldStream.close(); - } - }; + in = + new BufferedInputStream(newStream) { + @Override + public void close() throws IOException { + oldStream.close(); + } + }; // Update length to file size. Update position, mark length = Files.size(path); @@ -760,11 +745,10 @@ public boolean hasLength() { } /** - * Returns the length (in bytes) of this stream. Note that if the length - * was not available when this stream was instantiated, then this method - * will use the {@link #getPath()} method to buffer the entire stream to - * a temporary file in order to calculate the stream length. This case - * will only work if the stream has not yet been consumed. + * Returns the length (in bytes) of this stream. Note that if the length was not available when + * this stream was instantiated, then this method will use the {@link #getPath()} method to + * buffer the entire stream to a temporary file in order to calculate the stream length. This + * case will only work if the stream has not yet been consumed. * * @return stream length * @throws IOException if the length can not be determined @@ -786,19 +770,19 @@ public long getPosition() { } /** - * This relies on {@link IOUtils#skip(InputStream, long, byte[])} to ensure - * that the alleged bytes skipped were actually skipped. + * This relies on {@link IOUtils#skip(InputStream, long, byte[])} to ensure that the alleged + * bytes skipped were actually skipped. * * @param ln the number of bytes to skip * @return the number of bytes skipped - * @throws IOException if the number of bytes requested to be skipped does not match the - * number of bytes skipped or if there's an IOException during the read. + * @throws IOException if the number of bytes requested to be skipped does not match the number + * of bytes skipped or if there's an IOException during the read. */ @Override public long skip(long ln) throws IOException { - //On TIKA-3092, we found that using the static byte array buffer - //caused problems with multithreading with the FlateInputStream - //from a POIFS document stream + // On TIKA-3092, we found that using the static byte array buffer + // caused problems with multithreading with the FlateInputStream + // from a POIFS document stream if (skipBuffer == null) { skipBuffer = new byte[4096]; } @@ -847,9 +831,10 @@ protected void afterRead(int n) throws IOException { } else { consecutiveEOFs++; if (consecutiveEOFs > MAX_CONSECUTIVE_EOFS) { - throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." + - "If you think your file is not corrupt, please open an issue on Tika's " + - "JIRA"); + throw new IOException( + "Read too many -1 (EOFs); there could be an infinite loop." + + "If you think your file is not corrupt, please open an issue on Tika's " + + "JIRA"); } } } diff --git a/tika-core/src/main/java/org/apache/tika/io/package-info.java b/tika-core/src/main/java/org/apache/tika/io/package-info.java index 36c7274da5..daef464ee3 100644 --- a/tika-core/src/main/java/org/apache/tika/io/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/io/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * IO utilities. - */ +/** IO utilities. */ @aQute.bnd.annotation.Version("1.0.0") package org.apache.tika.io; diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java index e5b520ffc1..00081aa3e3 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java @@ -17,6 +17,8 @@ package org.apache.tika.language.detect; public enum LanguageConfidence { - - HIGH, MEDIUM, LOW, NONE // Special value when no language is detected + HIGH, + MEDIUM, + LOW, + NONE // Special value when no language is detected } diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java index 722ded343b..2166cf1870 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java @@ -20,7 +20,6 @@ import java.util.List; import java.util.Map; import java.util.Set; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.utils.CompareUtils; @@ -49,8 +48,8 @@ public abstract class LanguageDetector { private static final ServiceLoader DEFAULT_SERVICE_LOADER = new ServiceLoader(); - //if a user calls detect on a huge string, break it into this size - //and add sequentially until hasEnoughText() is true + // if a user calls detect on a huge string, break it into this size + // and add sequentially until hasEnoughText() is true private static final int BUFFER_LENGTH = 4096; // True if text is expected to be a mix of languages, and thus higher-resolution @@ -100,20 +99,18 @@ public LanguageDetector setShortText(boolean shortText) { } /** - * Load (or re-load) all available language models. This must - * be called after any settings that would impact the models - * being loaded (e.g. mixed language/short text), but - * before any of the document processing routines (below) - * are called. Note that it only needs to be called once. + * Load (or re-load) all available language models. This must be called after any settings that + * would impact the models being loaded (e.g. mixed language/short text), but before any of the + * document processing routines (below) are called. Note that it only needs to be called once. * * @return this */ public abstract LanguageDetector loadModels() throws IOException; /** - * Load (or re-load) the models specified in . These use the - * ISO 639-1 names, with an optional "-" for more - * specific specification (e.g. "zh-CN" for Chinese in China). + * Load (or re-load) the models specified in . These use the ISO 639-1 names, with an + * optional "-" for more specific specification (e.g. "zh-CN" for Chinese in + * China). * * @param languages list of target languages. * @return this @@ -121,8 +118,7 @@ public LanguageDetector setShortText(boolean shortText) { public abstract LanguageDetector loadModels(Set languages) throws IOException; /** - * Provide information about whether a model exists for a specific - * language. + * Provide information about whether a model exists for a specific language. * * @param language ISO 639-1 name for language * @return true if a model for this language exists. @@ -130,13 +126,14 @@ public LanguageDetector setShortText(boolean shortText) { public abstract boolean hasModel(String language); /** - * Set the a-priori probabilities for these languages. The provided map uses the language - * as the key, and the probability (0.0 > probability < 1.0) of text being in that language. - * Note that if the probabilities don't sum to 1.0, these values will be normalized. - *

- * If hasModel() returns false for any of the languages, an IllegalArgumentException is thrown. - *

- * Use of these probabilities is detector-specific, and thus might not impact the results at + * Set the a-priori probabilities for these languages. The provided map uses the language as the + * key, and the probability (0.0 > probability < 1.0) of text being in that language. Note that + * if the probabilities don't sum to 1.0, these values will be normalized. + * + *

If hasModel() returns false for any of the languages, an IllegalArgumentException is + * thrown. + * + *

Use of these probabilities is detector-specific, and thus might not impact the results at * all. As such, these should be viewed as a hint. * * @param languageProbabilities Map from language to probability @@ -149,26 +146,22 @@ public abstract LanguageDetector setPriors(Map languageProbabilit // The routines below are called when processing a document // ============================================================ - /** - * Reset statistics about the current document being processed - */ + /** Reset statistics about the current document being processed */ public abstract void reset(); /** - * Add statistics about this text for the current document. Note - * that we assume an implicit word break exists before/after - * each of these runs of text. + * Add statistics about this text for the current document. Note that we assume an implicit word + * break exists before/after each of these runs of text. * * @param cbuf Character buffer - * @param off Offset into cbuf to first character in the run of text - * @param len Number of characters in the run of text. + * @param off Offset into cbuf to first character in the run of text + * @param len Number of characters in the run of text. */ public abstract void addText(char[] cbuf, int off, int len); /** - * Add to the statistics being accumulated for the current - * document. Note that this is a default implementation for adding - * a string (not optimized) + * Add to the statistics being accumulated for the current document. Note that this is a + * default implementation for adding a string (not optimized) * * @param text Characters to add to current statistics. */ @@ -180,24 +173,22 @@ public void addText(CharSequence text) { return; } int start = 0; - while (! hasEnoughText() && start < len) { + while (!hasEnoughText() && start < len) { int end = Math.min(start + BUFFER_LENGTH, len); char[] chars = text.subSequence(start, end).toString().toCharArray(); addText(chars, 0, chars.length); start += BUFFER_LENGTH; } - } - /** - * Tell the caller whether more text is required for the current document - * before the language can be reliably detected. - *

- * Implementations can override this to do early termination of stats - * collection, which can improve performance with longer documents. - *

- * Note that detect() can be called even when this returns false + * Tell the caller whether more text is required for the current document before the language + * can be reliably detected. + * + *

Implementations can override this to do early termination of stats collection, which can + * improve performance with longer documents. + * + *

Note that detect() can be called even when this returns false * * @return true if we have enough text for reliable detection. */ @@ -208,9 +199,9 @@ public boolean hasEnoughText() { /** * Detect languages based on previously submitted text (via addText calls). * - * @return list of all possible languages with at least medium confidence, - * sorted by confidence from highest to lowest. There will always - * be at least one result, which might have a confidence of NONE. + * @return list of all possible languages with at least medium confidence, sorted by confidence + * from highest to lowest. There will always be at least one result, which might have a + * confidence of NONE. */ public abstract List detectAll(); @@ -223,8 +214,8 @@ public LanguageResult detect() { * Utility wrapper that detects the language of a given chunk of text. * * @param text String to add to current statistics. - * @return list of all possible languages with at least medium confidence, - * sorted by confidence from highest to lowest. + * @return list of all possible languages with at least medium confidence, sorted by confidence + * from highest to lowest. */ public List detectAll(String text) { reset(); @@ -237,5 +228,4 @@ public LanguageResult detect(CharSequence text) { addText(text); return detect(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java index af3e1bd02c..3ce358b438 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java @@ -17,12 +17,10 @@ package org.apache.tika.language.detect; import java.io.IOException; - import org.apache.tika.sax.WriteOutContentHandler; /** - * SAX content handler that updates a language detector based on all the - * received character content. + * SAX content handler that updates a language detector based on all the received character content. * * @since Apache Tika 0.10 */ @@ -45,9 +43,8 @@ public LanguageHandler(LanguageDetector detector) { } /** - * Returns the language detector used by this content handler. - * Note that the returned detector gets updated whenever new SAX events - * are received by this content handler. + * Returns the language detector used by this content handler. Note that the returned detector + * gets updated whenever new SAX events are received by this content handler. * * @return language detector */ diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java index ed52640bef..6e6db1a1a8 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java @@ -20,18 +20,20 @@ /** * Support for language tags (as defined by https://tools.ietf.org/html/bcp47) - *

- * See https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes for a list of - * three character language codes. - *

- * TODO change to LanguageTag, and use these vs. strings everywhere in the - * language detector API? + * + *

See https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes for a list of three character + * language codes. + * + *

TODO change to LanguageTag, and use these vs. strings everywhere in the language detector API? */ public class LanguageNames { public static String makeName(String language, String script, String region) { Locale locale = - new Locale.Builder().setLanguage(language).setScript(script).setRegion(region) + new Locale.Builder() + .setLanguage(language) + .setScript(script) + .setRegion(region) .build(); return locale.toLanguageTag(); } diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java index dada5fda17..e231e48f34 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java @@ -67,16 +67,11 @@ public boolean isUnknown() { } /** - * Return true if the target language matches the detected language. We consider - * it a match if, for the precision requested or detected, it matches. This means: - *

- * target | detected | match? - * zh | en | false - * zh | zh | true - * zh | zh-CN | true - * zh-CN | zh | true - * zh-CN | zh-TW | false - * zh-CN | zh-cn | true (case-insensitive) + * Return true if the target language matches the detected language. We consider it a match if, + * for the precision requested or detected, it matches. This means: + * + *

target | detected | match? zh | en | false zh | zh | true zh | zh-CN | true zh-CN | zh | + * true zh-CN | zh-TW | false zh-CN | zh-cn | true (case-insensitive) * * @param language * @return diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java index 92cd630c9d..539c8612f7 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java @@ -34,9 +34,8 @@ public LanguageWriter(LanguageDetector detector) { } /** - * Returns the language detector used by this writer. Note that - * the returned language detector gets updated whenever new characters - * are written. + * Returns the language detector used by this writer. Note that the returned language detector + * gets updated whenever new characters are written. * * @return language detector */ @@ -58,19 +57,13 @@ public void write(char[] cbuf, int off, int len) { detector.addText(cbuf, off, len); } - /** - * Ignored. - */ + /** Ignored. */ @Override - public void close() throws IOException { - } + public void close() throws IOException {} - /** - * Ignored. - */ + /** Ignored. */ @Override - public void flush() { - } + public void flush() {} public void reset() { detector.reset(); diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java b/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java index 11b45d5930..31e65d4d64 100644 --- a/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java +++ b/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java @@ -19,20 +19,18 @@ import java.io.IOException; import java.util.List; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.exception.TikaException; import org.apache.tika.utils.CompareUtils; /** - * A translator which picks the first available {@link Translator} - * implementations available through the - * {@link javax.imageio.spi.ServiceRegistry service provider mechanism}. + * A translator which picks the first available {@link Translator} implementations available through + * the {@link javax.imageio.spi.ServiceRegistry service provider mechanism}. * * @since Apache Tika 1.6 */ public class DefaultTranslator implements Translator { - private transient final ServiceLoader loader; + private final transient ServiceLoader loader; public DefaultTranslator(ServiceLoader loader) { this.loader = loader; @@ -43,8 +41,8 @@ public DefaultTranslator() { } /** - * Finds all statically loadable translators and sort the list by name, - * rather than discovery order. + * Finds all statically loadable translators and sort the list by name, rather than discovery + * order. * * @param loader service loader * @return ordered list of statically loadable translators @@ -55,9 +53,7 @@ private static List getDefaultTranslators(ServiceLoader loader) { return translators; } - /** - * Returns the first available translator, or null if none are - */ + /** Returns the first available translator, or null if none are */ private static Translator getFirstAvailable(ServiceLoader loader) { for (Translator t : getDefaultTranslators(loader)) { if (t.isAvailable()) { @@ -67,9 +63,7 @@ private static Translator getFirstAvailable(ServiceLoader loader) { return null; } - /** - * Translate, using the first available service-loaded translator - */ + /** Translate, using the first available service-loaded translator */ public String translate(String text, String sourceLanguage, String targetLanguage) throws TikaException, IOException { Translator t = getFirstAvailable(loader); @@ -79,9 +73,7 @@ public String translate(String text, String sourceLanguage, String targetLanguag throw new TikaException("No translators currently available"); } - /** - * Translate, using the first available service-loaded translator - */ + /** Translate, using the first available service-loaded translator */ public String translate(String text, String targetLanguage) throws TikaException, IOException { Translator t = getFirstAvailable(loader); if (t != null) { @@ -90,16 +82,12 @@ public String translate(String text, String targetLanguage) throws TikaException throw new TikaException("No translators currently available"); } - /** - * Returns all available translators - */ + /** Returns all available translators */ public List getTranslators() { return getDefaultTranslators(loader); } - /** - * Returns the current translator - */ + /** Returns the current translator */ public Translator getTranslator() { return getFirstAvailable(loader); } diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java b/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java index 9324af224a..10a1270f38 100644 --- a/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java +++ b/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java @@ -17,9 +17,8 @@ package org.apache.tika.language.translate; /** - * Dummy translator that always declines to give any text. Useful as a - * sentinel translator for when none others are available. - * for unknown document types. + * Dummy translator that always declines to give any text. Useful as a sentinel translator for when + * none others are available. for unknown document types. */ public class EmptyTranslator implements Translator { public String translate(String text, String sourceLanguage, String targetLanguage) { diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java index 563e6c4fc5..4905d72d53 100644 --- a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java +++ b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java @@ -17,7 +17,6 @@ package org.apache.tika.language.translate; import java.io.IOException; - import org.apache.tika.exception.TikaException; /** @@ -29,11 +28,11 @@ public interface Translator { /** * Translate text between given languages. * - * @param text The text to translate. + * @param text The text to translate. * @param sourceLanguage The input text language (for example, "en"). * @param targetLanguage The desired language to translate to (for example, "fr"). * @return The translation result. If translation is unavailable, returns the same text back. - * @throws TikaException When there is an error translating. + * @throws TikaException When there is an error translating. * @throws java.io.IOException * @since Tika 1.6 */ @@ -41,13 +40,13 @@ String translate(String text, String sourceLanguage, String targetLanguage) throws TikaException, IOException; /** - * Translate text to the given language - * This method attempts to auto-detect the source language of the text. + * Translate text to the given language This method attempts to auto-detect the source language + * of the text. * - * @param text The text to translate. + * @param text The text to translate. * @param targetLanguage The desired language to translate to (for example, "hi"). * @return The translation result. If translation is unavailable, returns the same text back. - * @throws TikaException When there is an error translating. + * @throws TikaException When there is an error translating. * @throws java.io.IOException * @since Tika 1.6 */ diff --git a/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java b/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java index db689f9120..aa51e08619 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java @@ -18,56 +18,36 @@ package org.apache.tika.metadata; /** - * Until we can find a common standard, we'll use these options. They - * were mostly derived from PDFBox's AccessPermission, but some can - * apply to other document formats, especially CAN_MODIFY and FILL_IN_FORM. + * Until we can find a common standard, we'll use these options. They were mostly derived from + * PDFBox's AccessPermission, but some can apply to other document formats, especially CAN_MODIFY + * and FILL_IN_FORM. */ public interface AccessPermissions { - String PREFIX = - "access_permission" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + String PREFIX = "access_permission" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; - /** - * Can any modifications be made to the document - */ + /** Can any modifications be made to the document */ Property CAN_MODIFY = Property.externalTextBag(PREFIX + "can_modify"); - /** - * Should content be extracted, generally. - */ + /** Should content be extracted, generally. */ Property EXTRACT_CONTENT = Property.externalText(PREFIX + "extract_content"); - /** - * Should content be extracted for the purposes - * of accessibility. - */ + /** Should content be extracted for the purposes of accessibility. */ Property EXTRACT_FOR_ACCESSIBILITY = Property.externalText(PREFIX + "extract_for_accessibility"); - /** - * Can the user insert/rotate/delete pages. - */ + /** Can the user insert/rotate/delete pages. */ Property ASSEMBLE_DOCUMENT = Property.externalText(PREFIX + "assemble_document"); - - /** - * Can the user fill in a form - */ + /** Can the user fill in a form */ Property FILL_IN_FORM = Property.externalText(PREFIX + "fill_in_form"); - /** - * Can the user modify annotations - */ + /** Can the user modify annotations */ Property CAN_MODIFY_ANNOTATIONS = Property.externalText(PREFIX + "modify_annotations"); - /** - * Can the user print the document - */ + /** Can the user print the document */ Property CAN_PRINT = Property.externalText(PREFIX + "can_print"); - /** - * Can the user print an image-degraded version of the document. - */ + /** Can the user print an image-degraded version of the document. */ Property CAN_PRINT_FAITHFUL = Property.externalText(PREFIX + "can_print_faithful"); - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java b/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java index 9ad1632837..5c9772d8e3 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java @@ -17,8 +17,8 @@ package org.apache.tika.metadata; /** - * Met keys from NCAR CCSM files in the Climate Forecast Convention. + * Met keys from NCAR CCSM files in the Climate Forecast + * Convention. */ public interface ClimateForcast { @@ -51,5 +51,4 @@ public interface ClimateForcast { String COMMENT = "comment"; String MODEL_NAME_ENGLISH = "model_name_english"; - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java b/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java index 122a1fc578..1a57611c4b 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java @@ -28,5 +28,4 @@ public interface CreativeCommons { String LICENSE_LOCATION = "License-Location"; String WORK_TYPE = "Work-Type"; - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java index 23750c35da..33c8beb435 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java @@ -29,162 +29,158 @@ public interface DublinCore { String PREFIX_DC_TERMS = "dcterms"; /** - * Typically, Format may include the media-type or dimensions of the - * resource. Format may be used to determine the software, hardware or - * other equipment needed to display or operate the resource. Examples - * of dimensions include size and duration. Recommended best practice is - * to select a value from a controlled vocabulary (for example, the list - * of Internet Media Types [MIME] defining computer media formats). + * Typically, Format may include the media-type or dimensions of the resource. Format may be + * used to determine the software, hardware or other equipment needed to display or operate the + * resource. Examples of dimensions include size and duration. Recommended best practice is to + * select a value from a controlled vocabulary (for example, the list of Internet Media Types + * [MIME] defining computer media formats). */ - Property FORMAT = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "format"); + Property FORMAT = + Property.internalText( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "format"); /** - * Recommended best practice is to identify the resource by means of - * a string or number conforming to a formal identification system. - * Example formal identification systems include the Uniform Resource - * Identifier (URI) (including the Uniform Resource Locator (URL)), - * the Digital Object Identifier (DOI) and the International Standard - * Book Number (ISBN). + * Recommended best practice is to identify the resource by means of a string or number + * conforming to a formal identification system. Example formal identification systems include + * the Uniform Resource Identifier (URI) (including the Uniform Resource Locator (URL)), the + * Digital Object Identifier (DOI) and the International Standard Book Number (ISBN). */ - Property IDENTIFIER = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "identifier"); + Property IDENTIFIER = + Property.internalText( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "identifier"); - /** - * Date on which the resource was changed. - */ - Property MODIFIED = Property.internalDate( - PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "modified"); + /** Date on which the resource was changed. */ + Property MODIFIED = + Property.internalDate( + PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "modified"); /** - * An entity responsible for making contributions to the content of the - * resource. Examples of a Contributor include a person, an organisation, - * or a service. Typically, the name of a Contributor should be used to - * indicate the entity. + * An entity responsible for making contributions to the content of the resource. Examples of a + * Contributor include a person, an organisation, or a service. Typically, the name of a + * Contributor should be used to indicate the entity. */ - Property CONTRIBUTOR = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contributor"); + Property CONTRIBUTOR = + Property.internalTextBag( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contributor"); /** - * The extent or scope of the content of the resource. Coverage will - * typically include spatial location (a place name or geographic - * coordinates), temporal period (a period label, date, or date range) - * or jurisdiction (such as a named administrative entity). Recommended - * best practice is to select a value from a controlled vocabulary (for - * example, the Thesaurus of Geographic Names [TGN]) and that, where - * appropriate, named places or time periods be used in preference to - * numeric identifiers such as sets of coordinates or date ranges. + * The extent or scope of the content of the resource. Coverage will typically include spatial + * location (a place name or geographic coordinates), temporal period (a period label, date, or + * date range) or jurisdiction (such as a named administrative entity). Recommended best + * practice is to select a value from a controlled vocabulary (for example, the Thesaurus of + * Geographic Names [TGN]) and that, where appropriate, named places or time periods be used in + * preference to numeric identifiers such as sets of coordinates or date ranges. */ - Property COVERAGE = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "coverage"); + Property COVERAGE = + Property.internalText( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "coverage"); /** - * An entity primarily responsible for making the content of the resource. - * Examples of a Creator include a person, an organisation, or a service. - * Typically, the name of a Creator should be used to indicate the entity. + * An entity primarily responsible for making the content of the resource. Examples of a Creator + * include a person, an organisation, or a service. Typically, the name of a Creator should be + * used to indicate the entity. */ - Property CREATOR = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creator"); + Property CREATOR = + Property.internalTextBag( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creator"); - /** - * Date of creation of the resource. - */ - Property CREATED = Property.internalDate( - PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "created"); + /** Date of creation of the resource. */ + Property CREATED = + Property.internalDate( + PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "created"); /** - * A date associated with an event in the life cycle of the resource. - * Typically, Date will be associated with the creation or availability of - * the resource. Recommended best practice for encoding the date value is - * defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD - * format. + * A date associated with an event in the life cycle of the resource. Typically, Date will be + * associated with the creation or availability of the resource. Recommended best practice for + * encoding the date value is defined in a profile of ISO 8601 [W3CDTF] and follows the + * YYYY-MM-DD format. */ - Property DATE = Property.internalDate( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "date"); + Property DATE = + Property.internalDate( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "date"); /** - * An account of the content of the resource. Description may include - * but is not limited to: an abstract, table of contents, reference to - * a graphical representation of content or a free-text account of - * the content. + * An account of the content of the resource. Description may include but is not limited to: an + * abstract, table of contents, reference to a graphical representation of content or a + * free-text account of the content. */ - Property DESCRIPTION = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "description"); + Property DESCRIPTION = + Property.internalText( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "description"); /** - * A language of the intellectual content of the resource. Recommended - * best practice is to use RFC 3066 [RFC3066], which, in conjunction - * with ISO 639 [ISO639], defines two- and three-letter primary language - * tags with optional subtags. Examples include "en" or "eng" for English, - * "akk" for Akkadian, and "en-GB" for English used in the United Kingdom. + * A language of the intellectual content of the resource. Recommended best practice is to use + * RFC 3066 [RFC3066], which, in conjunction with ISO 639 [ISO639], defines two- and + * three-letter primary language tags with optional subtags. Examples include "en" or "eng" for + * English, "akk" for Akkadian, and "en-GB" for English used in the United Kingdom. */ - Property LANGUAGE = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "language"); + Property LANGUAGE = + Property.internalText( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "language"); /** - * An entity responsible for making the resource available. Examples of - * a Publisher include a person, an organisation, or a service. Typically, - * the name of a Publisher should be used to indicate the entity. + * An entity responsible for making the resource available. Examples of a Publisher include a + * person, an organisation, or a service. Typically, the name of a Publisher should be used to + * indicate the entity. */ - Property PUBLISHER = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "publisher"); + Property PUBLISHER = + Property.internalText( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "publisher"); /** - * A reference to a related resource. Recommended best practice is to - * reference the resource by means of a string or number conforming to - * a formal identification system. + * A reference to a related resource. Recommended best practice is to reference the resource by + * means of a string or number conforming to a formal identification system. */ - Property RELATION = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "relation"); + Property RELATION = + Property.internalText( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "relation"); /** - * Information about rights held in and over the resource. Typically, - * a Rights element will contain a rights management statement for - * the resource, or reference a service providing such information. - * Rights information often encompasses Intellectual Property Rights - * (IPR), Copyright, and various Property Rights. If the Rights element - * is absent, no assumptions can be made about the status of these and - * other rights with respect to the resource. + * Information about rights held in and over the resource. Typically, a Rights element will + * contain a rights management statement for the resource, or reference a service providing such + * information. Rights information often encompasses Intellectual Property Rights (IPR), + * Copyright, and various Property Rights. If the Rights element is absent, no assumptions can + * be made about the status of these and other rights with respect to the resource. */ - Property RIGHTS = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "rights"); + Property RIGHTS = + Property.internalText( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "rights"); /** - * A reference to a resource from which the present resource is derived. - * The present resource may be derived from the Source resource in whole - * or in part. Recommended best practice is to reference the resource by - * means of a string or number conforming to a formal identification + * A reference to a resource from which the present resource is derived. The present resource + * may be derived from the Source resource in whole or in part. Recommended best practice is to + * reference the resource by means of a string or number conforming to a formal identification * system. */ - Property SOURCE = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "source"); + Property SOURCE = + Property.internalText( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "source"); /** - * The topic of the content of the resource. Typically, a Subject will - * be expressed as keywords, key phrases or classification codes that - * describe a topic of the resource. Recommended best practice is to - * select a value from a controlled vocabulary or formal classification - * scheme. + * The topic of the content of the resource. Typically, a Subject will be expressed as keywords, + * key phrases or classification codes that describe a topic of the resource. Recommended best + * practice is to select a value from a controlled vocabulary or formal classification scheme. */ - Property SUBJECT = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"); + Property SUBJECT = + Property.internalTextBag( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"); /** - * A name given to the resource. Typically, a Title will be a name by - * which the resource is formally known. + * A name given to the resource. Typically, a Title will be a name by which the resource is + * formally known. */ - Property TITLE = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "title"); + Property TITLE = + Property.internalText( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "title"); /** - * The nature or genre of the content of the resource. Type includes terms - * describing general categories, functions, genres, or aggregation levels - * for content. Recommended best practice is to select a value from a - * controlled vocabulary (for example, the DCMI Type Vocabulary - * [DCMITYPE]). To describe the physical or digital manifestation of - * the resource, use the Format element. + * The nature or genre of the content of the resource. Type includes terms describing general + * categories, functions, genres, or aggregation levels for content. Recommended best practice + * is to select a value from a controlled vocabulary (for example, the DCMI Type Vocabulary + * [DCMITYPE]). To describe the physical or digital manifestation of the resource, use the + * Format element. */ - Property TYPE = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "type"); - + Property TYPE = + Property.internalText( + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "type"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Epub.java b/tika-core/src/main/java/org/apache/tika/metadata/Epub.java index c6e3c3c33a..b967e04966 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Epub.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Epub.java @@ -26,11 +26,12 @@ public interface Epub { String EPUB_PREFIX = "epub" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; /** - * This is set to "pre-paginated" if any itemref on the spine or the - * metadata has a "pre-paginated" value, "reflowable" otherwise. + * This is set to "pre-paginated" if any itemref on the spine or the metadata has a + * "pre-paginated" value, "reflowable" otherwise. */ - Property RENDITION_LAYOUT = Property.externalClosedChoise(EPUB_PREFIX + "rendition:layout", - "pre-paginated", "reflowable"); + Property RENDITION_LAYOUT = + Property.externalClosedChoise( + EPUB_PREFIX + "rendition:layout", "pre-paginated", "reflowable"); Property VERSION = Property.externalText(EPUB_PREFIX + "version"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java b/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java index 8636969f12..6bd97c746b 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java @@ -20,60 +20,65 @@ public interface ExternalProcess { String PREFIX_EXTERNAL_META = "external-process"; - /** - * STD_OUT - */ - Property STD_OUT = Property.externalText( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stdout"); + /** STD_OUT */ + Property STD_OUT = + Property.externalText( + PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "stdout"); - /** - * STD_ERR - */ - Property STD_ERR = Property.externalText( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stderr"); - - - /** - * Whether or not stdout was truncated - */ - Property STD_OUT_IS_TRUNCATED = Property.externalBoolean( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "stdout-truncated"); + /** STD_ERR */ + Property STD_ERR = + Property.externalText( + PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "stderr"); - /** - * Whether or not stderr was truncated - */ - Property STD_ERR_IS_TRUNCATED = Property.externalBoolean( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "stderr-truncated"); + /** Whether or not stdout was truncated */ + Property STD_OUT_IS_TRUNCATED = + Property.externalBoolean( + PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "stdout-truncated"); - /** - * Stdout length whether or not it was truncated. If it was truncated, - * what would its length have been; if it wasn't, what is its length. - */ - Property STD_OUT_LENGTH = Property.externalReal( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "stdout-length"); + /** Whether or not stderr was truncated */ + Property STD_ERR_IS_TRUNCATED = + Property.externalBoolean( + PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "stderr-truncated"); /** - * Stderr length whether or not it was truncated. If it was truncated, - * what would its length have been; if it wasn't, what is its length. + * Stdout length whether or not it was truncated. If it was truncated, what would its length + * have been; if it wasn't, what is its length. */ - Property STD_ERR_LENGTH = Property.externalReal( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "stderr-length"); + Property STD_OUT_LENGTH = + Property.externalReal( + PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "stdout-length"); /** - * Exit value of the sub process + * Stderr length whether or not it was truncated. If it was truncated, what would its length + * have been; if it wasn't, what is its length. */ - Property EXIT_VALUE = Property.externalInteger( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "exit-value"); + Property STD_ERR_LENGTH = + Property.externalReal( + PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "stderr-length"); - /** - * Was the process timed out - */ - Property IS_TIMEOUT = Property.externalBoolean( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "timeout"); + /** Exit value of the sub process */ + Property EXIT_VALUE = + Property.externalInteger( + PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "exit-value"); + /** Was the process timed out */ + Property IS_TIMEOUT = + Property.externalBoolean( + PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "timeout"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java b/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java index 87afab71c4..1ddda08e31 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java @@ -16,9 +16,7 @@ */ package org.apache.tika.metadata; -/** - * A collection of metadata elements for file system level metadata - */ +/** A collection of metadata elements for file system level metadata */ public interface FileSystem { final String PREFIX = "fs:"; @@ -26,5 +24,4 @@ public interface FileSystem { Property CREATED = Property.externalDate(PREFIX + "created"); Property MODIFIED = Property.externalDate(PREFIX + "modified"); Property ACCESSED = Property.externalDate(PREFIX + "accessed"); - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Font.java b/tika-core/src/main/java/org/apache/tika/metadata/Font.java index 706e199dbc..8e20bd9a94 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Font.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Font.java @@ -1,28 +1,26 @@ package org.apache.tika.metadata; /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ public interface Font { String PREFIX_FONT_META = "font"; - /** - * Basic name of a font used in a file - */ - Property FONT_NAME = Property.internalTextBag( - PREFIX_FONT_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "name"); - + /** Basic name of a font used in a file */ + Property FONT_NAME = + Property.internalTextBag( + PREFIX_FONT_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "name"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java b/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java index 3c4006f2ef..df6f427f01 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java @@ -17,29 +17,20 @@ package org.apache.tika.metadata; /** - * Geographic schema. This is a collection of - * {@link Property property definition} constants for geographic - * information, as defined in the W3C Geo Vocabularies. + * Geographic schema. This is a collection of {@link Property property definition} constants for + * geographic information, as defined in the W3C Geo Vocabularies. * - * @see W3C Basic Geo Vocabulary + * @see W3C Basic Geo Vocabulary * @since Apache Tika 0.8 */ public interface Geographic { - /** - * The WGS84 Latitude of the Point - */ + /** The WGS84 Latitude of the Point */ Property LATITUDE = Property.internalReal("geo:lat"); - /** - * The WGS84 Longitude of the Point - */ + /** The WGS84 Longitude of the Point */ Property LONGITUDE = Property.internalReal("geo:long"); - /** - * The WGS84 Altitude of the Point - */ + /** The WGS84 Altitude of the Point */ Property ALTITUDE = Property.internalReal("geo:alt"); - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/HTML.java b/tika-core/src/main/java/org/apache/tika/metadata/HTML.java index 3e37cf6326..0c8533d4d2 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/HTML.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/HTML.java @@ -1,29 +1,28 @@ package org.apache.tika.metadata; /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ public interface HTML { String PREFIX_HTML_META = "html_meta"; - /** - * If a script element contains a src value, this value - * is set in the embedded document's metadata + * If a script element contains a src value, this value is set in the embedded document's + * metadata */ - Property SCRIPT_SOURCE = Property.internalText( - PREFIX_HTML_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "scriptSrc"); - + Property SCRIPT_SOURCE = + Property.internalText( + PREFIX_HTML_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "scriptSrc"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java b/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java index 937f365acb..b6ed1bd35f 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java @@ -19,8 +19,8 @@ /** * A collection of HTTP header names. * - * @see Hypertext Transfer Protocol -- - * HTTP/1.1 (RFC 2616) + * @see Hypertext Transfer Protocol -- HTTP/1.1 (RFC + * 2616) */ public interface HttpHeaders { @@ -39,5 +39,4 @@ public interface HttpHeaders { String CONTENT_TYPE = "Content-Type"; String LOCATION = "Location"; - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java b/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java index f5fa6442b6..0dc4073233 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java @@ -22,12 +22,13 @@ /** * IPTC photo metadata schema. - *

- * A collection of - * {@link Property property definition} constants for the photo metadata + * + *

A collection of {@link Property property definition} constants for the photo metadata * properties defined in the IPTC standard. * - * @see IPTC Photo Metadata + * @see IPTC + * Photo Metadata * @since Apache Tika 1.1 */ public interface IPTC { @@ -41,69 +42,66 @@ public interface IPTC { String PREFIX_PLUS = "plus"; /** - * Name of the city the content is focussing on -- either the place shown - * in visual media or referenced by text or audio media. This element is at - * the third level of a top-down geographical hierarchy. - *

- * This is a detail of a location with blurred semantics as it does not - * clearly indicate whether it is the location in the image or the location - * the photo was taken - which can be different. Two more concise properties - * are available in IPTC Extension with Location Created and Location Shown - * in the Image. - *

- * Maps to this IIM property: 2:90 City + * Name of the city the content is focussing on -- either the place shown in visual media or + * referenced by text or audio media. This element is at the third level of a top-down + * geographical hierarchy. + * + *

This is a detail of a location with blurred semantics as it does not clearly indicate + * whether it is the location in the image or the location the photo was taken - which can be + * different. Two more concise properties are available in IPTC Extension with Location Created + * and Location Shown in the Image. + * + *

Maps to this IIM property: 2:90 City * * @see Photoshop#CITY */ Property CITY = Photoshop.CITY; /** - * Full name of the country the content is focussing on -- either the - * country shown in visual media or referenced in text or audio media. This - * element is at the top/first level of a top- down geographical hierarchy. - * The full name should be expressed as a verbal name and not as a code, a - * code should go to the element "CountryCode" - *

- * This is a detail of a location with blurred semantics as it does not - * clearly indicate whether it is the location in the image or the location - * the photo was taken - which can be different. Two more concise properties - * are available in IPTC Extension with Location Created and Location Shown - * in the Image. - *

- * Maps to this IIM property: 2:101 Country/Primary Location Name + * Full name of the country the content is focussing on -- either the country shown in visual + * media or referenced in text or audio media. This element is at the top/first level of a top- + * down geographical hierarchy. The full name should be expressed as a verbal name and not as a + * code, a code should go to the element "CountryCode" + * + *

This is a detail of a location with blurred semantics as it does not clearly indicate + * whether it is the location in the image or the location the photo was taken - which can be + * different. Two more concise properties are available in IPTC Extension with Location Created + * and Location Shown in the Image. + * + *

Maps to this IIM property: 2:101 Country/Primary Location Name * * @see Photoshop#COUNTRY */ Property COUNTRY = Photoshop.COUNTRY; /** - * Code of the country the content is focussing on -- either the country - * shown in visual media or referenced in text or audio media. This element - * is at the top/first level of a top-down geographical hierarchy. The code - * should be taken from ISO 3166 two or three letter code. The full name of - * a country should go to the "Country" element. - *

- * This is a detail of a location with blurred semantics as it does not - * clearly indicate whether it is the location in the image or the location - * the photo was taken - which can be different. Two more concise properties - * are available in IPTC Extension with Location Created and Location Shown - * in the Image. - *

- * Maps to this IIM property: 2:100 Country/Primary Location Code - */ - Property COUNTRY_CODE = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CountryCode"); - - /** - * A textual description, including captions, of the item's content, - * particularly used where the object is not text. - *

- * Note: the XMP property (dc:description) which stores the value of this - * IPTC Core property is of type Lang Alt. Hence any software agent dealing - * with this property must abide to the processing rules for - * Lang Alt value type as specified by the XMP specifications. - *

- * Maps to this IIM property: 2:120 Caption/Abstract + * Code of the country the content is focussing on -- either the country shown in visual media + * or referenced in text or audio media. This element is at the top/first level of a top-down + * geographical hierarchy. The code should be taken from ISO 3166 two or three letter code. The + * full name of a country should go to the "Country" element. + * + *

This is a detail of a location with blurred semantics as it does not clearly indicate + * whether it is the location in the image or the location the photo was taken - which can be + * different. Two more concise properties are available in IPTC Extension with Location Created + * and Location Shown in the Image. + * + *

Maps to this IIM property: 2:100 Country/Primary Location Code + */ + Property COUNTRY_CODE = + Property.internalText( + PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "CountryCode"); + + /** + * A textual description, including captions, of the item's content, particularly used where the + * object is not text. + * + *

Note: the XMP property (dc:description) which stores the value of this IPTC Core property + * is of type Lang Alt. Hence any software agent dealing with this property must abide to the + * processing rules for Lang Alt value type as specified by the XMP specifications. + * + *

Maps to this IIM property: 2:120 Caption/Abstract * * @see DublinCore#DESCRIPTION */ @@ -111,264 +109,246 @@ public interface IPTC { /** * A brief synopsis of the caption. Headline is not the same as Title. - *

- * Maps to this IIM property: 2:105 Headline + * + *

Maps to this IIM property: 2:105 Headline * * @see Photoshop#HEADLINE */ Property HEADLINE = Photoshop.HEADLINE; /** - * Describes the nature, intellectual, artistic or journalistic - * characteristic of a item, not specifically its content. - *

- * The IPTC recognizes that the corresponding IPTC Genre NewsCodes needs - * photo specific extension to be better usable with this field (as of the - * release of this standard in the year 2008). - *

- * Maps to this IIM property: 2:04 Object Attribute Reference + * Describes the nature, intellectual, artistic or journalistic characteristic of a item, not + * specifically its content. + * + *

The IPTC recognizes that the corresponding IPTC Genre NewsCodes needs photo specific + * extension to be better usable with this field (as of the release of this standard in the year + * 2008). + * + *

Maps to this IIM property: 2:04 Object Attribute Reference */ - Property INTELLECTUAL_GENRE = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "IntellectualGenre"); + Property INTELLECTUAL_GENRE = + Property.internalText( + PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "IntellectualGenre"); /** - * Keywords to express the subject of the content. Keywords may be free - * text and don't have to be taken from a controlled vocabulary. Codes from - * the controlled vocabulary IPTC Subject NewsCodes must go to the - * "Subject Code" field. - *

- * Single values of this field should not be restricted to single words - * but must allow for phrases as well. - *

- * Maps to this IIM property: 2:25 Keywords + * Keywords to express the subject of the content. Keywords may be free text and don't have to + * be taken from a controlled vocabulary. Codes from the controlled vocabulary IPTC Subject + * NewsCodes must go to the "Subject Code" field. + * + *

Single values of this field should not be restricted to single words but must allow for + * phrases as well. + * + *

Maps to this IIM property: 2:25 Keywords * * @see DublinCore#SUBJECT */ Property KEYWORDS = DublinCore.SUBJECT; /** - * Name of the subregion of a country -- either called province or state or - * anything else -- the content is focussing on -- either the subregion - * shown in visual media or referenced by text or audio media. This element - * is at the second level of a top-down geographical hierarchy. - *

- * This is a detail of a location with blurred semantics as it does not - * clearly indicate whether it is the location in the image or the location - * the photo was taken - which can be different. Two more concise properties - * are available in IPTC Extension with Location Created and Location Shown - * in the Image. - *

- * Maps to this IIM property: 2:95 Province/State + * Name of the subregion of a country -- either called province or state or anything else -- the + * content is focussing on -- either the subregion shown in visual media or referenced by text + * or audio media. This element is at the second level of a top-down geographical hierarchy. + * + *

This is a detail of a location with blurred semantics as it does not clearly indicate + * whether it is the location in the image or the location the photo was taken - which can be + * different. Two more concise properties are available in IPTC Extension with Location Created + * and Location Shown in the Image. + * + *

Maps to this IIM property: 2:95 Province/State * * @see Photoshop#STATE */ Property PROVINCE_OR_STATE = Photoshop.STATE; /** - * Describes the scene of a news content. Specifies one or more terms - * from the IPTC "Scene-NewsCodes". Each Scene is represented as a string of - * 6 digits in an unordered list. - *

- * Note: Only Scene values from this IPTC taxonomy should be used here. More - * about the IPTC Scene-NewsCodes at www.newscodes.org. - */ - Property SCENE_CODE = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Scene"); - - /** - * Specifies one or more Subjects from the IPTC Subject-NewsCodes taxonomy - * to categorise the content. Each Subject is represented as a string of 8 - * digits in an unordered list. - *

- * Note: Only Subjects from a controlled vocabulary should be used here, - * free text has to be put into the Keyword element. More about - * IPTC Subject-NewsCodes at www.newscodes.org. - */ - Property SUBJECT_CODE = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "SubjectCode"); - - /** - * Name of a sublocation the content is focussing on -- either the - * location shown in visual media or referenced by text or audio media. This - * location name could either be the name of a sublocation to a city or the - * name of a well known location or (natural) monument outside a city. In - * the sense of a sublocation to a city this element is at the fourth level - * of a top-down geographical hierarchy. - *

- * This is a detail of a location with blurred semantics as it does not - * clearly indicate whether it is the location in the image or the location - * the photo was taken - which can be different. Two more concise properties - * are available in IPTC Extension with Location Created and Location Shown - * in the Image. - *

- * Maps to this IIM property: 2:92 Sublocation - */ - Property SUBLOCATION = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Location"); - - /** - * Designates the date and optionally the time the intellectual content was - * created rather than the date of the creation of the physical - * representation. - *

- * If a software system requires explicit time values and no time is given - * by the Date Created property the software system should default the time - * to 00:00:00. If the software system does not require an explicit time - * value the time part should be left empty as it is. - *

- * Note 1: Any content of the IIM dataset 2:60, Time Created, should be - * merged to this element. - * Note 2: Implementers are encouraged to provide - * the creation date and time from the EXIF data of a digital - * camera to the user for entering this date for the first time. - *

- * Maps to this IIM property: 2:55 Date Created + * Describes the scene of a news content. Specifies one or more terms from the IPTC + * "Scene-NewsCodes". Each Scene is represented as a string of 6 digits in an unordered list. + * + *

Note: Only Scene values from this IPTC taxonomy should be used here. More about the IPTC + * Scene-NewsCodes at www.newscodes.org. + */ + Property SCENE_CODE = + Property.internalTextBag( + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Scene"); + + /** + * Specifies one or more Subjects from the IPTC Subject-NewsCodes taxonomy to categorise the + * content. Each Subject is represented as a string of 8 digits in an unordered list. + * + *

Note: Only Subjects from a controlled vocabulary should be used here, free text has to be + * put into the Keyword element. More about IPTC Subject-NewsCodes at www.newscodes.org. + */ + Property SUBJECT_CODE = + Property.internalTextBag( + PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "SubjectCode"); + + /** + * Name of a sublocation the content is focussing on -- either the location shown in visual + * media or referenced by text or audio media. This location name could either be the name of a + * sublocation to a city or the name of a well known location or (natural) monument outside a + * city. In the sense of a sublocation to a city this element is at the fourth level of a + * top-down geographical hierarchy. + * + *

This is a detail of a location with blurred semantics as it does not clearly indicate + * whether it is the location in the image or the location the photo was taken - which can be + * different. Two more concise properties are available in IPTC Extension with Location Created + * and Location Shown in the Image. + * + *

Maps to this IIM property: 2:92 Sublocation + */ + Property SUBLOCATION = + Property.internalText( + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Location"); + + /** + * Designates the date and optionally the time the intellectual content was created rather than + * the date of the creation of the physical representation. + * + *

If a software system requires explicit time values and no time is given by the Date + * Created property the software system should default the time to 00:00:00. If the software + * system does not require an explicit time value the time part should be left empty as it is. + * + *

Note 1: Any content of the IIM dataset 2:60, Time Created, should be merged to this + * element. Note 2: Implementers are encouraged to provide the creation date and time from the + * EXIF data of a digital camera to the user for entering this date for the first time. + * + *

Maps to this IIM property: 2:55 Date Created * * @see Photoshop#DATE_CREATED */ Property DATE_CREATED = Photoshop.DATE_CREATED; /** - * Identifier or the name of the person involved in writing, editing or - * correcting the description of the content. - *

- * Maps to this IIM property: 2:122 Writer/Editor + * Identifier or the name of the person involved in writing, editing or correcting the + * description of the content. + * + *

Maps to this IIM property: 2:122 Writer/Editor * * @see Photoshop#CAPTION_WRITER */ Property DESCRIPTION_WRITER = Photoshop.CAPTION_WRITER; /** - * Any of a number of instructions from the provider or creator to the - * receiver of the item. - *

- * Maps to this IIM property: 2:40 Special Instruction + * Any of a number of instructions from the provider or creator to the receiver of the item. + * + *

Maps to this IIM property: 2:40 Special Instruction * * @see Photoshop#INSTRUCTIONS */ Property INSTRUCTIONS = Photoshop.INSTRUCTIONS; /** - * Number or identifier for the purpose of improved workflow handling. This - * is a user created identifier related to the job for which the item is - * supplied. - *

- * Note: As this identifier references a job of the receiver's workflow it - * must first be issued by the receiver, then transmitted to the creator or - * provider of the news object and finally added by the creator - * to this field. - *

- * Maps to this IIM property: 2:103 Original Transmission Reference + * Number or identifier for the purpose of improved workflow handling. This is a user created + * identifier related to the job for which the item is supplied. + * + *

Note: As this identifier references a job of the receiver's workflow it must first be + * issued by the receiver, then transmitted to the creator or provider of the news object and + * finally added by the creator to this field. + * + *

Maps to this IIM property: 2:103 Original Transmission Reference * * @see Photoshop#TRANSMISSION_REFERENCE */ Property JOB_ID = Photoshop.TRANSMISSION_REFERENCE; /** - * A shorthand reference for the item. Title provides a short human readable - * name which can be a text and/or numeric reference. It is not the same as - * Headline. - *

- * Many use the Title field to store the filename of the image, though the - * field may be used in many ways. Formal identifiers are provided by the - * Digital Image Id, or the Registry Entry property of the IPTC Extension. - *

- * Note 1: This element aligns with the use of Dublin Core's "Title" - * element. - * Note 2: the XMP property (dc:title) which stores the value of - * this IPTC Core property is of type Lang Alt. Hence any software agent - * dealing with this property must abide to the processing rules for Lang - * Alt value type as specified by the XMP specifications. - *

- * Maps to this IIM property: 2:05 Object Name + * A shorthand reference for the item. Title provides a short human readable name which can be a + * text and/or numeric reference. It is not the same as Headline. + * + *

Many use the Title field to store the filename of the image, though the field may be used + * in many ways. Formal identifiers are provided by the Digital Image Id, or the Registry Entry + * property of the IPTC Extension. + * + *

Note 1: This element aligns with the use of Dublin Core's "Title" element. Note 2: the XMP + * property (dc:title) which stores the value of this IPTC Core property is of type Lang Alt. + * Hence any software agent dealing with this property must abide to the processing rules for + * Lang Alt value type as specified by the XMP specifications. + * + *

Maps to this IIM property: 2:05 Object Name * * @see DublinCore#TITLE */ Property TITLE = DublinCore.TITLE; /** - * Contains any necessary copyright notice for claiming the intellectual - * property for this item and should identify the current owner of the - * copyright for the item. Other entities like the creator of the item may - * be added in the corresponding field. Notes on usage rights should be + * Contains any necessary copyright notice for claiming the intellectual property for this item + * and should identify the current owner of the copyright for the item. Other entities like the + * creator of the item may be added in the corresponding field. Notes on usage rights should be * provided in "Rights usage terms". - *

- * Copyright ownership can be expressed in a more controlled way using the - * PLUS fields "Copyright Owner", "Copyright Owner ID", - * "Copyright Owner Name" of the IPTC Extension. It is the user's - * responsibility to keep the values of the four fields in sync. - *

- * Note: the XMP property (dc:rights) which stores the value of this IPTC - * Core property is of type Lang Alt. Hence any software agent dealing with - * this property must abide to the processing rules for Lang Alt - * value type as specified by the XMP specifications. - *

- * Maps to this IIM property: 2:116 Copyright Notice + * + *

Copyright ownership can be expressed in a more controlled way using the PLUS fields + * "Copyright Owner", "Copyright Owner ID", "Copyright Owner Name" of the IPTC Extension. It is + * the user's responsibility to keep the values of the four fields in sync. + * + *

Note: the XMP property (dc:rights) which stores the value of this IPTC Core property is of + * type Lang Alt. Hence any software agent dealing with this property must abide to the + * processing rules for Lang Alt value type as specified by the XMP specifications. + * + *

Maps to this IIM property: 2:116 Copyright Notice * * @see DublinCore#RIGHTS */ Property COPYRIGHT_NOTICE = DublinCore.RIGHTS; /** - * Contains the name of the person who created the content of this item, a - * photographer for photos, a graphic artist for graphics, or a writer for - * textual news, but in cases where the photographer should not be - * identified the name of a company or organisation may be appropriate. - *

- * The creator can be expressed in a more controlled way using the - * "Image Creator" of PLUS in the IPTC Extension additionally. It is the - * user's responsibility to keep the values of the IPTC Core and the PLUS - * fields in sync. - *

- * Maps to this IIM property: 2:80 By-line + * Contains the name of the person who created the content of this item, a photographer for + * photos, a graphic artist for graphics, or a writer for textual news, but in cases where the + * photographer should not be identified the name of a company or organisation may be + * appropriate. + * + *

The creator can be expressed in a more controlled way using the "Image Creator" of PLUS in + * the IPTC Extension additionally. It is the user's responsibility to keep the values of the + * IPTC Core and the PLUS fields in sync. + * + *

Maps to this IIM property: 2:80 By-line * * @see DublinCore#CREATOR */ Property CREATOR = DublinCore.CREATOR; /** - * The creator's contact information provides all necessary information to - * get in contact with the creator of this item and comprises a set of - * sub-properties for proper addressing. - *

- * The IPTC Extension Licensor fields should be used instead of these - * Creator's Contact Info fields if you are using IPTC Extension fields. If - * the creator is also the licensor his or her contact information should be - * provided in the Licensor fields. - *

- * Note 1 to user interface implementers: All sub-properties of "Creator's - * contact information" should be shown as group on the form. - * Note 2: the - * CreatorContactInfo sub-properties' naming aligns with the vCard - * specification RFC 2426. + * The creator's contact information provides all necessary information to get in contact with + * the creator of this item and comprises a set of sub-properties for proper addressing. + * + *

The IPTC Extension Licensor fields should be used instead of these Creator's Contact Info + * fields if you are using IPTC Extension fields. If the creator is also the licensor his or her + * contact information should be provided in the Licensor fields. + * + *

Note 1 to user interface implementers: All sub-properties of "Creator's contact + * information" should be shown as group on the form. Note 2: the CreatorContactInfo + * sub-properties' naming aligns with the vCard specification RFC 2426. */ - Property CREATORS_CONTACT_INFO = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "CreatorContactInfo"); + Property CREATORS_CONTACT_INFO = + Property.internalText( + PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "CreatorContactInfo"); /** - * Contains the job title of the person who created the content of this - * item. As this is sort of a qualifier the Creator element has to be filled - * in as mandatory prerequisite for using Creator's Jobtitle. - *

- * Maps to this IIM property: 2:85 By-line Title + * Contains the job title of the person who created the content of this item. As this is sort of + * a qualifier the Creator element has to be filled in as mandatory prerequisite for using + * Creator's Jobtitle. + * + *

Maps to this IIM property: 2:85 By-line Title * * @see Photoshop#AUTHORS_POSITION */ Property CREATORS_JOB_TITLE = Photoshop.AUTHORS_POSITION; /** - * The credit to person(s) and/or organisation(s) required by the supplier - * of the item to be used when published. This is a free-text field. - *

- * Note 1: For more formal identifications of the creator or the owner of - * the copyrights of this image other rights properties may be used. - * Note 2: - * This property was named "Credit" by the IIM metadata, then it was renamed - * to "Provider" in IPTC Core 1.0. In IPTC Core 1.1. it has been renamed to - * "Credit Line" as the field is used for this purpose by many users. - *

- * Maps to this IIM property: 2:110 Credit + * The credit to person(s) and/or organisation(s) required by the supplier of the item to be + * used when published. This is a free-text field. + * + *

Note 1: For more formal identifications of the creator or the owner of the copyrights of + * this image other rights properties may be used. Note 2: This property was named "Credit" by + * the IIM metadata, then it was renamed to "Provider" in IPTC Core 1.0. In IPTC Core 1.1. it + * has been renamed to "Credit Line" as the field is used for this purpose by many users. + * + *

Maps to this IIM property: 2:110 Credit * * @see Photoshop#CREDIT */ @@ -376,900 +356,1048 @@ public interface IPTC { /** * The licensing parameters of the item expressed in free-text. - *

- * The PLUS fields of the IPTC Extension can be used in parallel to express - * the licensed usage in more controlled terms. + * + *

The PLUS fields of the IPTC Extension can be used in parallel to express the licensed + * usage in more controlled terms. */ Property RIGHTS_USAGE_TERMS = XMPRights.USAGE_TERMS; /** - * Identifies the original owner of the copyright for the intellectual - * content of the item. This could be an agency, a member of an agency or an - * individual. Source could be different from Creator and from the entities - * in the CopyrightNotice. - *

- * The original owner can never change. For that reason the content of this - * property should never be changed or deleted after the information is - * entered following the news object's initial creation. - *

- * Maps to this IIM property: 2:115 Source + * Identifies the original owner of the copyright for the intellectual content of the item. This + * could be an agency, a member of an agency or an individual. Source could be different from + * Creator and from the entities in the CopyrightNotice. + * + *

The original owner can never change. For that reason the content of this property should + * never be changed or deleted after the information is entered following the news object's + * initial creation. + * + *

Maps to this IIM property: 2:115 Source * * @see Photoshop#SOURCE */ Property SOURCE = Photoshop.SOURCE; /** - * The contact information address part. Comprises an optional company name - * and all required information to locate the building or postbox to which - * mail should be sent. To that end, the address is a multiline field. - *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426. + * The contact information address part. Comprises an optional company name and all required + * information to locate the building or postbox to which mail should be sent. To that end, the + * address is a multiline field. + * + *

Note 1: to user interface implementers: This field should be part of a "Contact + * information" group on the form. Note 2: the ContactInfo naming aligns with the vCard + * specification RFC 2426. */ - Property CONTACT_INFO_ADDRESS = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrExtadr"); + Property CONTACT_INFO_ADDRESS = + Property.internalTextBag( + PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "CiAdrExtadr"); /** * The contact information city part. - *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426. + * + *

Note 1: to user interface implementers: This field should be part of a "Contact + * information" group on the form. Note 2: the ContactInfo naming aligns with the vCard + * specification RFC 2426. */ - Property CONTACT_INFO_CITY = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCity"); + Property CONTACT_INFO_CITY = + Property.internalText( + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCity"); /** * The contact information country part. - *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426. + * + *

Note 1: to user interface implementers: This field should be part of a "Contact + * information" group on the form. Note 2: the ContactInfo naming aligns with the vCard + * specification RFC 2426. */ - Property CONTACT_INFO_COUNTRY = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCtry"); + Property CONTACT_INFO_COUNTRY = + Property.internalText( + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCtry"); /** * The contact information email address part. - *

- * Multiple email addresses can be given. May have to be separated by a - * comma in the user interface. - *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2 to user interface - * implementers: provide sufficient space to fill in multiple e-mail - * addresses. - * Note 3: the ContactInfo naming aligns with the vCard - * specification RFC 2426. + * + *

Multiple email addresses can be given. May have to be separated by a comma in the user + * interface. + * + *

Note 1: to user interface implementers: This field should be part of a "Contact + * information" group on the form. Note 2 to user interface implementers: provide sufficient + * space to fill in multiple e-mail addresses. Note 3: the ContactInfo naming aligns with the + * vCard specification RFC 2426. */ - Property CONTACT_INFO_EMAIL = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiEmailWork"); + Property CONTACT_INFO_EMAIL = + Property.internalTextBag( + PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "CiEmailWork"); /** * The contact information phone number part. - *

- * Multiple numbers can be given. May have to be separated by a - * comma in the user interface. - *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2 to user interface - * implementers: provide sufficient space to fill in multiple international - * numbers. - * Note 3: the ContactInfo naming aligns with the vCard - * specification RFC 2426. + * + *

Multiple numbers can be given. May have to be separated by a comma in the user interface. + * + *

Note 1: to user interface implementers: This field should be part of a "Contact + * information" group on the form. Note 2 to user interface implementers: provide sufficient + * space to fill in multiple international numbers. Note 3: the ContactInfo naming aligns with + * the vCard specification RFC 2426. */ - Property CONTACT_INFO_PHONE = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiTelWork"); + Property CONTACT_INFO_PHONE = + Property.internalTextBag( + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiTelWork"); /** * The contact information part denoting the local postal code. - *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426. + * + *

Note 1: to user interface implementers: This field should be part of a "Contact + * information" group on the form. Note 2: the ContactInfo naming aligns with the vCard + * specification RFC 2426. */ - Property CONTACT_INFO_POSTAL_CODE = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrPcode"); + Property CONTACT_INFO_POSTAL_CODE = + Property.internalText( + PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "CiAdrPcode"); /** * The contact information part denoting regional information such as state or province. - *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426. + * + *

Note 1: to user interface implementers: This field should be part of a "Contact + * information" group on the form. Note 2: the ContactInfo naming aligns with the vCard + * specification RFC 2426. */ - Property CONTACT_INFO_STATE_PROVINCE = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrRegion"); + Property CONTACT_INFO_STATE_PROVINCE = + Property.internalText( + PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "CiAdrRegion"); /** * The contact information web address part. Multiple addresses can be given, separated by a * comma. - *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2 to user interface - * implementers: provide sufficient space to fill in multiple URLs. - * Note 3: the ContactInfo naming aligns with the vCard + * + *

Note 1: to user interface implementers: This field should be part of a "Contact + * information" group on the form. Note 2 to user interface implementers: provide sufficient + * space to fill in multiple URLs. Note 3: the ContactInfo naming aligns with the vCard * specification RFC 2426. */ - Property CONTACT_INFO_WEB_URL = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiUrlWork"); + Property CONTACT_INFO_WEB_URL = + Property.internalTextBag( + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiUrlWork"); /** - * As this metadata element pertains to distribution management, it was not - * adopted. However, this data is still synchronised with the XMP property - * [photoshop:Urgency], and hence, available for future use, but outside the - * IPTC Core. + * As this metadata element pertains to distribution management, it was not adopted. However, + * this data is still synchronised with the XMP property [photoshop:Urgency], and hence, + * available for future use, but outside the IPTC Core. * * @deprecated */ - @Deprecated - Property URGENCY = Photoshop.URGENCY; + @Deprecated Property URGENCY = Photoshop.URGENCY; /** - * As this metadata element was earmarked as deprecated already for IIM 4.1, - * it was not adopted. However, this data is still synchronised with the XMP - * property [photoshop:Category], and hence available for future use - but - * outside the IPTC Core. For migrating from Category codes to Subject Codes - * please read the Guideline for mapping Category Codes to Subject NewsCodes + * As this metadata element was earmarked as deprecated already for IIM 4.1, it was not adopted. + * However, this data is still synchronised with the XMP property [photoshop:Category], and + * hence available for future use - but outside the IPTC Core. For migrating from Category codes + * to Subject Codes please read the Guideline for mapping Category Codes to Subject NewsCodes * section below. * * @deprecated */ - @Deprecated - Property CATEGORY = Photoshop.CATEGORY; + @Deprecated Property CATEGORY = Photoshop.CATEGORY; /** - * As this metadata element was earmarked as deprecated already for IIM 4.1, - * it was not adopted. However, this data is still synchronised with the XMP - * property [photoshop:SupplementalCategories], and hence available for - * future use - but outside the IPTC Core. + * As this metadata element was earmarked as deprecated already for IIM 4.1, it was not adopted. + * However, this data is still synchronised with the XMP property + * [photoshop:SupplementalCategories], and hence available for future use - but outside the IPTC + * Core. * * @deprecated */ - @Deprecated - Property SUPPLEMENTAL_CATEGORIES = Photoshop.SUPPLEMENTAL_CATEGORIES; - - /** - * Information about the ethnicity and other facets of the model(s) in a - * model-released image. - *

- * Use the Model Age field for the age of model(s). - */ - Property ADDITIONAL_MODEL_INFO = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AddlModelInfo"); + @Deprecated Property SUPPLEMENTAL_CATEGORIES = Photoshop.SUPPLEMENTAL_CATEGORIES; /** - * A set of metadata about artwork or an object in the item + * Information about the ethnicity and other facets of the model(s) in a model-released image. + * + *

Use the Model Age field for the age of model(s). + */ + Property ADDITIONAL_MODEL_INFO = + Property.internalText( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "AddlModelInfo"); + + /** A set of metadata about artwork or an object in the item */ + Property ARTWORK_OR_OBJECT = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "ArtworkOrObject"); + + /** A set of metadata about artwork or an object in the item */ + Property ORGANISATION_CODE = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "OrganisationInImageCode"); + + /** + * A term to describe the content of the image by a value from a Controlled Vocabulary. + * + *

This property is part of the Photo Metadata 2008 specifications, but should not released + * to the public on the standard Adobe Custom Panels for IPTC metadata or other user interfaces + * unless agreed by the IPTC. */ - Property ARTWORK_OR_OBJECT = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ArtworkOrObject"); + Property CONTROLLED_VOCABULARY_TERM = + Property.internalTextBag( + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CVterm"); /** - * A set of metadata about artwork or an object in the item + * A location the content of the item is about. For photos that is a location shown in the + * image. + * + *

If the location the image was taken in is different from this location the property + * Location Created should be used too. */ - Property ORGANISATION_CODE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "OrganisationInImageCode"); + Property LOCATION_SHOWN = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationShown"); /** - * A term to describe the content of the image by a value from a Controlled - * Vocabulary. - *

- * This property is part of the Photo Metadata 2008 specifications, but - * should not released to the public on the standard Adobe Custom Panels for - * IPTC metadata or other user interfaces unless agreed by the IPTC. + * Age of the human model(s) at the time this image was taken in a model released image. + * + *

The user should be aware of any legal implications of providing ages for young models. + * Ages below 18 years should not be included. */ - Property CONTROLLED_VOCABULARY_TERM = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CVterm"); + Property MODEL_AGE = + Property.internalTextBag( + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelAge"); /** - * A location the content of the item is about. For photos that is a - * location shown in the image. - *

- * If the location the image was taken in is different from this location - * the property Location Created should be used too. + * Name of the organisation or company which is featured in the content. + * + *

May be supplemented by values from a controlled vocabulary in the Organisation Code field. */ - Property LOCATION_SHOWN = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShown"); + Property ORGANISATION_NAME = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "OrganisationInImageName"); /** - * Age of the human model(s) at the time this image was taken in a model - * released image. - *

- * The user should be aware of any legal implications of providing ages for - * young models. Ages below 18 years should not be included. + * Name of a person the content of the item is about. For photos that is a person shown in the + * image. */ - Property MODEL_AGE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelAge"); + Property PERSON = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "PersonInImage"); /** - * Name of the organisation or company which is featured in the content. - *

- * May be supplemented by values from a controlled vocabulary in the - * Organisation Code field. - */ - Property ORGANISATION_NAME = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "OrganisationInImageName"); - - /** - * Name of a person the content of the item is about. For photos that is a - * person shown in the image. - */ - Property PERSON = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PersonInImage"); - - /** - * Globally unique identifier for the item. It is created and applied by the - * creator of the item at the time of its creation . This value shall not be - * changed after that time. - *

- * The identifier will probably be generated by the technical means of an - * imaging device or software and should be applied to the digital image - * file as early as possible in its life cycle. This identifier does not - * identify any pictured content, particularly in case of a scan of non- - * digital images, only this digital representation. - *

- * Any algorithm to create this identifier has to comply with the technical - * requirements to create a globally unique id. Any device creating digital - * images - e.g. still image cameras, video cameras, scanners - should - * create such an identifer right at the time of the creation of the digital - * data and add the id to the set of metadata without compromising - * performance. It is recommended that this image identifier allows - * identifying the device by which the image data and the GUID were created. - * IPTC's basic requirements for unique ids are: - * - It must be globally unique. Algorithms for this purpose exist. - * - It should identify the camera body. - * - It should identify each individual photo from this camera body. - * - It should identify the date and time of the creation of the picture. - * - It should be secured against tampering. - * This field should be implemented in a way to prove it has not been changed since its - * value has been applied. If the identifier has been created by the imaging device - * its type and brand can be found in the Exif/technical metadata. - */ - Property DIGITAL_IMAGE_GUID = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DigImageGUID"); + * Globally unique identifier for the item. It is created and applied by the creator of the item + * at the time of its creation . This value shall not be changed after that time. + * + *

The identifier will probably be generated by the technical means of an imaging device or + * software and should be applied to the digital image file as early as possible in its life + * cycle. This identifier does not identify any pictured content, particularly in case of a scan + * of non- digital images, only this digital representation. + * + *

Any algorithm to create this identifier has to comply with the technical requirements to + * create a globally unique id. Any device creating digital images - e.g. still image cameras, + * video cameras, scanners - should create such an identifer right at the time of the creation + * of the digital data and add the id to the set of metadata without compromising performance. + * It is recommended that this image identifier allows identifying the device by which the image + * data and the GUID were created. IPTC's basic requirements for unique ids are: - It must be + * globally unique. Algorithms for this purpose exist. - It should identify the camera body. - + * It should identify each individual photo from this camera body. - It should identify the date + * and time of the creation of the picture. - It should be secured against tampering. This field + * should be implemented in a way to prove it has not been changed since its value has been + * applied. If the identifier has been created by the imaging device its type and brand can be + * found in the Exif/technical metadata. + */ + Property DIGITAL_IMAGE_GUID = + Property.internalText( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "DigImageGUID"); /** * The type of the source digital file. - *

- * The IPTC recommends not to implement this property any longer. + * + *

The IPTC recommends not to implement this property any longer. * * @deprecated */ @Deprecated - Property DIGITAL_SOURCE_FILE_TYPE = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "DigitalSourcefileType"); + Property DIGITAL_SOURCE_FILE_TYPE = + Property.internalText( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "DigitalSourcefileType"); + + /** The type of the source of this digital image */ + Property DIGITAL_SOURCE_TYPE = + Property.internalText( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "DigitalSourceType"); /** - * The type of the source of this digital image + * Names or describes the specific event the content relates to. + * + *

Examples are: a press conference, dedication ceremony, etc. If this is a sub-event of a + * larger event both can be provided by the field: e.g. XXXIX Olympic Summer Games (Beijing): + * opening ceremony. Unplanned events could be named by this property too. */ - Property DIGITAL_SOURCE_TYPE = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DigitalSourceType"); + Property EVENT = + Property.internalText( + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Event"); /** - * Names or describes the specific event the content relates to. - *

- * Examples are: a press conference, dedication ceremony, etc. If this is a - * sub-event of a larger event both can be provided by the field: e.g. XXXIX - * Olympic Summer Games (Beijing): opening ceremony. Unplanned events could - * be named by this property too. - */ - Property EVENT = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Event"); - - /** - * Both a Registry Item Id and a Registry Organisation Id to record any - * registration of this item with a registry. - *

- * Typically an id from a registry is negotiated and applied after the - * creation of the digital image. - *

- * Any user interface implementation must show both sub-properties - Item Id - * and Organisation Id - as corresponding values. Further an input to both - * fields should be made mandatory. - */ - Property IMAGE_REGISTRY_ENTRY = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegistryId"); - - /** - * Identifies the most recent supplier of the item, who is not necessarily - * its owner or creator. - *

- * For identifying the supplier either a well known and/or registered - * company name or a URL of the company's web site may be used. This - * property succeeds the Provider property of IPTC Core 1.0 by its semantics - * as that Provider was renamed to Credit Line. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. - */ - Property IMAGE_SUPPLIER = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplier"); + * Both a Registry Item Id and a Registry Organisation Id to record any registration of this + * item with a registry. + * + *

Typically an id from a registry is negotiated and applied after the creation of the + * digital image. + * + *

Any user interface implementation must show both sub-properties - Item Id and Organisation + * Id - as corresponding values. Further an input to both fields should be made mandatory. + */ + Property IMAGE_REGISTRY_ENTRY = + Property.internalTextBag( + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegistryId"); + + /** + * Identifies the most recent supplier of the item, who is not necessarily its owner or creator. + * + *

For identifying the supplier either a well known and/or registered company name or a URL + * of the company's web site may be used. This property succeeds the Provider property of IPTC + * Core 1.0 by its semantics as that Provider was renamed to Credit Line. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. + */ + Property IMAGE_SUPPLIER = + Property.internalText( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplier"); + /** * @deprecated use {@link IPTC#IMAGE_SUPPLIER_ID} */ @Deprecated String IMAGE_SUPPLIER_ID_WRONG_CASE = PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierId"; + /** - * Identifies the most recent supplier of the item, who is not necessarily - * its owner or creator. - *

- * For identifying the supplier either a well known and/or registered - * company name or a URL of the company's web site may be used. This - * property succeeds the Provider property of IPTC Core 1.0 by its semantics - * as that Provider was renamed to Credit Line. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. - */ - Property IMAGE_SUPPLIER_ID = Property.composite(Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierID"), - new Property[]{Property.internalText(IPTC.IMAGE_SUPPLIER_ID_WRONG_CASE)}); - /** - * Identifies the most recent supplier of the item, who is not necessarily - * its owner or creator. - *

- * For identifying the supplier either a well known and/or registered - * company name or a URL of the company's web site may be used. This - * property succeeds the Provider property of IPTC Core 1.0 by its semantics - * as that Provider was renamed to Credit Line. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. - */ - Property IMAGE_SUPPLIER_NAME = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierName"); + * Identifies the most recent supplier of the item, who is not necessarily its owner or creator. + * + *

For identifying the supplier either a well known and/or registered company name or a URL + * of the company's web site may be used. This property succeeds the Provider property of IPTC + * Core 1.0 by its semantics as that Provider was renamed to Credit Line. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. + */ + Property IMAGE_SUPPLIER_ID = + Property.composite( + Property.internalText( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "ImageSupplierID"), + new Property[] {Property.internalText(IPTC.IMAGE_SUPPLIER_ID_WRONG_CASE)}); + + /** + * Identifies the most recent supplier of the item, who is not necessarily its owner or creator. + * + *

For identifying the supplier either a well known and/or registered company name or a URL + * of the company's web site may be used. This property succeeds the Provider property of IPTC + * Core 1.0 by its semantics as that Provider was renamed to Credit Line. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. + */ + Property IMAGE_SUPPLIER_NAME = + Property.internalText( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "ImageSupplierName"); /** * Optional identifier assigned by the Image Supplier to the image. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property IMAGE_SUPPLIER_IMAGE_ID = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierImageID"); + Property IMAGE_SUPPLIER_IMAGE_ID = + Property.internalText( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "ImageSupplierImageID"); /** - * The date and optionally time when any of the IPTC photo metadata fields - * has been last edited - *

- * The public use of this property is deprecated by IPTC Extension version - * 1.1. It may only still be used by a private user interface for a use - * scoped to a company. If used this field should be a timestamp of the - * latest change applied to any of the fields. - *

- * The value of this property should never be set by software. XMP-aware - * software should reflect any changes to metadata by the xmp:MetadataDate - * property of the XMP Basic scheme. + * The date and optionally time when any of the IPTC photo metadata fields has been last edited + * + *

The public use of this property is deprecated by IPTC Extension version 1.1. It may only + * still be used by a private user interface for a use scoped to a company. If used this field + * should be a timestamp of the latest change applied to any of the fields. + * + *

The value of this property should never be set by software. XMP-aware software should + * reflect any changes to metadata by the xmp:MetadataDate property of the XMP Basic scheme. */ - Property IPTC_LAST_EDITED = Property.internalDate( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "IptcLastEdited"); + Property IPTC_LAST_EDITED = + Property.internalDate( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "IptcLastEdited"); /** * The location the content of the item was created. - *

- * If the location in the image is different from the location the photo was - * taken the IPTC Extension property Location Shown in the Image should be - * used. + * + *

If the location in the image is different from the location the photo was taken the IPTC + * Extension property Location Shown in the Image should be used. */ - Property LOCATION_CREATED = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationCreated"); + Property LOCATION_CREATED = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationCreated"); /** - * The maximum available height in pixels of the original photo from which - * this photo has been derived by downsizing. + * The maximum available height in pixels of the original photo from which this photo has been + * derived by downsizing. */ - Property MAX_AVAIL_HEIGHT = Property.internalInteger( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MaxAvailHeight"); + Property MAX_AVAIL_HEIGHT = + Property.internalInteger( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "MaxAvailHeight"); /** - * The maximum available width in pixels of the original photo from which - * this photo has been derived by downsizing. + * The maximum available width in pixels of the original photo from which this photo has been + * derived by downsizing. */ - Property MAX_AVAIL_WIDTH = Property.internalInteger( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MaxAvailWidth"); + Property MAX_AVAIL_WIDTH = + Property.internalInteger( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "MaxAvailWidth"); /** - * The version number of the PLUS standards in place at the time of the - * transaction. - *

- * This property was included into the IPTC Extension schema from PLUS - * version 1.2 as all other PLUS properties. To reflect this the value of - * "PLUS Version" should be set to the string "1.2.0" + * The version number of the PLUS standards in place at the time of the transaction. + * + *

This property was included into the IPTC Extension schema from PLUS version 1.2 as all + * other PLUS properties. To reflect this the value of "PLUS Version" should be set to the + * string "1.2.0" */ - Property PLUS_VERSION = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Version"); + Property PLUS_VERSION = + Property.internalText( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Version"); /** * Owner or owners of the copyright in the licensed image. - *

- * Serves to identify the rights holder/s for the image. The Copyright - * Owner, Image Creator and Licensor may be the same or different entities. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * + *

Serves to identify the rights holder/s for the image. The Copyright Owner, Image Creator + * and Licensor may be the same or different entities. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property COPYRIGHT_OWNER = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwner"); + Property COPYRIGHT_OWNER = + Property.internalTextBag( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwner"); + /** * @deprecated use {@link IPTC#COPYRIGHT_OWNER_ID} */ @Deprecated String COPYRIGHT_OWNER_ID_WRONG_CASE = PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerId"; + /** * The ID of the owner or owners of the copyright in the licensed image. - *

- * Serves to identify the rights holder/s for the image. The Copyright - * Owner, Image Creator and Licensor may be the same or different entities. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * + *

Serves to identify the rights holder/s for the image. The Copyright Owner, Image Creator + * and Licensor may be the same or different entities. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property COPYRIGHT_OWNER_ID = Property.composite(Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerID"), - new Property[]{Property.internalTextBag(IPTC.COPYRIGHT_OWNER_ID_WRONG_CASE)}); + Property COPYRIGHT_OWNER_ID = + Property.composite( + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "CopyrightOwnerID"), + new Property[] {Property.internalTextBag(IPTC.COPYRIGHT_OWNER_ID_WRONG_CASE)}); + /** * The name of the owner or owners of the copyright in the licensed image. - *

- * Serves to identify the rights holder/s for the image. The Copyright - * Owner, Image Creator and Licensor may be the same or different entities. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * + *

Serves to identify the rights holder/s for the image. The Copyright Owner, Image Creator + * and Licensor may be the same or different entities. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property COPYRIGHT_OWNER_NAME = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerName"); + Property COPYRIGHT_OWNER_NAME = + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "CopyrightOwnerName"); /** * Creator or creators of the image. - *

- * The creator can be additionally expressed in free-text using the IPTC - * Core Creator field. In many countries, the Image Creator must be - * attributed in association with any use of the image. The Image Creator, - * Copyright Owner, Image Supplier and Licensor may be the same or different - * entities. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. - */ - Property IMAGE_CREATOR = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreator"); + * + *

The creator can be additionally expressed in free-text using the IPTC Core Creator field. + * In many countries, the Image Creator must be attributed in association with any use of the + * image. The Image Creator, Copyright Owner, Image Supplier and Licensor may be the same or + * different entities. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. + */ + Property IMAGE_CREATOR = + Property.internalTextBag( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreator"); + /** * @deprecated use {@link IPTC#IMAGE_CREATOR_ID} */ @Deprecated String IMAGE_CREATOR_ID_WRONG_CASE = PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorId"; + /** * The ID of the creator or creators of the image. - *

- * The creator can be additionally expressed in free-text using the IPTC - * Core Creator field. In many countries, the Image Creator must be - * attributed in association with any use of the image. The Image Creator, - * Copyright Owner, Image Supplier and Licensor may be the same or different - * entities. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. - */ - Property IMAGE_CREATOR_ID = Property.composite(Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorID"), - new Property[]{Property.internalTextBag(IPTC.IMAGE_CREATOR_ID_WRONG_CASE)}); + * + *

The creator can be additionally expressed in free-text using the IPTC Core Creator field. + * In many countries, the Image Creator must be attributed in association with any use of the + * image. The Image Creator, Copyright Owner, Image Supplier and Licensor may be the same or + * different entities. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. + */ + Property IMAGE_CREATOR_ID = + Property.composite( + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "ImageCreatorID"), + new Property[] {Property.internalTextBag(IPTC.IMAGE_CREATOR_ID_WRONG_CASE)}); + /** * The name of the creator or creators of the image. - *

- * The creator can be additionally expressed in free-text using the IPTC - * Core Creator field. In many countries, the Image Creator must be - * attributed in association with any use of the image. The Image Creator, - * Copyright Owner, Image Supplier and Licensor may be the same or different - * entities. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * + *

The creator can be additionally expressed in free-text using the IPTC Core Creator field. + * In many countries, the Image Creator must be attributed in association with any use of the + * image. The Image Creator, Copyright Owner, Image Supplier and Licensor may be the same or + * different entities. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property IMAGE_CREATOR_NAME = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorName"); + Property IMAGE_CREATOR_NAME = + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "ImageCreatorName"); /** - * A person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * A person or company that should be contacted to obtain a licence for using the item or who + * has licensed the item. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Licensor"); + Property LICENSOR = + Property.internalTextBag( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Licensor"); + /** * @deprecated use {@link IPTC#LICENSOR_ID} */ @Deprecated String LICENSOR_ID_WRONG_CASE = PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorId"; + /** - * The ID of the person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * The ID of the person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_ID = Property.composite(Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorID"), - new Property[]{Property.internalTextBag(IPTC.LICENSOR_ID_WRONG_CASE)}); + Property LICENSOR_ID = + Property.composite( + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LicensorID"), + new Property[] {Property.internalTextBag(IPTC.LICENSOR_ID_WRONG_CASE)}); + /** - * The name of the person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * The name of the person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_NAME = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorName"); + Property LICENSOR_NAME = + Property.internalTextBag( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorName"); /** - * The city of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * The city of a person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_CITY = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorCity"); + Property LICENSOR_CITY = + Property.internalTextBag( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorCity"); /** - * The country of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * The country of a person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_COUNTRY = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorCountry"); + Property LICENSOR_COUNTRY = + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LicensorCountry"); /** - * The email of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * The email of a person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_EMAIL = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorEmail"); + Property LICENSOR_EMAIL = + Property.internalTextBag( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorEmail"); /** * The extended address of a person or company that should be contacted to obtain a licence for * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_EXTENDED_ADDRESS = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LicensorExtendedAddress"); + Property LICENSOR_EXTENDED_ADDRESS = + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LicensorExtendedAddress"); /** - * The postal code of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * The postal code of a person or company that should be contacted to obtain a licence for using + * the item or who has licensed the item. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_POSTAL_CODE = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorPostalCode"); + Property LICENSOR_POSTAL_CODE = + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LicensorPostalCode"); /** - * The region of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * The region of a person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_REGION = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorRegion"); + Property LICENSOR_REGION = + Property.internalTextBag( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorRegion"); /** * The street address of a person or company that should be contacted to obtain a licence for * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_STREET_ADDRESS = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorStreetAddress"); + Property LICENSOR_STREET_ADDRESS = + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LicensorStreetAddress"); /** * The phone number of a person or company that should be contacted to obtain a licence for * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_TELEPHONE_1 = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorTelephone1"); + Property LICENSOR_TELEPHONE_1 = + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LicensorTelephone1"); /** * The phone number of a person or company that should be contacted to obtain a licence for * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_TELEPHONE_2 = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorTelephone2"); + Property LICENSOR_TELEPHONE_2 = + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LicensorTelephone2"); /** - * The URL of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * The URL of a person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_URL = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorURL"); + Property LICENSOR_URL = + Property.internalTextBag( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorURL"); /** - * Age of the youngest model pictured in the image, at the time that the - * image was made. - *

- * This age should not be displayed to the public on open web portals and - * the like. But it may be used by image repositories in a - * B2B enviroment. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * Age of the youngest model pictured in the image, at the time that the image was made. + * + *

This age should not be displayed to the public on open web portals and the like. But it + * may be used by image repositories in a B2B enviroment. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property MINOR_MODEL_AGE_DISCLOSURE = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "MinorModelAgeDisclosure"); + Property MINOR_MODEL_AGE_DISCLOSURE = + Property.internalText( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "MinorModelAgeDisclosure"); /** * Optional identifier associated with each Model Release. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property MODEL_RELEASE_ID = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseID"); + Property MODEL_RELEASE_ID = + Property.internalTextBag( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseID"); /** - * Summarizes the availability and scope of model releases authorizing usage - * of the likenesses of persons appearing in the photograph. - *

- * It is recommended to apply the PLUS controlled value Unlimited Model - * Releases (MR- UMR) very carefully and to check the wording of the model - * release thoroughly before applying it. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * Summarizes the availability and scope of model releases authorizing usage of the likenesses + * of persons appearing in the photograph. + * + *

It is recommended to apply the PLUS controlled value Unlimited Model Releases (MR- UMR) + * very carefully and to check the wording of the model release thoroughly before applying it. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property MODEL_RELEASE_STATUS = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseStatus"); + Property MODEL_RELEASE_STATUS = + Property.internalText( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "ModelReleaseStatus"); /** * Optional identifier associated with each Property Release. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property PROPERTY_RELEASE_ID = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PropertyReleaseID"); + Property PROPERTY_RELEASE_ID = + Property.internalTextBag( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "PropertyReleaseID"); /** - * Summarises the availability and scope of property releases authorizing - * usage of the properties appearing in the photograph. - *

- * It is recommended to apply the value PR-UPR very carefully and to check - * the wording of the property release thoroughly before applying it. - *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * Summarises the availability and scope of property releases authorizing usage of the + * properties appearing in the photograph. + * + *

It is recommended to apply the value PR-UPR very carefully and to check the wording of the + * property release thoroughly before applying it. + * + *

This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property PROPERTY_RELEASE_STATUS = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PropertyReleaseStatus"); + Property PROPERTY_RELEASE_STATUS = + Property.internalText( + PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "PropertyReleaseStatus"); /** - * Contains any necessary copyright notice for claiming the intellectual - * property for artwork or an object in the image and should identify the - * current owner of the copyright of this work with associated intellectual - * property rights. + * Contains any necessary copyright notice for claiming the intellectual property for artwork or + * an object in the image and should identify the current owner of the copyright of this work + * with associated intellectual property rights. */ - Property ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOCopyrightNotice"); + Property ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "AOCopyrightNotice"); - /** - * Contains the name of the artist who has created artwork or an object in the image. - */ - Property ARTWORK_OR_OBJECT_DETAIL_CREATOR = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOCreator"); + /** Contains the name of the artist who has created artwork or an object in the image. */ + Property ARTWORK_OR_OBJECT_DETAIL_CREATOR = + Property.internalTextBag( + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOCreator"); /** - * Designates the date and optionally the time the artwork or object in the - * image was created. This relates to artwork or objects with associated - * intellectual property rights. + * Designates the date and optionally the time the artwork or object in the image was created. + * This relates to artwork or objects with associated intellectual property rights. */ - Property ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AODateCreated"); + Property ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "AODateCreated"); /** - * The organisation or body holding and registering the artwork or object in - * the image for inventory purposes. + * The organisation or body holding and registering the artwork or object in the image for + * inventory purposes. */ - Property ARTWORK_OR_OBJECT_DETAIL_SOURCE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOSource"); + Property ARTWORK_OR_OBJECT_DETAIL_SOURCE = + Property.internalTextBag( + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOSource"); /** - * The inventory number issued by the organisation or body holding and - * registering the artwork or object in the image. + * The inventory number issued by the organisation or body holding and registering the artwork + * or object in the image. */ - Property ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOSourceInvNo"); + Property ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "AOSourceInvNo"); + + /** A reference for the artwork or object in the image. */ + Property ARTWORK_OR_OBJECT_DETAIL_TITLE = + Property.internalTextBag( + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOTitle"); /** - * A reference for the artwork or object in the image. + * Name of the city of a location. This element is at the fourth level of a top-down + * geographical hierarchy. */ - Property ARTWORK_OR_OBJECT_DETAIL_TITLE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOTitle"); + Property LOCATION_SHOWN_CITY = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationShownCity"); /** - * Name of the city of a location. This element is at the fourth level of a - * top-down geographical hierarchy. + * The ISO code of a country of a location. This element is at the second level of a top-down + * geographical hierarchy. + * + *

Note 1: an implementer would have to derive from the length of the value string whether + * this is the country code from the two or three letter scheme as no explicit indication can be + * provided. */ - Property LOCATION_SHOWN_CITY = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShownCity"); + Property LOCATION_SHOWN_COUNTRY_CODE = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationShownCountryCode"); /** - * The ISO code of a country of a location. This element is at the second - * level of a top-down geographical hierarchy. - *

- * Note 1: an implementer would have to derive from the length of the value - * string whether this is the country code from the two or three letter - * scheme as no explicit indication can be provided. + * The name of a country of a location. This element is at the second level of a top-down + * geographical hierarchy. */ - Property LOCATION_SHOWN_COUNTRY_CODE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationShownCountryCode"); + Property LOCATION_SHOWN_COUNTRY_NAME = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationShownCountryName"); /** - * The name of a country of a location. This element is at the second level - * of a top-down geographical hierarchy. + * The name of a subregion of a country - a province or state - of a location. This element is + * at the third level of a top-down geographical hierarchy. */ - Property LOCATION_SHOWN_COUNTRY_NAME = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationShownCountryName"); + Property LOCATION_SHOWN_PROVINCE_OR_STATE = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationShownProvinceState"); /** - * The name of a subregion of a country - a province or state - of a - * location. This element is at the third level of a top-down geographical + * Name of a sublocation. This sublocation name could either be the name of a sublocation to a + * city or the name of a well known location or (natural) monument outside a city. In the sense + * of a sublocation to a city this element is at the fifth level of a top-down geographical * hierarchy. */ - Property LOCATION_SHOWN_PROVINCE_OR_STATE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationShownProvinceState"); + Property LOCATION_SHOWN_SUBLOCATION = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationShownSublocation"); /** - * Name of a sublocation. This sublocation name could either be the name of - * a sublocation to a city or the name of a well known location or (natural) - * monument outside a city. In the sense of a sublocation to a city this - * element is at the fifth level of a top-down geographical hierarchy. + * The name of a world region of a location. This element is at the first (topI) level of a top- + * down geographical hierarchy. */ - Property LOCATION_SHOWN_SUBLOCATION = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationShownSublocation"); + Property LOCATION_SHOWN_WORLD_REGION = + Property.internalTextBag( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationShownWorldRegion"); /** - * The name of a world region of a location. This element is at the first - * (topI) level of a top- down geographical hierarchy. + * Name of the city of a location. This element is at the fourth level of a top-down + * geographical hierarchy. */ - Property LOCATION_SHOWN_WORLD_REGION = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationShownWorldRegion"); + Property LOCATION_CREATED_CITY = + Property.internalText( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationCreatedCity"); /** - * Name of the city of a location. This element is at the fourth level of a - * top-down geographical hierarchy. + * The ISO code of a country of a location. This element is at the second level of a top-down + * geographical hierarchy. + * + *

Note 1: an implementer would have to derive from the length of the value string whether + * this is the country code from the two or three letter scheme as no explicit indication can be + * provided. */ - Property LOCATION_CREATED_CITY = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedCity"); + Property LOCATION_CREATED_COUNTRY_CODE = + Property.internalText( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationCreatedCountryCode"); /** - * The ISO code of a country of a location. This element is at the second - * level of a top-down geographical hierarchy. - *

- * Note 1: an implementer would have to derive from the length of the value - * string whether this is the country code from the two or three letter - * scheme as no explicit indication can be provided. + * The name of a country of a location. This element is at the second level of a top-down + * geographical hierarchy. */ - Property LOCATION_CREATED_COUNTRY_CODE = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedCountryCode"); + Property LOCATION_CREATED_COUNTRY_NAME = + Property.internalText( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationCreatedCountryName"); /** - * The name of a country of a location. This element is at the second level - * of a top-down geographical hierarchy. + * The name of a subregion of a country - a province or state - of a location. This element is + * at the third level of a top-down geographical hierarchy. */ - Property LOCATION_CREATED_COUNTRY_NAME = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedCountryName"); + Property LOCATION_CREATED_PROVINCE_OR_STATE = + Property.internalText( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationCreatedProvinceState"); /** - * The name of a subregion of a country - a province or state - of a - * location. This element is at the third level of a top-down geographical + * Name of a sublocation. This sublocation name could either be the name of a sublocation to a + * city or the name of a well known location or (natural) monument outside a city. In the sense + * of a sublocation to a city this element is at the fifth level of a top-down geographical * hierarchy. */ - Property LOCATION_CREATED_PROVINCE_OR_STATE = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedProvinceState"); - - /** - * Name of a sublocation. This sublocation name could either be the name of - * a sublocation to a city or the name of a well known location or (natural) - * monument outside a city. In the sense of a sublocation to a city this - * element is at the fifth level of a top-down geographical hierarchy. - */ - Property LOCATION_CREATED_SUBLOCATION = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedSublocation"); - - /** - * The name of a world region of a location. This element is at the first - * (topI) level of a top- down geographical hierarchy. - */ - Property LOCATION_CREATED_WORLD_REGION = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedWorldRegion"); + Property LOCATION_CREATED_SUBLOCATION = + Property.internalText( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationCreatedSublocation"); /** - * A unique identifier created by a registry and applied by the creator of - * the item. This value shall not be changed after being applied. This - * identifier is linked to a corresponding Registry Organisation Identifier. + * The name of a world region of a location. This element is at the first (topI) level of a top- + * down geographical hierarchy. */ - Property REGISTRY_ENTRY_CREATED_ITEM_ID = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegItemId"); + Property LOCATION_CREATED_WORLD_REGION = + Property.internalText( + PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationCreatedWorldRegion"); /** - * An identifier for the registry which issued the corresponding Registry Image Id. + * A unique identifier created by a registry and applied by the creator of the item. This value + * shall not be changed after being applied. This identifier is linked to a corresponding + * Registry Organisation Identifier. */ - Property REGISTRY_ENTRY_CREATED_ORGANISATION_ID = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegOrgId"); + Property REGISTRY_ENTRY_CREATED_ITEM_ID = + Property.internalTextBag( + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegItemId"); + /** An identifier for the registry which issued the corresponding Registry Image Id. */ + Property REGISTRY_ENTRY_CREATED_ORGANISATION_ID = + Property.internalTextBag( + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegOrgId"); Property[] PROPERTY_GROUP_IPTC_CORE = - new Property[]{CITY, COUNTRY, COUNTRY_CODE, DESCRIPTION, HEADLINE, INTELLECTUAL_GENRE, - KEYWORDS, PROVINCE_OR_STATE, SCENE_CODE, SUBJECT_CODE, SUBLOCATION, - DATE_CREATED, DESCRIPTION_WRITER, INSTRUCTIONS, JOB_ID, TITLE, COPYRIGHT_NOTICE, - CREATOR, CREATORS_JOB_TITLE, CREDIT_LINE, RIGHTS_USAGE_TERMS, SOURCE, - CONTACT_INFO_ADDRESS, CONTACT_INFO_CITY, CONTACT_INFO_COUNTRY, - CONTACT_INFO_EMAIL, CONTACT_INFO_PHONE, CONTACT_INFO_POSTAL_CODE, - CONTACT_INFO_STATE_PROVINCE, CONTACT_INFO_WEB_URL}; + new Property[] { + CITY, + COUNTRY, + COUNTRY_CODE, + DESCRIPTION, + HEADLINE, + INTELLECTUAL_GENRE, + KEYWORDS, + PROVINCE_OR_STATE, + SCENE_CODE, + SUBJECT_CODE, + SUBLOCATION, + DATE_CREATED, + DESCRIPTION_WRITER, + INSTRUCTIONS, + JOB_ID, + TITLE, + COPYRIGHT_NOTICE, + CREATOR, + CREATORS_JOB_TITLE, + CREDIT_LINE, + RIGHTS_USAGE_TERMS, + SOURCE, + CONTACT_INFO_ADDRESS, + CONTACT_INFO_CITY, + CONTACT_INFO_COUNTRY, + CONTACT_INFO_EMAIL, + CONTACT_INFO_PHONE, + CONTACT_INFO_POSTAL_CODE, + CONTACT_INFO_STATE_PROVINCE, + CONTACT_INFO_WEB_URL + }; Property[] PROPERTY_GROUP_IPTC_EXT = - new Property[]{ADDITIONAL_MODEL_INFO, ORGANISATION_CODE, CONTROLLED_VOCABULARY_TERM, - MODEL_AGE, ORGANISATION_NAME, PERSON, DIGITAL_IMAGE_GUID, DIGITAL_SOURCE_TYPE, - EVENT, IMAGE_SUPPLIER_ID, IMAGE_SUPPLIER_NAME, IMAGE_SUPPLIER_IMAGE_ID, - IPTC_LAST_EDITED, MAX_AVAIL_HEIGHT, MAX_AVAIL_WIDTH, PLUS_VERSION, - COPYRIGHT_OWNER_ID, COPYRIGHT_OWNER_NAME, IMAGE_CREATOR_ID, IMAGE_CREATOR_NAME, - LICENSOR_ID, LICENSOR_NAME, LICENSOR_CITY, LICENSOR_COUNTRY, LICENSOR_EMAIL, - LICENSOR_EXTENDED_ADDRESS, LICENSOR_POSTAL_CODE, LICENSOR_REGION, - LICENSOR_STREET_ADDRESS, LICENSOR_TELEPHONE_1, LICENSOR_TELEPHONE_2, - LICENSOR_URL, MINOR_MODEL_AGE_DISCLOSURE, MODEL_RELEASE_ID, - MODEL_RELEASE_STATUS, PROPERTY_RELEASE_ID, PROPERTY_RELEASE_STATUS, - ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE, ARTWORK_OR_OBJECT_DETAIL_CREATOR, - ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED, ARTWORK_OR_OBJECT_DETAIL_SOURCE, - ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER, - ARTWORK_OR_OBJECT_DETAIL_TITLE, LOCATION_SHOWN_CITY, - LOCATION_SHOWN_COUNTRY_CODE, LOCATION_SHOWN_COUNTRY_NAME, - LOCATION_SHOWN_PROVINCE_OR_STATE, LOCATION_SHOWN_SUBLOCATION, - LOCATION_SHOWN_WORLD_REGION, LOCATION_CREATED_CITY, - LOCATION_CREATED_COUNTRY_CODE, LOCATION_CREATED_COUNTRY_NAME, - LOCATION_CREATED_PROVINCE_OR_STATE, LOCATION_CREATED_SUBLOCATION, - LOCATION_CREATED_WORLD_REGION, REGISTRY_ENTRY_CREATED_ITEM_ID, - REGISTRY_ENTRY_CREATED_ORGANISATION_ID}; + new Property[] { + ADDITIONAL_MODEL_INFO, + ORGANISATION_CODE, + CONTROLLED_VOCABULARY_TERM, + MODEL_AGE, + ORGANISATION_NAME, + PERSON, + DIGITAL_IMAGE_GUID, + DIGITAL_SOURCE_TYPE, + EVENT, + IMAGE_SUPPLIER_ID, + IMAGE_SUPPLIER_NAME, + IMAGE_SUPPLIER_IMAGE_ID, + IPTC_LAST_EDITED, + MAX_AVAIL_HEIGHT, + MAX_AVAIL_WIDTH, + PLUS_VERSION, + COPYRIGHT_OWNER_ID, + COPYRIGHT_OWNER_NAME, + IMAGE_CREATOR_ID, + IMAGE_CREATOR_NAME, + LICENSOR_ID, + LICENSOR_NAME, + LICENSOR_CITY, + LICENSOR_COUNTRY, + LICENSOR_EMAIL, + LICENSOR_EXTENDED_ADDRESS, + LICENSOR_POSTAL_CODE, + LICENSOR_REGION, + LICENSOR_STREET_ADDRESS, + LICENSOR_TELEPHONE_1, + LICENSOR_TELEPHONE_2, + LICENSOR_URL, + MINOR_MODEL_AGE_DISCLOSURE, + MODEL_RELEASE_ID, + MODEL_RELEASE_STATUS, + PROPERTY_RELEASE_ID, + PROPERTY_RELEASE_STATUS, + ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE, + ARTWORK_OR_OBJECT_DETAIL_CREATOR, + ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED, + ARTWORK_OR_OBJECT_DETAIL_SOURCE, + ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER, + ARTWORK_OR_OBJECT_DETAIL_TITLE, + LOCATION_SHOWN_CITY, + LOCATION_SHOWN_COUNTRY_CODE, + LOCATION_SHOWN_COUNTRY_NAME, + LOCATION_SHOWN_PROVINCE_OR_STATE, + LOCATION_SHOWN_SUBLOCATION, + LOCATION_SHOWN_WORLD_REGION, + LOCATION_CREATED_CITY, + LOCATION_CREATED_COUNTRY_CODE, + LOCATION_CREATED_COUNTRY_NAME, + LOCATION_CREATED_PROVINCE_OR_STATE, + LOCATION_CREATED_SUBLOCATION, + LOCATION_CREATED_WORLD_REGION, + REGISTRY_ENTRY_CREATED_ITEM_ID, + REGISTRY_ENTRY_CREATED_ORGANISATION_ID + }; } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java b/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java index 44faa14bc7..26033a86c5 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java @@ -16,10 +16,7 @@ */ package org.apache.tika.metadata; -/** - * Metadata for describing machines, such as their - * architecture, type and endian-ness - */ +/** Metadata for describing machines, such as their architecture, type and endian-ness */ public interface MachineMetadata { String PREFIX = "machine:"; @@ -40,9 +37,19 @@ public interface MachineMetadata { String PLATFORM_WINDOWS = "Windows"; Property PLATFORM = - Property.internalClosedChoise(PREFIX + "platform", PLATFORM_SYSV, PLATFORM_HPUX, - PLATFORM_NETBSD, PLATFORM_LINUX, PLATFORM_SOLARIS, PLATFORM_AIX, PLATFORM_IRIX, - PLATFORM_FREEBSD, PLATFORM_TRU64, PLATFORM_ARM, PLATFORM_EMBEDDED, + Property.internalClosedChoise( + PREFIX + "platform", + PLATFORM_SYSV, + PLATFORM_HPUX, + PLATFORM_NETBSD, + PLATFORM_LINUX, + PLATFORM_SOLARIS, + PLATFORM_AIX, + PLATFORM_IRIX, + PLATFORM_FREEBSD, + PLATFORM_TRU64, + PLATFORM_ARM, + PLATFORM_EMBEDDED, PLATFORM_WINDOWS); String MACHINE_x86_32 = "x86-32"; @@ -66,10 +73,26 @@ public interface MachineMetadata { String MACHINE_UNKNOWN = "Unknown"; Property MACHINE_TYPE = - Property.internalClosedChoise(PREFIX + "machineType", MACHINE_x86_32, MACHINE_x86_64, - MACHINE_IA_64, MACHINE_SPARC, MACHINE_M68K, MACHINE_M88K, MACHINE_MIPS, - MACHINE_PPC, MACHINE_S370, MACHINE_S390, MACHINE_ARM, MACHINE_VAX, - MACHINE_ALPHA, MACHINE_EFI, MACHINE_M32R, MACHINE_SH3, MACHINE_SH4, MACHINE_SH5, + Property.internalClosedChoise( + PREFIX + "machineType", + MACHINE_x86_32, + MACHINE_x86_64, + MACHINE_IA_64, + MACHINE_SPARC, + MACHINE_M68K, + MACHINE_M88K, + MACHINE_MIPS, + MACHINE_PPC, + MACHINE_S370, + MACHINE_S390, + MACHINE_ARM, + MACHINE_VAX, + MACHINE_ALPHA, + MACHINE_EFI, + MACHINE_M32R, + MACHINE_SH3, + MACHINE_SH4, + MACHINE_SH5, MACHINE_UNKNOWN); Property ENDIAN = Property.internalClosedChoise(PREFIX + "endian", Endian.LITTLE.name, Endian.BIG.name); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Message.java b/tika-core/src/main/java/org/apache/tika/metadata/Message.java index fcb1421f3c..d0d6790cd3 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Message.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Message.java @@ -18,8 +18,8 @@ /** * A collection of Message related property names. - *

- * See also {@link Office}'s MAPI-specific properties. + * + *

See also {@link Office}'s MAPI-specific properties. */ public interface Message { String MESSAGE_PREFIX = "Message" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; @@ -42,51 +42,51 @@ public interface Message { String MULTIPART_BOUNDARY = "Multipart-Boundary"; /** - * Where possible, this records the value from the name field. - * Even in MAPI messages, though, this can be an email address. + * Where possible, this records the value from the name field. Even in MAPI messages, though, + * this can be an email address. */ Property MESSAGE_FROM_NAME = Property.internalTextBag(MESSAGE_PREFIX + "From-Name"); /** - * Where possible, this records the value from the name field. - * Even in MAPI messages, though, this can be a name. - *

- * Note that the value may also be an X400/x500 Exchange format: - * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other + * Where possible, this records the value from the name field. Even in MAPI messages, though, + * this can be a name. + * + *

Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange + * Administrative Group/cn=Recipients/cn=someone.or.other */ Property MESSAGE_FROM_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "From-Email"); /** - * In Outlook messages, there are sometimes separate fields for "to-name" and - * "to-display-name" name. + * In Outlook messages, there are sometimes separate fields for "to-name" and "to-display-name" + * name. */ Property MESSAGE_TO_NAME = Property.internalTextBag(MESSAGE_PREFIX + "To-Name"); Property MESSAGE_TO_DISPLAY_NAME = Property.internalTextBag(MESSAGE_PREFIX + "To-Display-Name"); /** - * Where possible, this records the email value in the to field. - * Even in MAPI messages, though, this can be a name. - *

- * Note that the value may also be an X400/x500 Exchange format: - * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other + * Where possible, this records the email value in the to field. Even in MAPI messages, though, + * this can be a name. + * + *

Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange + * Administrative Group/cn=Recipients/cn=someone.or.other */ Property MESSAGE_TO_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "To-Email"); /** - * In Outlook messages, there are sometimes separate fields for "cc-name" and - * "cc-display-name" name. + * In Outlook messages, there are sometimes separate fields for "cc-name" and "cc-display-name" + * name. */ Property MESSAGE_CC_NAME = Property.internalTextBag(MESSAGE_PREFIX + "CC-Name"); Property MESSAGE_CC_DISPLAY_NAME = Property.internalTextBag(MESSAGE_PREFIX + "CC-Display-Name"); /** - * Where possible, this records the email value in the cc field. - * Even in MAPI messages, though, this can be a name. - *

- * Note that the value may also be an X400/x500 Exchange format: - * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other + * Where possible, this records the email value in the cc field. Even in MAPI messages, though, + * this can be a name. + * + *

Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange + * Administrative Group/cn=Recipients/cn=someone.or.other */ Property MESSAGE_CC_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "CC-Email"); @@ -100,12 +100,11 @@ public interface Message { Property.internalTextBag(MESSAGE_PREFIX + "BCC-Display-Name"); /** - * Where possible, this records the email value in the bcc field. - * Even in MAPI messages, though, this can be a name. - *

- * Note that the value may also be an X400/x500 Exchange format: - * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other + * Where possible, this records the email value in the bcc field. Even in MAPI messages, though, + * this can be a name. + * + *

Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange + * Administrative Group/cn=Recipients/cn=someone.or.other */ Property MESSAGE_BCC_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "BCC-Email"); - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java index 9b8e3b86cd..70c016261e 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java @@ -33,75 +33,74 @@ import java.util.Objects; import java.util.Properties; import java.util.TimeZone; - import org.apache.tika.metadata.Property.PropertyType; import org.apache.tika.metadata.writefilter.MetadataWriteFilter; import org.apache.tika.utils.DateUtils; -/** - * A multi-valued metadata container. - */ +/** A multi-valued metadata container. */ public class Metadata - implements CreativeCommons, Geographic, HttpHeaders, Message, ClimateForcast, TIFF, - TikaMimeKeys, Serializable { - - - private static final MetadataWriteFilter ACCEPT_ALL = new MetadataWriteFilter() { - @Override - public void filterExisting(Map data) { - //no-op - } + implements CreativeCommons, + Geographic, + HttpHeaders, + Message, + ClimateForcast, + TIFF, + TikaMimeKeys, + Serializable { + + private static final MetadataWriteFilter ACCEPT_ALL = + new MetadataWriteFilter() { + @Override + public void filterExisting(Map data) { + // no-op + } - @Override - public void add(String field, String value, Map data) { - String[] values = data.get(field); - if (values == null) { - set(field, value, data); - } else { - data.put(field, appendValues(values, value)); - } - } + @Override + public void add(String field, String value, Map data) { + String[] values = data.get(field); + if (values == null) { + set(field, value, data); + } else { + data.put(field, appendValues(values, value)); + } + } - //legacy behavior -- remove the field if value is null - @Override - public void set(String field, String value, Map data) { - if (value != null) { - data.put(field, new String[]{ value }); - } else { - data.remove(field); - } - } + // legacy behavior -- remove the field if value is null + @Override + public void set(String field, String value, Map data) { + if (value != null) { + data.put(field, new String[] {value}); + } else { + data.remove(field); + } + } - private String[] appendValues(String[] values, final String value) { - if (value == null) { - return values; - } - String[] newValues = new String[values.length + 1]; - System.arraycopy(values, 0, newValues, 0, values.length); - newValues[newValues.length - 1] = value; - return newValues; - } - }; + private String[] appendValues(String[] values, final String value) { + if (value == null) { + return values; + } + String[] newValues = new String[values.length + 1]; + System.arraycopy(values, 0, newValues, 0, values.length); + newValues[newValues.length - 1] = value; + return newValues; + } + }; - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 5623926545693153182L; + /** - * Some parsers will have the date as a ISO-8601 string - * already, and will set that into the Metadata object. + * Some parsers will have the date as a ISO-8601 string already, and will set that into the + * Metadata object. */ private static final DateUtils DATE_UTILS = new DateUtils(); - /** - * A map of all metadata attributes. - */ - private Map metadata = null; + /** A map of all metadata attributes. */ + private Map metadata = null; private MetadataWriteFilter writeFilter = ACCEPT_ALL; - /** - * Constructs a new, empty metadata. - */ + + /** Constructs a new, empty metadata. */ public Metadata() { metadata = new HashMap<>(); } @@ -115,8 +114,8 @@ private static DateFormat createDateFormat(String format, TimeZone timezone) { } /** - * Parses the given date string. This method is synchronized to prevent - * concurrent access to the thread-unsafe date formats. + * Parses the given date string. This method is synchronized to prevent concurrent access to the + * thread-unsafe date formats. * * @param date date string * @return parsed date, or null if the date can't be parsed @@ -133,8 +132,8 @@ private static synchronized Date parseDate(String date) { * @return true is named value is multivalued, false if single value or null */ public boolean isMultiValued(final Property property) { - return metadata.get(property.getName()) != null && - metadata.get(property.getName()).length > 1; + return metadata.get(property.getName()) != null + && metadata.get(property.getName()).length > 1; } /** @@ -157,8 +156,8 @@ public String[] names() { } /** - * Get the value associated to a metadata name. If many values are assiociated - * to the specified name, then the first one is returned. + * Get the value associated to a metadata name. If many values are assiociated to the specified + * name, then the first one is returned. * * @param name of the metadata. * @return the value associated to the specified metadata name. @@ -173,13 +172,12 @@ public String get(final String name) { } /** - * Sets the writeFilter that is called before {@link #set(String, String)} - * {@link #set(String, String[])}, {@link #add(String, String)}, - * {@link #add(String, String[])}. The default is {@link #ACCEPT_ALL}. + * Sets the writeFilter that is called before {@link #set(String, String)} {@link #set(String, + * String[])}, {@link #add(String, String)}, {@link #add(String, String[])}. The default is + * {@link #ACCEPT_ALL}. * - * This is intended for expert use only. Some parsers rely on metadata - * during the parse, and if the metadata they need is excluded, they - * will not function properly. + *

This is intended for expert use only. Some parsers rely on metadata during the parse, and + * if the metadata they need is excluded, they will not function properly. * * @param writeFilter * @since 2.4.0 @@ -204,8 +202,8 @@ public String get(Property property) { * Returns the value of the identified Integer based metadata property. * * @param property simple integer property definition - * @return property value as a Integer, or null if the property is not set, or - * not a valid Integer + * @return property value as a Integer, or null if the property is not set, or not + * a valid Integer * @since Apache Tika 0.8 */ public Integer getInt(Property property) { @@ -231,8 +229,8 @@ public Integer getInt(Property property) { * Returns the value of the identified Date based metadata property. * * @param property simple date property definition - * @return property value as a Date, or null if the property is not set, or not - * a valid Date + * @return property value as a Date, or null if the property is not set, or not a + * valid Date * @since Apache Tika 0.8 */ public Date getDate(Property property) { @@ -280,10 +278,10 @@ private String[] _getValues(final String name) { } /** - * Add a metadata name/value mapping. Add the specified value to the list of - * values associated to the specified metadata name. + * Add a metadata name/value mapping. Add the specified value to the list of values associated + * to the specified metadata name. * - * @param name the metadata name. + * @param name the metadata name. * @param value the metadata value. */ public void add(final String name, final String value) { @@ -291,10 +289,10 @@ public void add(final String name, final String value) { } /** - * Add a metadata name/value mapping. Add the specified value to the list of - * values associated to the specified metadata name. + * Add a metadata name/value mapping. Add the specified value to the list of values associated + * to the specified metadata name. * - * @param name the metadata name. + * @param name the metadata name. * @param newValues the metadata values */ protected void add(final String name, final String[] newValues) { @@ -309,11 +307,11 @@ protected void add(final String name, final String[] newValues) { } /** - * Add a metadata property/value mapping. Add the specified value to the list of - * values associated to the specified metadata property. + * Add a metadata property/value mapping. Add the specified value to the list of values + * associated to the specified metadata property. * * @param property the metadata property. - * @param value the metadata value. + * @param value the metadata value. */ public void add(final Property property, final String value) { @@ -353,17 +351,17 @@ public void setAll(Properties properties) { Enumeration names = (Enumeration) properties.propertyNames(); while (names.hasMoreElements()) { String name = names.nextElement(); - metadata.put(name, new String[]{properties.getProperty(name)}); + metadata.put(name, new String[] {properties.getProperty(name)}); } } /** - * Set metadata name/value. Associate the specified value to the specified - * metadata name. If some previous values were associated to this name, - * they are removed. If the given value is null, then the - * metadata entry is removed. + * Set metadata name/value. Associate the specified value to the specified metadata name. If + * some previous values were associated to this name, they are removed. If the given value is + * + * null, then the metadata entry is removed. * - * @param name the metadata name. + * @param name the metadata name. * @param value the metadata value, or null */ public void set(String name, String value) { @@ -371,8 +369,8 @@ public void set(String name, String value) { } protected void set(String name, String[] values) { - //TODO: optimize this to not copy if all - //values are to be included "as is" + // TODO: optimize this to not copy if all + // values are to be included "as is" if (values != null) { metadata.remove(name); for (String v : values) { @@ -387,7 +385,7 @@ protected void set(String name, String[] values) { * Sets the value of the identified metadata property. * * @param property property definition - * @param value property value + * @param value property value * @since Apache Tika 0.7 */ public void set(Property property, String value) { @@ -410,7 +408,7 @@ public void set(Property property, String value) { * Sets the values of the identified metadata property. * * @param property property definition - * @param values property values + * @param values property values * @since Apache Tika 1.2 */ public void set(Property property, String[] values) { @@ -433,17 +431,17 @@ public void set(Property property, String[] values) { * Sets the integer value of the identified metadata property. * * @param property simple integer property definition - * @param value property value + * @param value property value * @since Apache Tika 0.8 */ public void set(Property property, int value) { if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) { - throw new PropertyTypeException(Property.PropertyType.SIMPLE, - property.getPrimaryProperty().getPropertyType()); + throw new PropertyTypeException( + Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) { - throw new PropertyTypeException(Property.ValueType.INTEGER, - property.getPrimaryProperty().getValueType()); + throw new PropertyTypeException( + Property.ValueType.INTEGER, property.getPrimaryProperty().getValueType()); } set(property, Integer.toString(value)); } @@ -452,35 +450,36 @@ public void set(Property property, int value) { * Sets the integer value of the identified metadata property. * * @param property simple integer property definition - * @param value property value + * @param value property value * @since Apache Tika 0.8 */ public void set(Property property, long value) { if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) { - throw new PropertyTypeException(Property.PropertyType.SIMPLE, - property.getPrimaryProperty().getPropertyType()); + throw new PropertyTypeException( + Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL) { - throw new PropertyTypeException(Property.ValueType.REAL, - property.getPrimaryProperty().getValueType()); + throw new PropertyTypeException( + Property.ValueType.REAL, property.getPrimaryProperty().getValueType()); } set(property, Long.toString(value)); } + /** * Sets the integer value of the identified metadata property. * * @param property simple integer property definition - * @param value property value + * @param value property value * @since Apache Tika 2.1.1 */ public void set(Property property, boolean value) { if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) { - throw new PropertyTypeException(Property.PropertyType.SIMPLE, - property.getPrimaryProperty().getPropertyType()); + throw new PropertyTypeException( + Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.BOOLEAN) { - throw new PropertyTypeException(Property.ValueType.BOOLEAN, - property.getPrimaryProperty().getValueType()); + throw new PropertyTypeException( + Property.ValueType.BOOLEAN, property.getPrimaryProperty().getValueType()); } set(property, Boolean.toString(value)); } @@ -489,17 +488,17 @@ public void set(Property property, boolean value) { * Adds the integer value of the identified metadata property. * * @param property seq integer property definition - * @param value property value + * @param value property value * @since Apache Tika 1.21 */ public void add(Property property, int value) { if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) { - throw new PropertyTypeException(PropertyType.SEQ, - property.getPrimaryProperty().getPropertyType()); + throw new PropertyTypeException( + PropertyType.SEQ, property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) { - throw new PropertyTypeException(Property.ValueType.INTEGER, - property.getPrimaryProperty().getValueType()); + throw new PropertyTypeException( + Property.ValueType.INTEGER, property.getPrimaryProperty().getValueType()); } add(property, Integer.toString(value)); } @@ -513,12 +512,12 @@ public void add(Property property, int value) { */ public int[] getIntValues(Property property) { if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) { - throw new PropertyTypeException(PropertyType.SEQ, - property.getPrimaryProperty().getPropertyType()); + throw new PropertyTypeException( + PropertyType.SEQ, property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) { - throw new PropertyTypeException(Property.ValueType.INTEGER, - property.getPrimaryProperty().getValueType()); + throw new PropertyTypeException( + Property.ValueType.INTEGER, property.getPrimaryProperty().getValueType()); } String[] vals = getValues(property); int[] ret = new int[vals.length]; @@ -537,12 +536,12 @@ public int[] getIntValues(Property property) { */ public long[] getLongValues(Property property) { if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) { - throw new PropertyTypeException(PropertyType.SEQ, - property.getPrimaryProperty().getPropertyType()); + throw new PropertyTypeException( + PropertyType.SEQ, property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL) { - throw new PropertyTypeException(Property.ValueType.REAL, - property.getPrimaryProperty().getValueType()); + throw new PropertyTypeException( + Property.ValueType.REAL, property.getPrimaryProperty().getValueType()); } String[] vals = getValues(property); long[] ret = new long[vals.length]; @@ -556,14 +555,14 @@ public long[] getLongValues(Property property) { * Sets the real or rational value of the identified metadata property. * * @param property simple real or simple rational property definition - * @param value property value + * @param value property value * @since Apache Tika 0.8 */ public void set(Property property, double value) { - if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL && - property.getPrimaryProperty().getValueType() != Property.ValueType.RATIONAL) { - throw new PropertyTypeException(Property.ValueType.REAL, - property.getPrimaryProperty().getValueType()); + if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL + && property.getPrimaryProperty().getValueType() != Property.ValueType.RATIONAL) { + throw new PropertyTypeException( + Property.ValueType.REAL, property.getPrimaryProperty().getValueType()); } set(property, Double.toString(value)); } @@ -572,17 +571,17 @@ public void set(Property property, double value) { * Sets the date value of the identified metadata property. * * @param property simple integer property definition - * @param date property value + * @param date property value * @since Apache Tika 0.8 */ public void set(Property property, Date date) { if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) { - throw new PropertyTypeException(Property.PropertyType.SIMPLE, - property.getPrimaryProperty().getPropertyType()); + throw new PropertyTypeException( + Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) { - throw new PropertyTypeException(Property.ValueType.DATE, - property.getPrimaryProperty().getValueType()); + throw new PropertyTypeException( + Property.ValueType.DATE, property.getPrimaryProperty().getValueType()); } String dateString = null; if (date != null) { @@ -595,17 +594,17 @@ public void set(Property property, Date date) { * Sets the date value of the identified metadata property. * * @param property simple integer property definition - * @param date property value + * @param date property value * @since Apache Tika 0.8 */ public void set(Property property, Calendar date) { if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) { - throw new PropertyTypeException(Property.PropertyType.SIMPLE, - property.getPrimaryProperty().getPropertyType()); + throw new PropertyTypeException( + Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) { - throw new PropertyTypeException(Property.ValueType.DATE, - property.getPrimaryProperty().getValueType()); + throw new PropertyTypeException( + Property.ValueType.DATE, property.getPrimaryProperty().getValueType()); } String dateString = null; if (date != null) { @@ -618,13 +617,13 @@ public void set(Property property, Calendar date) { * Adds the date value of the identified metadata property. * * @param property simple calendar property definition - * @param date property value + * @param date property value * @since Apache Tika 2.5.0 */ public void add(Property property, Calendar date) { if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) { - throw new PropertyTypeException(Property.ValueType.DATE, - property.getPrimaryProperty().getValueType()); + throw new PropertyTypeException( + Property.ValueType.DATE, property.getPrimaryProperty().getValueType()); } String dateString = null; if (date != null) { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java index 2a9e428eb0..bbfbc6eb82 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java @@ -17,11 +17,9 @@ package org.apache.tika.metadata; /** - * Office Document properties collection. These properties apply to - * Office / Productivity Documents of all forms, including (but not limited - * to) MS Office and OpenDocument formats. - * This is a logical collection of properties, which may be drawn from a - * few different external definitions. + * Office Document properties collection. These properties apply to Office / Productivity Documents + * of all forms, including (but not limited to) MS Office and OpenDocument formats. This is a + * logical collection of properties, which may be drawn from a few different external definitions. * * @since Apache Tika 1.2 */ @@ -31,157 +29,190 @@ public interface Office { String PREFIX_DOC_META = "meta"; /** - * For user defined metadata entries in the document, - * what prefix should be attached to the key names. - * eg Text1 becomes custom:Info1=Text1 + * For user defined metadata entries in the document, what prefix should be attached to the key + * names. eg Text1 becomes + * custom:Info1=Text1 */ String USER_DEFINED_METADATA_NAME_PREFIX = "custom:"; - - /** - * Keywords pertaining to a document. Also populates {@link DublinCore#SUBJECT}. - */ - Property KEYWORDS = Property.composite(Property.internalTextBag( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "keyword"), - new Property[]{DublinCore.SUBJECT,}); - - /** - * Name of the initial creator/author of a document - */ - Property INITIAL_AUTHOR = Property.internalText( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "initial-author"); - - /** - * Name of the last (most recent) author of a document - */ - Property LAST_AUTHOR = Property.internalText( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "last-author"); - - /** - * Name of the principal author(s) of a document - */ - Property AUTHOR = Property.internalTextBag( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "author"); - - - /** - * When was the document created? - */ - Property CREATION_DATE = Property.internalDate( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creation-date"); - - /** - * When was the document last saved? - */ - Property SAVE_DATE = Property.internalDate( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "save-date"); - - /** - * When was the document last printed? - */ - Property PRINT_DATE = Property.internalDate( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "print-date"); - - - /** - * The number of Slides are there in the (presentation) document - */ - Property SLIDE_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "slide-count"); - - /** - * The number of Pages are there in the (paged) document - */ - Property PAGE_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "page-count"); - - /** - * The number of individual Paragraphs in the document - */ - Property PARAGRAPH_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "paragraph-count"); - - /** - * The number of lines in the document - */ - Property LINE_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "line-count"); - - /** - * The number of Words in the document - */ - Property WORD_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "word-count"); - - /** - * The number of Characters in the document - */ - Property CHARACTER_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "character-count"); - - /** - * The number of Characters in the document, including spaces - */ - Property CHARACTER_COUNT_WITH_SPACES = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "character-count-with-spaces"); - - /** - * The number of Tables in the document - */ - Property TABLE_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "table-count"); - - /** - * The number of Images in the document - */ - Property IMAGE_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "image-count"); - - /** - * The number of Objects in the document. These are typically non-Image resources - * embedded in the document, such as other documents or non-Image media. - */ - Property OBJECT_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "object-count"); - - /** - * MAPI message class. What type of .msg/MAPI file is it? - */ - Property MAPI_MESSAGE_CLASS = Property.internalClosedChoise( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-message-class", - "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", "POST", "TASK", "UNKNOWN", - "UNSPECIFIED"); - - Property MAPI_SENT_BY_SERVER_TYPE = Property.internalText( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "mapi-sent-by-server-type"); - - Property MAPI_FROM_REPRESENTING_NAME = Property.internalText( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "mapi-from-representing-name"); - - Property MAPI_FROM_REPRESENTING_EMAIL = Property.internalText( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "mapi-from-representing-email"); - - Property MAPI_MESSAGE_CLIENT_SUBMIT_TIME = Property.internalDate( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "mapi-msg-client-submit-time"); - - /** - * Embedded files may have a "progID" associated with them, such as - * Word.Document.12 or AcroExch.Document.DC + /** Keywords pertaining to a document. Also populates {@link DublinCore#SUBJECT}. */ + Property KEYWORDS = + Property.composite( + Property.internalTextBag( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "keyword"), + new Property[] { + DublinCore.SUBJECT, + }); + + /** Name of the initial creator/author of a document */ + Property INITIAL_AUTHOR = + Property.internalText( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "initial-author"); + + /** Name of the last (most recent) author of a document */ + Property LAST_AUTHOR = + Property.internalText( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "last-author"); + + /** Name of the principal author(s) of a document */ + Property AUTHOR = + Property.internalTextBag( + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "author"); + + /** When was the document created? */ + Property CREATION_DATE = + Property.internalDate( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "creation-date"); + + /** When was the document last saved? */ + Property SAVE_DATE = + Property.internalDate( + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "save-date"); + + /** When was the document last printed? */ + Property PRINT_DATE = + Property.internalDate( + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "print-date"); + + /** The number of Slides are there in the (presentation) document */ + Property SLIDE_COUNT = + Property.internalInteger( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "slide-count"); + + /** The number of Pages are there in the (paged) document */ + Property PAGE_COUNT = + Property.internalInteger( + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "page-count"); + + /** The number of individual Paragraphs in the document */ + Property PARAGRAPH_COUNT = + Property.internalInteger( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "paragraph-count"); + + /** The number of lines in the document */ + Property LINE_COUNT = + Property.internalInteger( + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "line-count"); + + /** The number of Words in the document */ + Property WORD_COUNT = + Property.internalInteger( + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "word-count"); + + /** The number of Characters in the document */ + Property CHARACTER_COUNT = + Property.internalInteger( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "character-count"); + + /** The number of Characters in the document, including spaces */ + Property CHARACTER_COUNT_WITH_SPACES = + Property.internalInteger( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "character-count-with-spaces"); + + /** The number of Tables in the document */ + Property TABLE_COUNT = + Property.internalInteger( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "table-count"); + + /** The number of Images in the document */ + Property IMAGE_COUNT = + Property.internalInteger( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "image-count"); + + /** + * The number of Objects in the document. These are typically non-Image resources embedded in + * the document, such as other documents or non-Image media. + */ + Property OBJECT_COUNT = + Property.internalInteger( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "object-count"); + + /** MAPI message class. What type of .msg/MAPI file is it? */ + Property MAPI_MESSAGE_CLASS = + Property.internalClosedChoise( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "mapi-message-class", + "APPOINTMENT", + "CONTACT", + "NOTE", + "STICKY_NOTE", + "POST", + "TASK", + "UNKNOWN", + "UNSPECIFIED"); + + Property MAPI_SENT_BY_SERVER_TYPE = + Property.internalText( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "mapi-sent-by-server-type"); + + Property MAPI_FROM_REPRESENTING_NAME = + Property.internalText( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "mapi-from-representing-name"); + + Property MAPI_FROM_REPRESENTING_EMAIL = + Property.internalText( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "mapi-from-representing-email"); + + Property MAPI_MESSAGE_CLIENT_SUBMIT_TIME = + Property.internalDate( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "mapi-msg-client-submit-time"); + + /** + * Embedded files may have a "progID" associated with them, such as Word.Document.12 or + * AcroExch.Document.DC */ Property PROG_ID = Property.internalText("msoffice:progID"); Property OCX_NAME = Property.internalText("msoffice:ocxName"); - Property MAPI_RECIPIENTS_STRING = Property.internalText(PREFIX_DOC_META + - TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-recipients-string"); - Property MAPI_IMPORTANCE = Property.internalInteger(PREFIX_DOC_META + - TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance"); - Property MAPI_PRIORTY = Property.internalInteger(PREFIX_DOC_META + - TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance"); - Property MAPI_IS_FLAGGED = Property.internalBoolean(PREFIX_DOC_META + - TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-is-flagged"); + Property MAPI_RECIPIENTS_STRING = + Property.internalText( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "mapi-recipients-string"); + Property MAPI_IMPORTANCE = + Property.internalInteger( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "mapi-importance"); + Property MAPI_PRIORTY = + Property.internalInteger( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "mapi-importance"); + Property MAPI_IS_FLAGGED = + Property.internalBoolean( + PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "mapi-is-flagged"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java index 1259719e16..5e3b32161b 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java @@ -17,62 +17,58 @@ package org.apache.tika.metadata; /** - * Core properties as defined in the Office Open XML specification part Two that are not - * in the DublinCore namespace. - * There is also a keyword property definition in the specification which is omitted here, - * because Tika should stick to the DublinCore/IPTC definition. + * Core properties as defined in the Office Open XML specification part Two that are not in the + * DublinCore namespace. There is also a keyword property definition in the specification which is + * omitted here, because Tika should stick to the DublinCore/IPTC definition. * - * @see ISO document of Office Open XML specification - * @see ECMA document of Office Open XML specification + * @see ISO document of Office Open XML specification + * @see ECMA + * document of Office Open XML specification */ public interface OfficeOpenXMLCore { String NAMESPACE_URI = "http://schemas.openxmlformats.org/package/2006/metadata/core-properties/"; String PREFIX = "cp"; - /** - * A categorization of the content of this package. - */ - Property CATEGORY = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "category"); + /** A categorization of the content of this package. */ + Property CATEGORY = + Property.externalText( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "category"); - /** - * The status of the content. - */ - Property CONTENT_STATUS = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contentStatus"); + /** The status of the content. */ + Property CONTENT_STATUS = + Property.externalText( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contentStatus"); - /** - * The user who performed the last modification. The identification is environment-specific. - */ - Property LAST_MODIFIED_BY = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastModifiedBy"); + /** The user who performed the last modification. The identification is environment-specific. */ + Property LAST_MODIFIED_BY = + Property.externalText( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastModifiedBy"); - /** - * The date and time of the last printing. - */ - Property LAST_PRINTED = Property.externalDate( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastPrinted"); + /** The date and time of the last printing. */ + Property LAST_PRINTED = + Property.externalDate( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastPrinted"); - /** - * The revision number. - */ - Property REVISION = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "revision"); + /** The revision number. */ + Property REVISION = + Property.externalText( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "revision"); - /** - * The version number. This value is set by the user or by the application. - */ - Property VERSION = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "version"); + /** The version number. This value is set by the user or by the application. */ + Property VERSION = + Property.externalText( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "version"); - /** - * The document's subject. Also populates {@link DublinCore#SUBJECT} - */ + /** The document's subject. Also populates {@link DublinCore#SUBJECT} */ @Deprecated - Property SUBJECT = Property.composite(Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"), - new Property[]{DublinCore.SUBJECT,}); + Property SUBJECT = + Property.composite( + Property.externalText( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"), + new Property[] { + DublinCore.SUBJECT, + }); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java index 6919c216b6..8bd6a00fc9 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java @@ -17,16 +17,16 @@ package org.apache.tika.metadata; /** - * Extended properties as defined in the Office Open XML specification part Four. - * Those properties are omitted which have equivalent properties defined in the ODF - * namespace like "word count". - * Also not all properties from the specification are defined here, yet. Only those which have - * been in use by the parsers so far. + * Extended properties as defined in the Office Open XML specification part Four. Those properties + * are omitted which have equivalent properties defined in the ODF namespace like "word count". Also + * not all properties from the specification are defined here, yet. Only those which have been in + * use by the parsers so far. * - * @see ISO document of Office Open XML specification - * @see ECMA document of Office Open XML specification + * @see ISO document of Office Open XML specification + * @see ECMA + * document of Office Open XML specification */ public interface OfficeOpenXMLExtended { String NAMESPACE_URI = @@ -42,42 +42,60 @@ public interface OfficeOpenXMLExtended { String SECURITY_LOCKED_FOR_ANNOTATIONS = "LockedForAnnotations"; String SECURITY_UNKNOWN = "Unknown"; - Property TEMPLATE = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Template"); + Property TEMPLATE = + Property.externalText( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Template"); - Property MANAGER = Property.externalTextBag( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Manager"); + Property MANAGER = + Property.externalTextBag( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Manager"); - Property COMPANY = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Company"); + Property COMPANY = + Property.externalText( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Company"); - Property PRESENTATION_FORMAT = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PresentationFormat"); + Property PRESENTATION_FORMAT = + Property.externalText( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PresentationFormat"); - Property NOTES = Property.externalInteger( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Notes"); + Property NOTES = + Property.externalInteger( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Notes"); - Property TOTAL_TIME = Property.externalInteger( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "TotalTime"); + Property TOTAL_TIME = + Property.externalInteger( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "TotalTime"); - Property HIDDEN_SLIDES = Property.externalInteger( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "HiddedSlides"); + Property HIDDEN_SLIDES = + Property.externalInteger( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "HiddedSlides"); - Property APPLICATION = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Application"); + Property APPLICATION = + Property.externalText( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Application"); - Property APP_VERSION = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AppVersion"); - //Integer flag - Property DOC_SECURITY = Property.externalInteger( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurity"); + Property APP_VERSION = + Property.externalText( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AppVersion"); + // Integer flag + Property DOC_SECURITY = + Property.externalInteger( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurity"); - //Human readable string explaining doc security flag - Property DOC_SECURITY_STRING = Property.externalClosedChoise( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurityString", - SECURITY_NONE, SECURITY_PASSWORD_PROTECTED, SECURITY_READ_ONLY_RECOMMENDED, - SECURITY_READ_ONLY_ENFORCED, SECURITY_LOCKED_FOR_ANNOTATIONS, SECURITY_UNKNOWN); + // Human readable string explaining doc security flag + Property DOC_SECURITY_STRING = + Property.externalClosedChoise( + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurityString", + SECURITY_NONE, + SECURITY_PASSWORD_PROTECTED, + SECURITY_READ_ONLY_RECOMMENDED, + SECURITY_READ_ONLY_ENFORCED, + SECURITY_LOCKED_FOR_ANNOTATIONS, + SECURITY_UNKNOWN); - Property COMMENTS = Property.externalTextBag( - WORD_PROCESSING_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Comments"); + Property COMMENTS = + Property.externalTextBag( + WORD_PROCESSING_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "Comments"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index b15c10383c..475af4f341 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -27,20 +27,18 @@ public interface PDF { String PDFA_PREFIX = "pdfa" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; String PDFAID_PREFIX = "pdfaid" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; - /** - * Number of %%EOF as extracted by the StartXRefScanner. See - * that class for limitations. + * Number of %%EOF as extracted by the StartXRefScanner. See that class for limitations. * - * This includes the final %%EOF, which may or may not be at the literal - * end of the file. This does not include an %%EOF - * if the startxref=0, as would happen in a dummy %%EOF in a linearized PDF. + *

This includes the final %%EOF, which may or may not be at the literal end of the file. + * This does not include an %%EOF if the startxref=0, as would happen in a dummy %%EOF in a + * linearized PDF. */ Property EOF_OFFSETS = Property.externalRealSeq(PDF_PREFIX + "eofOffsets"); /** - * Prefix to be used for properties that record what was stored - * in the docinfo section (as opposed to XMP) + * Prefix to be used for properties that record what was stored in the docinfo section (as + * opposed to XMP) */ String PDF_DOC_INFO_PREFIX = PDF_PREFIX + "docinfo" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; @@ -92,17 +90,14 @@ public interface PDF { Property PRODUCER = Property.internalText(PDF_PREFIX + "producer"); /** - * This specifies where an action or destination would be found/triggered - * in the document: on document open, before close, etc. + * This specifies where an action or destination would be found/triggered in the document: on + * document open, before close, etc. * - * This is included in the embedded document (js only for now?), not the container PDF. + *

This is included in the embedded document (js only for now?), not the container PDF. */ Property ACTION_TRIGGER = Property.internalText(PDF_PREFIX + "actionTrigger"); - /** - * This is a list of all action or destination triggers contained - * within a given PDF. - */ + /** This is a list of all action or destination triggers contained within a given PDF. */ Property ACTION_TRIGGERS = Property.internalTextBag(PDF_PREFIX + "actionTriggers"); Property ACTION_TYPES = Property.internalTextBag(PDF_PREFIX + "actionTypes"); @@ -118,101 +113,81 @@ public interface PDF { Property OVERALL_PERCENTAGE_UNMAPPED_UNICODE_CHARS = Property.internalReal(PDF_PREFIX + "overallPercentageUnmappedUnicodeChars"); - /** - * Contains at least one damaged font for at least one character - */ - Property CONTAINS_DAMAGED_FONT = - Property.internalBoolean(PDF_PREFIX + "containsDamagedFont"); + /** Contains at least one damaged font for at least one character */ + Property CONTAINS_DAMAGED_FONT = Property.internalBoolean(PDF_PREFIX + "containsDamagedFont"); - /** - * Contains at least one font that is not embedded - */ + /** Contains at least one font that is not embedded */ Property CONTAINS_NON_EMBEDDED_FONT = Property.internalBoolean(PDF_PREFIX + "containsNonEmbeddedFont"); - /** - * Has XFA - */ + /** Has XFA */ Property HAS_XFA = Property.internalBoolean(PDF_PREFIX + "hasXFA"); - /** - * Has XMP, whether or not it is valid - */ + /** Has XMP, whether or not it is valid */ Property HAS_XMP = Property.internalBoolean(PDF_PREFIX + "hasXMP"); /** - * If xmp is extracted by, e.g. the XMLProfiler, where did it come from? - * The document's document catalog or a specific page...or? + * If xmp is extracted by, e.g. the XMLProfiler, where did it come from? The document's document + * catalog or a specific page...or? */ Property XMP_LOCATION = Property.internalText(PDF_PREFIX + "xmpLocation"); - /** - * Has > 0 AcroForm fields - */ + /** Has > 0 AcroForm fields */ Property HAS_ACROFORM_FIELDS = Property.internalBoolean(PDF_PREFIX + "hasAcroFormFields"); Property HAS_MARKED_CONTENT = Property.internalBoolean(PDF_PREFIX + "hasMarkedContent"); - /** - * Has a collection element in the root. If true, this is likely a PDF Portfolio. - */ + /** Has a collection element in the root. If true, this is likely a PDF Portfolio. */ Property HAS_COLLECTION = Property.internalBoolean(PDF_PREFIX + "hasCollection"); - Property EMBEDDED_FILE_DESCRIPTION = Property.externalText(PDF_PREFIX + - "embeddedFileDescription"); + Property EMBEDDED_FILE_DESCRIPTION = + Property.externalText(PDF_PREFIX + "embeddedFileDescription"); - /** - * If the file came from an annotation and there was a type - */ - Property EMBEDDED_FILE_ANNOTATION_TYPE = Property.internalText(PDF_PREFIX + - "embeddedFileAnnotationType"); + /** If the file came from an annotation and there was a type */ + Property EMBEDDED_FILE_ANNOTATION_TYPE = + Property.internalText(PDF_PREFIX + "embeddedFileAnnotationType"); /** - * literal string from the PDEmbeddedFile#getSubtype(), should be what the PDF - * alleges is the embedded file's mime type - */ - Property EMBEDDED_FILE_SUBTYPE = Property.internalText(PDF_PREFIX + - "embeddedFileSubtype"); - /** - * If the PDF has an annotation of type 3D + * literal string from the PDEmbeddedFile#getSubtype(), should be what the PDF alleges is the + * embedded file's mime type */ + Property EMBEDDED_FILE_SUBTYPE = Property.internalText(PDF_PREFIX + "embeddedFileSubtype"); + + /** If the PDF has an annotation of type 3D */ Property HAS_3D = Property.internalBoolean(PDF_PREFIX + "has3D"); Property ANNOTATION_TYPES = Property.internalTextBag(PDF_PREFIX + "annotationTypes"); Property ANNOTATION_SUBTYPES = Property.internalTextBag(PDF_PREFIX + "annotationSubtypes"); - /** - * Number of 3D annotations a PDF contains. This makes {@link PDF#HAS_3D} redundant. - */ + /** Number of 3D annotations a PDF contains. This makes {@link PDF#HAS_3D} redundant. */ Property NUM_3D_ANNOTATIONS = Property.internalInteger(PDF_PREFIX + "num3DAnnotations"); - Property ASSOCIATED_FILE_RELATIONSHIP = Property.internalText(PDF_PREFIX + - "associatedFileRelationship"); + Property ASSOCIATED_FILE_RELATIONSHIP = + Property.internalText(PDF_PREFIX + "associatedFileRelationship"); + /** - * This is a zero-based number for incremental updates within a PDF -- 0 is the first - * update, 1 is the second, etc. The final version of the PDF (e.g. the last update) - * does not have an incremental update number. + * This is a zero-based number for incremental updates within a PDF -- 0 is the first update, 1 + * is the second, etc. The final version of the PDF (e.g. the last update) does not have an + * incremental update number. * - * This value is populated with the parse incremental updates feature is selected - * in the PDFParser. + *

This value is populated with the parse incremental updates feature is selected in the + * PDFParser. */ Property INCREMENTAL_UPDATE_NUMBER = - Property.composite(Property.internalInteger(PDF_PREFIX + "incrementalUpdateNumber"), - new Property[]{ TikaCoreProperties.VERSION_NUMBER }); + Property.composite( + Property.internalInteger(PDF_PREFIX + "incrementalUpdateNumber"), + new Property[] {TikaCoreProperties.VERSION_NUMBER}); - /** - * Incremental updates as extracted by the StartXRefScanner. See - * that class for limitations. - */ + /** Incremental updates as extracted by the StartXRefScanner. See that class for limitations. */ Property PDF_INCREMENTAL_UPDATE_COUNT = - Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"), - new Property[]{ TikaCoreProperties.VERSION_COUNT }); + Property.composite( + Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"), + new Property[] {TikaCoreProperties.VERSION_COUNT}); /** - * This counts the number of pages that would have been OCR'd or were OCR'd depending - * on the OCR settings. If NO_OCR is selected, this will + * This counts the number of pages that would have been OCR'd or were OCR'd depending on the OCR + * settings. If NO_OCR is selected, this will */ Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount"); - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java b/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java index 4ba79090e5..944ba977a1 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java @@ -17,22 +17,19 @@ package org.apache.tika.metadata; /** - * XMP Paged-text schema. This is a collection of - * {@link Property property definition} constants for the paged text - * properties defined in the XMP standard. + * XMP Paged-text schema. This is a collection of {@link Property property definition} constants for + * the paged text properties defined in the XMP standard. * - * @see XMP Specification, Part 2: Standard Schemas + * @see XMP Specification, Part 2: Standard Schemas * @since Apache Tika 0.8 */ public interface PagedText { - /** - * "The number of pages in the document (including any in contained - * documents)." - */ + /** "The number of pages in the document (including any in contained documents)." */ Property N_PAGES = Property.internalInteger("xmpTPg:NPages"); - //TODO MaxPageSize, Fonts, Colorants, PlateNames + // TODO MaxPageSize, Fonts, Colorants, PlateNames } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java b/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java index af4ababb08..1b8036c9d4 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java @@ -22,12 +22,12 @@ /** * XMP Photoshop metadata schema. - *

- * A collection of property constants for the - * Photo Metadata properties defined in the XMP Photoshop - * standard. * - * @see XMP Photoshop + *

A collection of property constants for the Photo Metadata properties defined in the XMP + * Photoshop standard. + * + * @see XMP + * Photoshop * @since Apache Tika 1.2 */ public interface Photoshop { @@ -35,57 +35,92 @@ public interface Photoshop { String NAMESPACE_URI_PHOTOSHOP = "http://ns.adobe.com/photoshop/1.0/"; String PREFIX_PHOTOSHOP = "photoshop"; - Property AUTHORS_POSITION = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AuthorsPosition"); + Property AUTHORS_POSITION = + Property.internalText( + PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "AuthorsPosition"); // TODO Replace this with proper indexed choices support - String[] _COLOR_MODE_CHOICES_INDEXED = - {"Bitmap", "Greyscale", "Indexed Colour", "RGB Color", "CMYK Colour", "Multi-Channel", - "Duotone", "LAB Colour", "reserved", "reserved", "YCbCr Colour", "YCgCo Colour", - "YCbCrK Colour"}; - Property COLOR_MODE = Property.internalClosedChoise( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ColorMode", - _COLOR_MODE_CHOICES_INDEXED); - - Property CAPTION_WRITER = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CaptionWriter"); - - Property CATEGORY = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Category"); - - Property CITY = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "City"); - - Property COUNTRY = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Country"); - - Property CREDIT = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Credit"); - - Property DATE_CREATED = Property.internalDate( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DateCreated"); - - Property HEADLINE = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Headline"); - - Property INSTRUCTIONS = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Instructions"); - - Property SOURCE = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Source"); - - Property STATE = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "State"); - - Property SUPPLEMENTAL_CATEGORIES = Property.internalTextBag( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "SupplementalCategories"); - - Property TRANSMISSION_REFERENCE = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "TransmissionReference"); - - Property URGENCY = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Urgency"); - + String[] _COLOR_MODE_CHOICES_INDEXED = { + "Bitmap", + "Greyscale", + "Indexed Colour", + "RGB Color", + "CMYK Colour", + "Multi-Channel", + "Duotone", + "LAB Colour", + "reserved", + "reserved", + "YCbCr Colour", + "YCgCo Colour", + "YCbCrK Colour" + }; + Property COLOR_MODE = + Property.internalClosedChoise( + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ColorMode", + _COLOR_MODE_CHOICES_INDEXED); + + Property CAPTION_WRITER = + Property.internalText( + PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "CaptionWriter"); + + Property CATEGORY = + Property.internalText( + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Category"); + + Property CITY = + Property.internalText( + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "City"); + + Property COUNTRY = + Property.internalText( + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Country"); + + Property CREDIT = + Property.internalText( + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Credit"); + + Property DATE_CREATED = + Property.internalDate( + PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "DateCreated"); + + Property HEADLINE = + Property.internalText( + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Headline"); + + Property INSTRUCTIONS = + Property.internalText( + PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "Instructions"); + + Property SOURCE = + Property.internalText( + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Source"); + + Property STATE = + Property.internalText( + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "State"); + + Property SUPPLEMENTAL_CATEGORIES = + Property.internalTextBag( + PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "SupplementalCategories"); + + Property TRANSMISSION_REFERENCE = + Property.internalText( + PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "TransmissionReference"); + + Property URGENCY = + Property.internalText( + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Urgency"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Property.java b/tika-core/src/main/java/org/apache/tika/metadata/Property.java index 3d67141414..a40f840f27 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Property.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Property.java @@ -26,10 +26,9 @@ import java.util.concurrent.ConcurrentHashMap; /** - * XMP property definition. Each instance of this class defines a single - * metadata property like "dc:format". In addition to the property name, - * the {@link ValueType value type} and category (internal or external) - * of the property are included in the property definition. The available + * XMP property definition. Each instance of this class defines a single metadata property like + * "dc:format". In addition to the property name, the {@link ValueType value type} and category + * (internal or external) of the property are included in the property definition. The available * choice values are also stored for open and closed choice value types. * * @since Apache Tika 0.7 @@ -43,21 +42,25 @@ public final class Property implements Comparable { private final ValueType valueType; private final Property primaryProperty; private final Property[] secondaryExtractProperties; - /** - * The available choices for the open and closed choice value types. - */ + + /** The available choices for the open and closed choice value types. */ private final Set choices; - private Property(String name, boolean internal, PropertyType propertyType, ValueType valueType, - String[] choices, Property primaryProperty, - Property[] secondaryExtractProperties) { + private Property( + String name, + boolean internal, + PropertyType propertyType, + ValueType valueType, + String[] choices, + Property primaryProperty, + Property[] secondaryExtractProperties) { this.name = name; this.internal = internal; this.propertyType = propertyType; this.valueType = valueType; if (choices != null) { - this.choices = Collections - .unmodifiableSet(new HashSet<>(Arrays.asList(choices.clone()))); + this.choices = + Collections.unmodifiableSet(new HashSet<>(Arrays.asList(choices.clone()))); } else { this.choices = null; } @@ -76,8 +79,12 @@ private Property(String name, boolean internal, PropertyType propertyType, Value } } - private Property(String name, boolean internal, PropertyType propertyType, ValueType valueType, - String[] choices) { + private Property( + String name, + boolean internal, + PropertyType propertyType, + ValueType valueType, + String[] choices) { this(name, internal, propertyType, valueType, choices, null, null); } @@ -89,8 +96,8 @@ private Property(String name, boolean internal, ValueType valueType) { this(name, internal, PropertyType.SIMPLE, valueType, null); } - private Property(String name, boolean internal, PropertyType propertyType, - ValueType valueType) { + private Property( + String name, boolean internal, PropertyType propertyType, ValueType valueType) { this(name, internal, propertyType, valueType, null); } @@ -222,16 +229,16 @@ public static Property externalTextBag(String name) { /** * Constructs a new composite property from the given primary and array of secondary properties. - *

- * Note that name of the composite property is taken from its primary property, - * and primary and secondary properties must not be composite properties themselves. + * + *

Note that name of the composite property is taken from its primary property, and primary + * and secondary properties must not be composite properties themselves. * * @param primaryProperty * @param secondaryExtractProperties * @return the composite property */ - public static Property composite(Property primaryProperty, - Property[] secondaryExtractProperties) { + public static Property composite( + Property primaryProperty, Property[] secondaryExtractProperties) { if (primaryProperty == null) { throw new NullPointerException("primaryProperty must not be null"); } @@ -249,8 +256,13 @@ public static Property composite(Property primaryProperty, if (primaryProperty.getChoices() != null) { choices = primaryProperty.getChoices().toArray(new String[0]); } - return new Property(primaryProperty.getName(), primaryProperty.isInternal(), - PropertyType.COMPOSITE, ValueType.PROPERTY, choices, primaryProperty, + return new Property( + primaryProperty.getName(), + primaryProperty.isInternal(), + PropertyType.COMPOSITE, + ValueType.PROPERTY, + choices, + primaryProperty, secondaryExtractProperties); } @@ -266,12 +278,11 @@ public boolean isExternal() { return !internal; } - /** - * Is the PropertyType one which accepts multiple values? - */ + /** Is the PropertyType one which accepts multiple values? */ public boolean isMultiValuePermitted() { - if (propertyType == PropertyType.BAG || propertyType == PropertyType.SEQ || - propertyType == PropertyType.ALT) { + if (propertyType == PropertyType.BAG + || propertyType == PropertyType.SEQ + || propertyType == PropertyType.ALT) { return true; } else if (propertyType == PropertyType.COMPOSITE) { // Base it on the primary property's behaviour @@ -289,9 +300,9 @@ public ValueType getValueType() { } /** - * Returns the (immutable) set of choices for the values of this property. - * Only defined for {@link ValueType#OPEN_CHOICE open} and - * {@link ValueType#CLOSED_CHOICE closed choice} value types. + * Returns the (immutable) set of choices for the values of this property. Only defined for + * {@link ValueType#OPEN_CHOICE open} and {@link ValueType#CLOSED_CHOICE closed choice} value + * types. * * @return available choices, or null */ @@ -325,40 +336,43 @@ public boolean equals(Object o) { return o instanceof Property && name.equals(((Property) o).name); } - //----------------------------------------------------------< Comparable > + // ----------------------------------------------------------< Comparable > public int hashCode() { return name.hashCode(); } - //--------------------------------------------------------------< Object > + // --------------------------------------------------------------< Object > public enum PropertyType { - /** - * A single value - */ - SIMPLE, STRUCTURE, - /** - * An un-ordered array - */ + /** A single value */ + SIMPLE, + STRUCTURE, + /** An un-ordered array */ BAG, - /** - * An ordered array - */ + /** An ordered array */ SEQ, - /** - * An ordered array with some sort of criteria - */ + /** An ordered array with some sort of criteria */ ALT, - /** - * Multiple child properties - */ + /** Multiple child properties */ COMPOSITE } public enum ValueType { - BOOLEAN, OPEN_CHOICE, CLOSED_CHOICE, DATE, INTEGER, LOCALE, MIME_TYPE, PROPER_NAME, - RATIONAL, REAL, TEXT, URI, URL, XPATH, PROPERTY + BOOLEAN, + OPEN_CHOICE, + CLOSED_CHOICE, + DATE, + INTEGER, + LOCALE, + MIME_TYPE, + PROPER_NAME, + RATIONAL, + REAL, + TEXT, + URI, + URL, + XPATH, + PROPERTY } - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java b/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java index ff1f926ba4..29947e680d 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java @@ -19,11 +19,10 @@ import org.apache.tika.metadata.Property.PropertyType; import org.apache.tika.metadata.Property.ValueType; - /** - * XMP property definition violation exception. This is thrown when - * you try to set a {@link Property} value with an incorrect type, - * such as storing an Integer when the property is of type Date. + * XMP property definition violation exception. This is thrown when you try to set a {@link + * Property} value with an incorrect type, such as storing an Integer when the property is of type + * Date. * * @since Apache Tika 0.8 */ @@ -42,9 +41,10 @@ public PropertyTypeException(ValueType expected, ValueType found) { } public PropertyTypeException(PropertyType unsupportedPropertyType) { - super((unsupportedPropertyType != PropertyType.COMPOSITE) ? - unsupportedPropertyType + " is not supported" : - "Composite Properties must not include other Composite" + - " Properties as either Primary or Secondary"); + super( + (unsupportedPropertyType != PropertyType.COMPOSITE) + ? unsupportedPropertyType + " is not supported" + : "Composite Properties must not include other Composite" + + " Properties as either Primary or Secondary"); } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java b/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java index 066348804f..c056bb61cf 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java @@ -26,27 +26,31 @@ public interface QuattroPro { String QUATTROPRO_METADATA_NAME_PREFIX = "wordperfect"; - /** - * ID. - */ - Property ID = Property.internalText( - QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Id"); - /** - * Version. - */ - Property VERSION = Property.internalInteger( - QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "Version"); - /** - * Build. - */ - Property BUILD = Property.internalInteger( - QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "Build"); - /** - * Lowest version. - */ - Property LOWEST_VERSION = Property.internalInteger( - QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LowestVersion"); + /** ID. */ + Property ID = + Property.internalText( + QUATTROPRO_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "Id"); + + /** Version. */ + Property VERSION = + Property.internalInteger( + QUATTROPRO_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "Version"); + + /** Build. */ + Property BUILD = + Property.internalInteger( + QUATTROPRO_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "Build"); + + /** Lowest version. */ + Property LOWEST_VERSION = + Property.internalInteger( + QUATTROPRO_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LowestVersion"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java b/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java index e4572e38f9..de0d45aa8c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java @@ -1,47 +1,52 @@ package org.apache.tika.metadata; /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ public interface RTFMetadata { String PREFIX_RTF_META = "rtf_meta"; - String RTF_PICT_META_PREFIX = "rtf_pict:"; /** - * if set to true, this means that an image file is probably a "thumbnail" - * any time a pict/emf/wmf is in an object + * if set to true, this means that an image file is probably a "thumbnail" any time a + * pict/emf/wmf is in an object */ - Property THUMBNAIL = Property.internalBoolean( - PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "thumbnail"); + Property THUMBNAIL = + Property.internalBoolean( + PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "thumbnail"); /** - * if an application and version is given as part of the - * embedded object, this is the literal string + * if an application and version is given as part of the embedded object, this is the literal + * string */ - Property EMB_APP_VERSION = Property.internalText( - PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_app_version"); - - Property EMB_CLASS = Property.internalText( - PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_class"); - - Property EMB_TOPIC = Property.internalText( - PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_topic"); - - Property EMB_ITEM = Property.internalText( - PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_item"); - + Property EMB_APP_VERSION = + Property.internalText( + PREFIX_RTF_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "emb_app_version"); + + Property EMB_CLASS = + Property.internalText( + PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_class"); + + Property EMB_TOPIC = + Property.internalText( + PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_topic"); + + Property EMB_ITEM = + Property.internalText( + PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_item"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java b/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java index fe5fd0ec39..b8d29923b8 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java @@ -17,112 +17,75 @@ package org.apache.tika.metadata; /** - * XMP Exif TIFF schema. This is a collection of - * {@link Property property definition} constants for the Exif TIFF - * properties defined in the XMP standard. + * XMP Exif TIFF schema. This is a collection of {@link Property property definition} constants for + * the Exif TIFF properties defined in the XMP standard. * - * @see XMP Specification, Part 2: Standard Schemas + * @see XMP Specification, Part 2: Standard Schemas * @since Apache Tika 0.8 */ public interface TIFF { - /** - * "Number of bits per component in each channel." - */ + /** "Number of bits per component in each channel." */ Property BITS_PER_SAMPLE = Property.internalIntegerSequence("tiff:BitsPerSample"); - /** - * "Image height in pixels." - */ + /** "Image height in pixels." */ Property IMAGE_LENGTH = Property.internalInteger("tiff:ImageLength"); - /** - * "Image width in pixels." - */ + /** "Image width in pixels." */ Property IMAGE_WIDTH = Property.internalInteger("tiff:ImageWidth"); - /** - * "Number of components per pixel." - */ + /** "Number of components per pixel." */ Property SAMPLES_PER_PIXEL = Property.internalInteger("tiff:SamplesPerPixel"); - /** - * Did the Flash fire when taking this image? - */ + /** Did the Flash fire when taking this image? */ Property FLASH_FIRED = Property.internalBoolean("exif:Flash"); - /** - * "Exposure time in seconds." - */ + /** "Exposure time in seconds." */ Property EXPOSURE_TIME = Property.internalRational("exif:ExposureTime"); /** - * "F-Number." - * The f-number is the focal length divided by the "effective" aperture - * diameter. It is a dimensionless number that is a measure of lens speed. + * "F-Number." The f-number is the focal length divided by the "effective" aperture diameter. It + * is a dimensionless number that is a measure of lens speed. */ Property F_NUMBER = Property.internalRational("exif:FNumber"); - /** - * "Focal length of the lens, in millimeters." - */ + /** "Focal length of the lens, in millimeters." */ Property FOCAL_LENGTH = Property.internalRational("exif:FocalLength"); - /** - * "ISO Speed and ISO Latitude of the input device as specified in ISO 12232" - */ + /** "ISO Speed and ISO Latitude of the input device as specified in ISO 12232" */ Property ISO_SPEED_RATINGS = Property.internalIntegerSequence("exif:IsoSpeedRatings"); - /** - * "Manufacturer of the recording equipment." - */ + /** "Manufacturer of the recording equipment." */ Property EQUIPMENT_MAKE = Property.internalText("tiff:Make"); - /** - * "Model name or number of the recording equipment." - */ + /** "Model name or number of the recording equipment." */ Property EQUIPMENT_MODEL = Property.internalText("tiff:Model"); - /** - * "Software or firmware used to generate the image." - */ + /** "Software or firmware used to generate the image." */ Property SOFTWARE = Property.internalText("tiff:Software"); /** - * "The Orientation of the image." - * 1 = 0th row at top, 0th column at left - * 2 = 0th row at top, 0th column at right - * 3 = 0th row at bottom, 0th column at right - * 4 = 0th row at bottom, 0th column at left - * 5 = 0th row at left, 0th column at top - * 6 = 0th row at right, 0th column at top - * 7 = 0th row at right, 0th column at bottom - * 8 = 0th row at left, 0th column at bottom + * "The Orientation of the image." 1 = 0th row at top, 0th column at left 2 = 0th row at top, + * 0th column at right 3 = 0th row at bottom, 0th column at right 4 = 0th row at bottom, 0th + * column at left 5 = 0th row at left, 0th column at top 6 = 0th row at right, 0th column at top + * 7 = 0th row at right, 0th column at bottom 8 = 0th row at left, 0th column at bottom */ Property ORIENTATION = - Property.internalClosedChoise("tiff:Orientation", "1", "2", "3", "4", "5", "6", "7", - "8"); + Property.internalClosedChoise( + "tiff:Orientation", "1", "2", "3", "4", "5", "6", "7", "8"); - /** - * "Horizontal resolution in pixels per unit." - */ + /** "Horizontal resolution in pixels per unit." */ Property RESOLUTION_HORIZONTAL = Property.internalRational("tiff:XResolution"); - /** - * "Vertical resolution in pixels per unit." - */ + /** "Vertical resolution in pixels per unit." */ Property RESOLUTION_VERTICAL = Property.internalRational("tiff:YResolution"); - /** - * "Units used for Horizontal and Vertical Resolutions." - * One of "Inch" or "cm" - */ + /** "Units used for Horizontal and Vertical Resolutions." One of "Inch" or "cm" */ Property RESOLUTION_UNIT = Property.internalClosedChoise("tiff:ResolutionUnit", "Inch", "cm"); - /** - * "Date and time when original image was generated" - */ + /** "Date and time when original image was generated" */ Property ORIGINAL_DATE = Property.internalDate("exif:DateTimeOriginal"); Property EXIF_PAGE_COUNT = Property.externalInteger("exif:PageCount"); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index effa4a6674..54a76773fc 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -17,229 +17,232 @@ package org.apache.tika.metadata; /** - * Contains a core set of basic Tika metadata properties, which all parsers - * will attempt to supply (where the file format permits). These are all - * defined in terms of other standard namespaces. - *

- * Users of Tika who wish to have consistent metadata across file formats - * can make use of these Properties, knowing that where present they will - * have consistent semantic meaning between different file formats. (No - * matter if one file format calls it Title, another Long-Title and another - * Long-Name, if they all mean the same thing as defined by - * {@link DublinCore#TITLE} then they will all be present as such) - *

- * For now, most of these properties are composite ones including the deprecated - * non-prefixed String properties from the Metadata class. In Tika 2.0, most - * of these will revert back to simple assignments. + * Contains a core set of basic Tika metadata properties, which all parsers will attempt to supply + * (where the file format permits). These are all defined in terms of other standard namespaces. + * + *

Users of Tika who wish to have consistent metadata across file formats can make use of these + * Properties, knowing that where present they will have consistent semantic meaning between + * different file formats. (No matter if one file format calls it Title, another Long-Title and + * another Long-Name, if they all mean the same thing as defined by {@link DublinCore#TITLE} then + * they will all be present as such) + * + *

For now, most of these properties are composite ones including the deprecated non-prefixed + * String properties from the Metadata class. In Tika 2.0, most of these will revert back to simple + * assignments. * * @since Apache Tika 1.2 */ @SuppressWarnings("deprecation") public interface TikaCoreProperties { - /** - * The common delimiter used between the namespace abbreviation and the property name - */ + /** The common delimiter used between the namespace abbreviation and the property name */ String NAMESPACE_PREFIX_DELIMITER = ":"; /** - * Use this to prefix metadata properties that store information - * about the parsing process. Users should be able to distinguish - * between metadata that was contained within the document and - * metadata about the parsing process. + * Use this to prefix metadata properties that store information about the parsing process. + * Users should be able to distinguish between metadata that was contained within the document + * and metadata about the parsing process. */ String TIKA_META_PREFIX = "X-TIKA" + NAMESPACE_PREFIX_DELIMITER; + Property EMBEDDED_DEPTH = Property.internalInteger(TIKA_META_PREFIX + "embedded_depth"); /** - * This tracks the embedded file paths based on the name of embedded files - * where available. There is a small risk that there may be path collisions - * and that these paths may not be unique within a file. + * This tracks the embedded file paths based on the name of embedded files where available. + * There is a small risk that there may be path collisions and that these paths may not be + * unique within a file. * - * For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}. + *

For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}. */ Property EMBEDDED_RESOURCE_PATH = Property.internalText(TIKA_META_PREFIX + "embedded_resource_path"); /** - * This tracks the embedded file paths based on the embedded file's - * {@link TikaCoreProperties#EMBEDDED_ID}. + * This tracks the embedded file paths based on the embedded file's {@link + * TikaCoreProperties#EMBEDDED_ID}. */ - Property EMBEDDED_ID_PATH = - Property.internalText(TIKA_META_PREFIX + "embedded_id_path"); + Property EMBEDDED_ID_PATH = Property.internalText(TIKA_META_PREFIX + "embedded_id_path"); - /** - * This is a 1-index counter for embedded files, used by the RecursiveParserWrapper - */ - Property EMBEDDED_ID = - Property.internalInteger(TIKA_META_PREFIX + "embedded_id"); + /** This is a 1-index counter for embedded files, used by the RecursiveParserWrapper */ + Property EMBEDDED_ID = Property.internalInteger(TIKA_META_PREFIX + "embedded_id"); Property PARSE_TIME_MILLIS = Property.internalText(TIKA_META_PREFIX + "parse_time_millis"); - /** - * Simple class name of the content handler - */ + + /** Simple class name of the content handler */ Property TIKA_CONTENT_HANDLER = Property.internalText(TIKA_META_PREFIX + "content_handler"); + Property TIKA_CONTENT = Property.internalText(TIKA_META_PREFIX + "content"); - /** - * Use this to store parse exception information in the Metadata object. - */ + + /** Use this to store parse exception information in the Metadata object. */ String TIKA_META_EXCEPTION_PREFIX = TIKA_META_PREFIX + "EXCEPTION" + NAMESPACE_PREFIX_DELIMITER; - /** - * Use this to store warnings that happened during the parse. - */ + /** Use this to store warnings that happened during the parse. */ String TIKA_META_WARN_PREFIX = TIKA_META_PREFIX + "WARN" + NAMESPACE_PREFIX_DELIMITER; - //exception in main file + // exception in main file Property CONTAINER_EXCEPTION = Property.internalText(TIKA_META_EXCEPTION_PREFIX + "container_exception"); - //exception in an embedded file + // exception in an embedded file Property EMBEDDED_EXCEPTION = Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_exception"); - //exception handling the raw bytes of an embedded file by an EmbeddedDocumentByteStore + // exception handling the raw bytes of an embedded file by an EmbeddedDocumentByteStore Property EMBEDDED_BYTES_EXCEPTION = Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_bytes_exception"); - //warning while parsing in an embedded file + // warning while parsing in an embedded file Property EMBEDDED_WARNING = Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_warning"); Property WRITE_LIMIT_REACHED = Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "write_limit_reached"); + /** - * Use this to store exceptions caught during a parse that are - * non-fatal, e.g. if a parser is in lenient mode and more - * content can be extracted if we ignore an exception thrown by - * a dependency. + * Use this to store exceptions caught during a parse that are non-fatal, e.g. if a parser is in + * lenient mode and more content can be extracted if we ignore an exception thrown by a + * dependency. */ Property TIKA_META_EXCEPTION_WARNING = Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "warn"); /** - * This means that metadata keys or metadata values were truncated. - * If there is an "include" filter, this should not be set if - * a field is not in the "include" set. + * This means that metadata keys or metadata values were truncated. If there is an "include" + * filter, this should not be set if a field is not in the "include" set. */ Property TRUNCATED_METADATA = Property.internalBoolean(TIKA_META_WARN_PREFIX + "truncated_metadata"); /** - * Use this to store exceptions caught while trying to read the - * stream of an embedded resource. Do not use this if there is - * a parse exception on the embedded resource. + * Use this to store exceptions caught while trying to read the stream of an embedded resource. + * Do not use this if there is a parse exception on the embedded resource. */ Property TIKA_META_EXCEPTION_EMBEDDED_STREAM = Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_stream_exception"); + Property TIKA_PARSED_BY = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By"); /** - * Use this to store a record of all parsers that touched a given file - * in the container file's metadata. + * Use this to store a record of all parsers that touched a given file in the container file's + * metadata. */ - Property TIKA_PARSED_BY_FULL_SET = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set"); + Property TIKA_PARSED_BY_FULL_SET = + Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set"); - Property TIKA_DETECTED_LANGUAGE = Property.externalTextBag(TIKA_META_PREFIX + - "detected_language"); + Property TIKA_DETECTED_LANGUAGE = + Property.externalTextBag(TIKA_META_PREFIX + "detected_language"); - Property TIKA_DETECTED_LANGUAGE_CONFIDENCE = Property.externalTextBag(TIKA_META_PREFIX + - "detected_language_confidence"); + Property TIKA_DETECTED_LANGUAGE_CONFIDENCE = + Property.externalTextBag(TIKA_META_PREFIX + "detected_language_confidence"); - Property TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW = Property.externalRealSeq(TIKA_META_PREFIX + - "detected_language_confidence_raw"); + Property TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW = + Property.externalRealSeq(TIKA_META_PREFIX + "detected_language_confidence_raw"); String RESOURCE_NAME_KEY = "resourceName"; String PROTECTED = "protected"; String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId"; String EMBEDDED_STORAGE_CLASS_ID = "embeddedStorageClassId"; String EMBEDDED_RESOURCE_TYPE_KEY = "embeddedResourceType"; + /** - * Some file formats can store information about their original - * file name/location or about their attachment's original file name/location - * within the file. + * Some file formats can store information about their original file name/location or about + * their attachment's original file name/location within the file. */ Property ORIGINAL_RESOURCE_NAME = Property.internalTextBag(TIKA_META_PREFIX + "origResourceName"); + /** - * This should be used to store the path (relative or full) - * of the source file, including the file name, - * e.g. doc/path/to/my_pdf.pdf - *

- * This can also be used for a primary key within a database. + * This should be used to store the path (relative or full) of the source file, including the + * file name, e.g. doc/path/to/my_pdf.pdf + * + *

This can also be used for a primary key within a database. */ Property SOURCE_PATH = Property.internalText(TIKA_META_PREFIX + "sourcePath"); + /** - * This is currently used to identify Content-Type that may be - * included within a document, such as in html documents - * (e.g. ) - * , or the value might come from outside the document. This information - * may be faulty and should be treated only as a hint. + * This is currently used to identify Content-Type that may be included within a document, such + * as in html documents (e.g. ) , or the value might come from outside the document. This information may be + * faulty and should be treated only as a hint. */ Property CONTENT_TYPE_HINT = Property.internalText(HttpHeaders.CONTENT_TYPE + "-Hint"); - /** - * This is used by users to override detection with the override detector. - */ + + /** This is used by users to override detection with the override detector. */ Property CONTENT_TYPE_USER_OVERRIDE = Property.internalText(HttpHeaders.CONTENT_TYPE + "-Override"); + /** - * This is used by parsers to override detection of embedded resources - * with the override detector. + * This is used by parsers to override detection of embedded resources with the override + * detector. */ Property CONTENT_TYPE_PARSER_OVERRIDE = Property.internalText(HttpHeaders.CONTENT_TYPE + "-Parser-Override"); + /** * @see DublinCore#FORMAT */ Property FORMAT = DublinCore.FORMAT; + /** * @see DublinCore#IDENTIFIER */ Property IDENTIFIER = DublinCore.IDENTIFIER; + /** * @see DublinCore#CONTRIBUTOR */ Property CONTRIBUTOR = DublinCore.CONTRIBUTOR; + /** * @see DublinCore#COVERAGE */ Property COVERAGE = DublinCore.COVERAGE; + /** * @see DublinCore#CREATOR */ Property CREATOR = DublinCore.CREATOR; + /** * @see Office#LAST_AUTHOR */ Property MODIFIER = Office.LAST_AUTHOR; + /** * @see XMP#CREATOR_TOOL */ Property CREATOR_TOOL = XMP.CREATOR_TOOL; + /** * @see DublinCore#LANGUAGE */ Property LANGUAGE = DublinCore.LANGUAGE; + /** * @see DublinCore#PUBLISHER */ Property PUBLISHER = DublinCore.PUBLISHER; + /** * @see DublinCore#RELATION */ Property RELATION = DublinCore.RELATION; + /** * @see DublinCore#RIGHTS */ Property RIGHTS = DublinCore.RIGHTS; + /** * @see DublinCore#SOURCE */ Property SOURCE = DublinCore.SOURCE; + /** * @see DublinCore#TYPE */ Property TYPE = DublinCore.TYPE; + /** * @see DublinCore#TITLE */ @@ -250,12 +253,13 @@ public interface TikaCoreProperties { * @see DublinCore#DESCRIPTION */ Property DESCRIPTION = DublinCore.DESCRIPTION; + /** - * {@link DublinCore#SUBJECT}; should include both subject and keywords - * if a document format has both. See also {@link Office#KEYWORDS} - * and {@link OfficeOpenXMLCore#SUBJECT}. + * {@link DublinCore#SUBJECT}; should include both subject and keywords if a document format has + * both. See also {@link Office#KEYWORDS} and {@link OfficeOpenXMLCore#SUBJECT}. */ Property SUBJECT = DublinCore.SUBJECT; + /** * @see DublinCore#DATE */ @@ -267,38 +271,41 @@ public interface TikaCoreProperties { * @see Office#SAVE_DATE */ Property MODIFIED = DublinCore.MODIFIED; + /** * @see Office#PRINT_DATE */ Property PRINT_DATE = Office.PRINT_DATE; + /** * @see XMP#METADATA_DATE */ Property METADATA_DATE = XMP.METADATA_DATE; + /** * @see Geographic#LATITUDE */ Property LATITUDE = Geographic.LATITUDE; - // Geographic related properties /** * @see Geographic#LONGITUDE */ Property LONGITUDE = Geographic.LONGITUDE; + /** * @see Geographic#ALTITUDE */ Property ALTITUDE = Geographic.ALTITUDE; + /** * @see XMP#RATING */ Property RATING = XMP.RATING; /** - * This is the number of images (as in a multi-frame gif) returned by - * Java's {@link javax.imageio.ImageReader#getNumImages(boolean)}. See - * the javadocs for known limitations. + * This is the number of images (as in a multi-frame gif) returned by Java's {@link + * javax.imageio.ImageReader#getNumImages(boolean)}. See the javadocs for known limitations. */ Property NUM_IMAGES = Property.internalInteger("imagereader:NumImages"); @@ -307,13 +314,18 @@ public interface TikaCoreProperties { * @see OfficeOpenXMLExtended#COMMENTS */ Property COMMENTS = OfficeOpenXMLExtended.COMMENTS; - /** - * Embedded resource type property - */ - Property EMBEDDED_RESOURCE_TYPE = Property.internalClosedChoise(EMBEDDED_RESOURCE_TYPE_KEY, - EmbeddedResourceType.ATTACHMENT.toString(), EmbeddedResourceType.INLINE.toString(), - EmbeddedResourceType.METADATA.toString(), EmbeddedResourceType.MACRO.toString(), - EmbeddedResourceType.THUMBNAIL.toString(), EmbeddedResourceType.RENDERING.toString()); + + /** Embedded resource type property */ + Property EMBEDDED_RESOURCE_TYPE = + Property.internalClosedChoise( + EMBEDDED_RESOURCE_TYPE_KEY, + EmbeddedResourceType.ATTACHMENT.toString(), + EmbeddedResourceType.INLINE.toString(), + EmbeddedResourceType.METADATA.toString(), + EmbeddedResourceType.MACRO.toString(), + EmbeddedResourceType.THUMBNAIL.toString(), + EmbeddedResourceType.RENDERING.toString()); + Property HAS_SIGNATURE = Property.internalBoolean("hasSignature"); Property SIGNATURE_NAME = Property.internalTextBag("signature:name"); @@ -323,69 +335,68 @@ public interface TikaCoreProperties { Property SIGNATURE_FILTER = Property.internalTextBag("signature:filter"); Property SIGNATURE_CONTACT_INFO = Property.internalTextBag("signature:contact-info"); - //is the file encrypted + // is the file encrypted Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted"); /** * When an EncodingDetector detects an encoding, the encoding should be stored in this field. * This is different from {@link Metadata#CONTENT_ENCODING} because that is what a parser - * chooses to use for processing a file. If an EncodingDetector returns "null", a parser - * may choose to use a default encoding. We want to differentiate between a parser using a - * default encoding and the output of an EncodingDetector. + * chooses to use for processing a file. If an EncodingDetector returns "null", a parser may + * choose to use a default encoding. We want to differentiate between a parser using a default + * encoding and the output of an EncodingDetector. */ Property DETECTED_ENCODING = Property.externalText(TIKA_META_PREFIX + "detectedEncoding"); - /** - * This should be the simple class name for the EncodingDetectors whose detected encoding - * was used in the parse. + * This should be the simple class name for the EncodingDetectors whose detected encoding was + * used in the parse. */ Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + "encodingDetector"); /** - * General metadata key for the count of non-final versions available within a file. This - * was added initially to support generalizing incremental updates in PDF. + * General metadata key for the count of non-final versions available within a file. This was + * added initially to support generalizing incremental updates in PDF. */ Property VERSION_COUNT = Property.externalInteger(TIKA_META_PREFIX + "versionCount"); /** - * General metadata key for the version number of a given file that contains - * earlier versions within it. This number is 0-indexed for the earliest version. - * The latest version does not have this metadata value. This was added initially - * to support generalizing incremental updates in PDF. + * General metadata key for the version number of a given file that contains earlier versions + * within it. This number is 0-indexed for the earliest version. The latest version does not + * have this metadata value. This was added initially to support generalizing incremental + * updates in PDF. */ Property VERSION_NUMBER = Property.externalInteger(TIKA_META_PREFIX + "versionNumber"); Property PIPES_RESULT = Property.externalText(TIKA_META_PREFIX + "pipes_result"); + /** - * A file might contain different types of embedded documents. - * The most common is the ATTACHMENT. - *

- * An INLINE embedded resource should be used for embedded image - * files that are used to render the page image (as in PDXObjImages in PDF files). - *

- * A MACRO is code that is embedded in the document and is intended - * to be executable within the application that opens the document. This - * includes traditional macros within Microsoft Office files and - * javascript within PDFActions. This would not include, e.g., an - * .exe file embedded in a .zip file. - *

- * A VERSION is an earlier version of the file as in incremental updates. - * The initial use case for this is incremental updates in PDFs, but - * it could be applied to other file formats as well where earlier versions - * are recoverable. See also {@link PDF#INCREMENTAL_UPDATE_NUMBER} - *

- * Not all parsers have yet implemented this. + * A file might contain different types of embedded documents. The most common is the + * ATTACHMENT. + * + *

An INLINE embedded resource should be used for embedded image files that are used to + * render the page image (as in PDXObjImages in PDF files). + * + *

A MACRO is code that is embedded in the document and is intended to be executable within + * the application that opens the document. This includes traditional macros within Microsoft + * Office files and javascript within PDFActions. This would not include, e.g., an .exe file + * embedded in a .zip file. + * + *

A VERSION is an earlier version of the file as in incremental updates. The initial use + * case for this is incremental updates in PDFs, but it could be applied to other file formats + * as well where earlier versions are recoverable. See also {@link + * PDF#INCREMENTAL_UPDATE_NUMBER} + * + *

Not all parsers have yet implemented this. */ enum EmbeddedResourceType { - INLINE, //image that is intended to be displayed in a rendering of the file - ATTACHMENT,//standard attachment as in email - MACRO, //any code that is intended to be run by the application - METADATA, //e.g. xmp, xfa - FONT,//embedded font files - THUMBNAIL, //TODO: set this in parsers that handle thumbnails - RENDERING, //if a file has been rendered - VERSION, //an earlier version of a file - ALTERNATE_FORMAT_CHUNK //OOXML inline alternate format chunk + INLINE, // image that is intended to be displayed in a rendering of the file + ATTACHMENT, // standard attachment as in email + MACRO, // any code that is intended to be run by the application + METADATA, // e.g. xmp, xfa + FONT, // embedded font files + THUMBNAIL, // TODO: set this in parsers that handle thumbnails + RENDERING, // if a file has been rendered + VERSION, // an earlier version of a file + ALTERNATE_FORMAT_CHUNK // OOXML inline alternate format chunk } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java index 7ae685e05e..775a20d871 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java @@ -16,13 +16,10 @@ */ package org.apache.tika.metadata; -/** - * A collection of Tika metadata keys used in Mime Type resolution - */ +/** A collection of Tika metadata keys used in Mime Type resolution */ public interface TikaMimeKeys { String TIKA_MIME_FILE = "tika.mime.file"; String MIME_TYPE_MAGIC = "mime.type.magic"; - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java index e4bf1454e2..3587131865 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java @@ -17,19 +17,16 @@ package org.apache.tika.metadata; /** - * Metadata properties for paged text, metadata appropriate - * for an individual page (useful for embedded document handlers - * called on individual pages). + * Metadata properties for paged text, metadata appropriate for an individual page (useful for + * embedded document handlers called on individual pages). * - * Use {@link PagedText} where possible + *

Use {@link PagedText} where possible */ public interface TikaPagedText { String TIKA_PAGED_TEXT_PREFIX = "tika_pg:"; - /** - * 1-based page number for a specific page - */ + + /** 1-based page number for a specific page */ Property PAGE_NUMBER = Property.internalInteger(TIKA_PAGED_TEXT_PREFIX + "page_number"); Property PAGE_ROTATION = Property.internalRational(TIKA_PAGED_TEXT_PREFIX + "page_rotation"); - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/WARC.java b/tika-core/src/main/java/org/apache/tika/metadata/WARC.java index 359236bdd0..9c017574d4 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/WARC.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/WARC.java @@ -27,5 +27,5 @@ public interface WARC { Property WARC_RECORD_ID = Property.externalText(PREFIX + "WARC-Record-ID"); - //TODO: lots + // TODO: lots } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java b/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java index 4fd37f07cd..750cc0535a 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java @@ -25,46 +25,52 @@ public interface WordPerfect { String WORDPERFECT_METADATA_NAME_PREFIX = "wordperfect"; - /** - * File size as defined in document header. - */ - Property FILE_SIZE = Property.internalText( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "FileSize"); - /** - * File identifier. - */ - Property FILE_ID = Property.internalText( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "FileId"); - /** - * Product type. - */ - Property PRODUCT_TYPE = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "ProductType"); - /** - * File type. - */ - Property FILE_TYPE = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "FileType"); - /** - * Major version. - */ - Property MAJOR_VERSION = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "MajorVersion"); - /** - * Minor version. - */ - Property MINOR_VERSION = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "MinorVersion"); - /** - * Is encrypted?. - */ - Property ENCRYPTED = Property.internalBoolean( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "Encrypted"); + /** File size as defined in document header. */ + Property FILE_SIZE = + Property.internalText( + WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "FileSize"); + + /** File identifier. */ + Property FILE_ID = + Property.internalText( + WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "FileId"); + + /** Product type. */ + Property PRODUCT_TYPE = + Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "ProductType"); + + /** File type. */ + Property FILE_TYPE = + Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "FileType"); + + /** Major version. */ + Property MAJOR_VERSION = + Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "MajorVersion"); + + /** Minor version. */ + Property MINOR_VERSION = + Property.internalInteger( + WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "MinorVersion"); + + /** Is encrypted?. */ + Property ENCRYPTED = + Property.internalBoolean( + WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "Encrypted"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java index bca38d40bb..ba3dcb6169 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java @@ -22,39 +22,30 @@ public interface XMP { String PREFIX = "xmp"; - /** - * The xmp prefix followed by the colon delimiter - */ + /** The xmp prefix followed by the colon delimiter */ String PREFIX_ = PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; - /** - * Unordered text strings of advisories. - */ + /** Unordered text strings of advisories. */ Property ABOUT = Property.externalTextBag(PREFIX_ + "About"); - /** - * Unordered text strings of advisories. - */ + /** Unordered text strings of advisories. */ Property ADVISORY = Property.externalTextBag(PREFIX_ + "Advisory"); /** - * The date and time the resource was created. For a digital file, this need not - * match a file-system creation time. For a freshly created resource, it should - * be close to that time, modulo the time taken to write the file. Later file - * transfer, copying, and so on, can make the file-system time arbitrarily different. + * The date and time the resource was created. For a digital file, this need not match a + * file-system creation time. For a freshly created resource, it should be close to that time, + * modulo the time taken to write the file. Later file transfer, copying, and so on, can make + * the file-system time arbitrarily different. */ Property CREATE_DATE = Property.externalDate(PREFIX_ + "CreateDate"); - /** - * The name of the first known tool used to create the resource. - */ + /** The name of the first known tool used to create the resource. */ Property CREATOR_TOOL = Property.externalText(PREFIX_ + "CreatorTool"); /** - * An unordered array of text strings that unambiguously identify the resource - * within a given context. An array item may be qualified with xmpidq:Scheme - * (see 8.7, “xmpidq namespace”) to denote the formal identification system to - * which that identifier conforms. + * An unordered array of text strings that unambiguously identify the resource within a given + * context. An array item may be qualified with xmpidq:Scheme (see 8.7, “xmpidq namespace”) to + * denote the formal identification system to which that identifier conforms. */ Property IDENTIFIER = Property.externalTextBag(PREFIX_ + "Identifier"); @@ -64,26 +55,21 @@ public interface XMP { Property LABEL = Property.externalText(PREFIX_ + "Label"); /** - * The date and time that any metadata for this resource was last changed. It - * should be the same as or more recent than xmp:ModifyDate + * The date and time that any metadata for this resource was last changed. It should be the same + * as or more recent than xmp:ModifyDate */ Property METADATA_DATE = Property.externalDate(PREFIX_ + "MetadataDate"); - /** - * The date and time the resource was last modified. - */ + /** The date and time the resource was last modified. */ Property MODIFY_DATE = Property.externalDate(PREFIX_ + "ModifyDate"); - /** - * A word or short phrase that represents the nick name fo the file - */ + /** A word or short phrase that represents the nick name fo the file */ Property NICKNAME = Property.externalText(PREFIX_ + "NickName"); /** - * A user-assigned rating for this file. The value shall be -1 or in the range - * [0..5], where -1 indicates “rejected” and 0 indicates “unrated”. If xmp:Rating - * is not present, a value of 0 should be assumed. + * A user-assigned rating for this file. The value shall be -1 or in the range [0..5], where -1 + * indicates “rejected” and 0 indicates “unrated”. If xmp:Rating is not present, a value of 0 + * should be assumed. */ Property RATING = Property.externalInteger(PREFIX_ + "Rating"); - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java index d7faa4483f..e4b43f2819 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java @@ -19,321 +19,279 @@ import java.util.Date; /** - * XMP Dynamic Media schema. This is a collection of - * {@link Property property definition} constants for the dynamic media - * properties defined in the XMP standard. + * XMP Dynamic Media schema. This is a collection of {@link Property property definition} constants + * for the dynamic media properties defined in the XMP standard. * - * @see XMP Specification, Part 2: Standard Schemas + * @see XMP Specification, Part 2: Standard Schemas * @since Apache Tika 0.7 */ public interface XMPDM { - /** - * "The absolute path to the file's peak audio file. If empty, no peak - * file exists." - */ + /** "The absolute path to the file's peak audio file. If empty, no peak file exists." */ Property ABS_PEAK_AUDIO_FILE_PATH = Property.internalURI("xmpDM:absPeakAudioFilePath"); - /** - * "The name of the album." - */ + /** "The name of the album." */ Property ALBUM = Property.externalText("xmpDM:album"); /** - * "An alternative tape name, set via the project window or timecode - * dialog in Premiere. If an alternative name has been set and has not - * been reverted, that name is displayed." + * "An alternative tape name, set via the project window or timecode dialog in Premiere. If an + * alternative name has been set and has not been reverted, that name is displayed." */ Property ALT_TAPE_NAME = Property.externalText("xmpDM:altTapeName"); -// /** -// * "A timecode set by the user. When specified, it is used instead -// * of the startTimecode." -// */ -// Property ALT_TIMECODE = "xmpDM:altTimecode"; + // /** + // * "A timecode set by the user. When specified, it is used instead + // * of the startTimecode." + // */ + // Property ALT_TIMECODE = "xmpDM:altTimecode"; - /** - * "The name of the artist or artists." - */ + /** "The name of the artist or artists." */ Property ARTIST = Property.externalText("xmpDM:artist"); - /** - * "The name of the album artist or group for compilation albums." - */ + /** "The name of the album artist or group for compilation albums." */ Property ALBUM_ARTIST = Property.externalText("xmpDM:albumArtist"); - /** - * "The date and time when the audio was last modified." - */ + /** "The date and time when the audio was last modified." */ Property AUDIO_MOD_DATE = Property.internalDate("xmpDM:audioModDate"); - /** - * "The audio sample rate. Can be any value, but commonly 32000, 41100, - * or 48000." - */ + /** "The audio sample rate. Can be any value, but commonly 32000, 41100, or 48000." */ Property AUDIO_SAMPLE_RATE = Property.internalInteger("xmpDM:audioSampleRate"); - /** - * "The audio sample type." - */ + /** "The audio sample type." */ Property AUDIO_SAMPLE_TYPE = - Property.internalClosedChoise("xmpDM:audioSampleType", "8Int", "16Int", "32Int", - "32Float"); + Property.internalClosedChoise( + "xmpDM:audioSampleType", "8Int", "16Int", "32Int", "32Float"); - /** - * "The audio channel type." - */ + /** "The audio channel type." */ Property AUDIO_CHANNEL_TYPE = Property.internalClosedChoise("xmpDM:audioChannelType", "Mono", "Stereo", "5.1", "7.1"); - /** - * "The audio compression used. For example, MP3." - */ + + /** "The audio compression used. For example, MP3." */ Property AUDIO_COMPRESSOR = Property.internalText("xmpDM:audioCompressor"); - /** - * "An album created by various artists." - */ + + /** "An album created by various artists." */ Property COMPILATION = Property.externalInteger("xmpDM:compilation"); -// /** -// * "Additional parameters for Beat Splice stretch mode." -// */ -// Property BEAT_SPLICE_PARAMS = "xmpDM:beatSpliceParams"; - /** - * "The composer's name." - */ + // /** + // * "Additional parameters for Beat Splice stretch mode." + // */ + // Property BEAT_SPLICE_PARAMS = "xmpDM:beatSpliceParams"; + /** "The composer's name." */ Property COMPOSER = Property.externalText("xmpDM:composer"); - /** - * "The copyright information." - */ + + /** "The copyright information." */ Property COPYRIGHT = Property.externalText("xmpDM:copyright"); -// /** -// * "An unordered list of all media used to create this media." -// */ -// Property CONTRIBUTED_MEDIA = "xmpDM:contributedMedia"; - /** - * "The disc number for part of an album set." - */ + // /** + // * "An unordered list of all media used to create this media." + // */ + // Property CONTRIBUTED_MEDIA = "xmpDM:contributedMedia"; + /** "The disc number for part of an album set." */ Property DISC_NUMBER = Property.externalInteger("xmpDM:discNumber"); - /** - * "The duration of the media file." - * Value is in Seconds, unless xmpDM:scale is also set. - */ + + /** "The duration of the media file." Value is in Seconds, unless xmpDM:scale is also set. */ Property DURATION = Property.externalReal("xmpDM:duration"); - /** - * "The engineer's name." - */ + + /** "The engineer's name." */ Property ENGINEER = Property.externalText("xmpDM:engineer"); - /** - * "The file data rate in megabytes per second. For example: - * '36/10' = 3.6 MB/sec" - */ + + /** "The file data rate in megabytes per second. For example: '36/10' = 3.6 MB/sec" */ Property FILE_DATA_RATE = Property.internalRational("xmpDM:fileDataRate"); - /** - * "The name of the genre." - */ + + /** "The name of the genre." */ Property GENRE = Property.externalText("xmpDM:genre"); - /** - * "The musical instrument." - */ + + /** "The musical instrument." */ Property INSTRUMENT = Property.externalText("xmpDM:instrument"); - /** - * "The audio's musical key." - */ + + /** "The audio's musical key." */ Property KEY = - Property.internalClosedChoise("xmpDM:key", "C", "C#", "D", "D#", "E", "F", "F#", "G", - "G#", "A", "A#", "B"); + Property.internalClosedChoise( + "xmpDM:key", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"); -// /** -// * "The duration of lead time for queuing music." -// */ -// Property INTRO_TIME = "xmpDM:introTime"; - /** - * "User's log comments." - */ + // /** + // * "The duration of lead time for queuing music." + // */ + // Property INTRO_TIME = "xmpDM:introTime"; + /** "User's log comments." */ Property LOG_COMMENT = Property.externalText("xmpDM:logComment"); - /** - * "When true, the clip can be looped seamlessly." - */ + + /** "When true, the clip can be looped seamlessly." */ Property LOOP = Property.internalBoolean("xmpDM:loop"); - /** - * "The number of beats." - */ + + /** "The number of beats." */ Property NUMBER_OF_BEATS = Property.internalReal("xmpDM:numberOfBeats"); - /** - * "The date and time when the metadata was last modified." - */ + + /** "The date and time when the metadata was last modified." */ Property METADATA_MOD_DATE = Property.internalDate("xmpDM:metadataModDate"); -// /** -// * An ordered list of markers. See also {@link #TRACKS xmpDM:Tracks}. -// */ -// Property MARKERS = "xmpDM:markers"; - /** - * "The sampling phase of film to be converted to video (pull-down)." - */ + // /** + // * An ordered list of markers. See also {@link #TRACKS xmpDM:Tracks}. + // */ + // Property MARKERS = "xmpDM:markers"; + /** "The sampling phase of film to be converted to video (pull-down)." */ Property PULL_DOWN = - Property.internalClosedChoise("xmpDM:pullDown", "WSSWW", "SSWWW", "SWWWS", "WWWSS", - "WWSSW", "WSSWW_24p", "SSWWW_24p", "SWWWS_24p", "WWWSS_24p", "WWSSW_24p"); - -// /** -// * "The time at which to fade out." -// */ -// Property OUT_CUE = "xmpDM:outCue"; - -// /** -// * "A reference to the project that created this file." -// */ -// Property PROJECT_REF = "xmpDM:projectRef"; - /** - * "The relative path to the file's peak audio file. If empty, no peak - * file exists." - */ + Property.internalClosedChoise( + "xmpDM:pullDown", + "WSSWW", + "SSWWW", + "SWWWS", + "WWWSS", + "WWSSW", + "WSSWW_24p", + "SSWWW_24p", + "SWWWS_24p", + "WWWSS_24p", + "WWSSW_24p"); + + // /** + // * "The time at which to fade out." + // */ + // Property OUT_CUE = "xmpDM:outCue"; + + // /** + // * "A reference to the project that created this file." + // */ + // Property PROJECT_REF = "xmpDM:projectRef"; + /** "The relative path to the file's peak audio file. If empty, no peak file exists." */ Property RELATIVE_PEAK_AUDIO_FILE_PATH = Property.internalURI("xmpDM:relativePeakAudioFilePath"); - /** - * "The date the title was released." - */ + + /** "The date the title was released." */ Property RELEASE_DATE = Property.externalDate("xmpDM:releaseDate"); -// /** -// * "The start time of the media inside the audio project." -// */ -// Property RELATIVE_TIMESTAMP = "xmpDM:relativeTimestamp"; + // /** + // * "The start time of the media inside the audio project." + // */ + // Property RELATIVE_TIMESTAMP = "xmpDM:relativeTimestamp"; /** - * "The musical scale used in the music. 'Neither' is most often used - * for instruments with no associated scale, such as drums." + * "The musical scale used in the music. 'Neither' is most often used for instruments with no + * associated scale, such as drums." */ Property SCALE_TYPE = Property.internalClosedChoise("xmpDM:scaleType", "Major", "Minor", "Both", "Neither"); -// /** -// * "Additional parameters for Resample stretch mode." -// */ -// Property RESAMPLE_PARAMS = "xmpDM:resampleParams"; - /** - * "The name of the scene." - */ + // /** + // * "Additional parameters for Resample stretch mode." + // */ + // Property RESAMPLE_PARAMS = "xmpDM:resampleParams"; + /** "The name of the scene." */ Property SCENE = Property.externalText("xmpDM:scene"); - /** - * "The date and time when the video was shot." - */ + + /** "The date and time when the video was shot." */ Property SHOT_DATE = Property.externalDate("xmpDM:shotDate"); + /** - * "The name of the location where the video was shot. For example: - * 'Oktoberfest, Munich, Germany'. For more accurate positioning, - * use the EXIF GPS values." + * "The name of the location where the video was shot. For example: 'Oktoberfest, Munich, + * Germany'. For more accurate positioning, use the EXIF GPS values." */ Property SHOT_LOCATION = Property.externalText("xmpDM:shotLocation"); - /** - * "The name of the shot or take." - */ + + /** "The name of the shot or take." */ Property SHOT_NAME = Property.externalText("xmpDM:shotName"); + /** - * "A description of the speaker angles from center front in degrees. - * For example: 'Left = -30, Right = 30, Center = 0, LFE = 45, - * Left Surround = -110, Right Surround = 110'" + * "A description of the speaker angles from center front in degrees. For example: 'Left = -30, + * Right = 30, Center = 0, LFE = 45, Left Surround = -110, Right Surround = 110'" */ Property SPEAKER_PLACEMENT = Property.externalText("xmpDM:speakerPlacement"); - /** - * "The audio stretch mode." - */ + + /** "The audio stretch mode." */ Property STRETCH_MODE = - Property.internalClosedChoise("xmpDM:stretchMode", "Fixed length", "Time-Scale", - "Resample", "Beat Splice", "Hybrid"); - -// /** -// * "The timecode of the first frame of video in the file, as obtained -// * from the device control." -// */ -// Property START_TIMECODE = "xmpDM:startTimecode"; + Property.internalClosedChoise( + "xmpDM:stretchMode", + "Fixed length", + "Time-Scale", + "Resample", + "Beat Splice", + "Hybrid"); + + // /** + // * "The timecode of the first frame of video in the file, as obtained + // * from the device control." + // */ + // Property START_TIMECODE = "xmpDM:startTimecode"; /** - * "The name of the tape from which the clip was captured, as set during - * the capture process." + * "The name of the tape from which the clip was captured, as set during the capture process." */ Property TAPE_NAME = Property.externalText("xmpDM:tapeName"); - /** - * "The audio's tempo." - */ + + /** "The audio's tempo." */ Property TEMPO = Property.internalReal("xmpDM:tempo"); - /** - * "The time signature of the music." - */ + + /** "The time signature of the music." */ Property TIME_SIGNATURE = - Property.internalClosedChoise("xmpDM:timeSignature", "2/4", "3/4", "4/4", "5/4", "7/4", - "6/8", "9/8", "12/8", "other"); + Property.internalClosedChoise( + "xmpDM:timeSignature", + "2/4", + "3/4", + "4/4", + "5/4", + "7/4", + "6/8", + "9/8", + "12/8", + "other"); -// /** -// * "Additional parameters for Time-Scale stretch mode." -// */ -// Property TIME_SCALE_PARAMS = "xmpDM:timeScaleParams"; - /** - * "A numeric value indicating the order of the audio file within its - * original recording." - */ + // /** + // * "Additional parameters for Time-Scale stretch mode." + // */ + // Property TIME_SCALE_PARAMS = "xmpDM:timeScaleParams"; + /** "A numeric value indicating the order of the audio file within its original recording." */ Property TRACK_NUMBER = Property.externalInteger("xmpDM:trackNumber"); - /** - * "The alpha mode." - */ + + /** "The alpha mode." */ Property VIDEO_ALPHA_MODE = Property.externalClosedChoise("xmpDM:videoAlphaMode", "straight", "pre-multiplied"); -// /** -// * "An unordered list of tracks. A track is a named set of markers, -// * which can specify a frame rate for all markers in the set. -// * See also {@link #MARKERS xmpDM:markers}." -// */ -// Property TRACKS = "xmpDM:Tracks"; - /** - * "When true, unity is clear, when false, it is opaque." - */ + // /** + // * "An unordered list of tracks. A track is a named set of markers, + // * which can specify a frame rate for all markers in the set. + // * See also {@link #MARKERS xmpDM:markers}." + // */ + // Property TRACKS = "xmpDM:Tracks"; + /** "When true, unity is clear, when false, it is opaque." */ Property VIDEO_ALPHA_UNITY_IS_TRANSPARENT = Property.internalBoolean("xmpDM:videoAlphaUnityIsTransparent"); -// /** -// * "A color in CMYK or RGB to be used as the pre-multiple color when -// * alpha mode is pre-multiplied." -// */ -// Property VIDEO_ALPHA_PREMULTIPLE_COLOR = "xmpDM:videoAlphaPremultipleColor"; - /** - * "The color space." - */ + // /** + // * "A color in CMYK or RGB to be used as the pre-multiple color when + // * alpha mode is pre-multiplied." + // */ + // Property VIDEO_ALPHA_PREMULTIPLE_COLOR = "xmpDM:videoAlphaPremultipleColor"; + /** "The color space." */ Property VIDEO_COLOR_SPACE = Property.internalClosedChoise("xmpDM:videoColorSpace", "sRGB", "CCIR-601", "CCIR-709"); - /** - * "Video compression used. For example, jpeg." - */ + + /** "Video compression used. For example, jpeg." */ Property VIDEO_COMPRESSOR = Property.internalText("xmpDM:videoCompressor"); - /** - * "The field order for video." - */ + + /** "The field order for video." */ Property VIDEO_FIELD_ORDER = Property.internalClosedChoise("xmpDM:videoFieldOrder", "Upper", "Lower", "Progressive"); - /** - * "The video frame rate." - */ + + /** "The video frame rate." */ Property VIDEO_FRAME_RATE = Property.internalOpenChoise("xmpDM:videoFrameRate", "24", "NTSC", "PAL"); - /** - * "The date and time when the video was last modified." - */ + + /** "The date and time when the video was last modified." */ Property VIDEO_MOD_DATE = Property.internalDate("xmpDM:videoModDate"); -// /** -// * "The frame size. For example: w:720, h: 480, unit:pixels" -// */ -// Property VIDEO_FRAME_SIZE = "xmpDM:videoFrameSize"; + // /** + // * "The frame size. For example: w:720, h: 480, unit:pixels" + // */ + // Property VIDEO_FRAME_SIZE = "xmpDM:videoFrameSize"; /** - * "The size in bits of each color component of a pixel. Standard - * Windows 32-bit pixels have 8 bits per component." + * "The size in bits of each color component of a pixel. Standard Windows 32-bit pixels have 8 + * bits per component." */ Property VIDEO_PIXEL_DEPTH = - Property.internalClosedChoise("xmpDM:videoPixelDepth", "8Int", "16Int", "32Int", - "32Float"); - /** - * "The aspect ratio, expressed as wd/ht. For example: '648/720' = 0.9" - */ + Property.internalClosedChoise( + "xmpDM:videoPixelDepth", "8Int", "16Int", "32Int", "32Float"); + + /** "The aspect ratio, expressed as wd/ht. For example: '648/720' = 0.9" */ Property VIDEO_PIXEL_ASPECT_RATIO = Property.internalRational("xmpDM:videoPixelAspectRatio"); /** @@ -345,9 +303,7 @@ public interface XMPDM { class ChannelTypePropertyConverter { private static final Property property = AUDIO_CHANNEL_TYPE; - /** - * How a standalone converter might work - */ + /** How a standalone converter might work */ public static String convert(Object value) { if (value instanceof String) { // Assume already done @@ -368,9 +324,7 @@ public static String convert(Object value) { return null; } - /** - * How convert+set might work - */ + /** How convert+set might work */ public static void convertAndSet(Metadata metadata, Object value) { if (value instanceof Integer || value instanceof Long) { metadata.set(property, convert(value)); @@ -386,5 +340,4 @@ public static void convertAndSet(Metadata metadata, Object value) { } } } - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java index 015b0657d8..38131f2cd9 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java @@ -22,15 +22,12 @@ public interface XMPIdq { String PREFIX = "xmpidq"; - /** - * The xmpidq prefix followed by the colon delimiter - */ + /** The xmpidq prefix followed by the colon delimiter */ String PREFIX_ = PREFIX + ":"; /** - * A qualifier providing the name of the formal identification - * scheme used for an item in the xmp:Identifier array. + * A qualifier providing the name of the formal identification scheme used for an item in the + * xmp:Identifier array. */ Property SCHEME = Property.externalText(PREFIX_ + "Scheme"); - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java index 2a81fa254f..6901c1be1c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java @@ -22,86 +22,68 @@ public interface XMPMM { String PREFIX = "xmpMM"; - /** - * The xmpMM prefix followed by the colon delimiter - */ + /** The xmpMM prefix followed by the colon delimiter */ String PREFIX_ = PREFIX + ":"; /** - * A reference to the resource from which this one is derived. - * This should be a minimal reference, in which missing - * components can be assumed to be unchanged. + * A reference to the resource from which this one is derived. This should be a minimal + * reference, in which missing components can be assumed to be unchanged. * - * TODO This property is of type RessourceRef which is a struct + *

TODO This property is of type RessourceRef which is a struct */ -// Property DERIVED_FROM = Property.externalText(PREFIX_ + "DerivedFrom"); + // Property DERIVED_FROM = Property.externalText(PREFIX_ + "DerivedFrom"); - /** - * The common identifier for all versions and renditions of a resource. - */ + /** The common identifier for all versions and renditions of a resource. */ Property DOCUMENTID = Property.externalText(PREFIX_ + "DocumentID"); /** - * An identifier for a specific incarnation of a resource, updated - * each time a file is saved. + * An identifier for a specific incarnation of a resource, updated each time a file is saved. */ Property INSTANCEID = Property.externalText(PREFIX_ + "InstanceID"); /** - * The common identifier for the original resource from which - * the current resource is derived. For example, if you save a - * resource to a different format, then save that one to another - * format, each save operation should generate a new - * xmpMM:DocumentID that uniquely identifies the resource in - * that format, but should retain the ID of the source file here. + * The common identifier for the original resource from which the current resource is derived. + * For example, if you save a resource to a different format, then save that one to another + * format, each save operation should generate a new xmpMM:DocumentID that uniquely identifies + * the resource in that format, but should retain the ID of the source file here. */ Property ORIGINAL_DOCUMENTID = Property.externalText(PREFIX_ + "OriginalDocumentID"); /** - * The rendition class name for this resource. This property - * should be absent or set to default for a resource that is not - * a derived rendition + * The rendition class name for this resource. This property should be absent or set to default + * for a resource that is not a derived rendition */ Property RENDITION_CLASS = - Property.externalOpenChoise(PREFIX_ + "RenditionClass", "default", "draft", "low-res", - "proof", "screen", "thumbnail"); + Property.externalOpenChoise( + PREFIX_ + "RenditionClass", + "default", + "draft", + "low-res", + "proof", + "screen", + "thumbnail"); /** - * Can be used to provide additional rendition parameters that - * are too complex or verbose to encode in xmpMM:RenditionClass + * Can be used to provide additional rendition parameters that are too complex or verbose to + * encode in xmpMM:RenditionClass */ Property RENDITION_PARAMS = Property.externalText(PREFIX_ + "RenditionParams"); - /** - * Instance id in the XMPMM's history section - */ + /** Instance id in the XMPMM's history section */ Property HISTORY_EVENT_INSTANCEID = Property.externalTextBag(PREFIX_ + "History:InstanceID"); - /** - * Action in the XMPMM's history section - */ + /** Action in the XMPMM's history section */ Property HISTORY_ACTION = Property.externalTextBag(PREFIX_ + "History:Action"); - /** - * When the action occurred in the XMPMM's history section - */ + + /** When the action occurred in the XMPMM's history section */ Property HISTORY_WHEN = Property.externalTextBag(PREFIX_ + "History:When"); - /** - * Software agent that created the action in the XMPMM's - * history section - */ + /** Software agent that created the action in the XMPMM's history section */ Property HISTORY_SOFTWARE_AGENT = Property.externalTextBag(PREFIX_ + "History:SoftwareAgent"); - /** - * Document id for the document that this document - * was derived from - */ + /** Document id for the document that this document was derived from */ Property DERIVED_FROM_DOCUMENTID = Property.externalText(PREFIX_ + "DerivedFrom:DocumentID"); - /** - * Instance id for the document instance that this - * document was derived from - */ + /** Instance id for the document instance that this document was derived from */ Property DERIVED_FROM_INSTANCEID = Property.externalText(PREFIX_ + "DerivedFrom:InstanceID"); - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java index 6254dbf266..5737c3b605 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java @@ -22,12 +22,12 @@ /** * XMP Rights management schema. - *

- * A collection of property constants for the - * rights management properties defined in the XMP + * + *

A collection of property constants for the rights management properties defined in the XMP * standard. * - * @see XMP Photoshop + * @see XMP + * Photoshop * @since Apache Tika 1.2 */ public interface XMPRights { @@ -35,26 +35,19 @@ public interface XMPRights { String NAMESPACE_URI_XMP_RIGHTS = "http://ns.adobe.com/xap/1.0/rights/"; String PREFIX_XMP_RIGHTS = "xmpRights"; - /** - * The xmpRights prefix followed by the colon delimiter - */ + /** The xmpRights prefix followed by the colon delimiter */ String PREFIX_ = PREFIX_XMP_RIGHTS + ":"; - /** - * A Web URL for a rights management certificate. - */ + /** A Web URL for a rights management certificate. */ Property CERTIFICATE = Property.internalText(PREFIX_ + "Certificate"); /** - * When true, indicates that this is a rights-managed resource. When - * false, indicates that this is a public-domain resource. Omit if the - * state is unknown. + * When true, indicates that this is a rights-managed resource. When false, indicates that this + * is a public-domain resource. Omit if the state is unknown. */ Property MARKED = Property.internalBoolean(PREFIX_ + "Marked"); - /** - * A list of legal owners of the resource. - */ + /** A list of legal owners of the resource. */ Property OWNER = Property.internalTextBag(PREFIX_ + "Owner"); /** @@ -63,9 +56,6 @@ public interface XMPRights { */ Property USAGE_TERMS = Property.internalText(PREFIX_ + "UsageTerms"); - /** - * A Web URL for a statement of the ownership and usage rights for this resource. - */ + /** A Web URL for a statement of the ownership and usage rights for this resource. */ Property WEB_STATEMENT = Property.internalText(PREFIX_ + "WebStatement"); - } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java index ca9b1e6ea5..1159b53dcb 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java @@ -20,7 +20,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -30,28 +29,25 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.utils.StringUtils; - /** - * This filter runs a regex against the first value in the "sourceField". - * If the pattern matches, it extracts the first group of the first match and - * set's the "targetField"'s value to that first group. - *

- * If there is a match, this will overwrite whatever value is in the - * "targetField". - *

- * If there is not a match, this filter will be a no-op. - *

- * If there are multiple matches, this filter will capture only the first. - * Open a ticket if you need different behavior. - *

- * If the source field has multiple values, this will run the regex - * against only the first value. - *

- * If the source field does not exist, this filter will be a no-op. - *

- * If the target field is the same value as the source field, this filter - * will overwrite the value in that field. Again, if there are multiple - * values in that field, those will all be overwritten. + * This filter runs a regex against the first value in the "sourceField". If the pattern matches, it + * extracts the first group of the first match and set's the "targetField"'s value to that first + * group. + * + *

If there is a match, this will overwrite whatever value is in the "targetField". + * + *

If there is not a match, this filter will be a no-op. + * + *

If there are multiple matches, this filter will capture only the first. Open a ticket if you + * need different behavior. + * + *

If the source field has multiple values, this will run the regex against only the first value. + * + *

If the source field does not exist, this filter will be a no-op. + * + *

If the target field is the same value as the source field, this filter will overwrite the + * value in that field. Again, if there are multiple values in that field, those will all be + * overwritten. */ public class CaptureGroupMetadataFilter extends MetadataFilter implements Initializable { @@ -94,7 +90,6 @@ public void initialize(Map params) throws TikaConfigException { } catch (PatternSyntaxException e) { throw new TikaConfigException("Couldn't parse regex", e); } - } @Override diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java index f196436ca6..e94fdefb36 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java @@ -19,16 +19,14 @@ import java.util.HashSet; import java.util.List; import java.util.Set; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; /** - * This class clears the entire metadata object if the - * mime matches the mime filter. The idea is that you might not want - * to store/transmit metadata for images or specific file types. + * This class clears the entire metadata object if the mime matches the mime filter. The idea is + * that you might not want to store/transmit metadata for images or specific file types. */ public class ClearByMimeMetadataFilter extends MetadataFilter { private final Set mimes; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java index 2c7d97661c..ed52548024 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java @@ -17,7 +17,6 @@ package org.apache.tika.metadata.filter; import java.util.List; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java index e093873844..08ca6c2157 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java @@ -22,32 +22,29 @@ import java.util.Date; import java.util.Locale; import java.util.TimeZone; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * Some dates in some file formats do not have a timezone. - * Tika correctly stores these without a timezone, e.g. 'yyyy-MM-dd'T'HH:mm:ss' - * This can be a problem if end points expect a 'Z' timezone. - * This filter makes the assumption that dates without timezones are UTC - * and always modifies the date to: "yyyy-MM-dd'T'HH:mm:ss'Z'" - * - * Users can specify an alternate defaultTimeZone with - * {@link DateNormalizingMetadataFilter#setDefaultTimeZone(String)} to apply - * if the file format does not specify a timezone. + * Some dates in some file formats do not have a timezone. Tika correctly stores these without a + * timezone, e.g. 'yyyy-MM-dd'T'HH:mm:ss' This can be a problem if end points expect a 'Z' timezone. + * This filter makes the assumption that dates without timezones are UTC and always modifies the + * date to: "yyyy-MM-dd'T'HH:mm:ss'Z'" * + *

Users can specify an alternate defaultTimeZone with {@link + * DateNormalizingMetadataFilter#setDefaultTimeZone(String)} to apply if the file format does not + * specify a timezone. */ public class DateNormalizingMetadataFilter extends MetadataFilter { private static TimeZone UTC = TimeZone.getTimeZone("UTC"); - private static final Logger LOGGER = LoggerFactory.getLogger(DateNormalizingMetadataFilter.class); + private static final Logger LOGGER = + LoggerFactory.getLogger(DateNormalizingMetadataFilter.class); private TimeZone defaultTimeZone = UTC; @@ -75,8 +72,8 @@ public void filter(Metadata metadata) throws TikaException { d = dateFormatter.parse(dateString); metadata.set(property, utcFormatter.format(d)); } catch (ParseException e) { - LOGGER.warn("Couldn't convert date to default time zone: >" - + dateString + "<"); + LOGGER.warn( + "Couldn't convert date to default time zone: >" + dateString + "<"); } } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java index 64a7d0ad61..c52c2af2f7 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java @@ -17,7 +17,6 @@ package org.apache.tika.metadata.filter; import java.util.List; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.utils.ServiceLoaderUtils; @@ -36,7 +35,8 @@ public DefaultMetadataFilter() { } private static List getDefaultFilters(ServiceLoader loader) { - List metadataFilters = loader.loadStaticServiceProviders(MetadataFilter.class); + List metadataFilters = + loader.loadStaticServiceProviders(MetadataFilter.class); ServiceLoaderUtils.sortLoadedClasses(metadataFilters); return metadataFilters; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java index 59d10d9898..3405558c75 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java @@ -19,7 +19,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java index db16f5dff6..83fecfcde2 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java @@ -18,7 +18,6 @@ import java.util.LinkedHashMap; import java.util.Map; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -57,10 +56,9 @@ public void filter(Metadata metadata) throws TikaException { } /** - * If this is true (default), this means that only the fields that - * have a "from" value in the mapper will be passed through. Otherwise, - * this will pass through all keys/values and mutate the keys - * that exist in the mappings. + * If this is true (default), this means that only the fields that have a "from" + * value in the mapper will be passed through. Otherwise, this will pass through all keys/values + * and mutate the keys that exist in the mappings. * * @param excludeUnmapped */ diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java index 2b65cb15d8..1b5edd174c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java @@ -23,19 +23,19 @@ import org.apache.tika.utils.StringUtils; /** - * If {@link Metadata} contains a {@link TikaCoreProperties#LATITUDE} and - * a {@link TikaCoreProperties#LONGITUDE}, this filter concatenates those with a - * comma in the order LATITUDE,LONGITUDE. + * If {@link Metadata} contains a {@link TikaCoreProperties#LATITUDE} and a {@link + * TikaCoreProperties#LONGITUDE}, this filter concatenates those with a comma in the order + * LATITUDE,LONGITUDE. * - * If you need any other mappings, please open a ticket on our JIRA. + *

If you need any other mappings, please open a ticket on our JIRA. */ public class GeoPointMetadataFilter extends MetadataFilter { String geoPointFieldName = "location"; /** - * Set the field for the concatenated LATITUDE,LONGITUDE string. - * The default if &dquot;location&dquot; + * Set the field for the concatenated LATITUDE,LONGITUDE string. The default if + * &dquot;location&dquot; * * @param geoPointFieldName field name to use for the geopoint field */ diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java index b75de6a9c5..0543b43cee 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java @@ -19,7 +19,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java index 21eb3eced1..1ae2491a0a 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java @@ -19,13 +19,11 @@ import java.io.IOException; import java.io.Serializable; - -import org.w3c.dom.Element; - import org.apache.tika.config.ConfigBase; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.w3c.dom.Element; /** * Filters the metadata in place after the parse @@ -36,16 +34,21 @@ public abstract class MetadataFilter extends ConfigBase implements Serializable /** * Loads the metadata filter from the config file if it exists, otherwise returns NoOpFilter + * * @param root * @return * @throws TikaConfigException * @throws IOException */ - public static MetadataFilter load(Element root, boolean allowMissing) throws TikaConfigException, - IOException { + public static MetadataFilter load(Element root, boolean allowMissing) + throws TikaConfigException, IOException { try { - return buildComposite("metadataFilters", CompositeMetadataFilter.class, - "metadataFilter", MetadataFilter.class, root); + return buildComposite( + "metadataFilters", + CompositeMetadataFilter.class, + "metadataFilter", + MetadataFilter.class, + root); } catch (TikaConfigException e) { if (allowMissing && e.getMessage().contains("could not find metadataFilters")) { return new NoOpFilter(); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java index d95472a1b9..477d9c1b22 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java @@ -19,16 +19,13 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -/** - * This filter performs no operations on the metadata - * and leaves it untouched. - */ +/** This filter performs no operations on the metadata and leaves it untouched. */ public class NoOpFilter extends MetadataFilter { public static final NoOpFilter NOOP_FILTER = new NoOpFilter(); @Override public void filter(Metadata metadata) throws TikaException { - //no op + // no op } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/package-info.java b/tika-core/src/main/java/org/apache/tika/metadata/package-info.java index 02fcae3ec8..3f2edbb744 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * Multi-valued metadata container, and set of constant metadata fields. - */ +/** Multi-valued metadata container, and set of constant metadata fields. */ @aQute.bnd.annotation.Version("1.0.0") package org.apache.tika.metadata; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java index e03367c773..565db7376f 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java @@ -24,13 +24,13 @@ public interface MetadataWriteFilter extends Serializable { void filterExisting(Map data); /** - * Based on the field and value, this filter modifies the field - * and/or the value to something that should be added to the Metadata object. + * Based on the field and value, this filter modifies the field and/or the value to something + * that should be added to the Metadata object. * - * If the value is null, no value is set or added. + *

If the value is null, no value is set or added. * - * Status updates (e.g. write limit reached) can be added directly to the - * underlying metadata. + *

Status updates (e.g. write limit reached) can be added directly to the underlying + * metadata. * * @param field * @param value @@ -40,9 +40,8 @@ public interface MetadataWriteFilter extends Serializable { void add(String field, String value, Map data); /** - * Based on the field and the value, this filter modifies - * the field and/or the value to something that should be set in the - * Metadata object. + * Based on the field and the value, this filter modifies the field and/or the value to + * something that should be set in the Metadata object. * * @param field * @param value diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java index f0e9f1fe61..78572bf870 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java @@ -26,47 +26,40 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; - import org.apache.tika.metadata.AccessPermissions; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.StringUtils; /** - * This is to be used to limit the amount of metadata that a - * parser can add based on the {@link #maxTotalEstimatedSize}, - * {@link #maxFieldSize}, {@link #maxValuesPerField}, and - * {@link #maxKeySize}. This can also be used to limit which - * fields are stored in the metadata object at write-time - * with {@link #includeFields}. - * - * All sizes are measured in UTF-16 bytes. The size is estimated - * as a rough order of magnitude of what is - * required to store the string in memory in Java. We recognize - * that Java uses more bytes to store length, offset etc. for strings. But - * the extra overhead varies by Java version and implementation, - * and we just need a basic estimate. We also recognize actual - * memory usage is affected by interning strings, etc. - * Please forgive us ... or consider writing your own write filter. :) + * This is to be used to limit the amount of metadata that a parser can add based on the {@link + * #maxTotalEstimatedSize}, {@link #maxFieldSize}, {@link #maxValuesPerField}, and {@link + * #maxKeySize}. This can also be used to limit which fields are stored in the metadata object at + * write-time with {@link #includeFields}. * + *

All sizes are measured in UTF-16 bytes. The size is estimated as a rough order of magnitude of + * what is required to store the string in memory in Java. We recognize that Java uses more bytes to + * store length, offset etc. for strings. But the extra overhead varies by Java version and + * implementation, and we just need a basic estimate. We also recognize actual memory usage is + * affected by interning strings, etc. Please forgive us ... or consider writing your own write + * filter. :) * - * NOTE: Fields in {@link #ALWAYS_SET_FIELDS} are - * always set no matter the current state of {@link #maxTotalEstimatedSize}. - * Except for {@link TikaCoreProperties#TIKA_CONTENT}, they are truncated at - * {@link #maxFieldSize}, and their sizes contribute to the {@link #maxTotalEstimatedSize}. + *

NOTE: Fields in {@link #ALWAYS_SET_FIELDS} are always set no matter the current state + * of {@link #maxTotalEstimatedSize}. Except for {@link TikaCoreProperties#TIKA_CONTENT}, they are + * truncated at {@link #maxFieldSize}, and their sizes contribute to the {@link + * #maxTotalEstimatedSize}. * - * NOTE: Fields in {@link #ALWAYS_ADD_FIELDS} are - * always added no matter the current state of {@link #maxTotalEstimatedSize}. - * Except for {@link TikaCoreProperties#TIKA_CONTENT}, each addition is truncated at - * {@link #maxFieldSize}, and their sizes contribute to the {@link #maxTotalEstimatedSize}. + *

NOTE: Fields in {@link #ALWAYS_ADD_FIELDS} are always added no matter the current state + * of {@link #maxTotalEstimatedSize}. Except for {@link TikaCoreProperties#TIKA_CONTENT}, each + * addition is truncated at {@link #maxFieldSize}, and their sizes contribute to the {@link + * #maxTotalEstimatedSize}. * - * This class {@link #minimumMaxFieldSizeInAlwaysFields} to protect the - * {@link #ALWAYS_ADD_FIELDS} and {@link #ALWAYS_SET_FIELDS}. If we didn't - * have this and a user sets the {@link #maxFieldSize} to, say, 10 bytes, - * the internal parser behavior would be broken because parsers rely on - * {@link Metadata#CONTENT_TYPE} to determine which parser to call. + *

This class {@link #minimumMaxFieldSizeInAlwaysFields} to protect the {@link + * #ALWAYS_ADD_FIELDS} and {@link #ALWAYS_SET_FIELDS}. If we didn't have this and a user sets the + * {@link #maxFieldSize} to, say, 10 bytes, the internal parser behavior would be broken because + * parsers rely on {@link Metadata#CONTENT_TYPE} to determine which parser to call. * - * NOTE: as with {@link Metadata}, this object is not thread safe. + *

NOTE: as with {@link Metadata}, this object is not thread safe. */ public class StandardWriteFilter implements MetadataWriteFilter, Serializable { @@ -87,7 +80,7 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable { ALWAYS_SET_FIELDS.add(Metadata.CONTENT_DISPOSITION); ALWAYS_SET_FIELDS.add(TikaCoreProperties.CONTAINER_EXCEPTION.getName()); ALWAYS_SET_FIELDS.add(TikaCoreProperties.EMBEDDED_EXCEPTION.getName()); - //Metadata.CONTENT_LOCATION? used by the html parser + // Metadata.CONTENT_LOCATION? used by the html parser } static { @@ -97,41 +90,42 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable { private static final String METADATA_TRUNCATED_KEY = TikaCoreProperties.TRUNCATED_METADATA.getName(); private static final String TIKA_CONTENT_KEY = TikaCoreProperties.TIKA_CONTENT.getName(); - private static final String[] TRUE = new String[]{"true"}; + private static final String[] TRUE = new String[] {"true"}; - //allow at least these many bytes in the "always" fields. - //As of 2022-03, the longest mime is 146. Doubling that gives - //us some leeway. If a mime is truncated, bad things will happen. + // allow at least these many bytes in the "always" fields. + // As of 2022-03, the longest mime is 146. Doubling that gives + // us some leeway. If a mime is truncated, bad things will happen. private final int minimumMaxFieldSizeInAlwaysFields = 300; - private final boolean includeEmpty; private final int maxTotalEstimatedSize; private final int maxValuesPerField; private final int maxFieldSize; private final int maxKeySize; - private final Set includeFields; private Map fieldSizes = new HashMap<>(); - //tracks the estimated size in utf16 bytes. Can be > maxEstimated size + // tracks the estimated size in utf16 bytes. Can be > maxEstimated size int estimatedSize = 0; /** - * @param maxKeySize maximum key size in UTF-16 bytes-- keys will be truncated to this - * length; if less than 0, keys will not be truncated + * @param maxKeySize maximum key size in UTF-16 bytes-- keys will be truncated to this length; + * if less than 0, keys will not be truncated * @param maxEstimatedSize - * @param includeFields if null or empty, all fields are included; otherwise, which fields - * to add to the metadata object. - * @param includeEmpty if true, this will set or add an empty value to the - * metadata object. + * @param includeFields if null or empty, all fields are included; otherwise, which fields to + * add to the metadata object. + * @param includeEmpty if true, this will set or add an empty value to the metadata + * object. */ - protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimatedSize, - int maxValuesPerField, - Set includeFields, - boolean includeEmpty) { + protected StandardWriteFilter( + int maxKeySize, + int maxFieldSize, + int maxEstimatedSize, + int maxValuesPerField, + Set includeFields, + boolean includeEmpty) { this.maxKeySize = maxKeySize; this.maxFieldSize = maxFieldSize; @@ -143,16 +137,16 @@ protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimated @Override public void filterExisting(Map data) { - //this is somewhat costly, but it ensures that - //metadata that was placed in the metadata object before this - //filter was applied is removed. - //It should only be called once, and probably not on that - //many fields. + // this is somewhat costly, but it ensures that + // metadata that was placed in the metadata object before this + // filter was applied is removed. + // It should only be called once, and probably not on that + // many fields. Map tmp = new HashMap<>(); for (Map.Entry e : data.entrySet()) { String name = e.getKey(); String[] vals = e.getValue(); - if (! includeField(name)) { + if (!includeField(name)) { continue; } for (int i = 0; i < vals.length; i++) { @@ -166,10 +160,9 @@ public void filterExisting(Map data) { data.putAll(tmp); } - @Override public void set(String field, String value, Map data) { - if (! include(field, value)) { + if (!include(field, value)) { return; } if (ALWAYS_SET_FIELDS.contains(field) || ALWAYS_ADD_FIELDS.contains(field)) { @@ -182,12 +175,12 @@ public void set(String field, String value, Map data) { private void setAlwaysInclude(String field, String value, Map data) { if (TIKA_CONTENT_KEY.equals(field)) { - data.put(field, new String[]{ value }); + data.put(field, new String[] {value}); return; } int sizeToAdd = estimateSize(value); - //if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax - //we do not want to truncate a mime! + // if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax + // we do not want to truncate a mime! int alwaysMaxFieldLength = Math.max(minimumMaxFieldSizeInAlwaysFields, maxFieldSize); String toSet = value; if (sizeToAdd > alwaysMaxFieldLength) { @@ -198,29 +191,29 @@ private void setAlwaysInclude(String field, String value, Map totalAdded += sizeToAdd; if (data.containsKey(field)) { String[] vals = data.get(field); - //this should only ever be single valued!!! + // this should only ever be single valued!!! if (vals.length > 0) { totalAdded -= estimateSize(vals[0]); } } estimatedSize += totalAdded; - data.put(field, new String[]{toSet}); + data.put(field, new String[] {toSet}); } private void addAlwaysInclude(String field, String value, Map data) { if (TIKA_CONTENT_KEY.equals(field)) { - data.put(field, new String[]{ value }); + data.put(field, new String[] {value}); return; } - if (! data.containsKey(field)) { + if (!data.containsKey(field)) { setAlwaysInclude(field, value, data); return; } - //TODO: should we limit the number of field values? + // TODO: should we limit the number of field values? int toAddSize = estimateSize(value); - //if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax - //we do not want to truncate a mime! + // if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax + // we do not want to truncate a mime! int alwaysMaxFieldLength = Math.max(minimumMaxFieldSizeInAlwaysFields, maxFieldSize); String toAddValue = value; if (toAddSize > alwaysMaxFieldLength) { @@ -234,29 +227,27 @@ private void addAlwaysInclude(String field, String value, Map data.put(field, appendValue(data.get(field), toAddValue)); } - - //calculate the max field length allowed if we are - //setting a value + // calculate the max field length allowed if we are + // setting a value private int maxAllowedToSet(StringSizePair filterKey) { Integer existingSizeInt = fieldSizes.get(filterKey.string); int existingSize = existingSizeInt == null ? 0 : existingSizeInt; - //this is how much is allowed by the overall total limit + // this is how much is allowed by the overall total limit int allowedByMaxTotal = maxTotalEstimatedSize - estimatedSize; - //if we're overwriting a value, that value's data size is now available + // if we're overwriting a value, that value's data size is now available allowedByMaxTotal += existingSize; - //if we're adding a key, we need to subtract that value + // if we're adding a key, we need to subtract that value allowedByMaxTotal -= existingSizeInt == null ? filterKey.size : 0; return Math.min(maxFieldSize, allowedByMaxTotal); } - @Override public void add(String field, String value, Map data) { - if (! include(field, value)) { + if (!include(field, value)) { return; } if (ALWAYS_SET_FIELDS.contains(field)) { @@ -267,7 +258,7 @@ public void add(String field, String value, Map data) { return; } StringSizePair filterKey = filterKey(field, value, data); - if (! data.containsKey(filterKey.string)) { + if (!data.containsKey(filterKey.string)) { setFilterKey(filterKey, value, data); return; } @@ -298,16 +289,16 @@ public void add(String field, String value, Map data) { int addedOverall = valueLength; if (fieldSizeInteger == null) { - //if there was no value before, we're adding - //a key. If there was a value before, do not - //add the key length. + // if there was no value before, we're adding + // a key. If there was a value before, do not + // add the key length. addedOverall += filterKey.size; } estimatedSize += addedOverall; fieldSizes.put(filterKey.string, valueLength + fieldSize); - data.put(filterKey.string, appendValue(data.get(filterKey.string), toAdd )); + data.put(filterKey.string, appendValue(data.get(filterKey.string), toAdd)); } private String[] appendValue(String[] values, final String value) { @@ -320,28 +311,27 @@ private String[] appendValue(String[] values, final String value) { return newValues; } - //calculate the max field length allowed if we are - //adding a value + // calculate the max field length allowed if we are + // adding a value private int maxAllowedToAdd(StringSizePair filterKey) { Integer existingSizeInt = fieldSizes.get(filterKey.string); int existingSize = existingSizeInt == null ? 0 : existingSizeInt; - //how much can we add to this field + // how much can we add to this field int allowedByMaxField = maxFieldSize - existingSize; - //this is how much is allowed by the overall total limit + // this is how much is allowed by the overall total limit int allowedByMaxTotal = maxTotalEstimatedSize - estimatedSize - 1; - //if we're adding a new key, we need to subtract that value + // if we're adding a new key, we need to subtract that value allowedByMaxTotal -= existingSizeInt == null ? filterKey.size : 0; return Math.min(allowedByMaxField, allowedByMaxTotal); } - private void setFilterKey(StringSizePair filterKey, String value, - Map data) { - //if you can't even add the key, give up now - if (! data.containsKey(filterKey.string) && - (filterKey.size + estimatedSize > maxTotalEstimatedSize)) { + private void setFilterKey(StringSizePair filterKey, String value, Map data) { + // if you can't even add the key, give up now + if (!data.containsKey(filterKey.string) + && (filterKey.size + estimatedSize > maxTotalEstimatedSize)) { setTruncated(data); return; } @@ -365,9 +355,9 @@ private void setFilterKey(StringSizePair filterKey, String value, int addedOverall = 0; if (fieldSizeInteger == null) { - //if there was no value before, we're adding - //a key. If there was a value before, do not - //add the key length. + // if there was no value before, we're adding + // a key. If there was a value before, do not + // add the key length. addedOverall += filterKey.size; } addedOverall += valueLength - fieldSize; @@ -375,8 +365,7 @@ private void setFilterKey(StringSizePair filterKey, String value, fieldSizes.put(filterKey.string, valueLength); - data.put(filterKey.string, new String[]{ toSet }); - + data.put(filterKey.string, new String[] {toSet}); } private void setTruncated(Map data) { @@ -390,15 +379,13 @@ private StringSizePair filterKey(String field, String value, Map data) { setTruncated(data); - //correctly handle multibyte characters + // correctly handle multibyte characters byte[] bytes = value.getBytes(StandardCharsets.UTF_16BE); ByteBuffer bb = ByteBuffer.wrap(bytes, 0, length); CharBuffer cb = CharBuffer.allocate(length); @@ -416,6 +403,7 @@ private boolean include(String field, String value) { /** * Tests for null or empty. Does not check for length + * * @param value * @return */ @@ -433,8 +421,7 @@ private boolean includeField(String name) { if (ALWAYS_SET_FIELDS.contains(name)) { return true; } - if (includeFields == null || - includeFields.contains(name)) { + if (includeFields == null || includeFields.contains(name)) { return true; } return false; @@ -446,7 +433,7 @@ private static int estimateSize(String s) { private static class StringSizePair { final String string; - final int size;//utf-16 bytes -- estimated + final int size; // utf-16 bytes -- estimated final boolean truncated; public StringSizePair(String string, int size, boolean truncated) { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java index b7d60b540c..397c238910 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java @@ -22,12 +22,11 @@ import java.util.concurrent.ConcurrentHashMap; /** - * Factory class for {@link StandardWriteFilter}. See that class - * for how the estimated sizes are calculated on Strings. + * Factory class for {@link StandardWriteFilter}. See that class for how the estimated sizes are + * calculated on Strings. */ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { - public static int DEFAULT_MAX_KEY_SIZE = 1024; public static int DEFAULT_MAX_FIELD_SIZE = 100 * 1024; public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024; @@ -54,8 +53,13 @@ public MetadataWriteFilter newInstance() { throw new IllegalArgumentException("max estimated size must be > 0"); } - return new StandardWriteFilter(maxKeySize, maxFieldSize, - maxTotalEstimatedBytes, maxValuesPerField, includeFields, includeEmpty); + return new StandardWriteFilter( + maxKeySize, + maxFieldSize, + maxTotalEstimatedBytes, + maxValuesPerField, + includeFields, + includeEmpty); } public void setIncludeFields(List includeFields) { @@ -110,9 +114,19 @@ public boolean isIncludeEmpty() { @Override public String toString() { - return "StandardWriteFilterFactory{" + "includeFields=" + includeFields + ", maxKeySize=" + - maxKeySize + ", maxFieldSize=" + maxFieldSize + ", maxTotalEstimatedBytes=" + - maxTotalEstimatedBytes + ", maxValuesPerField=" + maxValuesPerField + - ", includeEmpty=" + includeEmpty + '}'; + return "StandardWriteFilterFactory{" + + "includeFields=" + + includeFields + + ", maxKeySize=" + + maxKeySize + + ", maxFieldSize=" + + maxFieldSize + + ", maxTotalEstimatedBytes=" + + maxTotalEstimatedBytes + + ", maxValuesPerField=" + + maxValuesPerField + + ", includeEmpty=" + + includeEmpty + + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/mime/AndClause.java b/tika-core/src/main/java/org/apache/tika/mime/AndClause.java index c7a2184820..9fcf670962 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/AndClause.java +++ b/tika-core/src/main/java/org/apache/tika/mime/AndClause.java @@ -46,5 +46,4 @@ public int size() { public String toString() { return "and" + Arrays.toString(clauses); } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/Clause.java b/tika-core/src/main/java/org/apache/tika/mime/Clause.java index fc3bcc1e39..cfc7701fb5 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/Clause.java +++ b/tika-core/src/main/java/org/apache/tika/mime/Clause.java @@ -18,20 +18,15 @@ import java.io.Serializable; -/** - * Defines a clause to be evaluated. - */ +/** Defines a clause to be evaluated. */ interface Clause extends Serializable { - /** - * Evaluates this clause with the specified chunk of data. - */ + /** Evaluates this clause with the specified chunk of data. */ boolean eval(byte[] data); /** - * Returns the size of this clause. The size of a clause is the number of - * chars it is composed of. + * Returns the size of this clause. The size of a clause is the number of chars it is composed + * of. */ int size(); - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java b/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java index 1ba53fe9bb..961bfd99eb 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java +++ b/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java @@ -16,13 +16,12 @@ */ package org.apache.tika.mime; -/** - * A set of Hex encoding and decoding utility methods. - */ +/** A set of Hex encoding and decoding utility methods. */ public class HexCoDec { - private static final char[] HEX_CHARS = - {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; + private static final char[] HEX_CHARS = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' + }; /** * Decode a hex string @@ -47,9 +46,9 @@ public static byte[] decode(char[] hexChars) { /** * Decode an array of hex chars. * - * @param hexChars an array of hex characters. + * @param hexChars an array of hex characters. * @param startIndex the index of the first character to decode - * @param length the number of characters to decode. + * @param length the number of characters to decode. * @return the decode hex chars as bytes. */ public static byte[] decode(char[] hexChars, int startIndex, int length) { @@ -59,8 +58,10 @@ public static byte[] decode(char[] hexChars, int startIndex, int length) { byte[] result = new byte[length / 2]; for (int j = 0; j < result.length; j++) { - result[j] = (byte) (hexCharToNibble(hexChars[startIndex++]) * 16 + - hexCharToNibble(hexChars[startIndex++])); + result[j] = + (byte) + (hexCharToNibble(hexChars[startIndex++]) * 16 + + hexCharToNibble(hexChars[startIndex++])); } return result; } @@ -78,9 +79,9 @@ public static char[] encode(byte[] bites) { /** * Hex encode an array of bytes * - * @param bites the array of bytes to encode. + * @param bites the array of bytes to encode. * @param startIndex the index of the first character to encode. - * @param length the number of characters to encode. + * @param length the number of characters to encode. * @return the array of hex characters. */ public static char[] encode(byte[] bites, int startIndex, int length) { @@ -93,9 +94,7 @@ public static char[] encode(byte[] bites, int startIndex, int length) { return result; } - /** - * Internal method to turn a hex char into a nibble. - */ + /** Internal method to turn a hex char into a nibble. */ private static int hexCharToNibble(char ch) { if ((ch >= '0') && (ch <= '9')) { return ch - '0'; @@ -107,5 +106,4 @@ private static int hexCharToNibble(char ch) { throw new IllegalArgumentException("Not a hex char - '" + ch + "'"); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/Magic.java b/tika-core/src/main/java/org/apache/tika/mime/Magic.java index 9c3b3adf62..1254bd55be 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/Magic.java +++ b/tika-core/src/main/java/org/apache/tika/mime/Magic.java @@ -16,10 +16,7 @@ */ package org.apache.tika.mime; -/** - * Defines a magic for a MimeType. A magic is made of one or several - * MagicClause. - */ +/** Defines a magic for a MimeType. A magic is made of one or several MagicClause. */ class Magic implements Clause, Comparable { private final MimeType type; @@ -82,5 +79,4 @@ public boolean equals(Object o) { public int hashCode() { return type.hashCode() ^ string.hashCode(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java b/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java index afa0de91f1..f1c5a83af7 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java @@ -17,15 +17,11 @@ package org.apache.tika.mime; import java.io.IOException; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; - import org.apache.tika.detect.MagicDetector; import org.apache.tika.metadata.Metadata; -/** - * Defines a magic match. - */ +/** Defines a magic match. */ class MagicMatch implements Clause { private final MediaType mediaType; @@ -57,8 +53,9 @@ private synchronized MagicDetector getDetector() { public boolean eval(byte[] data) { try { - return getDetector().detect(new UnsynchronizedByteArrayInputStream(data), new Metadata()) != - MediaType.OCTET_STREAM; + return getDetector() + .detect(new UnsynchronizedByteArrayInputStream(data), new Metadata()) + != MediaType.OCTET_STREAM; } catch (IOException e) { // Should never happen with a ByteArrayInputStream return false; @@ -72,5 +69,4 @@ public int size() { public String toString() { return mediaType.toString() + " " + type + " " + offset + " " + value + " " + mask; } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java index 13ad6ed9cd..44a03c1a06 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java @@ -29,14 +29,10 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -/** - * Internet media type. - */ +/** Internet media type. */ public final class MediaType implements Comparable, Serializable { - /** - * Serial version UID. - */ + /** Serial version UID. */ private static final long serialVersionUID = -3831000556189036392L; private static final Pattern SPECIAL = Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=]"); @@ -44,23 +40,24 @@ public final class MediaType implements Comparable, Serializable { private static final Pattern SPECIAL_OR_WHITESPACE = Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]"); - /** - * See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters. - */ + /** See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters. */ private static final String VALID_CHARS = "([^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]+)"; private static final Pattern TYPE_PATTERN = Pattern.compile("(?s)\\s*" + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS + "\\s*($|;.*)"); // TIKA-350: handle charset as first element in content-type - private static final Pattern CHARSET_FIRST_PATTERN = Pattern.compile( - "(?is)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*" + VALID_CHARS + "\\s*/\\s*" + - VALID_CHARS + "\\s*"); + private static final Pattern CHARSET_FIRST_PATTERN = + Pattern.compile( + "(?is)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*" + + VALID_CHARS + + "\\s*/\\s*" + + VALID_CHARS + + "\\s*"); /** - * Set of basic types with normalized "type/subtype" names. - * Used to optimize type lookup and to avoid having too many - * {@link MediaType} instances in memory. + * Set of basic types with normalized "type/subtype" names. Used to optimize type lookup and to + * avoid having too many {@link MediaType} instances in memory. */ private static final Map SIMPLE_TYPES = new HashMap<>(); @@ -75,24 +72,22 @@ public final class MediaType implements Comparable, Serializable { public static final MediaType APPLICATION_XML = parse("application/xml"); public static final MediaType APPLICATION_ZIP = parse("application/zip"); - /** - * Canonical string representation of this media type. - */ + + /** Canonical string representation of this media type. */ private final String string; + /** - * Location of the "/" character separating the type and the subtype - * tokens in {@link #string}. + * Location of the "/" character separating the type and the subtype tokens in {@link #string}. */ private final int slash; + /** - * Location of the first ";" character separating the type part of - * {@link #string} from possible parameters. Length of {@link #string} - * in case there are no parameters. + * Location of the first ";" character separating the type part of {@link #string} from possible + * parameters. Length of {@link #string} in case there are no parameters. */ private final int semicolon; - /** - * Immutable sorted map of media type parameters. - */ + + /** Immutable sorted map of media type parameters. */ private final Map parameters; public MediaType(String type, String subtype, Map parameters) { @@ -157,8 +152,8 @@ public MediaType(MediaType type, Map parameters) { /** * Creates a media type by adding a parameter to a base type. * - * @param type base type - * @param name parameter name + * @param type base type + * @param name parameter name * @param value parameter value * @since Apache Tika 1.2 */ @@ -169,7 +164,7 @@ public MediaType(MediaType type, String name, String value) { /** * Creates a media type by adding the "charset" parameter to a base type. * - * @param type base type + * @param type base type * @param charset charset value * @since Apache Tika 1.2 */ @@ -198,8 +193,7 @@ public static MediaType video(String type) { } /** - * Convenience method that returns an unmodifiable set that contains - * all the given media types. + * Convenience method that returns an unmodifiable set that contains all the given media types. * * @param types media types * @return unmodifiable set of the given types @@ -216,8 +210,8 @@ public static Set set(MediaType... types) { } /** - * Convenience method that parses the given media type strings and - * returns an unmodifiable set that contains all the parsed types. + * Convenience method that parses the given media type strings and returns an unmodifiable set + * that contains all the parsed types. * * @param types media type strings * @return unmodifiable set of the parsed types @@ -235,10 +229,9 @@ public static Set set(String... types) { } /** - * Parses the given string to a media type. The string is expected - * to be of the form "type/subtype(; parameter=...)*" as defined in - * RFC 2045, though we also handle "charset=xxx; type/subtype" for - * broken web servers. + * Parses the given string to a media type. The string is expected to be of the form + * "type/subtype(; parameter=...)*" as defined in RFC 2045, though we also handle "charset=xxx; + * type/subtype" for broken web servers. * * @param string media type string to be parsed * @return parsed media type, or null if parsing fails @@ -255,9 +248,9 @@ public static MediaType parse(String string) { int slash = string.indexOf('/'); if (slash == -1) { return null; - } else if (SIMPLE_TYPES.size() < 10000 && - isSimpleName(string.substring(0, slash)) && - isSimpleName(string.substring(slash + 1))) { + } else if (SIMPLE_TYPES.size() < 10000 + && isSimpleName(string.substring(0, slash)) + && isSimpleName(string.substring(slash + 1))) { type = new MediaType(string, slash); SIMPLE_TYPES.put(string, type); } @@ -270,13 +263,13 @@ public static MediaType parse(String string) { Matcher matcher; matcher = TYPE_PATTERN.matcher(string); if (matcher.matches()) { - return new MediaType(matcher.group(1), matcher.group(2), - parseParameters(matcher.group(3))); + return new MediaType( + matcher.group(1), matcher.group(2), parseParameters(matcher.group(3))); } matcher = CHARSET_FIRST_PATTERN.matcher(string); if (matcher.matches()) { - return new MediaType(matcher.group(2), matcher.group(3), - parseParameters(matcher.group(1))); + return new MediaType( + matcher.group(2), matcher.group(3), parseParameters(matcher.group(1))); } return null; @@ -285,8 +278,12 @@ public static MediaType parse(String string) { private static boolean isSimpleName(String name) { for (int i = 0; i < name.length(); i++) { char c = name.charAt(i); - if (c != '-' && c != '+' && c != '.' && c != '_' && !('0' <= c && c <= '9') && - !('a' <= c && c <= 'z')) { + if (c != '-' + && c != '+' + && c != '.' + && c != '_' + && !('0' <= c && c <= '9') + && !('a' <= c && c <= 'z')) { return false; } } @@ -329,8 +326,7 @@ private static Map parseParameters(String string) { } /** - * Fuzzy unquoting mechanism that works also with somewhat malformed - * quotes. + * Fuzzy unquoting mechanism that works also with somewhat malformed quotes. * * @param s string to unquote * @return unquoted string @@ -359,8 +355,7 @@ private static Map union(Map a, Maptrue if this type has one or more parameters, - * false otherwise + * @return true if this type has one or more parameters, false + * otherwise * @since Apache Tika 0.8 */ public boolean hasParameters() { @@ -399,8 +388,8 @@ public boolean hasParameters() { } /** - * Returns an immutable sorted map of the parameters of this media type. - * The parameter names are guaranteed to be trimmed and in lower case. + * Returns an immutable sorted map of the parameters of this media type. The parameter names are + * guaranteed to be trimmed and in lower case. * * @return sorted map of parameters */ @@ -428,5 +417,4 @@ public int hashCode() { public int compareTo(MediaType that) { return string.compareTo(that.string); } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java index ac5b3add87..93a3f54498 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java @@ -23,24 +23,22 @@ import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; -/** - * Registry of known Internet media types. - */ +/** Registry of known Internet media types. */ public class MediaTypeRegistry implements Serializable { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 4710974869988895410L; + /** - * Registry of known media types, including type aliases. A canonical - * media type is handled as an identity mapping, while an alias is stored - * as a mapping from the alias to the corresponding canonical type. + * Registry of known media types, including type aliases. A canonical media type is handled as + * an identity mapping, while an alias is stored as a mapping from the alias to the + * corresponding canonical type. */ private final Map registry = new ConcurrentHashMap<>(); + /** - * Known type inheritance relationships. The mapping is from a media type - * to the closest supertype. + * Known type inheritance relationships. The mapping is from a media type to the closest + * supertype. */ private final Map inheritance = new HashMap<>(); @@ -55,8 +53,8 @@ public static MediaTypeRegistry getDefaultRegistry() { } /** - * Returns the set of all known canonical media types. Type aliases are - * not included in the returned set. + * Returns the set of all known canonical media types. Type aliases are not included in the + * returned set. * * @return canonical media types * @since Apache Tika 0.8 @@ -126,13 +124,12 @@ public MediaType normalize(MediaType type) { } /** - * Checks whether the given media type a is a specialization of a more - * generic type b. Both types should be already normalised. + * Checks whether the given media type a is a specialization of a more generic type b. Both + * types should be already normalised. * * @param a media type, normalised * @param b suspected supertype, normalised - * @return true if b is a supertype of a, - * false otherwise + * @return true if b is a supertype of a, false otherwise * @since Apache Tika 0.8 */ public boolean isSpecializationOf(MediaType a, MediaType b) { @@ -140,13 +137,13 @@ public boolean isSpecializationOf(MediaType a, MediaType b) { } /** - * Checks whether the given media type equals the given base type or - * is a specialization of it. Both types should be already normalised. + * Checks whether the given media type equals the given base type or is a specialization of it. + * Both types should be already normalised. * * @param a media type, normalised * @param b base type, normalised - * @return true if b equals a or is a specialization of it, - * false otherwise + * @return true if b equals a or is a specialization of it, false + * otherwise * @since Apache Tika 1.2 */ public boolean isInstanceOf(MediaType a, MediaType b) { @@ -154,14 +151,14 @@ public boolean isInstanceOf(MediaType a, MediaType b) { } /** - * Parses and normalises the given media type string and checks whether - * the result equals the given base type or is a specialization of it. - * The given base type should already be normalised. + * Parses and normalises the given media type string and checks whether the result equals the + * given base type or is a specialization of it. The given base type should already be + * normalised. * * @param a media type * @param b base type, normalised - * @return true if b equals a or is a specialization of it, - * false otherwise + * @return true if b equals a or is a specialization of it, false + * otherwise * @since Apache Tika 1.2 */ public boolean isInstanceOf(String a, MediaType b) { @@ -169,14 +166,12 @@ public boolean isInstanceOf(String a, MediaType b) { } /** - * Returns the supertype of the given type. If the media type database - * has an explicit inheritance rule for the type, then that is used. - * Next, if the given type has any parameters, then the respective base - * type (parameter-less) is returned. Otherwise built-in heuristics like - * text/... -> text/plain and .../...+xml -> application/xml are used. - * Finally application/octet-stream is returned for all types for which no other - * supertype is known, and the return value for application/octet-stream - * is null. + * Returns the supertype of the given type. If the media type database has an explicit + * inheritance rule for the type, then that is used. Next, if the given type has any parameters, + * then the respective base type (parameter-less) is returned. Otherwise built-in heuristics + * like text/... -> text/plain and .../...+xml -> application/xml are used. Finally + * application/octet-stream is returned for all types for which no other supertype is known, and + * the return value for application/octet-stream is null. * * @param type media type * @return supertype, or null for application/octet-stream @@ -203,5 +198,4 @@ public MediaType getSupertype(MediaType type) { return null; } } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java index 8dc3ddba54..8522dcc048 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java @@ -22,69 +22,49 @@ import java.util.Collections; import java.util.List; -/** - * Internet media type. - */ +/** Internet media type. */ public final class MimeType implements Comparable, Serializable { - /** - * Serial version UID. - */ + /** Serial version UID. */ private static final long serialVersionUID = 4357830439860729201L; - /** - * The normalized media type name. - */ + + /** The normalized media type name. */ private final MediaType type; - /** - * The minimum length of data to provides for magic analyzis - */ + + /** The minimum length of data to provides for magic analyzis */ private final int minLength = 0; - /** - * The MimeType acronym - */ + + /** The MimeType acronym */ private String acronym = ""; - /** - * The http://en.wikipedia.org/wiki/Uniform_Type_Identifier - */ + /** The http://en.wikipedia.org/wiki/Uniform_Type_Identifier */ private String uti = ""; - /** - * Documentation Links - */ + /** Documentation Links */ private List links = Collections.emptyList(); - /** - * Description of this media type. - */ + /** Description of this media type. */ private String description = ""; - /** - * The magics associated to this Mime-Type - */ + /** The magics associated to this Mime-Type */ private List magics = null; - /** - * The root-XML associated to this Mime-Type - */ + /** The root-XML associated to this Mime-Type */ private List rootXML = null; - /** - * All known file extensions of this type, in order of preference - * (best first). - */ + + /** All known file extensions of this type, in order of preference (best first). */ private List extensions = null; + /** - * Whether this mime-type is used for server-side scripts, - * and thus cannot reliably be used for filename-based type detection + * Whether this mime-type is used for server-side scripts, and thus cannot reliably be used for + * filename-based type detection */ private boolean isInterpreted = false; /** - * Creates a media type with the give name and containing media type - * registry. The name is expected to be valid and normalized to lower - * case. This constructor should only be called by - * {@link MimeTypes#forName(String)} to keep the media type registry - * up to date. + * Creates a media type with the give name and containing media type registry. The name is + * expected to be valid and normalized to lower case. This constructor should only be called by + * {@link MimeTypes#forName(String)} to keep the media type registry up to date. * * @param type normalized media type name */ @@ -96,9 +76,9 @@ public final class MimeType implements Comparable, Serializable { } /** - * Checks that the given string is a valid Internet media type name - * based on rules from RFC 2054 section 5.3. For validation purposes the - * rules can be simplified to the following: + * Checks that the given string is a valid Internet media type name based on rules from RFC 2054 + * section 5.3. For validation purposes the rules can be simplified to the following: + * *

      * name := token "/" token
      * token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
@@ -107,8 +87,8 @@ public final class MimeType implements Comparable, Serializable {
      * 
* * @param name name string - * @return true if the string is a valid media type name, - * false otherwise + * @return true if the string is a valid media type name, false + * otherwise */ public static boolean isValid(String name) { if (name == null) { @@ -118,9 +98,22 @@ public static boolean isValid(String name) { boolean slash = false; for (int i = 0; i < name.length(); i++) { char ch = name.charAt(i); - if (ch <= ' ' || ch >= 127 || ch == '(' || ch == ')' || ch == '<' || ch == '>' || - ch == '@' || ch == ',' || ch == ';' || ch == ':' || ch == '\\' || ch == '"' || - ch == '[' || ch == ']' || ch == '?' || ch == '=') { + if (ch <= ' ' + || ch >= 127 + || ch == '(' + || ch == ')' + || ch == '<' + || ch == '>' + || ch == '@' + || ch == ',' + || ch == ';' + || ch == ':' + || ch == '\\' + || ch == '"' + || ch == '[' + || ch == ']' + || ch == '?' + || ch == '=') { return false; } else if (ch == '/') { if (slash || i == 0 || i + 1 == name.length()) { @@ -171,7 +164,6 @@ public void setDescription(String description) { this.description = description; } - /** * Returns an acronym for this mime type. * @@ -197,7 +189,8 @@ void setAcronym(String v) { * Get the UTI for this mime type. * * @return The Uniform Type Identifier - * @see http://en.wikipedia.org/wiki/Uniform_Type_Identifier + * @see http://en.wikipedia.org/wiki/Uniform_Type_Identifier */ public String getUniformTypeIdentifier() { return uti; @@ -239,7 +232,6 @@ void addLink(URI link) { links = Collections.unmodifiableList(copy); } - /** * Add some rootXML info to this mime-type * @@ -308,9 +300,7 @@ public boolean matches(byte[] data) { return matchesMagic(data); } - /** - * whether the type is used as a server-side scripting technology - */ + /** whether the type is used as a server-side scripting technology */ boolean isInterpreted() { return isInterpreted; } @@ -323,7 +313,7 @@ public int compareTo(MimeType mime) { return type.compareTo(mime.type); } - //----------------------------------------------------------< Comparable > + // ----------------------------------------------------------< Comparable > public boolean equals(Object o) { if (o instanceof MimeType) { @@ -334,7 +324,7 @@ public boolean equals(Object o) { return false; } - //--------------------------------------------------------------< Object > + // --------------------------------------------------------------< Object > public int hashCode() { return type.hashCode(); @@ -350,9 +340,9 @@ public String toString() { } /** - * Returns the preferred file extension of this type, or an empty string - * if no extensions are known. Use the {@link #getExtensions()} method to - * get the full list of known extensions of this type. + * Returns the preferred file extension of this type, or an empty string if no extensions are + * known. Use the {@link #getExtensions()} method to get the full list of known extensions of + * this type. * * @return preferred file extension or empty string * @since Apache Tika 0.9 @@ -395,15 +385,10 @@ void addExtension(String extension) { } } - /** - * Defines a RootXML description. RootXML is made of a localName and/or a - * namespaceURI. - */ + /** Defines a RootXML description. RootXML is made of a localName and/or a namespaceURI. */ static class RootXML implements Serializable { - /** - * Serial version UID. - */ + /** Serial version UID. */ private static final long serialVersionUID = 5140496601491000730L; private MimeType type = null; @@ -423,7 +408,7 @@ static class RootXML implements Serializable { } boolean matches(String namespaceURI, String localName) { - //Compare namespaces + // Compare namespaces if (!isEmpty(this.namespaceURI)) { if (!this.namespaceURI.equals(namespaceURI)) { return false; @@ -436,7 +421,7 @@ boolean matches(String namespaceURI, String localName) { } } - //Compare root element's local name + // Compare root element's local name if (!isEmpty(this.localName)) { return this.localName.equals(localName); } else { @@ -446,9 +431,7 @@ boolean matches(String namespaceURI, String localName) { } } - /** - * Checks if a string is null or empty. - */ + /** Checks if a string is null or empty. */ private boolean isEmpty(String str) { return (str == null) || (str.equals("")); } @@ -469,5 +452,4 @@ public String toString() { return type + ", " + namespaceURI + ", " + localName; } } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java index 31bc8a1400..1df02808d8 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java @@ -18,9 +18,7 @@ import org.apache.tika.exception.TikaException; -/** - * A class to encapsulate MimeType related exceptions. - */ +/** A class to encapsulate MimeType related exceptions. */ public class MimeTypeException extends TikaException { /** @@ -33,14 +31,12 @@ public MimeTypeException(String message) { } /** - * Constructs a MimeTypeException with the specified detail message - * and root cause. + * Constructs a MimeTypeException with the specified detail message and root cause. * * @param message the detail message. - * @param cause root cause + * @param cause root cause */ public MimeTypeException(String message, Throwable cause) { super(message, cause); } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java index 10ef6cbc7c..852be15203 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java @@ -31,9 +31,7 @@ import java.util.Locale; import java.util.Map; import javax.xml.namespace.QName; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; - import org.apache.tika.Tika; import org.apache.tika.detect.Detector; import org.apache.tika.detect.TextDetector; @@ -42,76 +40,62 @@ import org.apache.tika.metadata.TikaCoreProperties; /** - * This class is a MimeType repository. It gathers a set of MimeTypes and - * enables to retrieves a content-type from its name, from a file name, or from - * a magic character sequence. - *

- * The MIME type detection methods that take an {@link InputStream} as - * an argument will never reads more than {@link #getMinLength()} bytes - * from the stream. Also the given stream is never - * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked}, - * or {@link InputStream#reset() reset} by the methods. Thus a client can - * use the {@link InputStream#markSupported() mark feature} of the stream - * (if available) to restore the stream back to the state it was before type - * detection if it wants to process the stream based on the detected type. + * This class is a MimeType repository. It gathers a set of MimeTypes and enables to retrieves a + * content-type from its name, from a file name, or from a magic character sequence. + * + *

The MIME type detection methods that take an {@link InputStream} as an argument will never + * reads more than {@link #getMinLength()} bytes from the stream. Also the given stream is never + * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked}, or {@link + * InputStream#reset() reset} by the methods. Thus a client can use the {@link + * InputStream#markSupported() mark feature} of the stream (if available) to restore the stream back + * to the state it was before type detection if it wants to process the stream based on the detected + * type. */ public final class MimeTypes implements Detector, Serializable { - /** - * Name of the {@link #rootMimeType root} type, application/octet-stream. - */ + /** Name of the {@link #rootMimeType root} type, application/octet-stream. */ public static final String OCTET_STREAM = "application/octet-stream"; - /** - * Name of the {@link #textMimeType text} type, text/plain. - */ + + /** Name of the {@link #textMimeType text} type, text/plain. */ public static final String PLAIN_TEXT = "text/plain"; - /** - * Name of the {@link #xmlMimeType xml} type, application/xml. - */ + + /** Name of the {@link #xmlMimeType xml} type, application/xml. */ public static final String XML = "application/xml"; - /** - * Serial version UID. - */ + + /** Serial version UID. */ private static final long serialVersionUID = -1350863170146349036L; + private static final Map CLASSLOADER_SPECIFIC_DEFAULT_TYPES = new HashMap<>(); private static MimeTypes DEFAULT_TYPES = null; - /** - * Root type, application/octet-stream. - */ + + /** Root type, application/octet-stream. */ private final MimeType rootMimeType; + private final List rootMimeTypeL; - /** - * Text type, text/plain. - */ + + /** Text type, text/plain. */ private final MimeType textMimeType; - /** - * html type, text/html - */ + + /** html type, text/html */ private final MimeType htmlMimeType; - /** - * xml type, application/xml - */ + + /** xml type, application/xml */ private final MimeType xmlMimeType; - /** - * Registered media types and their aliases. - */ + + /** Registered media types and their aliases. */ private final MediaTypeRegistry registry = new MediaTypeRegistry(); - /** - * All the registered MimeTypes indexed on their canonical names - */ + + /** All the registered MimeTypes indexed on their canonical names */ private final Map types = new HashMap<>(); - /** - * The patterns matcher - */ + + /** The patterns matcher */ private final Patterns patterns = new Patterns(registry); - /** - * Sorted list of all registered magics - */ + + /** Sorted list of all registered magics */ private final List magics = new ArrayList<>(); - /** - * Sorted list of all registered rootXML - */ + + /** Sorted list of all registered rootXML */ private final List xmls = new ArrayList<>(); public MimeTypes() { @@ -128,8 +112,8 @@ public MimeTypes() { } /** - * Get the default MimeTypes. This includes all the build in - * media types, and any custom override ones present. + * Get the default MimeTypes. This includes all the build in media types, and any custom + * override ones present. * * @return MimeTypes default type registry */ @@ -138,8 +122,8 @@ public static synchronized MimeTypes getDefaultMimeTypes() { } /** - * Get the default MimeTypes. This includes all the built-in - * media types, and any custom override ones present. + * Get the default MimeTypes. This includes all the built-in media types, and any custom + * override ones present. * * @param classLoader to use, if not the default * @return MimeTypes default type registry @@ -152,8 +136,9 @@ public static synchronized MimeTypes getDefaultMimeTypes(ClassLoader classLoader if (types == null) { try { - types = MimeTypesFactory - .create("tika-mimetypes.xml", "custom-mimetypes.xml", classLoader); + types = + MimeTypesFactory.create( + "tika-mimetypes.xml", "custom-mimetypes.xml", classLoader); } catch (MimeTypeException e) { throw new RuntimeException("Unable to parse the default media type registry", e); } catch (IOException e) { @@ -170,8 +155,8 @@ public static synchronized MimeTypes getDefaultMimeTypes(ClassLoader classLoader } /** - * Find the Mime Content Type of a document from its name. - * Returns application/octet-stream if no better match is found. + * Find the Mime Content Type of a document from its name. Returns application/octet-stream if + * no better match is found. * * @param name of the document to analyze. * @return the Mime Content Type of the specified document name @@ -192,13 +177,13 @@ public MimeType getMimeType(String name) { } /** - * Find the Mime Content Type of a document stored in the given file. - * Returns application/octet-stream if no better match is found. + * Find the Mime Content Type of a document stored in the given file. Returns + * application/octet-stream if no better match is found. * * @param file file to analyze * @return the Mime Content Type of the specified document * @throws MimeTypeException if the type can't be detected - * @throws IOException if the file can't be read + * @throws IOException if the file can't be read * @deprecated Use {@link Tika#detect(File)} instead */ @Deprecated @@ -207,16 +192,14 @@ public MimeType getMimeType(File file) throws MimeTypeException, IOException { } /** - * Returns the MIME type that best matches the given first few bytes - * of a document stream. Returns application/octet-stream if no better - * match is found. - *

- * If multiple matches are found, the best (highest priority) matching - * type is returned. If multiple matches are found with the same priority, - * then all of these are returned. - *

- * The given byte array is expected to be at least {@link #getMinLength()} - * long, or shorter only if the document stream itself is shorter. + * Returns the MIME type that best matches the given first few bytes of a document stream. + * Returns application/octet-stream if no better match is found. + * + *

If multiple matches are found, the best (highest priority) matching type is returned. If + * multiple matches are found with the same priority, then all of these are returned. + * + *

The given byte array is expected to be at least {@link #getMinLength()} long, or shorter + * only if the document stream itself is shorter. * * @param data first few bytes of a document stream * @return matching MIME type @@ -248,15 +231,15 @@ List getMimeType(byte[] data) { // When detecting generic XML (or possibly XHTML), // extract the root element and match it against known types - if ("application/xml".equals(matched.getName()) || - "text/html".equals(matched.getName())) { + if ("application/xml".equals(matched.getName()) + || "text/html".equals(matched.getName())) { XmlRootExtractor extractor = new XmlRootExtractor(); QName rootElement = extractor.extractRootElement(data); if (rootElement != null) { for (MimeType type : xmls) { - if (type.matchesXML(rootElement.getNamespaceURI(), - rootElement.getLocalPart())) { + if (type.matchesXML( + rootElement.getNamespaceURI(), rootElement.getLocalPart())) { result.set(i, type); break; } @@ -292,7 +275,8 @@ List getMimeType(byte[] data) { // Finally, assume plain text if no control bytes are found try { TextDetector detector = new TextDetector(getMinLength()); - UnsynchronizedByteArrayInputStream stream = new UnsynchronizedByteArrayInputStream(data); + UnsynchronizedByteArrayInputStream stream = + new UnsynchronizedByteArrayInputStream(data); MimeType type = forName(detector.detect(stream, new Metadata()).toString()); return Collections.singletonList(type); } catch (Exception e) { @@ -301,13 +285,11 @@ List getMimeType(byte[] data) { } /** - * Reads the first {@link #getMinLength()} bytes from the given stream. - * If the stream is shorter, then the entire content of the stream is - * returned. - *

- * The given stream is never {@link InputStream#close() closed}, - * {@link InputStream#mark(int) marked}, or - * {@link InputStream#reset() reset} by this method. + * Reads the first {@link #getMinLength()} bytes from the given stream. If the stream is + * shorter, then the entire content of the stream is returned. + * + *

The given stream is never {@link InputStream#close() closed}, {@link InputStream#mark(int) + * marked}, or {@link InputStream#reset() reset} by this method. * * @param stream stream to be read * @return first {@link #getMinLength()} (or fewer) bytes of the stream @@ -336,9 +318,8 @@ byte[] readMagicHeader(InputStream stream) throws IOException { } /** - * Returns the registered media type with the given name (or alias). - * The named media type is automatically registered (and returned) if - * it doesn't already exist. + * Returns the registered media type with the given name (or alias). The named media type is + * automatically registered (and returned) if it doesn't already exist. * * @param name media type name (case-insensitive) * @return the registered media type with the given name or alias @@ -371,18 +352,18 @@ public MimeType forName(String name) throws MimeTypeException { /** * Returns the registered, normalised media type with the given name (or alias). * - *

Unlike {@link #forName(String)}, this function will not create a - * new MimeType and register it. Instead, null will be returned if - * there is no definition available for the given name. + *

Unlike {@link #forName(String)}, this function will not create a new MimeType and + * register it. Instead, null will be returned if there is no definition available + * for the given name. * - *

Also, unlike {@link #forName(String)}, this function may return a - * mime type that has fewer parameters than were included in the supplied name. - * If the registered mime type has parameters (e.g. - * application/dita+xml;format=map), then those will be maintained. - * However, if the supplied name has paramenters that the registered mime - * type does not (e.g. application/xml; charset=UTF-8 as a name, - * compared to just application/xml for the type in the registry), - * then those parameters will not be included in the returned type. + *

Also, unlike {@link #forName(String)}, this function may return a mime type that has fewer + * parameters than were included in the supplied name. If the registered mime type has + * parameters (e.g. application/dita+xml;format=map), then those will be + * maintained. However, if the supplied name has paramenters that the registered mime + * type does not (e.g. + * application/xml; charset=UTF-8 as a name, compared to just application/xml + * for the type in the registry), then those parameters will not be included in the + * returned type. * * @param name media type name (case-insensitive) * @return the registered media type with the given name or alias, or null if not found @@ -410,10 +391,10 @@ public synchronized void setSuperType(MimeType type, MediaType parent) { } /** - * Adds an alias for the given media type. This method should only - * be called from {@link MimeType#addAlias(String)}. + * Adds an alias for the given media type. This method should only be called from {@link + * MimeType#addAlias(String)}. * - * @param type media type + * @param type media type * @param alias media type alias (normalized to lower case) */ synchronized void addAlias(MimeType type, MediaType alias) { @@ -421,10 +402,10 @@ synchronized void addAlias(MimeType type, MediaType alias) { } /** - * Adds a file name pattern for the given media type. Assumes that the - * pattern being added is not a JDK standard regular expression. + * Adds a file name pattern for the given media type. Assumes that the pattern being added is + * not a JDK standard regular expression. * - * @param type media type + * @param type media type * @param pattern file name pattern * @throws MimeTypeException if the pattern conflicts with existing ones */ @@ -433,16 +414,14 @@ public void addPattern(MimeType type, String pattern) throws MimeTypeException { } /** - * Adds a file name pattern for the given media type. The caller can specify - * whether the pattern being added is or is not a JDK standard - * regular expression via the isRegex parameter. If the value - * is set to true, then a JDK standard regex is assumed, otherwise the - * freedesktop glob type is assumed. + * Adds a file name pattern for the given media type. The caller can specify whether the pattern + * being added is or is not a JDK standard regular expression via the + * isRegex parameter. If the value is set to true, then a JDK standard regex is assumed, + * otherwise the freedesktop glob type is assumed. * - * @param type media type + * @param type media type * @param pattern file name pattern - * @param isRegex set to true if JDK std regexs are desired, otherwise set to - * false. + * @param isRegex set to true if JDK std regexs are desired, otherwise set to false. * @throws MimeTypeException if the pattern conflicts with existing ones. */ public void addPattern(MimeType type, String pattern, boolean isRegex) @@ -455,8 +434,8 @@ public MediaTypeRegistry getMediaTypeRegistry() { } /** - * Return the minimum length of data to provide to analyzing methods based - * on the document's content in order to check all the known MimeTypes. + * Return the minimum length of data to provide to analyzing methods based on the document's + * content in order to check all the known MimeTypes. * * @return the minimum length of data to provide. * @see #getMimeType(byte[]) @@ -487,10 +466,7 @@ void add(MimeType type) { } } - /** - * Called after all configured types have been loaded. - * Initializes the magics and xmls sets. - */ + /** Called after all configured types have been loaded. Initializes the magics and xmls sets. */ void init() { for (MimeType type : types.values()) { magics.addAll(type.getMagics()); @@ -503,14 +479,13 @@ void init() { } /** - * Automatically detects the MIME type of a document based on magic - * markers in the stream prefix and any given metadata hints. - *

- * The given stream is expected to support marks, so that this method - * can reset the stream to the position it was in before this method - * was called. + * Automatically detects the MIME type of a document based on magic markers in the stream prefix + * and any given metadata hints. + * + *

The given stream is expected to support marks, so that this method can reset the stream to + * the position it was in before this method was called. * - * @param input document stream, or null + * @param input document stream, or null * @param metadata metadata hints * @return MIME type of the document * @throws IOException if the document stream could not be read @@ -585,18 +560,17 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } /** - * Use the MimeType hint to try to clarify or specialise the current - * possible types list. - * If the hint is a specialised form, use that instead - * If there are multiple possible types, use the hint to select one + * Use the MimeType hint to try to clarify or specialise the current possible types list. If the + * hint is a specialised form, use that instead If there are multiple possible types, use the + * hint to select one */ private List applyHint(List possibleTypes, MimeType hint) { if (possibleTypes == null || possibleTypes.isEmpty()) { return Collections.singletonList(hint); } else { for (final MimeType type : possibleTypes) { - if (hint.equals(type) || - registry.isSpecializationOf(hint.getType(), type.getType())) { + if (hint.equals(type) + || registry.isSpecializationOf(hint.getType(), type.getType())) { // Use just this type return Collections.singletonList(hint); } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java index afec1f1993..936334d1a8 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java @@ -23,22 +23,18 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; -/** - * Creates instances of MimeTypes. - */ +/** Creates instances of MimeTypes. */ public class MimeTypesFactory { private static final Logger LOG = LoggerFactory.getLogger(MimeTypesFactory.class); - /** - * System property to set a path to an additional external custom mimetypes - * XML file to be loaded. + * System property to set a path to an additional external custom mimetypes XML file to be + * loaded. */ public static final String CUSTOM_MIMES_SYS_PROP = "tika.custom-mimetypes"; @@ -64,10 +60,10 @@ public static MimeTypes create(Document document) throws MimeTypeException { } /** - * Creates and returns a MimeTypes instance from the specified input stream. - * Does not close the input stream(s). + * Creates and returns a MimeTypes instance from the specified input stream. Does not close the + * input stream(s). * - * @throws IOException if the stream can not be read + * @throws IOException if the stream can not be read * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(InputStream... inputStreams) @@ -85,16 +81,15 @@ public static MimeTypes create(InputStream... inputStreams) * @see #create(InputStream...) */ public static MimeTypes create(InputStream stream) throws IOException, MimeTypeException { - return create(new InputStream[]{stream}); + return create(new InputStream[] {stream}); } /** - * Creates and returns a MimeTypes instance from the resource - * at the location specified by the URL. Opens and closes the - * InputStream from the URL. - * If multiple URLs are supplied, then they are loaded in turn. + * Creates and returns a MimeTypes instance from the resource at the location specified by the + * URL. Opens and closes the InputStream from the URL. If multiple URLs are supplied, then they + * are loaded in turn. * - * @throws IOException if the URL can not be accessed + * @throws IOException if the URL can not be accessed * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(URL... urls) throws IOException, MimeTypeException { @@ -116,14 +111,14 @@ public static MimeTypes create(URL... urls) throws IOException, MimeTypeExceptio * @see #create(URL...) */ public static MimeTypes create(URL url) throws IOException, MimeTypeException { - return create(new URL[]{url}); + return create(new URL[] {url}); } /** - * Creates and returns a MimeTypes instance from the specified file path, - * as interpreted by the class loader in getResource(). + * Creates and returns a MimeTypes instance from the specified file path, as interpreted by the + * class loader in getResource(). * - * @throws IOException if the file can not be accessed + * @throws IOException if the file can not be accessed * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(String filePath) throws IOException, MimeTypeException { @@ -131,15 +126,13 @@ public static MimeTypes create(String filePath) throws IOException, MimeTypeExce } /** - * Creates and returns a MimeTypes instance. The core mimetypes - * will be loaded from the specified file path, and any custom - * override mimetypes found will loaded afterwards. - * The file paths will be interpreted by the default class loader in - * getResource(). + * Creates and returns a MimeTypes instance. The core mimetypes will be loaded from the + * specified file path, and any custom override mimetypes found will loaded afterwards. The file + * paths will be interpreted by the default class loader in getResource(). * - * @param coreFilePath The main MimeTypes file to load + * @param coreFilePath The main MimeTypes file to load * @param extensionFilePath The name of extension MimeType files to load afterwards - * @throws IOException if the file can not be accessed + * @throws IOException if the file can not be accessed * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(String coreFilePath, String extensionFilePath) @@ -148,21 +141,19 @@ public static MimeTypes create(String coreFilePath, String extensionFilePath) } /** - * Creates and returns a MimeTypes instance. The core mimetypes - * will be loaded from the specified file path, and any custom - * override mimetypes found will loaded afterwards. - * The file paths will be interpreted by the specified class - * loader in getResource(). - * It will also load custom mimetypes from the system property - * {@link #CUSTOM_MIMES_SYS_PROP}, if specified. + * Creates and returns a MimeTypes instance. The core mimetypes will be loaded from the + * specified file path, and any custom override mimetypes found will loaded afterwards. The file + * paths will be interpreted by the specified class loader in getResource(). It will also load + * custom mimetypes from the system property {@link #CUSTOM_MIMES_SYS_PROP}, if specified. * - * @param coreFilePath The main MimeTypes file to load + * @param coreFilePath The main MimeTypes file to load * @param extensionFilePath The name of extension MimeType files to load afterwards - * @throws IOException if the file can not be accessed + * @throws IOException if the file can not be accessed * @throws MimeTypeException if the type configuration is invalid */ - public static MimeTypes create(String coreFilePath, String extensionFilePath, - ClassLoader classLoader) throws IOException, MimeTypeException { + public static MimeTypes create( + String coreFilePath, String extensionFilePath, ClassLoader classLoader) + throws IOException, MimeTypeException { // If no specific classloader was requested, use our own class's one if (classLoader == null) { classLoader = MimeTypesReader.class.getClassLoader(); @@ -174,17 +165,14 @@ public static MimeTypes create(String coreFilePath, String extensionFilePath, // Get the core URL, and all the extensions URLs URL coreURL = classLoader.getResource(classPrefix + coreFilePath); - List extensionURLs = - Collections.list(classLoader.getResources(extensionFilePath)); + List extensionURLs = Collections.list(classLoader.getResources(extensionFilePath)); // Swap that into an Array, and process List urls = new ArrayList<>(); urls.add(coreURL); urls.addAll(extensionURLs); if (LOG.isDebugEnabled()) { - urls.stream().forEach( u -> - LOG.debug("Loaded custom mimes file: {}", u) - ); + urls.stream().forEach(u -> LOG.debug("Loaded custom mimes file: {}", u)); } String customMimesPath = System.getProperty(CUSTOM_MIMES_SYS_PROP); @@ -197,7 +185,9 @@ public static MimeTypes create(String coreFilePath, String extensionFilePath, URL externalURL = externalFile.toURI().toURL(); urls.add(externalURL); if (LOG.isDebugEnabled()) { - LOG.debug("Loaded external custom mimetypes file: {}", externalFile.getAbsolutePath()); + LOG.debug( + "Loaded external custom mimetypes file: {}", + externalFile.getAbsolutePath()); } } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java index 76bc5c7525..a8eacdfeac 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java @@ -34,8 +34,9 @@ import javax.xml.transform.TransformerException; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.sax.SAXResult; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.utils.XMLReaderUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; @@ -44,9 +45,6 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.exception.TikaException; -import org.apache.tika.utils.XMLReaderUtils; - /** * A reader for XML files compliant with the freedesktop MIME-info DTD. * @@ -103,21 +101,21 @@ * type CDATA #REQUIRED> * ]> *

- *

- * In addition to the standard fields, this will also read two Tika specific fields: - * - link - * - uti * - * @see https://freedesktop.org/wiki/Specifications/shared-mime-info-spec/ + *

In addition to the standard fields, this will also read two Tika specific fields: - link - uti + * + * @see https://freedesktop.org/wiki/Specifications/shared-mime-info-spec/ */ public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMetKeys { private static final ReentrantReadWriteLock READ_WRITE_LOCK = new ReentrantReadWriteLock(); - /** - * Parser pool size - */ + + /** Parser pool size */ private static int POOL_SIZE = 10; + private static ArrayBlockingQueue SAX_PARSERS = new ArrayBlockingQueue<>(POOL_SIZE); static Logger LOG = LoggerFactory.getLogger(MimeTypesReader.class); + static { try { setPoolSize(POOL_SIZE); @@ -128,9 +126,7 @@ public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMe protected final MimeTypes types; - /** - * Current type - */ + /** Current type */ protected MimeType type = null; protected int priority; @@ -143,9 +139,8 @@ protected MimeTypesReader(MimeTypes types) { } /** - * Acquire a SAXParser from the pool; create one if it - * doesn't exist. Make sure to {@link #releaseParser(SAXParser)} in - * a finally block every time you call this. + * Acquire a SAXParser from the pool; create one if it doesn't exist. Make sure to {@link + * #releaseParser(SAXParser)} in a finally block every time you call this. * * @return a SAXParser * @throws TikaException @@ -160,7 +155,6 @@ private static SAXParser acquireSAXParser() throws TikaException { throw new TikaException("interrupted while waiting for SAXParser", e); } finally { READ_WRITE_LOCK.readLock().unlock(); - } if (parser != null) { return parser; @@ -177,11 +171,11 @@ private static void releaseParser(SAXParser parser) { try { parser.reset(); } catch (UnsupportedOperationException e) { - //ignore + // ignore } try { READ_WRITE_LOCK.readLock().lock(); - //if there are extra parsers (e.g. after a reset of the pool to a smaller size), + // if there are extra parsers (e.g. after a reset of the pool to a smaller size), // this parser will not be added and will then be gc'd SAX_PARSERS.offer(parser); } finally { @@ -196,9 +190,9 @@ private static void releaseParser(SAXParser parser) { */ public static void setPoolSize(int poolSize) throws TikaException { try { - //stop the world with a write lock - //parsers that are currently in use will be offered, but not - //accepted and will be gc'd + // stop the world with a write lock + // parsers that are currently in use will be offered, but not + // accepted and will be gc'd READ_WRITE_LOCK.writeLock().lock(); SAX_PARSERS = new ArrayBlockingQueue<>(poolSize); for (int i = 0; i < poolSize; i++) { @@ -216,8 +210,10 @@ private static SAXParser newSAXParser() throws TikaException { try { factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); } catch (ParserConfigurationException | SAXException e) { - LOG.warn("can't set secure processing feature on: " + factory.getClass() + - ". User assumes responsibility for consequences."); + LOG.warn( + "can't set secure processing feature on: " + + factory.getClass() + + ". User assumes responsibility for consequences."); } try { return factory.newSAXParser(); @@ -278,8 +274,10 @@ public void startElement(String uri, String localName, String qName, Attributes } else if (SUB_CLASS_OF_TAG.equals(qName)) { String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR); types.setSuperType(type, MediaType.parse(parent)); - } else if (ACRONYM_TAG.equals(qName) || COMMENT_TAG.equals(qName) || - TIKA_LINK_TAG.equals(qName) || TIKA_UTI_TAG.equals(qName)) { + } else if (ACRONYM_TAG.equals(qName) + || COMMENT_TAG.equals(qName) + || TIKA_LINK_TAG.equals(qName) + || TIKA_UTI_TAG.equals(qName)) { characters = new StringBuilder(); } else if (GLOB_TAG.equals(qName)) { String pattern = attributes.getValue(PATTERN_ATTR); @@ -297,8 +295,11 @@ public void startElement(String uri, String localName, String qName, Attributes type.addRootXML(namespace, name); } else if (MATCH_TAG.equals(qName)) { if (attributes.getValue(MATCH_MINSHOULDMATCH_ATTR) != null) { - current = new ClauseRecord(new MinShouldMatchVal( - Integer.parseInt(attributes.getValue(MATCH_MINSHOULDMATCH_ATTR)))); + current = + new ClauseRecord( + new MinShouldMatchVal( + Integer.parseInt( + attributes.getValue(MATCH_MINSHOULDMATCH_ATTR)))); } else { String kind = attributes.getValue(MATCH_TYPE_ATTR); String offset = attributes.getValue(MATCH_OFFSET_ATTR); @@ -360,20 +361,25 @@ public void characters(char[] ch, int start, int length) { } } - protected void handleMimeError(String input, MimeTypeException ex, String qName, - Attributes attributes) throws SAXException { + protected void handleMimeError( + String input, MimeTypeException ex, String qName, Attributes attributes) + throws SAXException { throw new SAXException(ex); } - protected void handleGlobError(MimeType type, String pattern, MimeTypeException ex, - String qName, Attributes attributes) throws SAXException { + protected void handleGlobError( + MimeType type, + String pattern, + MimeTypeException ex, + String qName, + Attributes attributes) + throws SAXException { throw new SAXException(ex); } /** - * Shim class used during building of actual classes. - * This temporarily holds the value of the minShouldMatchClause - * so that the actual MinShouldMatchClause can have a cleaner/immutable + * Shim class used during building of actual classes. This temporarily holds the value of the + * minShouldMatchClause so that the actual MinShouldMatchClause can have a cleaner/immutable * initialization. */ private static class MinShouldMatchVal implements Clause { @@ -442,6 +448,5 @@ public void stop() { public List getClauses() { return subclauses; } - } } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java index df35134571..54fb57f01a 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java @@ -16,9 +16,7 @@ */ package org.apache.tika.mime; -/** - * Met Keys used by the {@link MimeTypesReader}. - */ +/** Met Keys used by the {@link MimeTypesReader}. */ public interface MimeTypesReaderMetKeys { String MIME_INFO_TAG = "mime-info"; diff --git a/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java b/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java index 0a18f4e703..97c642368d 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java @@ -25,9 +25,9 @@ class MinShouldMatchClause implements Clause { /** * Minimum number of clauses that need to match. - *

- * Throws IllegalArgumentException if min <= 0, - * if clauses is null or has size == 0, or if min > clauses.size() + * + *

Throws IllegalArgumentException if min <= 0, if clauses is null or has size == 0, or if + * min > clauses.size() * * @param min * @param clauses @@ -72,5 +72,4 @@ public int size() { public String toString() { return "minShouldMatch (min: " + min + ") " + clauses; } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/OrClause.java b/tika-core/src/main/java/org/apache/tika/mime/OrClause.java index 6a2f212f5a..8235062013 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/OrClause.java +++ b/tika-core/src/main/java/org/apache/tika/mime/OrClause.java @@ -46,5 +46,4 @@ public int size() { public String toString() { return "or" + clauses; } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/Patterns.java b/tika-core/src/main/java/org/apache/tika/mime/Patterns.java index 48c0329f06..ea83c1487d 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/Patterns.java +++ b/tika-core/src/main/java/org/apache/tika/mime/Patterns.java @@ -23,32 +23,23 @@ import java.util.SortedMap; import java.util.TreeMap; -/** - * Defines a MimeType pattern. - */ +/** Defines a MimeType pattern. */ class Patterns implements Serializable { - /** - * Serial version UID. - */ + /** Serial version UID. */ private static final long serialVersionUID = -5778015347278111140L; private final MediaTypeRegistry registry; - /** - * Index of exact name patterns. - */ + /** Index of exact name patterns. */ private final Map names = new HashMap<>(); - /** - * Index of extension patterns of the form "*extension". - */ + /** Index of extension patterns of the form "*extension". */ private final Map extensions = new HashMap<>(); - /** - * Index of generic glob patterns, sorted by length. - */ - private final SortedMap globs = - new TreeMap<>(new LengthComparator()); + + /** Index of generic glob patterns, sorted by length. */ + private final SortedMap globs = new TreeMap<>(new LengthComparator()); + private int minExtensionLength = Integer.MAX_VALUE; private int maxExtensionLength = 0; @@ -71,11 +62,14 @@ public void add(String pattern, boolean isJavaRegex, MimeType type) throws MimeT addGlob(pattern, type); } else { - if (pattern.indexOf('*') == -1 && pattern.indexOf('?') == -1 && - pattern.indexOf('[') == -1) { + if (pattern.indexOf('*') == -1 + && pattern.indexOf('?') == -1 + && pattern.indexOf('[') == -1) { addName(pattern, type); - } else if (pattern.startsWith("*") && pattern.indexOf('*', 1) == -1 && - pattern.indexOf('?') == -1 && pattern.indexOf('[') == -1) { + } else if (pattern.startsWith("*") + && pattern.indexOf('*', 1) == -1 + && pattern.indexOf('?') == -1 + && pattern.indexOf('[') == -1) { String extension = pattern.substring(1); addExtension(extension, type); type.addExtension(extension); @@ -89,8 +83,8 @@ private void addName(String name, MimeType type) throws MimeTypeException { MimeType previous = names.get(name); if (previous == null || registry.isSpecializationOf(previous.getType(), type.getType())) { names.put(name, type); - } else if (previous == type || - registry.isSpecializationOf(type.getType(), previous.getType())) { + } else if (previous == type + || registry.isSpecializationOf(type.getType(), previous.getType())) { // do nothing } else { throw new MimeTypeException("Conflicting name pattern: " + name); @@ -104,8 +98,8 @@ private void addExtension(String extension, MimeType type) throws MimeTypeExcept int length = extension.length(); minExtensionLength = Math.min(minExtensionLength, length); maxExtensionLength = Math.max(maxExtensionLength, length); - } else if (previous == type || - registry.isSpecializationOf(type.getType(), previous.getType())) { + } else if (previous == type + || registry.isSpecializationOf(type.getType(), previous.getType())) { // do nothing } else { throw new MimeTypeException("Conflicting extension pattern: " + extension); @@ -116,8 +110,8 @@ private void addGlob(String glob, MimeType type) throws MimeTypeException { MimeType previous = globs.get(glob); if (previous == null || registry.isSpecializationOf(previous.getType(), type.getType())) { globs.put(glob, type); - } else if (previous == type || - registry.isSpecializationOf(type.getType(), previous.getType())) { + } else if (previous == type + || registry.isSpecializationOf(type.getType(), previous.getType())) { // do nothing } else { throw new MimeTypeException("Conflicting glob pattern: " + glob); @@ -126,17 +120,15 @@ private void addGlob(String glob, MimeType type) throws MimeTypeException { /** * Find the MimeType corresponding to a resource name. - *

- * It applies the recommendations detailed in FreeDesktop Shared MIME-info - * Database for guessing MimeType from a resource name: It first tries a - * case-sensitive match, then try again with the resource name converted to - * lower-case if that fails. If several patterns match then the longest - * pattern is used. In particular, files with multiple extensions (such as - * Data.tar.gz) match the longest sequence of extensions (eg '*.tar.gz' in - * preference to '*.gz'). Literal patterns (eg, 'Makefile') are matched - * before all others. Patterns beginning with `*.' and containing no other - * special characters (`*?[') are matched before other wildcarded patterns - * (since this covers the majority of the patterns). + * + *

It applies the recommendations detailed in FreeDesktop Shared MIME-info Database for + * guessing MimeType from a resource name: It first tries a case-sensitive match, then try again + * with the resource name converted to lower-case if that fails. If several patterns match then + * the longest pattern is used. In particular, files with multiple extensions (such as + * Data.tar.gz) match the longest sequence of extensions (eg '*.tar.gz' in preference to + * '*.gz'). Literal patterns (eg, 'Makefile') are matched before all others. Patterns beginning + * with `*.' and containing no other special characters (`*?[') are matched before other + * wildcarded patterns (since this covers the majority of the patterns). */ public MimeType matches(String name) { if (name == null) { @@ -189,9 +181,7 @@ private String compile(String glob) { private static final class LengthComparator implements Comparator, Serializable { - /** - * Serial version UID. - */ + /** Serial version UID. */ private static final long serialVersionUID = 8468289702915532359L; public int compare(String a, String b) { @@ -201,7 +191,5 @@ public int compare(String a, String b) { } return diff; } - } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java b/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java index 5e33b85795..aba6869a00 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java +++ b/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java @@ -22,21 +22,17 @@ import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; - import org.apache.tika.detect.Detector; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -/** - * Selector for combining different mime detection results - * based on probability - */ +/** Selector for combining different mime detection results based on probability */ public class ProbabilisticMimeDetectionSelector implements Detector { private static final long serialVersionUID = 224589862960269260L; - /** - * probability parameters default value - */ + + /** probability parameters default value */ private static final float DEFAULT_MAGIC_TRUST = 0.9f; + private static final float DEFAULT_META_TRUST = 0.8f; private static final float DEFAULT_EXTENSION_TRUST = 0.8f; private final MimeTypes mimeTypes; @@ -58,10 +54,7 @@ public class ProbabilisticMimeDetectionSelector implements Detector { */ private float threshold; - /** - * - ***********************/ - + /** ********************* */ public ProbabilisticMimeDetectionSelector() { this(MimeTypes.getDefaultMimeTypes(), null); } @@ -80,10 +73,14 @@ public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes, final Build this.initializeDefaultProbabilityParameters(); this.changeRate = 0.1f; if (builder != null) { - priorMagicFileType = builder.priorMagicFileType == 0f ? priorMagicFileType : - builder.priorMagicFileType; - priorExtensionFileType = builder.priorExtensionFileType == 0f ? priorExtensionFileType : - builder.priorExtensionFileType; + priorMagicFileType = + builder.priorMagicFileType == 0f + ? priorMagicFileType + : builder.priorMagicFileType; + priorExtensionFileType = + builder.priorExtensionFileType == 0f + ? priorExtensionFileType + : builder.priorExtensionFileType; priorMetaFileType = builder.priorMetaFileType == 0f ? priorMetaFileType : builder.priorMetaFileType; @@ -99,9 +96,7 @@ public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes, final Build } } - /** - * Initilize probability parameters with default values; - */ + /** Initilize probability parameters with default values; */ private void initializeDefaultProbabilityParameters() { priorMagicFileType = 0.5f; priorExtensionFileType = 0.5f; @@ -130,7 +125,7 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException input.mark(mimeTypes.getMinLength()); try { byte[] prefix = mimeTypes.readMagicHeader(input); - //defensive copy + // defensive copy possibleTypes.addAll(mimeTypes.getMimeType(prefix)); } finally { input.reset(); @@ -186,9 +181,10 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException return applyProbilities(possibleTypes, extHint, metaHint); } - private MediaType applyProbilities(final List possibleTypes, - final MimeType extMimeType, - final MimeType metadataMimeType) { + private MediaType applyProbilities( + final List possibleTypes, + final MimeType extMimeType, + final MimeType metadataMimeType) { /* initialize some probability variables */ MediaType extensionMediaType_ = extMimeType == null ? null : extMimeType.getType(); @@ -231,8 +227,8 @@ private MediaType applyProbilities(final List possibleTypes, } else { // check if each identified type belongs to the same class; if (extensionMediaType_ != null) { - if (extensionMediaType_.equals(magictype) || - registry.isSpecializationOf(extensionMediaType_, magictype)) { + if (extensionMediaType_.equals(magictype) + || registry.isSpecializationOf(extensionMediaType_, magictype)) { // Use just this type possibleTypes.set(i, extMimeType); } else if (registry.isSpecializationOf(magictype, extensionMediaType_)) { @@ -240,8 +236,8 @@ private MediaType applyProbilities(final List possibleTypes, } } if (metaMediaType_ != null) { - if (metaMediaType_.equals(magictype) || - registry.isSpecializationOf(metaMediaType_, magictype)) { + if (metaMediaType_.equals(magictype) + || registry.isSpecializationOf(metaMediaType_, magictype)) { // Use just this type possibleTypes.set(i, metadataMimeType); } else if (registry.isSpecializationOf(magictype, metaMediaType_)) { @@ -269,7 +265,6 @@ private MediaType applyProbilities(final List possibleTypes, * grow as our trust goes down */ mag_neg = mag_neg * (1 + changeRate); - } if (magictype != null && mag_trust != 1) { @@ -387,7 +382,6 @@ private MediaType applyProbilities(final List possibleTypes, } pPrime /= (pPrime + deno); results[0] = pPrime; - } if (maxProb < results[0]) { maxProb = results[0]; @@ -405,7 +399,6 @@ private MediaType applyProbilities(final List possibleTypes, } pPrime /= (pPrime + deno); results[1] = pPrime; - } if (maxProb < results[1]) { maxProb = results[1]; @@ -429,19 +422,15 @@ private MediaType applyProbilities(final List possibleTypes, bestEstimate = extensionMediaType_; } } - } return maxProb < threshold ? this.rootMediaType : bestEstimate; - } public MediaTypeRegistry getMediaTypeRegistry() { return this.mimeTypes.getMediaTypeRegistry(); } - /** - * build class for probability parameters setting - */ + /** build class for probability parameters setting */ public static class Builder { /* * the following are the prior probabilities for the file type @@ -512,12 +501,9 @@ public synchronized Builder threshold(final float threshold) { return this; } - /** - * Initialize the MimeTypes with this builder instance - */ + /** Initialize the MimeTypes with this builder instance */ public ProbabilisticMimeDetectionSelector build2() { return new ProbabilisticMimeDetectionSelector(this); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/mime/package-info.java b/tika-core/src/main/java/org/apache/tika/mime/package-info.java index 104dc3acf9..2387683e91 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/mime/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * Media type information. - */ +/** Media type information. */ @aQute.bnd.annotation.Version("1.2.0") package org.apache.tika.mime; diff --git a/tika-core/src/main/java/org/apache/tika/package-info.java b/tika-core/src/main/java/org/apache/tika/package-info.java index cf4352ddcb..77ffacce69 100644 --- a/tika-core/src/main/java/org/apache/tika/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * Apache Tika. - */ +/** Apache Tika. */ @aQute.bnd.annotation.Version("1.0.0") package org.apache.tika; diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java index 2e9f3936fe..19152f24cd 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java @@ -20,14 +20,12 @@ import org.apache.tika.detect.DefaultEncodingDetector; import org.apache.tika.detect.EncodingDetector; - /** - * Abstract base class for parsers that use the AutoDetectReader and need - * to use the {@link EncodingDetector} configured by {@link TikaConfig} + * Abstract base class for parsers that use the AutoDetectReader and need to use the {@link + * EncodingDetector} configured by {@link TikaConfig} */ public abstract class AbstractEncodingDetectorParser implements Parser { - private EncodingDetector encodingDetector; public AbstractEncodingDetectorParser() { @@ -39,8 +37,8 @@ public AbstractEncodingDetectorParser(EncodingDetector encodingDetector) { } /** - * Look for an EncodingDetetor in the ParseContext. If it hasn't been - * passed in, use the original EncodingDetector from initialization. + * Look for an EncodingDetetor in the ParseContext. If it hasn't been passed in, use the + * original EncodingDetector from initialization. * * @param parseContext * @return diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java index c5c3315f92..128580000c 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java @@ -20,28 +20,27 @@ import java.util.concurrent.ConcurrentHashMap; /** - * Abstract base class for parsers that call external processes. This - * adds one more layer of 'hope' that processes won't be orphaned if - * the jvm has to be restarted. This does not guarantee that the - * processes won't be orphaned in case of, e.g. kill -9, but this - * increases the chances that under normal circumstances or if the jvm - * itself exits, that external processes won't be orphaned. + * Abstract base class for parsers that call external processes. This adds one more layer of 'hope' + * that processes won't be orphaned if the jvm has to be restarted. This does not guarantee that the + * processes won't be orphaned in case of, e.g. kill -9, but this increases the chances that under + * normal circumstances or if the jvm itself exits, that external processes won't be orphaned. * * @since Apache Tika 1.27 */ public abstract class AbstractExternalProcessParser implements Parser { - /** - * Serial version UID. - */ + /** Serial version UID. */ private static final long serialVersionUID = 7186985395903074255L; private static final ConcurrentHashMap PROCESS_MAP = new ConcurrentHashMap<>(); static { - Runtime.getRuntime().addShutdownHook(new Thread(() -> { - PROCESS_MAP.forEachValue(1, Process::destroyForcibly); - })); + Runtime.getRuntime() + .addShutdownHook( + new Thread( + () -> { + PROCESS_MAP.forEachValue(1, Process::destroyForcibly); + })); } protected String register(Process p) { @@ -54,4 +53,3 @@ protected Process release(String id) { return PROCESS_MAP.remove(id); } } - diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java index f6017d6c0f..7913f52708 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java @@ -18,16 +18,14 @@ import java.io.IOException; import java.io.InputStream; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Abstract base class for new parsers. This method implements the old - * deprecated parse method so subclasses won't have to. + * Abstract base class for new parsers. This method implements the old deprecated parse method so + * subclasses won't have to. * * @deprecated for removal in 4.x * @since Apache Tika 0.10 @@ -35,27 +33,21 @@ @Deprecated public abstract class AbstractParser implements Parser { - /** - * Serial version UID. - */ + /** Serial version UID. */ private static final long serialVersionUID = 7186985395903074255L; /** - * Calls the - * {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)} - * method with an empty {@link ParseContext}. This method exists as a - * leftover from Tika 0.x when the three-argument parse() method still - * existed in the {@link Parser} interface. No new code should call this - * method anymore, it's only here for backwards compatibility. + * Calls the {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)} method + * with an empty {@link ParseContext}. This method exists as a leftover from Tika 0.x when the + * three-argument parse() method still existed in the {@link Parser} interface. No new code + * should call this method anymore, it's only here for backwards compatibility. * - * @deprecated use the {@link Parser#parse(InputStream, ContentHandler, - * Metadata, ParseContext)} method instead + * @deprecated use the {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)} + * method instead */ @Deprecated public void parse(InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { parse(stream, handler, metadata, new ParseContext()); } - } - diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index 86eae692a0..d8869aacd3 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@ -18,10 +18,6 @@ import java.io.IOException; import java.io.InputStream; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; @@ -38,31 +34,23 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.sax.SecureContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class AutoDetectParser extends CompositeParser { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 6110455808615143122L; - //private final TikaConfig config; - /** - * The type detector used by this parser to auto-detect the type - * of a document. - */ + // private final TikaConfig config; + + /** The type detector used by this parser to auto-detect the type of a document. */ private Detector detector; // always set in the constructor - /** - * Configuration used when initializing a SecureContentHandler - * and the TikaInputStream. - */ + /** Configuration used when initializing a SecureContentHandler and the TikaInputStream. */ private AutoDetectParserConfig autoDetectParserConfig; - /** - * Creates an auto-detecting parser instance using the default Tika - * configuration. - */ + /** Creates an auto-detecting parser instance using the default Tika configuration. */ public AutoDetectParser() { this(TikaConfig.getDefaultConfig()); } @@ -73,10 +61,10 @@ public AutoDetectParser(Detector detector) { } /** - * Creates an auto-detecting parser instance using the specified set of parser. - * This allows one to create a Tika configuration where only a subset of the - * available parsers have their 3rd party jars included, as otherwise the - * use of the default TikaConfig will throw various "ClassNotFound" exceptions. + * Creates an auto-detecting parser instance using the specified set of parser. This allows one + * to create a Tika configuration where only a subset of the available parsers have their 3rd + * party jars included, as otherwise the use of the default TikaConfig will throw various + * "ClassNotFound" exceptions. * * @param parsers */ @@ -95,14 +83,13 @@ public AutoDetectParser(TikaConfig config) { setFallback(buildFallbackParser(config)); setDetector(config.getDetector()); setAutoDetectParserConfig(config.getAutoDetectParserConfig()); - } private static Parser buildFallbackParser(TikaConfig config) { Parser fallback = null; Parser p = config.getParser(); if (p instanceof DefaultParser) { - fallback = ((DefaultParser)p).getFallback(); + fallback = ((DefaultParser) p).getFallback(); } else { fallback = new EmptyParser(); } @@ -110,25 +97,27 @@ private static Parser buildFallbackParser(TikaConfig config) { if (config.getAutoDetectParserConfig().getDigesterFactory() == null) { return fallback; } else { - return new DigestingParser(fallback, + return new DigestingParser( + fallback, config.getAutoDetectParserConfig().getDigesterFactory().build(), - config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument()); + config.getAutoDetectParserConfig() + .getDigesterFactory() + .isSkipContainerDocument()); } - } private static Parser getParser(TikaConfig config) { if (config.getAutoDetectParserConfig().getDigesterFactory() == null) { return config.getParser(); } - return new DigestingParser(config.getParser(), + return new DigestingParser( + config.getParser(), config.getAutoDetectParserConfig().getDigesterFactory().build(), config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument()); } /** - * Returns the type detector used by this parser to auto-detect the type - * of a document. + * Returns the type detector used by this parser to auto-detect the type of a document. * * @return type detector * @since Apache Tika 0.4 @@ -138,8 +127,7 @@ public Detector getDetector() { } /** - * Sets the type detector used by this parser to auto-detect the type - * of a document. + * Sets the type detector used by this parser to auto-detect the type of a document. * * @param detector type detector * @since Apache Tika 0.4 @@ -149,8 +137,8 @@ public void setDetector(Detector detector) { } /** - * Sets the configuration that will be used to create SecureContentHandlers - * that will be used for parsing. + * Sets the configuration that will be used to create SecureContentHandlers that will be used + * for parsing. * * @param autoDetectParserConfig type SecureContentHandlerConfig * @since Apache Tika 2.1.1 @@ -163,8 +151,9 @@ public AutoDetectParserConfig getAutoDetectParserConfig() { return this.autoDetectParserConfig; } - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { if (autoDetectParserConfig.getMetadataWriteFilterFactory() != null) { metadata.setMetadataWriteFilter( autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance()); @@ -172,18 +161,18 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); - //figure out if we should spool to disk + // figure out if we should spool to disk maybeSpool(tis, autoDetectParserConfig, metadata); // Automatically detect the MIME type of the document MediaType type = detector.detect(tis, metadata); - //update CONTENT_TYPE as long as it wasn't set by parser override - if (metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) == null || - !metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) + // update CONTENT_TYPE as long as it wasn't set by parser override + if (metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) == null + || !metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) .equals(type.toString())) { metadata.set(Metadata.CONTENT_TYPE, type.toString()); } - //check for zero-byte inputstream + // check for zero-byte inputstream if (tis.getOpenContainer() == null) { if (autoDetectParserConfig.getThrowOnZeroBytes()) { tis.mark(1); @@ -195,8 +184,10 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } handler = decorateHandler(handler, metadata, context, autoDetectParserConfig); // TIKA-216: Zip bomb prevention - SecureContentHandler sch = handler != null ? - createSecureContentHandler(handler, tis, autoDetectParserConfig) : null; + SecureContentHandler sch = + handler != null + ? createSecureContentHandler(handler, tis, autoDetectParserConfig) + : null; initializeEmbeddedDocumentExtractor(metadata, context); try { @@ -212,33 +203,38 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } } - private ContentHandler decorateHandler(ContentHandler handler, Metadata metadata, - ParseContext context, - AutoDetectParserConfig autoDetectParserConfig) { + private ContentHandler decorateHandler( + ContentHandler handler, + Metadata metadata, + ParseContext context, + AutoDetectParserConfig autoDetectParserConfig) { if (context.get(RecursiveParserWrapper.RecursivelySecureContentHandler.class) != null) { - //using the recursiveparserwrapper. we should decorate this handler - return autoDetectParserConfig.getContentHandlerDecoratorFactory() + // using the recursiveparserwrapper. we should decorate this handler + return autoDetectParserConfig + .getContentHandlerDecoratorFactory() .decorate(handler, metadata, context); } ParseRecord parseRecord = context.get(ParseRecord.class); if (parseRecord == null || parseRecord.getDepth() == 0) { - return autoDetectParserConfig.getContentHandlerDecoratorFactory() + return autoDetectParserConfig + .getContentHandlerDecoratorFactory() .decorate(handler, metadata, context); } - //else do not decorate + // else do not decorate return handler; } - private void maybeSpool(TikaInputStream tis, AutoDetectParserConfig autoDetectParserConfig, - Metadata metadata) throws IOException { + private void maybeSpool( + TikaInputStream tis, AutoDetectParserConfig autoDetectParserConfig, Metadata metadata) + throws IOException { if (tis.hasFile()) { return; } if (autoDetectParserConfig.getSpoolToDisk() == null) { return; } - //whether or not a content-length has been sent in, - //if spoolToDisk == 0, spool it + // whether or not a content-length has been sent in, + // if spoolToDisk == 0, spool it if (autoDetectParserConfig.getSpoolToDisk() == 0) { tis.getPath(); metadata.set(HttpHeaders.CONTENT_LENGTH, Long.toString(tis.getLength())); @@ -253,7 +249,7 @@ private void maybeSpool(TikaInputStream tis, AutoDetectParserConfig autoDetectPa metadata.set(HttpHeaders.CONTENT_LENGTH, Long.toString(tis.getLength())); } } catch (NumberFormatException e) { - //swallow...maybe log? + // swallow...maybe log? } } } @@ -262,8 +258,8 @@ private void initializeEmbeddedDocumentExtractor(Metadata metadata, ParseContext if (context.get(EmbeddedDocumentExtractor.class) != null) { return; } - //pass self to handle embedded documents if - //the caller hasn't specified one. + // pass self to handle embedded documents if + // the caller hasn't specified one. Parser p = context.get(Parser.class); if (p == null) { context.set(Parser.class, this); @@ -284,9 +280,8 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata) parse(stream, handler, metadata, context); } - private SecureContentHandler createSecureContentHandler(ContentHandler handler, - TikaInputStream tis, - AutoDetectParserConfig config) { + private SecureContentHandler createSecureContentHandler( + ContentHandler handler, TikaInputStream tis, AutoDetectParserConfig config) { SecureContentHandler sch = new SecureContentHandler(handler, tis); if (config == null) { return sch; @@ -309,5 +304,4 @@ private SecureContentHandler createSecureContentHandler(ContentHandler handler, } return sch; } - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java index afe65b07ed..5f295da365 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java @@ -18,33 +18,33 @@ import java.io.IOException; import java.io.Serializable; - -import org.w3c.dom.Element; -import org.xml.sax.ContentHandler; - import org.apache.tika.config.ConfigBase; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory; import org.apache.tika.sax.ContentHandlerDecoratorFactory; +import org.w3c.dom.Element; +import org.xml.sax.ContentHandler; /** - * This config object can be used to tune how conservative we want to be - * when parsing data that is extremely compressible and resembles a ZIP - * bomb. Null values will be ignored and will not affect the default values - * in SecureContentHandler. - *

- * See ModifyingContentWithHandlersAndMetadataFilters - * for documentation and examples for configuring this with a tika-config.xml file. + * This config object can be used to tune how conservative we want to be when parsing data that is + * extremely compressible and resembles a ZIP bomb. Null values will be ignored and will not affect + * the default values in SecureContentHandler. + * + *

See ModifyingContentWithHandlersAndMetadataFilters + * for documentation and examples for configuring this with a tika-config.xml file. */ public class AutoDetectParserConfig extends ConfigBase implements Serializable { private static ContentHandlerDecoratorFactory NOOP_CONTENT_HANDLER_DECORATOR_FACTORY = new ContentHandlerDecoratorFactory() { @Override - public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata, - ParseContext parseContext) { + public ContentHandler decorate( + ContentHandler contentHandler, + Metadata metadata, + ParseContext parseContext) { return contentHandler; } }; @@ -53,35 +53,29 @@ public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata, public static AutoDetectParserConfig load(Element element) throws TikaConfigException, IOException { - return AutoDetectParserConfig.buildSingle("autoDetectParserConfig", - AutoDetectParserConfig.class, element, AutoDetectParserConfig.DEFAULT); + return AutoDetectParserConfig.buildSingle( + "autoDetectParserConfig", + AutoDetectParserConfig.class, + element, + AutoDetectParserConfig.DEFAULT); } /** - * If this is not null and greater than -1, the AutoDetectParser - * will spool the stream to disk if the length of the stream is known - * ahead of time. + * If this is not null and greater than -1, the AutoDetectParser will spool the stream to disk + * if the length of the stream is known ahead of time. */ private Long spoolToDisk = null; - /** - * SecureContentHandler -- Desired output threshold in characters. - */ + /** SecureContentHandler -- Desired output threshold in characters. */ private Long outputThreshold = null; - /** - * SecureContentHandler -- Desired maximum compression ratio. - */ + /** SecureContentHandler -- Desired maximum compression ratio. */ private Long maximumCompressionRatio = null; - /** - * SecureContentHandler -- Desired maximum XML nesting level. - */ + /** SecureContentHandler -- Desired maximum XML nesting level. */ private Integer maximumDepth = null; - /** - * SecureContentHandler -- Desired maximum package entry nesting level. - */ + /** SecureContentHandler -- Desired maximum package entry nesting level. */ private Integer maximumPackageEntryDepth = null; private MetadataWriteFilterFactory metadataWriteFilterFactory = null; @@ -99,14 +93,17 @@ public static AutoDetectParserConfig load(Element element) * Creates a SecureContentHandlerConfig using the passed in parameters. * * @param spoolToDisk - * @param outputThreshold SecureContentHandler - character output threshold. - * @param maximumCompressionRatio SecureContentHandler - max compression ratio allowed. - * @param maximumDepth SecureContentHandler - maximum XML element nesting level. + * @param outputThreshold SecureContentHandler - character output threshold. + * @param maximumCompressionRatio SecureContentHandler - max compression ratio allowed. + * @param maximumDepth SecureContentHandler - maximum XML element nesting level. * @param maximumPackageEntryDepth SecureContentHandler - maximum package entry nesting level. */ - public AutoDetectParserConfig(Long spoolToDisk, Long outputThreshold, - Long maximumCompressionRatio, Integer maximumDepth, - Integer maximumPackageEntryDepth) { + public AutoDetectParserConfig( + Long spoolToDisk, + Long outputThreshold, + Long maximumCompressionRatio, + Integer maximumDepth, + Integer maximumPackageEntryDepth) { this.spoolToDisk = spoolToDisk; this.outputThreshold = outputThreshold; this.maximumCompressionRatio = maximumCompressionRatio; @@ -114,9 +111,7 @@ public AutoDetectParserConfig(Long spoolToDisk, Long outputThreshold, this.maximumPackageEntryDepth = maximumPackageEntryDepth; } - public AutoDetectParserConfig() { - - } + public AutoDetectParserConfig() {} public Long getSpoolToDisk() { return spoolToDisk; @@ -203,14 +198,27 @@ public boolean getThrowOnZeroBytes() { @Override public String toString() { - return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ", outputThreshold=" + - outputThreshold + ", maximumCompressionRatio=" + maximumCompressionRatio + - ", maximumDepth=" + maximumDepth + ", maximumPackageEntryDepth=" + - maximumPackageEntryDepth + ", metadataWriteFilterFactory=" + - metadataWriteFilterFactory + ", embeddedDocumentExtractorFactory=" + - embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" + - contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory + - ", throwOnZeroBytes=" + throwOnZeroBytes + '}'; + return "AutoDetectParserConfig{" + + "spoolToDisk=" + + spoolToDisk + + ", outputThreshold=" + + outputThreshold + + ", maximumCompressionRatio=" + + maximumCompressionRatio + + ", maximumDepth=" + + maximumDepth + + ", maximumPackageEntryDepth=" + + maximumPackageEntryDepth + + ", metadataWriteFilterFactory=" + + metadataWriteFilterFactory + + ", embeddedDocumentExtractorFactory=" + + embeddedDocumentExtractorFactory + + ", contentHandlerDecoratorFactory=" + + contentHandlerDecoratorFactory + + ", digesterFactory=" + + digesterFactory + + ", throwOnZeroBytes=" + + throwOnZeroBytes + + '}'; } } - diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java index 2365c8943a..e2a8f9f0a2 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java @@ -22,21 +22,14 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.Map; - -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; +import org.xml.sax.SAXException; -/** - * Factory for an AutoDetectParser - */ +/** Factory for an AutoDetectParser */ public class AutoDetectParserFactory extends ParserFactory { - /** - * Path to a tika-config file. This must be a literal - * file or findable on the classpath. - */ + /** Path to a tika-config file. This must be a literal file or findable on the classpath. */ public static final String TIKA_CONFIG_PATH = "tika_config_path"; public AutoDetectParserFactory(Map args) { diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java index 3b50b4da77..415814c03c 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java @@ -26,10 +26,6 @@ import java.util.List; import java.util.Map; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.io.TemporaryResources; @@ -41,37 +37,32 @@ import org.apache.tika.sax.TaggedContentHandler; import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.ParserUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Composite parser that delegates parsing tasks to a component parser - * based on the declared content type of the incoming document. A fallback - * parser is defined for cases where a parser for the given content type is - * not available. + * Composite parser that delegates parsing tasks to a component parser based on the declared content + * type of the incoming document. A fallback parser is defined for cases where a parser for the + * given content type is not available. */ public class CompositeParser implements Parser { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 2192845797749627824L; - /** - * Media type registry. - */ + /** Media type registry. */ private MediaTypeRegistry registry; - /** - * List of component parsers. - */ + /** List of component parsers. */ private List parsers; - /** - * The fallback parser, used when no better parser is available. - */ + /** The fallback parser, used when no better parser is available. */ private Parser fallback = new EmptyParser(); - public CompositeParser(MediaTypeRegistry registry, List parsers, - Collection> excludeParsers) { + public CompositeParser( + MediaTypeRegistry registry, + List parsers, + Collection> excludeParsers) { if (excludeParsers == null || excludeParsers.isEmpty()) { this.parsers = parsers; } else { @@ -107,13 +98,13 @@ public Map getParsers(ParseContext context) { return map; } - private boolean isExcluded(Collection> excludeParsers, - Class p) { + private boolean isExcluded( + Collection> excludeParsers, Class p) { return excludeParsers.contains(p) || assignableFrom(excludeParsers, p); } - private boolean assignableFrom(Collection> excludeParsers, - Class p) { + private boolean assignableFrom( + Collection> excludeParsers, Class p) { for (Class e : excludeParsers) { if (e.isAssignableFrom(p)) { return true; @@ -123,9 +114,9 @@ private boolean assignableFrom(Collection> excludeParser } /** - * Utility method that goes through all the component parsers and finds - * all media types for which more than one parser declares support. This - * is useful in tracking down conflicting parser definitions. + * Utility method that goes through all the component parsers and finds all media types for + * which more than one parser declares support. This is useful in tracking down conflicting + * parser definitions. * * @param context parsing context * @return media types that are supported by at least two component parsers @@ -175,9 +166,8 @@ public void setMediaTypeRegistry(MediaTypeRegistry registry) { } /** - * Returns all parsers registered with the Composite Parser, - * including ones which may not currently be active. - * This won't include the Fallback Parser, if defined + * Returns all parsers registered with the Composite Parser, including ones which may not + * currently be active. This won't include the Fallback Parser, if defined */ public List getAllComponentParsers() { return Collections.unmodifiableList(parsers); @@ -200,8 +190,9 @@ public Map getParsers() { public void setParsers(Map parsers) { this.parsers = new ArrayList<>(parsers.size()); for (Map.Entry entry : parsers.entrySet()) { - this.parsers.add(ParserDecorator - .withTypes(entry.getValue(), Collections.singleton(entry.getKey()))); + this.parsers.add( + ParserDecorator.withTypes( + entry.getValue(), Collections.singleton(entry.getKey()))); } } @@ -224,14 +215,12 @@ public void setFallback(Parser fallback) { } /** - * Returns the parser that best matches the given metadata. By default - * looks for a parser that matches the content type metadata property, - * and uses the fallback parser if a better match is not found. The - * type hierarchy information included in the configured media type - * registry is used when looking for a matching parser instance. - *

- * Subclasses can override this method to provide more accurate - * parser resolution. + * Returns the parser that best matches the given metadata. By default looks for a parser that + * matches the content type metadata property, and uses the fallback parser if a better match is + * not found. The type hierarchy information included in the configured media type registry is + * used when looking for a matching parser instance. + * + *

Subclasses can override this method to provide more accurate parser resolution. * * @param metadata document metadata * @return matching parser @@ -242,7 +231,7 @@ protected Parser getParser(Metadata metadata) { protected Parser getParser(Metadata metadata, ParseContext context) { Map map = getParsers(context); - //check for parser override first + // check for parser override first String contentTypeString = metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE); if (contentTypeString == null) { contentTypeString = metadata.get(Metadata.CONTENT_TYPE); @@ -271,14 +260,14 @@ public Set getSupportedTypes(ParseContext context) { /** * Delegates the call to the matching component parser. - *

- * Potential {@link RuntimeException}s, {@link IOException}s and - * {@link SAXException}s unrelated to the given input stream and content - * handler are automatically wrapped into {@link TikaException}s to better - * honor the {@link Parser} contract. + * + *

Potential {@link RuntimeException}s, {@link IOException}s and {@link SAXException}s + * unrelated to the given input stream and content handler are automatically wrapped into {@link + * TikaException}s to better honor the {@link Parser} contract. */ - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { Parser parser = getParser(metadata, context); TemporaryResources tmp = new TemporaryResources(); ParseRecord parserRecord = context.get(ParseRecord.class); @@ -297,7 +286,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, try { parser.parse(taggedStream, taggedHandler, metadata, context); } catch (SecurityException e) { - //rethrow security exceptions + // rethrow security exceptions throw e; } catch (IOException e) { taggedStream.throwIfCauseOf(e); @@ -324,7 +313,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, private void recordEmbeddedMetadata(Metadata metadata, ParseContext context) { ParseRecord record = context.get(ParseRecord.class); if (record == null) { - //this should never happen + // this should never happen return; } for (Exception e : record.getExceptions()) { diff --git a/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java b/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java index 1ffd851db3..16656c284d 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java @@ -26,27 +26,23 @@ import java.util.Set; import javax.crypto.Cipher; import javax.crypto.CipherInputStream; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Decrypts the incoming document stream and delegates further parsing to - * another parser instance. The decryption key and other settings as well - * as the delegate parser are taken from the parsing context. + * Decrypts the incoming document stream and delegates further parsing to another parser instance. + * The decryption key and other settings as well as the delegate parser are taken from the parsing + * context. * * @since Apache Tika 0.10 */ public abstract class CryptoParser extends DelegatingParser { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = -3507995752666557731L; private final String transformation; @@ -69,8 +65,9 @@ public Set getSupportedTypes(ParseContext context) { return types; } - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { try { Cipher cipher; if (provider != null) { @@ -101,5 +98,4 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, throw new TikaException("Unable to decrypt document stream", e); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java index 3205ea81d1..1a2516feaa 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java @@ -21,7 +21,6 @@ import java.util.Collections; import java.util.List; import java.util.Map; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.detect.DefaultEncodingDetector; import org.apache.tika.detect.EncodingDetector; @@ -32,42 +31,56 @@ import org.apache.tika.utils.ServiceLoaderUtils; /** - * A composite parser based on all the {@link Parser} implementations - * available through the - * {@link javax.imageio.spi.ServiceRegistry service provider mechanism}. + * A composite parser based on all the {@link Parser} implementations available through the {@link + * javax.imageio.spi.ServiceRegistry service provider mechanism}. * * @since Apache Tika 0.8 */ public class DefaultParser extends CompositeParser { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = 3612324825403757520L; - private transient final ServiceLoader loader; - public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader, - Collection> excludeParsers, - EncodingDetector encodingDetector, Renderer renderer) { + private final transient ServiceLoader loader; + + public DefaultParser( + MediaTypeRegistry registry, + ServiceLoader loader, + Collection> excludeParsers, + EncodingDetector encodingDetector, + Renderer renderer) { super(registry, getDefaultParsers(loader, encodingDetector, renderer, excludeParsers)); this.loader = loader; } - public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader, - Collection> excludeParsers) { - super(registry, - getDefaultParsers(loader, new DefaultEncodingDetector(loader), - new CompositeRenderer(loader), excludeParsers)); + public DefaultParser( + MediaTypeRegistry registry, + ServiceLoader loader, + Collection> excludeParsers) { + super( + registry, + getDefaultParsers( + loader, + new DefaultEncodingDetector(loader), + new CompositeRenderer(loader), + excludeParsers)); this.loader = loader; } - public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader, - EncodingDetector encodingDetector, Renderer renderer) { + public DefaultParser( + MediaTypeRegistry registry, + ServiceLoader loader, + EncodingDetector encodingDetector, + Renderer renderer) { this(registry, loader, Collections.EMPTY_SET, encodingDetector, renderer); } public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader) { - this(registry, loader, Collections.EMPTY_SET, new DefaultEncodingDetector(loader), + this( + registry, + loader, + Collections.EMPTY_SET, + new DefaultEncodingDetector(loader), new CompositeRenderer(loader)); } @@ -88,21 +101,19 @@ public DefaultParser() { } /** - * Finds all statically loadable parsers and sort the list by name, - * rather than discovery order. CompositeParser takes the last - * parser for any given media type, so put the Tika parsers first + * Finds all statically loadable parsers and sort the list by name, rather than discovery order. + * CompositeParser takes the last parser for any given media type, so put the Tika parsers first * so that non-Tika (user supplied) parsers can take precedence. * * @param loader service loader * @return ordered list of statically loadable parsers */ - private static List getDefaultParsers(ServiceLoader loader, - EncodingDetector encodingDetector, - Renderer renderer, - Collection> - excludeParsers) { - List parsers = - loader.loadStaticServiceProviders(Parser.class, excludeParsers); + private static List getDefaultParsers( + ServiceLoader loader, + EncodingDetector encodingDetector, + Renderer renderer, + Collection> excludeParsers) { + List parsers = loader.loadStaticServiceProviders(Parser.class, excludeParsers); if (encodingDetector != null) { for (Parser p : parsers) { @@ -115,14 +126,14 @@ private static List getDefaultParsers(ServiceLoader loader, } } ServiceLoaderUtils.sortLoadedClasses(parsers); - //reverse the order of parsers so that custom ones come last - //this will prevent them from being overwritten in getParsers(ParseContext ..) + // reverse the order of parsers so that custom ones come last + // this will prevent them from being overwritten in getParsers(ParseContext ..) Collections.reverse(parsers); return parsers; } - //recursively go through the parsers and set the encoding detector - //as configured in the config file + // recursively go through the parsers and set the encoding detector + // as configured in the config file private static void setEncodingDetector(Parser p, EncodingDetector encodingDetector) { if (p instanceof AbstractEncodingDetectorParser) { ((AbstractEncodingDetectorParser) p).setEncodingDetector(encodingDetector); diff --git a/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java index f2e007cfe9..e7ddcc7a19 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java @@ -19,30 +19,26 @@ import java.io.IOException; import java.io.InputStream; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Base class for parser implementations that want to delegate parts of the - * task of parsing an input document to another parser. The delegate parser - * is looked up from the parsing context using the {@link Parser} class as - * the key. + * Base class for parser implementations that want to delegate parts of the task of parsing an input + * document to another parser. The delegate parser is looked up from the parsing context using the + * {@link Parser} class as the key. * * @since Apache Tika 0.4, major changes in Tika 0.5 */ public class DelegatingParser implements Parser { /** - * Returns the parser instance to which parsing tasks should be delegated. - * The default implementation looks up the delegate parser from the given - * parse context, and uses an {@link EmptyParser} instance as a fallback. - * Subclasses can override this method to implement alternative delegation - * strategies. + * Returns the parser instance to which parsing tasks should be delegated. The default + * implementation looks up the delegate parser from the given parse context, and uses an {@link + * EmptyParser} instance as a fallback. Subclasses can override this method to implement + * alternative delegation strategies. * * @param context parse context * @return delegate parser @@ -57,18 +53,16 @@ public Set getSupportedTypes(ParseContext context) { } /** - * Looks up the delegate parser from the parsing context and - * delegates the parse operation to it. If a delegate parser is not - * found, then an empty XHTML document is returned. - *

- * Subclasses should override this method to parse the top level - * structure of the given document stream. Parsed sub-streams can - * be passed to this base class method to be parsed by the configured - * delegate parser. + * Looks up the delegate parser from the parsing context and delegates the parse operation to + * it. If a delegate parser is not found, then an empty XHTML document is returned. + * + *

Subclasses should override this method to parse the top level structure of the given + * document stream. Parsed sub-streams can be passed to this base class method to be parsed by + * the configured delegate parser. */ - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws SAXException, IOException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws SAXException, IOException, TikaException { getDelegateParser(context).parse(stream, handler, metadata, context); } - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java index 8c0358da72..65d0e6e387 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java @@ -17,23 +17,21 @@ package org.apache.tika.parser; - import java.io.IOException; import java.io.InputStream; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class DigestingParser extends ParserDecorator { private final Digester digester; private final boolean skipContainerDocument; + /** * Creates a decorator for the given parser. * @@ -46,8 +44,9 @@ public DigestingParser(Parser parser, Digester digester, boolean skipContainerDo } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); try { @@ -64,7 +63,7 @@ private boolean shouldDigest(Metadata metadata) { if (digester == null) { return false; } - if (! skipContainerDocument) { + if (!skipContainerDocument) { return true; } Integer parseDepth = metadata.getInt(TikaCoreProperties.EMBEDDED_DEPTH); @@ -75,43 +74,40 @@ private boolean shouldDigest(Metadata metadata) { } /** - * This is used in {@link AutoDetectParserConfig} to (optionally) - * wrap the parser in a digesting parser. + * This is used in {@link AutoDetectParserConfig} to (optionally) wrap the parser in a digesting + * parser. */ public interface DigesterFactory { Digester build(); + void setSkipContainerDocument(boolean skipContainerDocument); + boolean isSkipContainerDocument(); } - /** - * Interface for digester. See - * org.apache.parser.utils.CommonsDigester in tika-parsers for an + /** + * Interface for digester. See org.apache.parser.utils.CommonsDigester in tika-parsers for an * implementation. */ public interface Digester { /** - * Digests an InputStream and sets the appropriate value(s) in the metadata. - * The Digester is also responsible for marking and resetting the stream. - *

- * The given stream is guaranteed to support the - * {@link InputStream#markSupported() mark feature} and the detector - * is expected to {@link InputStream#mark(int) mark} the stream before - * reading any bytes from it, and to {@link InputStream#reset() reset} - * the stream before returning. The stream must not be closed by the - * detector. + * Digests an InputStream and sets the appropriate value(s) in the metadata. The Digester is + * also responsible for marking and resetting the stream. * - * @param is InputStream to digest - * @param m Metadata to set the values for + *

The given stream is guaranteed to support the {@link InputStream#markSupported() mark + * feature} and the detector is expected to {@link InputStream#mark(int) mark} the stream + * before reading any bytes from it, and to {@link InputStream#reset() reset} the stream + * before returning. The stream must not be closed by the detector. + * + * @param is InputStream to digest + * @param m Metadata to set the values for * @param parseContext ParseContext * @throws IOException */ void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException; } - /** - * Encodes byte array from a MessageDigest to String - */ + /** Encodes byte array from a MessageDigest to String */ public interface Encoder { String encode(byte[] bytes); } diff --git a/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java b/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java index 546d0c2a71..774da24d53 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java @@ -19,35 +19,30 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Dummy parser that always produces an empty XHTML document without even - * attempting to parse the given document stream. Useful as a sentinel parser - * for unknown document types. + * Dummy parser that always produces an empty XHTML document without even attempting to parse the + * given document stream. Useful as a sentinel parser for unknown document types. */ public class EmptyParser implements Parser { - /** - * Singleton instance of this class. - */ + /** Singleton instance of this class. */ public static final EmptyParser INSTANCE = new EmptyParser(); - /** - * Serial version UID. - */ + + /** Serial version UID. */ private static final long serialVersionUID = -4218649699095732123L; public Set getSupportedTypes(ParseContext context) { return Collections.emptySet(); } - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws SAXException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws SAXException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument(); diff --git a/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java b/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java index b8071cb52f..37b946cb5e 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java @@ -19,31 +19,28 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; - -import org.xml.sax.ContentHandler; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; /** - * Dummy parser that always throws a {@link TikaException} without even - * attempting to parse the given document stream. Useful as a sentinel parser - * for unknown document types. + * Dummy parser that always throws a {@link TikaException} without even attempting to parse the + * given document stream. Useful as a sentinel parser for unknown document types. */ public class ErrorParser implements Parser { - /** - * Singleton instance of this class. - */ + /** Singleton instance of this class. */ public static final ErrorParser INSTANCE = new ErrorParser(); + private static final long serialVersionUID = 7727423956957641824L; public Set getSupportedTypes(ParseContext context) { return Collections.emptySet(); } - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws TikaException { throw new TikaException("Parse error"); } } diff --git a/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java b/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java index 822512d7e4..51e1ca6f55 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java @@ -26,14 +26,8 @@ import java.net.URLConnection; import java.util.Collections; import java.util.Set; - import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -42,7 +36,10 @@ import org.apache.tika.sax.TaggedContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.utils.XMLReaderUtils; - +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; public class NetworkParser implements Parser { @@ -63,8 +60,9 @@ public Set getSupportedTypes(ParseContext context) { return supportedTypes; } - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); @@ -74,16 +72,20 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } } - private void parse(TikaInputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + private void parse( + TikaInputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { if ("telnet".equals(uri.getScheme())) { try (Socket socket = new Socket(uri.getHost(), uri.getPort())) { - new ParsingTask(stream, new FilterOutputStream(socket.getOutputStream()) { - @Override - public void close() throws IOException { - socket.shutdownOutput(); - } - }).parse(socket.getInputStream(), handler, metadata, context); + new ParsingTask( + stream, + new FilterOutputStream(socket.getOutputStream()) { + @Override + public void close() throws IOException { + socket.shutdownOutput(); + } + }) + .parse(socket.getInputStream(), handler, metadata, context); } } else { URL url = uri.toURL(); @@ -95,7 +97,6 @@ public void close() throws IOException { .parse(CloseShieldInputStream.wrap(input), handler, metadata, context); } } - } private static class ParsingTask implements Runnable { @@ -111,17 +112,16 @@ public ParsingTask(TikaInputStream input, OutputStream output) { this.output = output; } - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { Thread thread = new Thread(this, "Tika network parser"); thread.start(); - TaggedContentHandler tagged = - new TaggedContentHandler(handler); + TaggedContentHandler tagged = new TaggedContentHandler(handler); try { - XMLReaderUtils - .parseSAX(stream, new TeeContentHandler(tagged, new MetaHandler(metadata)), - context); + XMLReaderUtils.parseSAX( + stream, new TeeContentHandler(tagged, new MetaHandler(metadata)), context); } catch (SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("Invalid network parser output", e); @@ -141,7 +141,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } } - //---------------------------------------------------------- + // ---------------------------------------------------------- public void run() { try { @@ -154,7 +154,6 @@ public void run() { exception = e; } } - } private static class MetaHandler extends DefaultHandler { @@ -176,7 +175,5 @@ public void startElement(String uri, String localName, String qName, Attributes } } } - } - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index 531f1daa04..ba4781bf42 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -28,15 +28,13 @@ import javax.xml.parsers.SAXParserFactory; import javax.xml.stream.XMLInputFactory; import javax.xml.transform.Transformer; - +import org.apache.tika.exception.TikaException; +import org.apache.tika.utils.XMLReaderUtils; import org.xml.sax.SAXNotRecognizedException; import org.xml.sax.SAXNotSupportedException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.exception.TikaException; -import org.apache.tika.utils.XMLReaderUtils; - /** * Parse context. Used to pass context information to Tika parsers. * @@ -45,21 +43,16 @@ */ public class ParseContext implements Serializable { - /** - * Serial version UID. - */ + /** Serial version UID. */ private static final long serialVersionUID = -5921436862145826534L; - /** - * Map of objects in this context - */ + /** Map of objects in this context */ private final Map context = new HashMap<>(); /** - * Adds the given value to the context as an implementation of the given - * interface. + * Adds the given value to the context as an implementation of the given interface. * - * @param key the interface implemented by the given value + * @param key the interface implemented by the given value * @param value the value to be added, or null to remove */ public void set(Class key, T value) { @@ -74,8 +67,7 @@ public void set(Class key, T value) { * Returns the object in this context that implements the given interface. * * @param key the interface implemented by the requested object - * @return the object that implements the given interface, - * or null if not found + * @return the object that implements the given interface, or null if not found */ @SuppressWarnings("unchecked") public T get(Class key) { @@ -83,13 +75,13 @@ public T get(Class key) { } /** - * Returns the object in this context that implements the given interface, - * or the given default value if such an object is not found. + * Returns the object in this context that implements the given interface, or the given default + * value if such an object is not found. * - * @param key the interface implemented by the requested object + * @param key the interface implemented by the requested object * @param defaultValue value to return if the requested object is not found - * @return the object that implements the given interface, - * or the given default value if not found + * @return the object that implements the given interface, or the given default value if not + * found */ public T get(Class key, T defaultValue) { T value = get(key); @@ -101,9 +93,8 @@ public T get(Class key, T defaultValue) { } /** - * Returns the XMLReader specified in this parsing context. If a reader - * is not explicitly specified, then one is created using the specified - * or the default SAX parser. + * Returns the XMLReader specified in this parsing context. If a reader is not explicitly + * specified, then one is created using the specified or the default SAX parser. * * @return XMLReader * @throws TikaException @@ -119,11 +110,10 @@ public XMLReader getXMLReader() throws TikaException { } /** - * Returns the SAX parser specified in this parsing context. If a parser - * is not explicitly specified, then one is created using the specified - * or the default SAX parser factory. Consider using - * {@link XMLReaderUtils#parseSAX(InputStream, DefaultHandler, ParseContext)} - * for more efficient reuse of SAXParsers. + * Returns the SAX parser specified in this parsing context. If a parser is not explicitly + * specified, then one is created using the specified or the default SAX parser factory. + * Consider using {@link XMLReaderUtils#parseSAX(InputStream, DefaultHandler, ParseContext)} for + * more efficient reuse of SAXParsers. * * @return SAX parser * @throws TikaException if a SAX parser could not be created @@ -140,11 +130,10 @@ public SAXParser getSAXParser() throws TikaException { } /** - * Returns the SAX parser factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware, not validating, and to use - * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. + * Returns the SAX parser factory specified in this parsing context. If a factory is not + * explicitly specified, then a default factory instance is created and returned. The default + * factory instance is configured to be namespace-aware, not validating, and to use {@link + * XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. * * @return SAX parser factory * @since Apache Tika 0.8 @@ -158,7 +147,7 @@ public SAXParserFactory getSAXParserFactory() { try { factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); } catch (ParserConfigurationException | SAXNotSupportedException e) { - //swallow + // swallow } catch (SAXNotRecognizedException e) { // TIKA-271: Some XML parsers do not support the // secure-processing feature, even though it's required by @@ -171,17 +160,16 @@ public SAXParserFactory getSAXParserFactory() { } /** - * Returns the DOM builder factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware and to apply reasonable security + * Returns the DOM builder factory specified in this parsing context. If a factory is not + * explicitly specified, then a default factory instance is created and returned. The default + * factory instance is configured to be namespace-aware and to apply reasonable security * features. * * @return DOM parser factory * @since Apache Tika 1.13 */ private DocumentBuilderFactory getDocumentBuilderFactory() { - //borrowed from Apache POI + // borrowed from Apache POI DocumentBuilderFactory documentBuilderFactory = get(DocumentBuilderFactory.class); if (documentBuilderFactory != null) { return documentBuilderFactory; @@ -191,13 +179,11 @@ private DocumentBuilderFactory getDocumentBuilderFactory() { } /** - * Returns the DOM builder specified in this parsing context. - * If a builder is not explicitly specified, then a builder - * instance is created and returned. The builder instance is - * configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER}, - * and it sets the ErrorHandler to null. - * Consider using {@link XMLReaderUtils#buildDOM(InputStream, ParseContext)} - * instead for more efficient reuse of document builders. + * Returns the DOM builder specified in this parsing context. If a builder is not explicitly + * specified, then a builder instance is created and returned. The builder instance is + * configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER}, and it sets the + * ErrorHandler to null. Consider using {@link XMLReaderUtils#buildDOM(InputStream, + * ParseContext)} instead for more efficient reuse of document builders. * * @return DOM Builder * @since Apache Tika 1.13 @@ -212,11 +198,10 @@ public DocumentBuilder getDocumentBuilder() throws TikaException { } /** - * Returns the StAX input factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware and to apply reasonable security - * using the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}. + * Returns the StAX input factory specified in this parsing context. If a factory is not + * explicitly specified, then a default factory instance is created and returned. The default + * factory instance is configured to be namespace-aware and to apply reasonable security using + * the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}. * * @return StAX input factory * @since Apache Tika 1.13 @@ -229,14 +214,12 @@ public XMLInputFactory getXMLInputFactory() { return XMLReaderUtils.getXMLInputFactory(); } - /** * Returns the transformer specified in this parsing context. - *

- * If a transformer is not explicitly specified, then a default transformer - * instance is created and returned. The default transformer instance is - * configured to to use - * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. + * + *

If a transformer is not explicitly specified, then a default transformer instance is + * created and returned. The default transformer instance is configured to to use {@link + * XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. * * @return Transformer * @throws TikaException when the transformer can not be created @@ -251,5 +234,4 @@ public Transformer getTransformer() throws TikaException { return XMLReaderUtils.getTransformer(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java index ca0edc567c..267f09923d 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java @@ -20,18 +20,16 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Set; - import org.apache.tika.metadata.Metadata; /** - * Use this class to store exceptions, warnings and other information - * during the parse. This information is added to the parent's metadata - * after the parse by the {@link CompositeParser}. + * Use this class to store exceptions, warnings and other information during the parse. This + * information is added to the parent's metadata after the parse by the {@link CompositeParser}. */ public class ParseRecord { - //hard limits so that specially crafted files - //don't cause an OOM + // hard limits so that specially crafted files + // don't cause an OOM private static int MAX_PARSERS = 100; private static final int MAX_EXCEPTIONS = 100; @@ -103,7 +101,6 @@ public List getWarnings() { return warnings; } - public boolean isWriteLimitReached() { return writeLimitReached; } diff --git a/tika-core/src/main/java/org/apache/tika/parser/Parser.java b/tika-core/src/main/java/org/apache/tika/parser/Parser.java index 44882883a4..6aa9dfa564 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/Parser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/Parser.java @@ -20,22 +20,18 @@ import java.io.InputStream; import java.io.Serializable; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; -/** - * Tika parser interface. - */ +/** Tika parser interface. */ public interface Parser extends Serializable { /** - * Returns the set of media types supported by this parser when used - * with the given parse context. + * Returns the set of media types supported by this parser when used with the given parse + * context. * * @param context parse context * @return immutable set of media types @@ -44,26 +40,24 @@ public interface Parser extends Serializable { Set getSupportedTypes(ParseContext context); /** - * Parses a document stream into a sequence of XHTML SAX events. - * Fills in related document metadata in the given metadata object. - *

- * The given document stream is consumed but not closed by this method. - * The responsibility to close the stream remains on the caller. - *

- * Information about the parsing context can be passed in the context - * parameter. See the parser implementations for the kinds of context - * information they expect. + * Parses a document stream into a sequence of XHTML SAX events. Fills in related document + * metadata in the given metadata object. + * + *

The given document stream is consumed but not closed by this method. The responsibility to + * close the stream remains on the caller. + * + *

Information about the parsing context can be passed in the context parameter. See the + * parser implementations for the kinds of context information they expect. * - * @param stream the document stream (input) - * @param handler handler for the XHTML SAX events (output) + * @param stream the document stream (input) + * @param handler handler for the XHTML SAX events (output) * @param metadata document metadata (input and output) - * @param context parse context - * @throws IOException if the document stream could not be read - * @throws SAXException if the SAX events could not be processed + * @param context parse context + * @throws IOException if the document stream could not be read + * @throws SAXException if the SAX events could not be processed * @throws TikaException if the document could not be parsed * @since Apache Tika 0.5 */ void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException; - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java b/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java index 32d6661fb3..7e0f655fb5 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java @@ -21,34 +21,30 @@ import java.util.Collection; import java.util.HashSet; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy; import org.apache.tika.parser.multiple.FallbackParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Decorator base class for the {@link Parser} interface. - *

This class simply delegates all parsing calls to an underlying decorated - * parser instance. Subclasses can provide extra decoration by overriding the - * parse method. - *

To decorate several different parsers at the same time, wrap them in - * a {@link CompositeParser} instance first. + * + *

This class simply delegates all parsing calls to an underlying decorated parser instance. + * Subclasses can provide extra decoration by overriding the parse method. + * + *

To decorate several different parsers at the same time, wrap them in a {@link CompositeParser} + * instance first. */ public class ParserDecorator implements Parser { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = -3861669115439125268L; - /** - * The decorated parser instance. - */ + + /** The decorated parser instance. */ private final Parser parser; /** @@ -61,11 +57,11 @@ public ParserDecorator(Parser parser) { } /** - * Decorates the given parser so that it always claims to support - * parsing of the given media types. + * Decorates the given parser so that it always claims to support parsing of the given media + * types. * * @param parser the parser to be decorated - * @param types supported media types + * @param types supported media types * @return the decorated parser */ public static final Parser withTypes(Parser parser, final Set types) { @@ -85,10 +81,10 @@ public String getDecorationName() { } /** - * Decorates the given parser so that it never claims to support - * parsing of the given media types, but will work for all others. + * Decorates the given parser so that it never claims to support parsing of the given media + * types, but will work for all others. * - * @param parser the parser to be decorated + * @param parser the parser to be decorated * @param excludeTypes excluded/ignored media types * @return the decorated parser */ @@ -99,8 +95,7 @@ public static final Parser withoutTypes(Parser parser, final Set excl @Override public Set getSupportedTypes(ParseContext context) { // Get our own, writable copy of the types the parser supports - Set parserTypes = - new HashSet<>(super.getSupportedTypes(context)); + Set parserTypes = new HashSet<>(super.getSupportedTypes(context)); // Remove anything on our excludes list parserTypes.removeAll(excludeTypes); // Return whatever is left @@ -115,14 +110,14 @@ public String getDecorationName() { } /** - * Decorates the given parsers into a virtual parser, where they'll - * be tried in preference order until one works without error. + * Decorates the given parsers into a virtual parser, where they'll be tried in preference order + * until one works without error. * * @deprecated This has been replaced by {@link FallbackParser} */ @Deprecated - public static final Parser withFallbacks(final Collection parsers, - final Set types) { + public static final Parser withFallbacks( + final Collection parsers, final Set types) { // Delegate to the new FallbackParser for now, until people upgrade // Keep old behaviour on metadata, which was to preseve all MediaTypeRegistry registry = MediaTypeRegistry.getDefaultRegistry(); @@ -135,21 +130,22 @@ public static final Parser withFallbacks(final Collection pars } /** - * Delegates the method call to the decorated parser. Subclasses should - * override this method (and use super.getSupportedTypes() - * to invoke the decorated parser) to implement extra decoration. + * Delegates the method call to the decorated parser. Subclasses should override this method + * (and use super.getSupportedTypes() to invoke the decorated parser) to implement + * extra decoration. */ public Set getSupportedTypes(ParseContext context) { return parser.getSupportedTypes(context); } /** - * Delegates the method call to the decorated parser. Subclasses should - * override this method (and use super.parse() to invoke - * the decorated parser) to implement extra decoration. + * Delegates the method call to the decorated parser. Subclasses should override this method + * (and use super.parse() to invoke the decorated parser) to implement extra + * decoration. */ - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { parser.parse(stream, handler, metadata, context); } diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java b/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java index af541b0582..4df3a13d47 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java @@ -17,13 +17,10 @@ package org.apache.tika.parser; - import java.io.IOException; import java.util.Map; - -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; +import org.xml.sax.SAXException; public abstract class ParserFactory { @@ -34,5 +31,4 @@ public ParserFactory(Map args) { } public abstract Parser build() throws IOException, SAXException, TikaException; - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java b/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java index 308fa7ebc0..7d3b55d95f 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java @@ -18,21 +18,18 @@ import java.io.IOException; import java.io.InputStream; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.utils.RegexUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Parser decorator that post-processes the results from a decorated parser. - * The post-processing takes care of filling in the "fulltext", "summary", - * and "outlinks" metadata entries based on the full text content returned by - * the decorated parser. + * Parser decorator that post-processes the results from a decorated parser. The post-processing + * takes care of filling in the "fulltext", "summary", and "outlinks" metadata entries based on the + * full text content returned by the decorated parser. */ public class ParserPostProcessor extends ParserDecorator { @@ -46,11 +43,11 @@ public ParserPostProcessor(Parser parser) { } /** - * Forwards the call to the delegated parser and post-processes the - * results as described above. + * Forwards the call to the delegated parser and post-processes the results as described above. */ - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { ContentHandler body = new BodyContentHandler(); ContentHandler tee = new TeeContentHandler(handler, body); super.parse(stream, tee, metadata, context); @@ -65,5 +62,4 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, metadata.add("outlinks", link); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java b/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java index fe98e746de..b8f560c813 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java @@ -29,59 +29,42 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.concurrent.Executor; - -import org.xml.sax.ContentHandler; - import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; /** - * Reader for the text content from a given binary stream. This class - * uses a background parsing task with a {@link Parser} - * ({@link AutoDetectParser} by default) to parse the text content from - * a given input stream. The {@link BodyContentHandler} class and a pipe - * is used to convert the push-based SAX event stream to the pull-based - * character stream defined by the {@link Reader} interface. + * Reader for the text content from a given binary stream. This class uses a background parsing task + * with a {@link Parser} ({@link AutoDetectParser} by default) to parse the text content from a + * given input stream. The {@link BodyContentHandler} class and a pipe is used to convert the + * push-based SAX event stream to the pull-based character stream defined by the {@link Reader} + * interface. * * @since Apache Tika 0.2 */ public class ParsingReader extends Reader { - /** - * Parser instance used for parsing the given binary stream. - */ + /** Parser instance used for parsing the given binary stream. */ private final Parser parser; - /** - * Buffered read end of the pipe. - */ + /** Buffered read end of the pipe. */ private final Reader reader; - /** - * Write end of the pipe. - */ + /** Write end of the pipe. */ private final Writer writer; - /** - * The binary stream being parsed. - */ + /** The binary stream being parsed. */ private final InputStream stream; - /** - * Metadata associated with the document being parsed. - */ + /** Metadata associated with the document being parsed. */ private final Metadata metadata; - /** - * The parse context. - */ + /** The parse context. */ private final ParseContext context; - /** - * An exception (if any) thrown by the parsing thread. - */ + /** An exception (if any) thrown by the parsing thread. */ private transient Throwable throwable; /** @@ -96,11 +79,10 @@ public ParsingReader(InputStream stream) throws IOException { } /** - * Creates a reader for the text content of the given binary stream - * with the given name. + * Creates a reader for the text content of the given binary stream with the given name. * * @param stream binary stream - * @param name document name + * @param name document name * @throws IOException if the document can not be parsed */ public ParsingReader(InputStream stream, String name) throws IOException { @@ -113,7 +95,7 @@ public ParsingReader(InputStream stream, String name) throws IOException { * * @param path path * @throws FileNotFoundException if the given file does not exist - * @throws IOException if the document can not be parsed + * @throws IOException if the document can not be parsed */ public ParsingReader(Path path) throws IOException { this(Files.newInputStream(path), path.getFileName().toString()); @@ -124,7 +106,7 @@ public ParsingReader(Path path) throws IOException { * * @param file file * @throws FileNotFoundException if the given file does not exist - * @throws IOException if the document can not be parsed + * @throws IOException if the document can not be parsed * @see #ParsingReader(Path) */ public ParsingReader(File file) throws FileNotFoundException, IOException { @@ -132,56 +114,66 @@ public ParsingReader(File file) throws FileNotFoundException, IOException { } /** - * Creates a reader for the text content of the given binary stream - * with the given document metadata. The given parser is used for - * parsing. A new background thread is started for the parsing task. - *

- * The created reader will be responsible for closing the given stream. - * The stream and any associated resources will be closed at or before - * the time when the {@link #close()} method is called on this reader. + * Creates a reader for the text content of the given binary stream with the given document + * metadata. The given parser is used for parsing. A new background thread is started for the + * parsing task. + * + *

The created reader will be responsible for closing the given stream. The stream and any + * associated resources will be closed at or before the time when the {@link #close()} method is + * called on this reader. * - * @param parser parser instance - * @param stream binary stream + * @param parser parser instance + * @param stream binary stream * @param metadata document metadata * @throws IOException if the document can not be parsed */ - public ParsingReader(Parser parser, InputStream stream, final Metadata metadata, - ParseContext context) throws IOException { - this(parser, stream, metadata, context, command -> { - String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); - if (name != null) { - name = "Apache Tika: " + name; - } else { - name = "Apache Tika"; - } - Thread thread = new Thread(command, name); - thread.setDaemon(true); - thread.start(); - }); + public ParsingReader( + Parser parser, InputStream stream, final Metadata metadata, ParseContext context) + throws IOException { + this( + parser, + stream, + metadata, + context, + command -> { + String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); + if (name != null) { + name = "Apache Tika: " + name; + } else { + name = "Apache Tika"; + } + Thread thread = new Thread(command, name); + thread.setDaemon(true); + thread.start(); + }); } /** - * Creates a reader for the text content of the given binary stream - * with the given document metadata. The given parser is used for the - * parsing task that is run with the given executor. The given executor - * must run the parsing task asynchronously in a separate thread, - * since the current thread must return to the caller that can then - * consume the parsed text through the {@link Reader} interface. - *

- * The created reader will be responsible for closing the given stream. - * The stream and any associated resources will be closed at or before - * the time when the {@link #close()} method is called on this reader. + * Creates a reader for the text content of the given binary stream with the given document + * metadata. The given parser is used for the parsing task that is run with the given executor. + * The given executor must run the parsing task asynchronously in a separate thread, + * since the current thread must return to the caller that can then consume the parsed text + * through the {@link Reader} interface. + * + *

The created reader will be responsible for closing the given stream. The stream and any + * associated resources will be closed at or before the time when the {@link #close()} method is + * called on this reader. * - * @param parser parser instance - * @param stream binary stream + * @param parser parser instance + * @param stream binary stream * @param metadata document metadata - * @param context parsing context + * @param context parsing context * @param executor executor for the parsing task * @throws IOException if the document can not be parsed * @since Apache Tika 0.4 */ - public ParsingReader(Parser parser, InputStream stream, Metadata metadata, ParseContext context, - Executor executor) throws IOException { + public ParsingReader( + Parser parser, + InputStream stream, + Metadata metadata, + ParseContext context, + Executor executor) + throws IOException { this.parser = parser; PipedReader pipedReader = new PipedReader(); this.reader = new BufferedReader(pipedReader); @@ -203,8 +195,7 @@ public ParsingReader(Parser parser, InputStream stream, Metadata metadata, Parse } /** - * Utility method that returns a {@link Metadata} instance - * for a document with the given name. + * Utility method that returns a {@link Metadata} instance for a document with the given name. * * @param name resource name (or null) * @return metadata instance @@ -218,14 +209,14 @@ private static Metadata getMetadata(String name) { } /** - * Reads parsed text from the pipe connected to the parsing thread. - * Fails if the parsing thread has thrown an exception. + * Reads parsed text from the pipe connected to the parsing thread. Fails if the parsing thread + * has thrown an exception. * * @param cbuf character buffer - * @param off start offset within the buffer - * @param len maximum number of characters to read - * @throws IOException if the parsing thread has failed or - * if for some reason the pipe does not work properly + * @param off start offset within the buffer + * @param len maximum number of characters to read + * @throws IOException if the parsing thread has failed or if for some reason the pipe does not + * work properly */ @Override public int read(char[] cbuf, int off, int len) throws IOException { @@ -240,9 +231,9 @@ public int read(char[] cbuf, int off, int len) throws IOException { } /** - * Closes the read end of the pipe. If the parsing thread is still - * running, next write to the pipe will fail and cause the thread - * to stop. Thus there is no need to explicitly terminate the thread. + * Closes the read end of the pipe. If the parsing thread is still running, next write to the + * pipe will fail and cause the thread to stop. Thus there is no need to explicitly terminate + * the thread. * * @throws IOException if the pipe can not be closed */ @@ -251,16 +242,13 @@ public void close() throws IOException { reader.close(); } - /** - * The background parsing task. - */ + /** The background parsing task. */ private class ParsingTask implements Runnable { /** - * Parses the given binary stream and writes the text content - * to the write end of the pipe. Potential exceptions (including - * the one caused if the read end is closed unexpectedly) are - * stored before the input stream is closed and processing is stopped. + * Parses the given binary stream and writes the text content to the write end of the pipe. + * Potential exceptions (including the one caused if the read end is closed unexpectedly) + * are stored before the input stream is closed and processing is stopped. */ public void run() { try { @@ -286,7 +274,5 @@ public void run() { } } } - } - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java b/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java index b14baddcae..61dd2b3d11 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java +++ b/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java @@ -19,23 +19,19 @@ import org.apache.tika.metadata.Metadata; /** - * Interface for providing a password to a Parser for handling Encrypted - * and Password Protected Documents. - * An implementation of this should be set on the {@link ParseContext} - * supplied to {@link Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, - * Metadata, ParseContext)} - * to provide a way to get the document password. - * An implementation of this interface defines some specific selection - * or lookup criteria, to be applied against the document metadata passed - * to the {@link #getPassword(Metadata)} method. + * Interface for providing a password to a Parser for handling Encrypted and Password Protected + * Documents. An implementation of this should be set on the {@link ParseContext} supplied to {@link + * Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, Metadata, ParseContext)} to provide + * a way to get the document password. An implementation of this interface defines some specific + * selection or lookup criteria, to be applied against the document metadata passed to the {@link + * #getPassword(Metadata)} method. * * @since Apache Tika 1.1 */ public interface PasswordProvider { /** - * Looks up the password for a document with the given metadata, - * and returns it for the Parser. If no password is available - * for the document, will return null. + * Looks up the password for a document with the given metadata, and returns it for the Parser. + * If no password is available for the document, will return null. * * @param metadata document metadata * @return The document decryption password, or null if not known diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index 3cb78d5207..6037893cf6 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -19,11 +19,6 @@ import java.io.IOException; import java.io.InputStream; import java.util.Set; - -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; @@ -42,49 +37,44 @@ import org.apache.tika.sax.WriteLimiter; import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.ParserUtils; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * This is a helper class that wraps a parser in a recursive handler. - * It takes care of setting the embedded parser in the ParseContext - * and handling the embedded path calculations. - *

- * After parsing a document, call getMetadata() to retrieve a list of - * Metadata objects, one for each embedded resource. The first item - * in the list will contain the Metadata for the outer container file. - *

- * Content can also be extracted and stored in the {@link TikaCoreProperties#TIKA_CONTENT} field - * of a Metadata object. Select the type of content to be stored - * at initialization. - *

- * If a WriteLimitReachedException is encountered, the wrapper will stop - * processing the current resource, and it will not process - * any of the child resources for the given resource. However, it will try to - * parse as much as it can. If a WLRE is reached in the parent document, - * no child resources will be parsed. - *

- * The implementation is based on Jukka's RecursiveMetadataParser - * and Nick's additions. See: - * RecursiveMetadataParser. - *

- * Note that this wrapper holds all data in memory and is not appropriate - * for files with content too large to be held in memory. - *

- * The unit tests for this class are in the tika-parsers module. - *

+ * This is a helper class that wraps a parser in a recursive handler. It takes care of setting the + * embedded parser in the ParseContext and handling the embedded path calculations. + * + *

After parsing a document, call getMetadata() to retrieve a list of Metadata objects, one for + * each embedded resource. The first item in the list will contain the Metadata for the outer + * container file. + * + *

Content can also be extracted and stored in the {@link TikaCoreProperties#TIKA_CONTENT} field + * of a Metadata object. Select the type of content to be stored at initialization. + * + *

If a WriteLimitReachedException is encountered, the wrapper will stop processing the current + * resource, and it will not process any of the child resources for the given resource. However, it + * will try to parse as much as it can. If a WLRE is reached in the parent document, no child + * resources will be parsed. + * + *

The implementation is based on Jukka's RecursiveMetadataParser and Nick's additions. See: RecursiveMetadataParser. + * + *

Note that this wrapper holds all data in memory and is not appropriate for files with content + * too large to be held in memory. + * + *

The unit tests for this class are in the tika-parsers module. */ public class RecursiveParserWrapper extends ParserDecorator { - /** - * Generated serial version - */ + /** Generated serial version */ private static final long serialVersionUID = 9086536568120690938L; - private final boolean catchEmbeddedExceptions; /** - * Initialize the wrapper with {@link #catchEmbeddedExceptions} set - * to true as default. + * Initialize the wrapper with {@link #catchEmbeddedExceptions} set to true as + * default. * * @param wrappedParser parser to use for the container documents and the embedded documents */ @@ -93,29 +83,26 @@ public RecursiveParserWrapper(Parser wrappedParser) { } /** - * @param wrappedParser parser to wrap - * @param catchEmbeddedExceptions whether or not to catch+record embedded exceptions. - * If set to false, embedded exceptions will be - * thrown and the rest of the file will not be parsed. The - * following will not be ignored: - * {@link CorruptedFileException}, {@link RuntimeException} + * @param wrappedParser parser to wrap + * @param catchEmbeddedExceptions whether or not to catch+record embedded exceptions. If set to + * false, embedded exceptions will be thrown and the rest of the file will not + * be parsed. The following will not be ignored: {@link CorruptedFileException}, {@link + * RuntimeException} */ public RecursiveParserWrapper(Parser wrappedParser, boolean catchEmbeddedExceptions) { super(wrappedParser); this.catchEmbeddedExceptions = catchEmbeddedExceptions; } - @Override public Set getSupportedTypes(ParseContext context) { return getWrappedParser().getSupportedTypes(context); } - /** * @param stream - * @param recursiveParserWrapperHandler -- handler must implement - * {@link RecursiveParserWrapperHandler} + * @param recursiveParserWrapperHandler -- handler must implement {@link + * RecursiveParserWrapperHandler} * @param metadata * @param context * @throws IOException @@ -124,14 +111,18 @@ public Set getSupportedTypes(ParseContext context) { * @throws IllegalStateException if the handler is not a {@link RecursiveParserWrapperHandler} */ @Override - public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandler, - Metadata metadata, ParseContext context) + public void parse( + InputStream stream, + ContentHandler recursiveParserWrapperHandler, + Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { - //this tracks the state of the parent parser, per call to #parse + // this tracks the state of the parent parser, per call to #parse ParserState parserState; if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) { - parserState = new ParserState( - (AbstractRecursiveParserWrapperHandler) recursiveParserWrapperHandler); + parserState = + new ParserState( + (AbstractRecursiveParserWrapperHandler) recursiveParserWrapperHandler); } else { throw new IllegalStateException( "ContentHandler must implement RecursiveParserWrapperHandler"); @@ -149,17 +140,18 @@ public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandl if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) { ContentHandlerFactory factory = - ((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler).getContentHandlerFactory(); + ((AbstractRecursiveParserWrapperHandler) recursiveParserWrapperHandler) + .getContentHandlerFactory(); if (factory instanceof WriteLimiter) { - writeLimit = ((WriteLimiter)factory).getWriteLimit(); - throwOnWriteLimitReached = ((WriteLimiter)factory).isThrowOnWriteLimitReached(); + writeLimit = ((WriteLimiter) factory).getWriteLimit(); + throwOnWriteLimitReached = ((WriteLimiter) factory).isThrowOnWriteLimitReached(); } } try { TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); RecursivelySecureContentHandler secureContentHandler = - new RecursivelySecureContentHandler(localHandler, tis, writeLimit, - throwOnWriteLimitReached, context); + new RecursivelySecureContentHandler( + localHandler, tis, writeLimit, throwOnWriteLimitReached, context); context.set(RecursivelySecureContentHandler.class, secureContentHandler); getWrappedParser().parse(tis, secureContentHandler, metadata, context); } catch (Throwable e) { @@ -193,13 +185,12 @@ private String getResourceName(Metadata metadata, ParserState state) { } else { objectName = "embedded-" + (++state.unknownCount); } - //make sure that there isn't any path info in the objectName - //some parsers can return paths, not just file names + // make sure that there isn't any path info in the objectName + // some parsers can return paths, not just file names objectName = FilenameUtils.getName(objectName); return objectName; } - private class EmbeddedParserDecorator extends StatefulParser { private static final long serialVersionUID = 207648200464263337L; @@ -208,9 +199,8 @@ private class EmbeddedParserDecorator extends StatefulParser { private String embeddedIdPath = null; - - private EmbeddedParserDecorator(Parser parser, String location, - String embeddedIdPath, ParserState parseState) { + private EmbeddedParserDecorator( + Parser parser, String location, String embeddedIdPath, ParserState parseState) { super(parser); this.location = location; if (!this.location.endsWith("/")) { @@ -221,10 +211,11 @@ private EmbeddedParserDecorator(Parser parser, String location, } @Override - public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler ignore, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { - //Test to see if we should avoid parsing + // Test to see if we should avoid parsing if (parserState.recursiveParserWrapperHandler.hasHitMaximumEmbeddedResources()) { return; } @@ -235,25 +226,26 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, metadata.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, objectLocation); String idPath = - this.embeddedIdPath.equals("/") ? - this.embeddedIdPath + ++parserState.embeddedCount : - this.embeddedIdPath + "/" + ++parserState.embeddedCount; + this.embeddedIdPath.equals("/") + ? this.embeddedIdPath + ++parserState.embeddedCount + : this.embeddedIdPath + "/" + ++parserState.embeddedCount; metadata.add(TikaCoreProperties.EMBEDDED_ID_PATH, idPath); metadata.set(TikaCoreProperties.EMBEDDED_ID, parserState.embeddedCount); - //get a fresh handler + // get a fresh handler ContentHandler localHandler = parserState.recursiveParserWrapperHandler.getNewContentHandler(); parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, metadata); Parser preContextParser = context.get(Parser.class); - context.set(Parser.class, - new EmbeddedParserDecorator(getWrappedParser(), objectLocation, - idPath, parserState)); + context.set( + Parser.class, + new EmbeddedParserDecorator( + getWrappedParser(), objectLocation, idPath, parserState)); long started = System.currentTimeMillis(); RecursivelySecureContentHandler secureContentHandler = context.get(RecursivelySecureContentHandler.class); - //store the handler that was used before this parse - //so that you can return it back to its state at the end of this parse + // store the handler that was used before this parse + // so that you can return it back to its state at the end of this parse ContentHandler preContextHandler = secureContentHandler.handler; secureContentHandler.updateContentHandler(localHandler); @@ -276,9 +268,9 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, if (e instanceof EncryptedDocumentException) { metadata.set(TikaCoreProperties.IS_ENCRYPTED, true); } - if (context.get(ZeroByteFileException.IgnoreZeroByteFileException.class) != null && - e instanceof ZeroByteFileException) { - //do nothing + if (context.get(ZeroByteFileException.IgnoreZeroByteFileException.class) != null + && e instanceof ZeroByteFileException) { + // do nothing } else if (catchEmbeddedExceptions) { ParserUtils.recordParserFailure(this, e, metadata); } else { @@ -289,20 +281,21 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, secureContentHandler.updateContentHandler(preContextHandler); long elapsedMillis = System.currentTimeMillis() - started; metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); - parserState.recursiveParserWrapperHandler - .endEmbeddedDocument(localHandler, metadata); + parserState.recursiveParserWrapperHandler.endEmbeddedDocument( + localHandler, metadata); } } } /** - * This tracks the state of the parse of a single document. - * In future versions, this will allow the RecursiveParserWrapper to be thread safe. + * This tracks the state of the parse of a single document. In future versions, this will allow + * the RecursiveParserWrapper to be thread safe. */ private static class ParserState { private final AbstractRecursiveParserWrapperHandler recursiveParserWrapperHandler; private int unknownCount = 0; - private int embeddedCount = 0;//this is effectively 1-indexed + private int embeddedCount = 0; // this is effectively 1-indexed + private ParserState(AbstractRecursiveParserWrapperHandler handler) { this.recursiveParserWrapperHandler = handler; } @@ -311,7 +304,7 @@ private ParserState(AbstractRecursiveParserWrapperHandler handler) { static class RecursivelySecureContentHandler extends SecureContentHandler { private ContentHandler handler; - //total allowable chars across all handlers + // total allowable chars across all handlers private final int totalWriteLimit; private final boolean throwOnWriteLimitReached; @@ -320,11 +313,15 @@ static class RecursivelySecureContentHandler extends SecureContentHandler { private boolean writeLimitReached = false; - //total chars written to all handlers + // total chars written to all handlers private int totalChars = 0; - public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream stream, - int totalWriteLimit, - boolean throwOnWriteLimitReached, ParseContext parseContext) { + + public RecursivelySecureContentHandler( + ContentHandler handler, + TikaInputStream stream, + int totalWriteLimit, + boolean throwOnWriteLimitReached, + ParseContext parseContext) { super(handler, stream); this.handler = handler; this.totalWriteLimit = totalWriteLimit; @@ -339,11 +336,10 @@ public void updateContentHandler(ContentHandler handler) { /** * Bypass the SecureContentHandler... - *

- * This handler only looks at zip bomb via zip expansion. - * Users should be protected within entries against nested - * nested xml entities. We don't want to carry - * those stats _across_ entries. + * + *

This handler only looks at zip bomb via zip expansion. Users should be protected + * within entries against nested nested xml entities. We don't want to carry those stats + * _across_ entries. * * @param uri * @param localName diff --git a/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java b/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java index 412673b70e..c6bbabf9e1 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java @@ -28,10 +28,6 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -40,6 +36,8 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class RegexCaptureParser implements Parser, Initializable { @@ -50,15 +48,11 @@ public class RegexCaptureParser implements Parser, Initializable { private Map matchMap = new HashMap<>(); @Override - public void initialize(Map params) throws TikaConfigException { - - } + public void initialize(Map params) throws TikaConfigException {} @Override public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { - - } + throws TikaConfigException {} @Override public Set getSupportedTypes(ParseContext context) { @@ -68,10 +62,11 @@ public Set getSupportedTypes(ParseContext context) { private boolean writeContent = false; @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { - try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream, - StandardCharsets.UTF_8))) { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + try (BufferedReader reader = + new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))) { String line = reader.readLine(); Map localCaptureMap = new HashMap(); for (Map.Entry e : captureMap.entrySet()) { diff --git a/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java b/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java index 0daae6be12..7babf8897d 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java @@ -13,12 +13,11 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */package org.apache.tika.parser; + */ package org.apache.tika.parser; import org.apache.tika.renderer.Renderer; public interface RenderingParser { void setRenderer(Renderer renderer); - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java b/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java index 0fb657b4dd..242895087b 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java @@ -17,15 +17,12 @@ package org.apache.tika.parser; /** - * The RecursiveParserWrapper wraps the parser sent - * into the parsecontext and then uses that parser + * The RecursiveParserWrapper wraps the parser sent into the parsecontext and then uses that parser * to store state (among many other things). - *

- * There are some use cases where regular parsers - * want to parse content inline (e.g. OCR), and their - * output should not be treated as coming from an embedded - * object. - **/ + * + *

There are some use cases where regular parsers want to parse content inline (e.g. OCR), and + * their output should not be treated as coming from an embedded object. + */ public class StatefulParser extends ParserDecorator { /** diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java index ee4dfe233d..b3b6ca5a6e 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java +++ b/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java @@ -19,7 +19,6 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -27,7 +26,6 @@ import org.apache.tika.parser.DigestingParser; import org.apache.tika.parser.ParseContext; - public class CompositeDigester implements DigestingParser.Digester { private final DigestingParser.Digester[] digesters; diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java index c3e4fde2cb..e1707958f2 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java +++ b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java @@ -24,7 +24,6 @@ import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.security.Provider; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.TemporaryResources; @@ -47,19 +46,19 @@ public InputStreamDigester(int markLimit, String algorithm, DigestingParser.Enco } /** - * @param markLimit limit in bytes to allow for mark/reset. If the inputstream is longer - * than this limit, the stream will be reset and then spooled to a - * temporary file. - * Throws IllegalArgumentException if < 0. - * @param algorithm name of the digest algorithm to retrieve from the Provider - * @param algorithmKeyName name of the algorithm to store - * as part of the key in the metadata - * when {@link #digest(InputStream, Metadata, ParseContext)} is called - * @param encoder encoder to convert the byte array returned from the digester to a - * string + * @param markLimit limit in bytes to allow for mark/reset. If the inputstream is longer than + * this limit, the stream will be reset and then spooled to a temporary file. Throws + * IllegalArgumentException if < 0. + * @param algorithm name of the digest algorithm to retrieve from the Provider + * @param algorithmKeyName name of the algorithm to store as part of the key in the metadata + * when {@link #digest(InputStream, Metadata, ParseContext)} is called + * @param encoder encoder to convert the byte array returned from the digester to a string */ - public InputStreamDigester(int markLimit, String algorithm, String algorithmKeyName, - DigestingParser.Encoder encoder) { + public InputStreamDigester( + int markLimit, + String algorithm, + String algorithmKeyName, + DigestingParser.Encoder encoder) { this.algorithm = algorithm; this.algorithmKeyName = algorithmKeyName; this.encoder = encoder; @@ -70,11 +69,9 @@ public InputStreamDigester(int markLimit, String algorithm, String algorithmKeyN } } - /** - * Copied from commons-codec - */ - private static MessageDigest updateDigest(MessageDigest digest, InputStream data, Metadata metadata) - throws IOException { + /** Copied from commons-codec */ + private static MessageDigest updateDigest( + MessageDigest digest, InputStream data, Metadata metadata) throws IOException { byte[] buffer = new byte[1024]; long total = 0; for (int read = data.read(buffer, 0, 1024); read > -1; read = data.read(buffer, 0, 1024)) { @@ -87,7 +84,7 @@ private static MessageDigest updateDigest(MessageDigest digest, InputStream data private static void setContentLength(long length, Metadata metadata) { if (StringUtils.isBlank(metadata.get(Metadata.CONTENT_LENGTH))) { - //only add it if it hasn't been populated already + // only add it if it hasn't been populated already metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length)); } } @@ -106,21 +103,20 @@ private MessageDigest newMessageDigest() { } /** - * When subclassing this, becare to ensure that your provider is - * thread-safe (not likely) or return a new provider with each call. + * When subclassing this, becare to ensure that your provider is thread-safe (not likely) or + * return a new provider with each call. * - * @return provider to use to get the MessageDigest from the algorithm name. - * Default is to return null. + * @return provider to use to get the MessageDigest from the algorithm name. Default is to + * return null. */ protected Provider getProvider() { return null; } /** - * @param is InputStream to digest. Best to use a TikaInputStream because - * of potential need to spool to disk. InputStream must - * support mark/reset. - * @param metadata metadata in which to store the digest information + * @param is InputStream to digest. Best to use a TikaInputStream because of potential need to + * spool to disk. InputStream must support mark/reset. + * @param metadata metadata in which to store the digest information * @param parseContext ParseContext -- not actually used yet, but there for future expansion * @throws IOException on IO problem or IllegalArgumentException if algorithm couldn't be found */ @@ -133,19 +129,18 @@ public void digest(InputStream is, Metadata metadata, ParseContext parseContext) if (tis.hasFile()) { sz = tis.getLength(); } - //if the inputstream has a file, - //and its size is greater than its mark limit, - //just digest the underlying file. + // if the inputstream has a file, + // and its size is greater than its mark limit, + // just digest the underlying file. if (sz > markLimit) { digestFile(tis.getFile(), sz, metadata); return; } } - - //try the usual mark/reset stuff. - //however, if you actually hit the bound, - //then stop and spool to file via TikaInputStream + // try the usual mark/reset stuff. + // however, if you actually hit the bound, + // then stop and spool to file via TikaInputStream BoundedInputStream bis = new BoundedInputStream(markLimit, is); boolean finishedStream = false; bis.mark(markLimit + 1); @@ -154,8 +149,8 @@ public void digest(InputStream is, Metadata metadata, ParseContext parseContext) if (finishedStream) { return; } - //if the stream wasn't finished -- if the stream was longer than the mark limit -- - //spool to File and digest that. + // if the stream wasn't finished -- if the stream was longer than the mark limit -- + // spool to File and digest that. if (tis != null) { digestFile(tis.getFile(), -1, metadata); } else { @@ -174,12 +169,14 @@ public void digest(InputStream is, Metadata metadata, ParseContext parseContext) } private String getMetadataKey() { - return TikaCoreProperties.TIKA_META_PREFIX + "digest" + - TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + algorithmKeyName; + return TikaCoreProperties.TIKA_META_PREFIX + + "digest" + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + algorithmKeyName; } private void digestFile(File f, long sz, Metadata m) throws IOException { - //only add it if it hasn't been populated already + // only add it if it hasn't been populated already if (StringUtils.isBlank(m.get(Metadata.CONTENT_LENGTH))) { if (sz < 0) { sz = f.length(); @@ -192,7 +189,7 @@ private void digestFile(File f, long sz, Metadata m) throws IOException { } /** - * @param is input stream to read from + * @param is input stream to read from * @param metadata metadata for reporting the digest * @return whether or not this finished the input stream * @throws IOException @@ -212,5 +209,4 @@ private boolean digestStream(InputStream is, Metadata metadata) throws IOExcepti metadata.set(getMetadataKey(), encoder.encode(digestBytes)); return true; } - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java index 53cb7b7eac..da4cb6ecb0 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java @@ -18,17 +18,15 @@ import java.io.IOException; import java.util.List; - import org.apache.tika.exception.TikaException; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; /** - * A Composite Parser that wraps up all the available External Parsers, - * and provides an easy way to access them. - * Parser that uses an external program (like catdoc or pdf2txt) to extract - * text content and metadata from a given document. + * A Composite Parser that wraps up all the available External Parsers, and provides an easy way to + * access them. Parser that uses an external program (like catdoc or pdf2txt) to extract text + * content and metadata from a given document. */ public class CompositeExternalParser extends CompositeParser { private static final long serialVersionUID = 6962436916649024024L; diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java index beeed1fd99..5282f79d98 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java @@ -36,13 +36,7 @@ import java.util.concurrent.TimeoutException; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -51,57 +45,56 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Parser that uses an external program (like catdoc or pdf2txt) to extract - * text content and metadata from a given document. + * Parser that uses an external program (like catdoc or pdf2txt) to extract text content and + * metadata from a given document. */ public class ExternalParser implements Parser { private static final Logger LOG = LoggerFactory.getLogger(ExternalParser.class); /** - * The token, which if present in the Command string, will - * be replaced with the input filename. + * The token, which if present in the Command string, will be replaced with the input filename. * Alternately, the input data can be streamed over STDIN. */ public static final String INPUT_FILE_TOKEN = "${INPUT}"; + /** - * The token, which if present in the Command string, will - * be replaced with the output filename. + * The token, which if present in the Command string, will be replaced with the output filename. * Alternately, the output data can be collected on STDOUT. */ public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}"; + private static final long serialVersionUID = -1079128990650687037L; - //make this parameterizable + // make this parameterizable private final long timeoutMs = 60000; - /** - * Media types supported by the external program. - */ + + /** Media types supported by the external program. */ private Set supportedTypes = Collections.emptySet(); - /** - * Regular Expressions to run over STDOUT to - * extract Metadata. - */ + /** Regular Expressions to run over STDOUT to extract Metadata. */ private Map metadataPatterns = null; + /** * The external command to invoke. * * @see Runtime#exec(String[]) */ - private String[] command = new String[]{"cat"}; - /** - * A consumer for ignored Lines - */ + private String[] command = new String[] {"cat"}; + + /** A consumer for ignored Lines */ private LineConsumer ignoredLineConsumer = LineConsumer.NULL; /** - * Starts a thread that reads and discards the contents of the - * standard stream of the given process. Potential exceptions - * are ignored, and the stream is closed once fully processed. - * Note: calling this starts a new thread and blocks the current(caller) - * thread until the new thread dies + * Starts a thread that reads and discards the contents of the standard stream of the given + * process. Potential exceptions are ignored, and the stream is closed once fully processed. + * Note: calling this starts a new thread and blocks the current(caller) thread until the new + * thread dies * * @param stream stream to be ignored */ @@ -110,25 +103,26 @@ private static void ignoreStream(final InputStream stream) { } /** - * Starts a thread that reads and discards the contents of the - * standard stream of the given process. Potential exceptions - * are ignored, and the stream is closed once fully processed. + * Starts a thread that reads and discards the contents of the standard stream of the given + * process. Potential exceptions are ignored, and the stream is closed once fully processed. * - * @param stream stream to sent to black hole (a k a null) - * @param waitForDeath when {@code true} the caller thread will be - * blocked till the death of new thread. + * @param stream stream to sent to black hole (a k a null) + * @param waitForDeath when {@code true} the caller thread will be blocked till the death of new + * thread. * @return The thread that is created and started */ private static Thread ignoreStream(final InputStream stream, boolean waitForDeath) { - Thread t = new Thread(() -> { - try { - IOUtils.copy(stream, NULL_OUTPUT_STREAM); - } catch (IOException e) { - //swallow - } finally { - IOUtils.closeQuietly(stream); - } - }); + Thread t = + new Thread( + () -> { + try { + IOUtils.copy(stream, NULL_OUTPUT_STREAM); + } catch (IOException e) { + // swallow + } finally { + IOUtils.closeQuietly(stream); + } + }); t.start(); if (waitForDeath) { try { @@ -140,20 +134,19 @@ private static Thread ignoreStream(final InputStream stream, boolean waitForDeat } /** - * Checks to see if the command can be run. Typically used with - * something like "myapp --version" to check to see if "myapp" - * is installed and on the path. + * Checks to see if the command can be run. Typically used with something like "myapp --version" + * to check to see if "myapp" is installed and on the path. * - * @param checkCmd The check command to run + * @param checkCmd The check command to run * @param errorValue What is considered an error value? */ public static boolean check(String checkCmd, int... errorValue) { - return check(new String[]{checkCmd}, errorValue); + return check(new String[] {checkCmd}, errorValue); } public static boolean check(String[] checkCmd, int... errorValue) { if (errorValue.length == 0) { - errorValue = new int[]{127}; + errorValue = new int[] {127}; } Process process = null; @@ -163,7 +156,7 @@ public static boolean check(String[] checkCmd, int... errorValue) { Thread stdOutSuckerThread = ignoreStream(process.getInputStream(), false); stdErrSuckerThread.join(); stdOutSuckerThread.join(); - //make the timeout parameterizable + // make the timeout parameterizable boolean finished = process.waitFor(60000, TimeUnit.MILLISECONDS); if (!finished) { throw new TimeoutException(); @@ -184,14 +177,15 @@ public static boolean check(String[] checkCmd, int... errorValue) { // External process execution is banned by the security manager throw se; } catch (Error err) { - if (err.getMessage() != null && (err.getMessage().contains("posix_spawn") || - err.getMessage().contains("UNIXProcess"))) { + if (err.getMessage() != null + && (err.getMessage().contains("posix_spawn") + || err.getMessage().contains("UNIXProcess"))) { LOG.debug("(TIKA-1526): exception trying to run: " + checkCmd[0], err); - //"Error forking command due to JVM locale bug - //(see TIKA-1526 and SOLR-6387)" + // "Error forking command due to JVM locale bug + // (see TIKA-1526 and SOLR-6387)" return false; } - //throw if a different kind of error + // throw if a different kind of error throw err; } finally { if (process != null) { @@ -217,9 +211,8 @@ public String[] getCommand() { } /** - * Sets the command to be run. This can include either of - * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN} - * if the command needs filenames. + * Sets the command to be run. This can include either of {@link #INPUT_FILE_TOKEN} or {@link + * #OUTPUT_FILE_TOKEN} if the command needs filenames. * * @see Runtime#exec(String[]) */ @@ -250,23 +243,21 @@ public Map getMetadataExtractionPatterns() { } /** - * Sets the map of regular expression patterns and Metadata - * keys. Any matching patterns will have the matching - * metadata entries set. - * Set this to null to disable Metadata extraction. + * Sets the map of regular expression patterns and Metadata keys. Any matching patterns will + * have the matching metadata entries set. Set this to null to disable Metadata extraction. */ public void setMetadataExtractionPatterns(Map patterns) { this.metadataPatterns = patterns; } /** - * Executes the configured external command and passes the given document - * stream as a simple XHTML document to the given SAX content handler. - * Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)} - * has been called to set patterns. + * Executes the configured external command and passes the given document stream as a simple + * XHTML document to the given SAX content handler. Metadata is only extracted if {@link + * #setMetadataExtractionPatterns(Map)} has been called to set patterns. */ - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); TemporaryResources tmp = new TemporaryResources(); @@ -277,8 +268,12 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } } - private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, - TemporaryResources tmp) throws IOException, SAXException, TikaException { + private void parse( + TikaInputStream stream, + XHTMLContentHandler xhtml, + Metadata metadata, + TemporaryResources tmp) + throws IOException, SAXException, TikaException { boolean inputToStdIn = true; boolean outputFromStdOut = true; boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty()); @@ -360,14 +355,14 @@ private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata m } /** - * Starts a thread that extracts the contents of the standard output - * stream of the given process to the given XHTML content handler. - * The standard output stream is closed once fully processed. + * Starts a thread that extracts the contents of the standard output stream of the given process + * to the given XHTML content handler. The standard output stream is closed once fully + * processed. * * @param stream - * @param xhtml XHTML content handler + * @param xhtml XHTML content handler * @throws SAXException if the XHTML SAX events could not be handled - * @throws IOException if an input error occurred + * @throws IOException if an input error occurred */ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException { @@ -384,24 +379,25 @@ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) } /** - * Starts a thread that sends the contents of the given input stream - * to the standard input stream of the given process. Potential - * exceptions are ignored, and the standard input stream is closed - * once fully processed. Note that the given input stream is not - * closed by this method. + * Starts a thread that sends the contents of the given input stream to the standard input + * stream of the given process. Potential exceptions are ignored, and the standard input stream + * is closed once fully processed. Note that the given input stream is not closed by + * this method. * * @param process process - * @param stream input stream + * @param stream input stream */ private void sendInput(final Process process, final InputStream stream) { - Thread t = new Thread(() -> { - OutputStream stdin = process.getOutputStream(); - try { - IOUtils.copy(stream, stdin); - } catch (IOException e) { - //swallow - } - }); + Thread t = + new Thread( + () -> { + OutputStream stdin = process.getOutputStream(); + try { + IOUtils.copy(stream, stdin); + } catch (IOException e) { + // swallow + } + }); t.start(); try { t.join(); @@ -410,36 +406,39 @@ private void sendInput(final Process process, final InputStream stream) { } private void extractMetadata(final InputStream stream, final Metadata metadata) { - Thread t = new Thread(() -> { - BufferedReader reader; - reader = new BufferedReader(new InputStreamReader(stream, UTF_8)); - try { - String line; - while ((line = reader.readLine()) != null) { - boolean consumed = false; - for (Map.Entry entry : metadataPatterns.entrySet()) { - Matcher m = entry.getKey().matcher(line); - if (m.find()) { - consumed = true; - if (entry.getValue() != null && - !entry.getValue().equals("")) { - metadata.add(entry.getValue(), m.group(1)); - } else { - metadata.add(m.group(1), m.group(2)); + Thread t = + new Thread( + () -> { + BufferedReader reader; + reader = new BufferedReader(new InputStreamReader(stream, UTF_8)); + try { + String line; + while ((line = reader.readLine()) != null) { + boolean consumed = false; + for (Map.Entry entry : + metadataPatterns.entrySet()) { + Matcher m = entry.getKey().matcher(line); + if (m.find()) { + consumed = true; + if (entry.getValue() != null + && !entry.getValue().equals("")) { + metadata.add(entry.getValue(), m.group(1)); + } else { + metadata.add(m.group(1), m.group(2)); + } + } + } + if (!consumed) { + ignoredLineConsumer.consume(line); + } + } + } catch (IOException e) { + // Ignore + } finally { + IOUtils.closeQuietly(reader); + IOUtils.closeQuietly(stream); } - } - } - if (!consumed) { - ignoredLineConsumer.consume(line); - } - } - } catch (IOException e) { - // Ignore - } finally { - IOUtils.closeQuietly(reader); - IOUtils.closeQuietly(stream); - } - }); + }); t.start(); try { t.join(); @@ -453,12 +452,11 @@ private void extractMetadata(final InputStream stream, final Metadata metadata) * @since Apache Tika 1.14 */ public interface LineConsumer extends Serializable { - /** - * A null consumer - */ - LineConsumer NULL = line -> { - // ignores - }; + /** A null consumer */ + LineConsumer NULL = + line -> { + // ignores + }; /** * Consume a line @@ -467,6 +465,4 @@ public interface LineConsumer extends Serializable { */ void consume(String line); } - - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java index 3c79fd3e06..0d14d2fd5f 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java +++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java @@ -27,7 +27,10 @@ import java.util.StringTokenizer; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; - +import org.apache.tika.exception.TikaException; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.utils.XMLReaderUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; @@ -35,16 +38,9 @@ import org.xml.sax.InputSource; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.mime.MediaType; -import org.apache.tika.mime.MimeTypeException; -import org.apache.tika.utils.XMLReaderUtils; - /** - * Builds up ExternalParser instances based on XML file(s) - * which define what to run, for what, and how to process - * any output metadata. - * Typically used to configure up a series of external programs + * Builds up ExternalParser instances based on XML file(s) which define what to run, for what, and + * how to process any output metadata. Typically used to configure up a series of external programs * (like catdoc or pdf2txt) to extract text content from documents. * *

@@ -86,16 +82,17 @@ public static List read(Element element) throws TikaException, I
             }
         } else {
             throw new MimeTypeException(
-                    "Not a <" + EXTERNAL_PARSERS_TAG + "/> configuration document: " +
-                            (element != null ? element.getTagName() : "n/a"));
+                    "Not a <"
+                            + EXTERNAL_PARSERS_TAG
+                            + "/> configuration document: "
+                            + (element != null ? element.getTagName() : "n/a"));
         }
 
         return parsers;
     }
 
     /**
-     * Builds and Returns an ExternalParser, or null if a check
-     * command was given that didn't match.
+     * Builds and Returns an ExternalParser, or null if a check command was given that didn't match.
      */
     private static ExternalParser readParser(Element parserDef) throws TikaException {
         ExternalParser parser = new ExternalParser();
@@ -122,7 +119,8 @@ private static ExternalParser readParser(Element parserDef) throws TikaException
                         parser.setMetadataExtractionPatterns(readMetadataPatterns(child));
                         break;
                     default:
-                        throw new IllegalArgumentException("reaction not defined for " + child.getTagName());
+                        throw new IllegalArgumentException(
+                                "reaction not defined for " + child.getTagName());
                 }
             }
         }
@@ -186,7 +184,7 @@ private static boolean readCheckTagAndCheck(Element checkDef) {
                             String s = st.nextToken();
                             errorVals.add(Integer.parseInt(s));
                         } catch (NumberFormatException e) {
-                            //swallow
+                            // swallow
                         }
                     }
                 }
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
index 86369c6cd7..dadb5baee1 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
@@ -16,9 +16,7 @@
  */
 package org.apache.tika.parser.external;
 
-/**
- * Met Keys used by the {@link ExternalParsersConfigReader}.
- */
+/** Met Keys used by the {@link ExternalParsersConfigReader}. */
 public interface ExternalParsersConfigReaderMetKeys {
 
     String EXTERNAL_PARSERS_TAG = "external-parsers";
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
index 561cbe7d00..f4137ac718 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
@@ -24,7 +24,6 @@
 import java.util.Enumeration;
 import java.util.List;
 import java.util.Map;
-
 import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
@@ -33,8 +32,7 @@
 import org.apache.tika.parser.Parser;
 
 /**
- * Creates instances of ExternalParser based on XML
- * configuration files.
+ * Creates instances of ExternalParser based on XML configuration files.
  *
  * @see ExternalParsersConfigReader
  */
@@ -52,8 +50,9 @@ public static List create(ServiceLoader loader)
     public static List create(String filename, ServiceLoader loader)
             throws IOException, TikaException {
         String filepath =
-                ExternalParsersFactory.class.getPackage().getName().replace('.', '/') + "/" +
-                        filename;
+                ExternalParsersFactory.class.getPackage().getName().replace('.', '/')
+                        + "/"
+                        + filename;
         Enumeration files = loader.findServiceResources(filepath);
         ArrayList list = Collections.list(files);
         URL[] urls = list.toArray(new URL[0]);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java b/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
index 4ee27b9d65..8c9e2ae1bd 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
@@ -15,8 +15,6 @@
  * limitations under the License.
  */
 
-/**
- * External parser process.
- */
+/** External parser process. */
 @aQute.bnd.annotation.Version("1.0.0")
 package org.apache.tika.parser.external;
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
index 5dbea57f04..273f6f1af5 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
@@ -29,12 +29,6 @@
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
 import org.apache.tika.config.Field;
 import org.apache.tika.config.Initializable;
 import org.apache.tika.config.InitializableProblemHandler;
@@ -54,14 +48,17 @@
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.FileProcessResult;
 import org.apache.tika.utils.ProcessUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 /**
- * This is a next generation external parser that uses some of the more
- * recent additions to Tika. This is an experimental alternative to the
- * {@link org.apache.tika.parser.external.ExternalParser}.
- * Specifically, it relies more on configuration than the SPI model.
- * Further, users can specify a parser to handle the output
- * of the external process.
+ * This is a next generation external parser that uses some of the more recent additions to Tika.
+ * This is an experimental alternative to the {@link
+ * org.apache.tika.parser.external.ExternalParser}. Specifically, it relies more on configuration
+ * than the SPI model. Further, users can specify a parser to handle the output of the external
+ * process.
  */
 public class ExternalParser implements Parser, Initializable {
 
@@ -98,9 +95,10 @@ public Set getSupportedTypes(ParseContext context) {
     }
 
     @Override
-    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
-                      ParseContext context) throws IOException, SAXException, TikaException {
-        //this may remain null, depending on whether the external parser writes to a file
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // this may remain null, depending on whether the external parser writes to a file
         Path outFile = null;
         try (TemporaryResources tmp = new TemporaryResources()) {
             TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
@@ -111,13 +109,18 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
             boolean outputFileInCommandline = false;
             for (String c : commandLine) {
                 if (inputMatcher.reset(c).find()) {
-                    String updated = c.replace(INPUT_FILE_TOKEN,
-                            ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString()));
+                    String updated =
+                            c.replace(
+                                    INPUT_FILE_TOKEN,
+                                    ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString()));
                     thisCommandLine.add(updated);
                 } else if (outputMatcher.reset(c).find()) {
                     outFile = Files.createTempFile("tika-external2-", "");
-                    String updated = c.replace(OUTPUT_FILE_TOKEN,
-                            ProcessUtils.escapeCommandLine(outFile.toAbsolutePath().toString()));
+                    String updated =
+                            c.replace(
+                                    OUTPUT_FILE_TOKEN,
+                                    ProcessUtils.escapeCommandLine(
+                                            outFile.toAbsolutePath().toString()));
                     thisCommandLine.add(updated);
                     outputFileInCommandline = true;
                 } else {
@@ -127,21 +130,27 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
             FileProcessResult result = null;
             long localTimeoutMillis = TikaTaskTimeout.getTimeoutMillis(context, timeoutMs);
             if (outputFileInCommandline) {
-                result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine),
-                        localTimeoutMillis, maxStdOut, maxStdErr);
+                result =
+                        ProcessUtils.execute(
+                                new ProcessBuilder(thisCommandLine),
+                                localTimeoutMillis,
+                                maxStdOut,
+                                maxStdErr);
             } else {
                 outFile = Files.createTempFile("tika-external2-", "");
-                result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine),
-                        localTimeoutMillis, outFile, maxStdErr);
+                result =
+                        ProcessUtils.execute(
+                                new ProcessBuilder(thisCommandLine),
+                                localTimeoutMillis,
+                                outFile,
+                                maxStdErr);
             }
             metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout());
             metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue());
             metadata.set(ExternalProcess.STD_OUT_LENGTH, result.getStdoutLength());
-            metadata.set(ExternalProcess.STD_OUT_IS_TRUNCATED,
-                    result.isStdoutTruncated());
+            metadata.set(ExternalProcess.STD_OUT_IS_TRUNCATED, result.isStdoutTruncated());
             metadata.set(ExternalProcess.STD_ERR_LENGTH, result.getStderrLength());
-            metadata.set(ExternalProcess.STD_ERR_IS_TRUNCATED,
-                    result.isStderrTruncated());
+            metadata.set(ExternalProcess.STD_ERR_IS_TRUNCATED, result.isStderrTruncated());
 
             if (returnStdout) {
                 metadata.set(ExternalProcess.STD_OUT, result.getStdout());
@@ -160,23 +169,26 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
         }
     }
 
-    private void handleOutput(FileProcessResult result, Path outFile,
-                              XHTMLContentHandler xhtml, Metadata metadata,
-                              ParseContext parseContext) throws SAXException, TikaException,
-            IOException {
+    private void handleOutput(
+            FileProcessResult result,
+            Path outFile,
+            XHTMLContentHandler xhtml,
+            Metadata metadata,
+            ParseContext parseContext)
+            throws SAXException, TikaException, IOException {
         if (outputParser == EmptyParser.INSTANCE) {
             if (outFile != null) {
                 try (BufferedReader reader = Files.newBufferedReader(outFile)) {
                     String line = reader.readLine();
                     while (line != null) {
-                        //do we want to wrap this in 

elements? + // do we want to wrap this in

elements? xhtml.characters(line); xhtml.newline(); line = reader.readLine(); } } } else { - //read this in line by line and wrap

elements? + // read this in line by line and wrap

elements? xhtml.characters(result.getStdout()); } } else { @@ -185,18 +197,17 @@ private void handleOutput(FileProcessResult result, Path outFile, outputParser.parse(is, new BodyContentHandler(xhtml), metadata, parseContext); } } else { - try (InputStream is = TikaInputStream.get( - result.getStdout().getBytes(StandardCharsets.UTF_8))) { + try (InputStream is = + TikaInputStream.get(result.getStdout().getBytes(StandardCharsets.UTF_8))) { outputParser.parse(is, new BodyContentHandler(xhtml), metadata, parseContext); } } } - } /** - * This is set during initialization from a tika-config. - * Any calls after initialization will result in a {@link IllegalStateException}. + * This is set during initialization from a tika-config. Any calls after initialization will + * result in a {@link IllegalStateException}. * * @param supportedTypes */ @@ -226,9 +237,8 @@ public void setMaxStdOut(int maxStdOut) { } /** - * Use this to specify the full commandLine. The commandline must - * include at least {@link ExternalParser#INPUT_FILE_TOKEN}. - * If the external process writes to an output file, specify + * Use this to specify the full commandLine. The commandline must include at least {@link + * ExternalParser#INPUT_FILE_TOKEN}. If the external process writes to an output file, specify * {@link ExternalParser#OUTPUT_FILE_TOKEN}. * * @param commandLine @@ -238,12 +248,10 @@ public void setCommandLine(List commandLine) { this.commandLine = commandLine; } - /** - * If set to true, this will return the stdout in the metadata - * via {@link org.apache.tika.metadata.ExternalProcess#STD_OUT}. - * Default is false because this should normally - * be handled by the outputParser + * If set to true, this will return the stdout in the metadata via {@link + * org.apache.tika.metadata.ExternalProcess#STD_OUT}. Default is false because this + * should normally be handled by the outputParser * * @param returnStdout */ @@ -253,9 +261,9 @@ public void setReturnStdout(boolean returnStdout) { } /** - * If set to true, this will return the stderr in the metadata - * via {@link org.apache.tika.metadata.ExternalProcess#STD_ERR}. - * Default is true + * If set to true, this will return the stderr in the metadata via {@link + * org.apache.tika.metadata.ExternalProcess#STD_ERR}. Default is true + * * @param returnStderr */ @Field @@ -264,10 +272,10 @@ public void setReturnStderr(boolean returnStderr) { } /** - * This parser is called on the output of the process. - * If the process writes to an output file, specified by - * {@link ExternalParser#OUTPUT_FILE_TOKEN}, this parser will parse that file, + * This parser is called on the output of the process. If the process writes to an output file, + * specified by {@link ExternalParser#OUTPUT_FILE_TOKEN}, this parser will parse that file, * otherwise it will parse the UTF-8 encoded bytes from the process' STD_OUT. + * * @param parser */ @Field @@ -281,7 +289,7 @@ public Parser getOutputParser() { @Override public void initialize(Map params) throws TikaConfigException { - //no-op + // no-op } @Override @@ -295,9 +303,9 @@ public void checkInitialization(InitializableProblemHandler problemHandler) } if (outputParser == EmptyParser.INSTANCE) { - LOG.debug("no parser selected for the output; contents will be " + - "written to the content handler"); + LOG.debug( + "no parser selected for the output; contents will be " + + "written to the content handler"); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java index 9f2ea8a3b2..c2a9125191 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java @@ -30,10 +30,6 @@ import java.util.List; import java.util.Map; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Param; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; @@ -46,56 +42,55 @@ import org.apache.tika.parser.ParserDecorator; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.utils.ParserUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Abstract base class for parser wrappers which may / will - * process a given stream multiple times, merging the results - * of the various parsers used. - * End users should normally use {@link FallbackParser} or - * {@link SupplementingParser} along with a Strategy. - * Note that unless you give a {@link ContentHandlerFactory}, - * you'll get content from every parser tried mushed together! + * Abstract base class for parser wrappers which may / will process a given stream multiple times, + * merging the results of the various parsers used. End users should normally use {@link + * FallbackParser} or {@link SupplementingParser} along with a Strategy. Note that unless you give a + * {@link ContentHandlerFactory}, you'll get content from every parser tried mushed together! * * @since Apache Tika 1.18 */ public abstract class AbstractMultipleParser implements Parser { protected static final String METADATA_POLICY_CONFIG_KEY = "metadataPolicy"; - /** - * Serial version UID. - */ + + /** Serial version UID. */ private static final long serialVersionUID = 5383668090329836559L; - /** - * How we should handle metadata clashes - */ + + /** How we should handle metadata clashes */ private final MetadataPolicy policy; - /** - * List of the multiple parsers to try. - */ + + /** List of the multiple parsers to try. */ private final Collection parsers; + /** - * Computed list of Mime Types to offer, which is all - * those in common between the parsers. - * For explicit mimetypes only, use a {@link ParserDecorator} + * Computed list of Mime Types to offer, which is all those in common between the parsers. For + * explicit mimetypes only, use a {@link ParserDecorator} */ private final Set offeredTypes; - /** - * Media type registry. - */ + + /** Media type registry. */ private MediaTypeRegistry registry; @SuppressWarnings("rawtypes") - public AbstractMultipleParser(MediaTypeRegistry registry, Collection parsers, - Map params) { + public AbstractMultipleParser( + MediaTypeRegistry registry, + Collection parsers, + Map params) { this(registry, getMetadataPolicy(params), parsers); } - public AbstractMultipleParser(MediaTypeRegistry registry, MetadataPolicy policy, - Parser... parsers) { + public AbstractMultipleParser( + MediaTypeRegistry registry, MetadataPolicy policy, Parser... parsers) { this(registry, policy, Arrays.asList(parsers)); } - public AbstractMultipleParser(MediaTypeRegistry registry, MetadataPolicy policy, - Collection parsers) { + public AbstractMultipleParser( + MediaTypeRegistry registry, + MetadataPolicy policy, + Collection parsers) { this.policy = policy; this.parsers = parsers; this.registry = registry; @@ -117,8 +112,8 @@ protected static MetadataPolicy getMetadataPolicy(Map params) { "Required parameter '" + METADATA_POLICY_CONFIG_KEY + "' not supplied"); } - protected static Metadata mergeMetadata(Metadata newMetadata, Metadata lastMetadata, - MetadataPolicy policy) { + protected static Metadata mergeMetadata( + Metadata newMetadata, Metadata lastMetadata, MetadataPolicy policy) { if (policy == MetadataPolicy.DISCARD_ALL) { return newMetadata; } @@ -211,56 +206,60 @@ public List getAllParsers() { return Collections.unmodifiableList(new ArrayList<>(parsers)); } - /** - * Used to allow implementations to prepare or change things - * before parsing occurs - */ - protected void parserPrepare(Parser parser, Metadata metadata, ParseContext context) { - } + /** Used to allow implementations to prepare or change things before parsing occurs */ + protected void parserPrepare(Parser parser, Metadata metadata, ParseContext context) {} /** - * Used to notify implementations that a Parser has Finished - * or Failed, and to allow them to decide to continue or - * abort further parsing + * Used to notify implementations that a Parser has Finished or Failed, and to allow them to + * decide to continue or abort further parsing */ - protected abstract boolean parserCompleted(Parser parser, Metadata metadata, - ContentHandler handler, ParseContext context, - Exception exception); + protected abstract boolean parserCompleted( + Parser parser, + Metadata metadata, + ContentHandler handler, + ParseContext context, + Exception exception); /** - * Processes the given Stream through one or more parsers, - * resetting things between parsers as requested by policy. - * The actual processing is delegated to one or more {@link Parser}s. - *

- * Note that you'll get text from every parser this way, to have - * control of which content is from which parser you need to - * call the method with a {@link ContentHandlerFactory} instead. + * Processes the given Stream through one or more parsers, resetting things between parsers as + * requested by policy. The actual processing is delegated to one or more {@link Parser}s. + * + *

Note that you'll get text from every parser this way, to have control of which content is + * from which parser you need to call the method with a {@link ContentHandlerFactory} instead. */ @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { parse(stream, handler, null, metadata, context); } /** - * Processes the given Stream through one or more parsers, - * resetting things between parsers as requested by policy. - * The actual processing is delegated to one or more {@link Parser}s. - * You will get one ContentHandler fetched for each Parser used. - * TODO Do we need to return all the ContentHandler instances we created? + * Processes the given Stream through one or more parsers, resetting things between parsers as + * requested by policy. The actual processing is delegated to one or more {@link Parser}s. You + * will get one ContentHandler fetched for each Parser used. TODO Do we need to return all the + * ContentHandler instances we created? * - * @deprecated The {@link ContentHandlerFactory} override is still experimental - * and the method signature is subject to change before Tika 2.0 + * @deprecated The {@link ContentHandlerFactory} override is still experimental and the method + * signature is subject to change before Tika 2.0 */ @Deprecated - public void parse(InputStream stream, ContentHandlerFactory handlers, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, + ContentHandlerFactory handlers, + Metadata metadata, + ParseContext context) + throws IOException, SAXException, TikaException { parse(stream, null, handlers, metadata, context); } - private void parse(InputStream stream, ContentHandler handler, - ContentHandlerFactory handlerFactory, Metadata originalMetadata, - ParseContext context) throws IOException, SAXException, TikaException { + private void parse( + InputStream stream, + ContentHandler handler, + ContentHandlerFactory handlerFactory, + Metadata originalMetadata, + ParseContext context) + throws IOException, SAXException, TikaException { // Track the metadata between parsers, so we can apply our policy Metadata lastMetadata = cloneMetadata(originalMetadata); Metadata metadata = lastMetadata; @@ -270,7 +269,8 @@ private void parse(InputStream stream, ContentHandler handler, try { // Ensure we'll be able to re-read safely, buffering to disk if so, // to permit Parsers 2+ to be able to read the same data - InputStream taggedStream = ParserUtils.ensureStreamReReadable(stream, tmp, originalMetadata); + InputStream taggedStream = + ParserUtils.ensureStreamReReadable(stream, tmp, originalMetadata); for (Parser p : parsers) { // Get a new handler for this parser, if we can @@ -342,31 +342,20 @@ private void parse(InputStream stream, ContentHandler handler, } /** - * The various strategies for handling metadata emitted by - * multiple parsers. - * Note that not all will be supported by all subclasses. + * The various strategies for handling metadata emitted by multiple parsers. Note that not all + * will be supported by all subclasses. */ public enum MetadataPolicy { - /** - * Before moving onto another parser, throw away - * all previously seen metadata - */ + /** Before moving onto another parser, throw away all previously seen metadata */ DISCARD_ALL, - /** - * The first parser to output a given key wins, - * merge in non-clashing other keys - */ + /** The first parser to output a given key wins, merge in non-clashing other keys */ FIRST_WINS, /** - * The last parser to output a given key wins, - * overriding previous parser values for a + * The last parser to output a given key wins, overriding previous parser values for a * clashing key. */ LAST_WINS, - /** - * Where multiple parsers output a given key, - * store all their different (unique) values - */ + /** Where multiple parsers output a given key, store all their different (unique) values */ KEEP_ALL } } diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java index e538e596a9..78cd108220 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java @@ -20,43 +20,41 @@ import java.util.Collection; import java.util.List; import java.util.Map; - -import org.xml.sax.ContentHandler; - import org.apache.tika.config.Param; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.xml.sax.ContentHandler; /** * Tries multiple parsers in turn, until one succeeds. - *

- * Can optionally keep Metadata from failed parsers when - * trying the next one, depending on the {@link AbstractMultipleParser.MetadataPolicy} - * chosen. + * + *

Can optionally keep Metadata from failed parsers when trying the next one, depending on the + * {@link AbstractMultipleParser.MetadataPolicy} chosen. * * @since Apache Tika 1.18 */ public class FallbackParser extends AbstractMultipleParser { - /** - * The different Metadata Policies we support (all) - */ + /** The different Metadata Policies we support (all) */ public static final List allowedPolicies = Arrays.asList(MetadataPolicy.values()); - /** - * Serial version UID. - */ + + /** Serial version UID. */ private static final long serialVersionUID = 5844409020977206167L; @SuppressWarnings("rawtypes") - public FallbackParser(MediaTypeRegistry registry, Collection parsers, - Map params) { + public FallbackParser( + MediaTypeRegistry registry, + Collection parsers, + Map params) { super(registry, parsers, params); } - public FallbackParser(MediaTypeRegistry registry, MetadataPolicy policy, - Collection parsers) { + public FallbackParser( + MediaTypeRegistry registry, + MetadataPolicy policy, + Collection parsers) { super(registry, policy, parsers); } @@ -65,12 +63,15 @@ public FallbackParser(MediaTypeRegistry registry, MetadataPolicy policy, Parser. } @Override - protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandler handler, - ParseContext context, Exception exception) { + protected boolean parserCompleted( + Parser parser, + Metadata metadata, + ContentHandler handler, + ParseContext context, + Exception exception) { // If there was no exception, abort further parsers return exception != null; // Have the next parser tried } } - diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java index 8cf83c019f..213d8c5357 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java @@ -20,52 +20,49 @@ import java.util.Collection; import java.util.List; import java.util.Map; - -import org.xml.sax.ContentHandler; - import org.apache.tika.config.Param; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.xml.sax.ContentHandler; /** - * Runs the input stream through all available parsers, - * merging the metadata from them based on the + * Runs the input stream through all available parsers, merging the metadata from them based on the * {@link AbstractMultipleParser.MetadataPolicy} chosen. - *

- * Warning - currently only one Parser should output - * any Content to the {@link ContentHandler}, the rest - * should only output {@link Metadata}. A solution to - * multiple-content is still being worked on... + * + *

Warning - currently only one Parser should output any Content to the {@link ContentHandler}, + * the rest should only output {@link Metadata}. A solution to multiple-content is still being + * worked on... * * @since Apache Tika 1.18 */ public class SupplementingParser extends AbstractMultipleParser { - /** - * The different Metadata Policies we support (not discard) - */ + /** The different Metadata Policies we support (not discard) */ public static final List allowedPolicies = - Arrays.asList(MetadataPolicy.FIRST_WINS, MetadataPolicy.LAST_WINS, - MetadataPolicy.KEEP_ALL); - /** - * Serial version UID. - */ + Arrays.asList( + MetadataPolicy.FIRST_WINS, MetadataPolicy.LAST_WINS, MetadataPolicy.KEEP_ALL); + + /** Serial version UID. */ private static final long serialVersionUID = 313179254565350994L; @SuppressWarnings("rawtypes") - public SupplementingParser(MediaTypeRegistry registry, Collection parsers, - Map params) { + public SupplementingParser( + MediaTypeRegistry registry, + Collection parsers, + Map params) { super(registry, parsers, params); } - public SupplementingParser(MediaTypeRegistry registry, MetadataPolicy policy, - Parser... parsers) { + public SupplementingParser( + MediaTypeRegistry registry, MetadataPolicy policy, Parser... parsers) { this(registry, policy, Arrays.asList(parsers)); } - public SupplementingParser(MediaTypeRegistry registry, MetadataPolicy policy, - Collection parsers) { + public SupplementingParser( + MediaTypeRegistry registry, + MetadataPolicy policy, + Collection parsers) { super(registry, policy, parsers); // Ensure it's a supported policy @@ -76,8 +73,12 @@ public SupplementingParser(MediaTypeRegistry registry, MetadataPolicy policy, } @Override - protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandler handler, - ParseContext context, Exception exception) { + protected boolean parserCompleted( + Parser parser, + Metadata metadata, + ContentHandler handler, + ParseContext context, + Exception exception) { // If there was no exception, just carry on to the next if (exception == null) { return true; @@ -87,4 +88,3 @@ protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandl return true; } } - diff --git a/tika-core/src/main/java/org/apache/tika/parser/package-info.java b/tika-core/src/main/java/org/apache/tika/parser/package-info.java index 10df69e1e6..f536df40aa 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/parser/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * Tika parsers. - */ +/** Tika parsers. */ @aQute.bnd.annotation.Version("1.0.0") package org.apache.tika.parser; diff --git a/tika-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java b/tika-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java index f8dcffb641..1e01008192 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java @@ -20,7 +20,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -37,7 +36,6 @@ public void report(FetchEmitTuple t, PipesResult result, long elapsed) { for (PipesReporter reporter : pipesReporters) { reporter.report(t, result, elapsed); } - } @Override @@ -82,7 +80,7 @@ public List getPipesReporters() { @Override public void initialize(Map params) throws TikaConfigException { - //no-op + // no-op } @Override @@ -97,8 +95,8 @@ public void checkInitialization(InitializableProblemHandler problemHandler) } /** - * Tries to close all resources. Throws the last encountered IOException - * if any are thrown by the component reporters. + * Tries to close all resources. Throws the last encountered IOException if any are thrown by + * the component reporters. * * @throws IOException */ diff --git a/tika-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java b/tika-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java index fd49927e1f..f34bd0ea85 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java @@ -16,9 +16,7 @@ */ package org.apache.tika.pipes; -/** - * This should be catastrophic - */ +/** This should be catastrophic */ public class FailedToStartClientException extends RuntimeException { public FailedToStartClientException(Throwable t) { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java b/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java index 0c0334fd46..3d4f79d55d 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java @@ -18,7 +18,6 @@ import java.io.Serializable; import java.util.Objects; - import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.emitter.EmitKey; import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig; @@ -29,7 +28,8 @@ public class FetchEmitTuple implements Serializable { public static final ON_PARSE_EXCEPTION DEFAULT_ON_PARSE_EXCEPTION = ON_PARSE_EXCEPTION.EMIT; public enum ON_PARSE_EXCEPTION { - SKIP, EMIT + SKIP, + EMIT } private final String id; @@ -42,28 +42,61 @@ public enum ON_PARSE_EXCEPTION { private EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig; public FetchEmitTuple(String id, FetchKey fetchKey, EmitKey emitKey) { - this(id, fetchKey, emitKey, new Metadata(), HandlerConfig.DEFAULT_HANDLER_CONFIG, + this( + id, + fetchKey, + emitKey, + new Metadata(), + HandlerConfig.DEFAULT_HANDLER_CONFIG, DEFAULT_ON_PARSE_EXCEPTION); } - public FetchEmitTuple(String id, FetchKey fetchKey, EmitKey emitKey, ON_PARSE_EXCEPTION onParseException) { - this(id, fetchKey, emitKey, new Metadata(), HandlerConfig.DEFAULT_HANDLER_CONFIG, + + public FetchEmitTuple( + String id, FetchKey fetchKey, EmitKey emitKey, ON_PARSE_EXCEPTION onParseException) { + this( + id, + fetchKey, + emitKey, + new Metadata(), + HandlerConfig.DEFAULT_HANDLER_CONFIG, onParseException); } public FetchEmitTuple(String id, FetchKey fetchKey, EmitKey emitKey, Metadata metadata) { - this(id, fetchKey, emitKey, metadata, HandlerConfig.DEFAULT_HANDLER_CONFIG, + this( + id, + fetchKey, + emitKey, + metadata, + HandlerConfig.DEFAULT_HANDLER_CONFIG, DEFAULT_ON_PARSE_EXCEPTION); } - public FetchEmitTuple(String id, FetchKey fetchKey, EmitKey emitKey, Metadata metadata, - HandlerConfig handlerConfig, ON_PARSE_EXCEPTION onParseException) { - this(id, fetchKey, emitKey, metadata, handlerConfig, onParseException, + public FetchEmitTuple( + String id, + FetchKey fetchKey, + EmitKey emitKey, + Metadata metadata, + HandlerConfig handlerConfig, + ON_PARSE_EXCEPTION onParseException) { + this( + id, + fetchKey, + emitKey, + metadata, + handlerConfig, + onParseException, EmbeddedDocumentBytesConfig.SKIP); } - public FetchEmitTuple(String id, FetchKey fetchKey, EmitKey emitKey, Metadata metadata, - HandlerConfig handlerConfig, ON_PARSE_EXCEPTION onParseException, - EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig) { + public FetchEmitTuple( + String id, + FetchKey fetchKey, + EmitKey emitKey, + Metadata metadata, + HandlerConfig handlerConfig, + ON_PARSE_EXCEPTION onParseException, + EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig) { this.id = id; this.fetchKey = fetchKey; this.emitKey = emitKey; @@ -76,6 +109,7 @@ public FetchEmitTuple(String id, FetchKey fetchKey, EmitKey emitKey, Metadata me public String getId() { return id; } + public FetchKey getFetchKey() { return fetchKey; } @@ -148,16 +182,32 @@ public int hashCode() { result = 31 * result + (metadata != null ? metadata.hashCode() : 0); result = 31 * result + (onParseException != null ? onParseException.hashCode() : 0); result = 31 * result + (handlerConfig != null ? handlerConfig.hashCode() : 0); - result = 31 * result + - (embeddedDocumentBytesConfig != null ? embeddedDocumentBytesConfig.hashCode() : 0); + result = + 31 * result + + (embeddedDocumentBytesConfig != null + ? embeddedDocumentBytesConfig.hashCode() + : 0); return result; } @Override public String toString() { - return "FetchEmitTuple{" + "id='" + id + '\'' + ", fetchKey=" + fetchKey + ", emitKey=" + - emitKey + ", metadata=" + metadata + ", onParseException=" + onParseException + - ", handlerConfig=" + handlerConfig + ", embeddedDocumentBytesConfig=" + - embeddedDocumentBytesConfig + '}'; + return "FetchEmitTuple{" + + "id='" + + id + + '\'' + + ", fetchKey=" + + fetchKey + + ", emitKey=" + + emitKey + + ", metadata=" + + metadata + + ", onParseException=" + + onParseException + + ", handlerConfig=" + + handlerConfig + + ", embeddedDocumentBytesConfig=" + + embeddedDocumentBytesConfig + + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java index d128dcb3dd..af77bdcf9e 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java @@ -19,31 +19,27 @@ import java.io.Serializable; import java.util.Locale; import java.util.Objects; - import org.apache.tika.sax.BasicContentHandlerFactory; public class HandlerConfig implements Serializable { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = -3861669115439125268L; public static final HandlerConfig DEFAULT_HANDLER_CONFIG = - new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, PARSE_MODE.RMETA, - -1, -1, true); + new HandlerConfig( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, PARSE_MODE.RMETA, -1, -1, true); /** - * {@link PARSE_MODE#RMETA} "recursive metadata" is the same as the -J option - * in tika-app and the /rmeta endpoint in tika-server. Each embedded file is represented as - * its own metadata object. + * {@link PARSE_MODE#RMETA} "recursive metadata" is the same as the -J option in tika-app and + * the /rmeta endpoint in tika-server. Each embedded file is represented as its own metadata + * object. * - * {@link PARSE_MODE#CONCATENATE} is similar - * to the legacy tika-app behavior and the /tika endpoint (accept: application/json) in - * tika-server. This concatenates the - * contents of embedded files and returns a single metadata object for the file no - * matter how many embedded objects there are; this option throws away metadata from - * embedded objects and silently skips exceptions in embedded objects. + *

{@link PARSE_MODE#CONCATENATE} is similar to the legacy tika-app behavior and the /tika + * endpoint (accept: application/json) in tika-server. This concatenates the contents of + * embedded files and returns a single metadata object for the file no matter how many embedded + * objects there are; this option throws away metadata from embedded objects and silently skips + * exceptions in embedded objects. */ public enum PARSE_MODE { RMETA, @@ -63,8 +59,11 @@ public static PARSE_MODE parseMode(String modeString) { } sb.append(m.name().toLowerCase(Locale.US)); } - throw new IllegalArgumentException("mode must be one of: (" + sb + - "). I regret I do not understand: " + modeString); + throw new IllegalArgumentException( + "mode must be one of: (" + + sb + + "). I regret I do not understand: " + + modeString); } } @@ -77,10 +76,12 @@ public static PARSE_MODE parseMode(String modeString) { boolean throwOnWriteLimitReached = true; PARSE_MODE parseMode = PARSE_MODE.RMETA; - - public HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE type, PARSE_MODE parseMode, - int writeLimit, - int maxEmbeddedResources, boolean throwOnWriteLimitReached) { + public HandlerConfig( + BasicContentHandlerFactory.HANDLER_TYPE type, + PARSE_MODE parseMode, + int writeLimit, + int maxEmbeddedResources, + boolean throwOnWriteLimitReached) { this.type = type; this.parseMode = parseMode; this.writeLimit = writeLimit; @@ -117,21 +118,32 @@ public boolean equals(Object o) { return false; } HandlerConfig that = (HandlerConfig) o; - return writeLimit == that.writeLimit && maxEmbeddedResources == that.maxEmbeddedResources && - throwOnWriteLimitReached == that.throwOnWriteLimitReached && type == that.type && - parseMode == that.parseMode; + return writeLimit == that.writeLimit + && maxEmbeddedResources == that.maxEmbeddedResources + && throwOnWriteLimitReached == that.throwOnWriteLimitReached + && type == that.type + && parseMode == that.parseMode; } @Override public int hashCode() { - return Objects.hash(type, writeLimit, maxEmbeddedResources, throwOnWriteLimitReached, - parseMode); + return Objects.hash( + type, writeLimit, maxEmbeddedResources, throwOnWriteLimitReached, parseMode); } @Override public String toString() { - return "HandlerConfig{" + "type=" + type + ", writeLimit=" + writeLimit + - ", maxEmbeddedResources=" + maxEmbeddedResources + ", throwOnWriteLimitReached=" + - throwOnWriteLimitReached + ", parseMode=" + parseMode + '}'; + return "HandlerConfig{" + + "type=" + + type + + ", writeLimit=" + + writeLimit + + ", maxEmbeddedResources=" + + maxEmbeddedResources + + ", throwOnWriteLimitReached=" + + throwOnWriteLimitReached + + ", parseMode=" + + parseMode + + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/LoggingPipesReporter.java b/tika-core/src/main/java/org/apache/tika/pipes/LoggingPipesReporter.java index 5f00880ba0..5262abbabc 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/LoggingPipesReporter.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/LoggingPipesReporter.java @@ -16,13 +16,10 @@ */ package org.apache.tika.pipes; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** - * Simple PipesReporter that logs everything at the debug level. - */ +/** Simple PipesReporter that logs everything at the debug level. */ public class LoggingPipesReporter extends PipesReporter { Logger LOGGER = LoggerFactory.getLogger(LoggingPipesReporter.class); diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java index 52e72df854..64ea871a60 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java @@ -40,24 +40,21 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.pipes.emitter.EmitData; import org.apache.tika.pipes.emitter.EmitKey; import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * The PipesClient is designed to be single-threaded. It only allots - * a single thread for {@link #process(FetchEmitTuple)} processing. - * See {@link org.apache.tika.pipes.async.AsyncProcessor} for handling - * multiple PipesClients. + * The PipesClient is designed to be single-threaded. It only allots a single thread for {@link + * #process(FetchEmitTuple)} processing. See {@link org.apache.tika.pipes.async.AsyncProcessor} for + * handling multiple PipesClients. */ public class PipesClient implements Closeable { @@ -65,9 +62,9 @@ public class PipesClient implements Closeable { private static final int MAX_BYTES_BEFORE_READY = 20000; private static AtomicInteger CLIENT_COUNTER = new AtomicInteger(0); private static final long WAIT_ON_DESTROY_MS = 10000; - //this synchronizes the creation and/or closing of the executorService - //there are a number of assumptions throughout that PipesClient is run - //single threaded + // this synchronizes the creation and/or closing of the executorService + // there are a number of assumptions throughout that PipesClient is run + // single threaded private final Object[] executorServiceLock = new Object[0]; private final PipesConfigBase pipesConfig; private final int pipesClientId; @@ -110,7 +107,7 @@ public void close() throws IOException { try { destroyForcibly(); } catch (InterruptedException e) { - //swallow + // swallow } } synchronized (executorServiceLock) { @@ -125,10 +122,12 @@ public PipesResult process(FetchEmitTuple t) throws IOException, InterruptedExce boolean restart = false; if (!ping()) { restart = true; - } else if (pipesConfig.getMaxFilesProcessedPerProcess() > 0 && - filesProcessed >= pipesConfig.getMaxFilesProcessedPerProcess()) { - LOG.info("pipesClientId={}: restarting server after hitting max files: {}", - pipesClientId, filesProcessed); + } else if (pipesConfig.getMaxFilesProcessedPerProcess() > 0 + && filesProcessed >= pipesConfig.getMaxFilesProcessedPerProcess()) { + LOG.info( + "pipesClientId={}: restarting server after hitting max files: {}", + pipesClientId, + filesProcessed); restart = true; } if (restart) { @@ -138,8 +137,10 @@ public PipesResult process(FetchEmitTuple t) throws IOException, InterruptedExce restart(); successfulRestart = true; } catch (TimeoutException e) { - LOG.warn("pipesClientId={}: couldn't restart within {} ms (startupTimeoutMillis)", - pipesClientId, pipesConfig.getStartupTimeoutMillis()); + LOG.warn( + "pipesClientId={}: couldn't restart within {} ms (startupTimeoutMillis)", + pipesClientId, + pipesConfig.getStartupTimeoutMillis()); Thread.sleep(pipesConfig.getSleepOnStartupTimeoutMillis()); } } @@ -150,52 +151,58 @@ public PipesResult process(FetchEmitTuple t) throws IOException, InterruptedExce private PipesResult actuallyProcess(FetchEmitTuple t) throws InterruptedException { long start = System.currentTimeMillis(); final PipesResult[] intermediateResult = new PipesResult[1]; - FutureTask futureTask = new FutureTask<>(() -> { - - UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get(); - try (ObjectOutputStream objectOutputStream = new ObjectOutputStream(bos)) { - objectOutputStream.writeObject(t); - } - - byte[] bytes = bos.toByteArray(); - output.write(CALL.getByte()); - output.writeInt(bytes.length); - output.write(bytes); - output.flush(); - if (LOG.isTraceEnabled()) { - LOG.trace("pipesClientId={}: timer -- write tuple: {} ms", - pipesClientId, - System.currentTimeMillis() - start); - } - long readStart = System.currentTimeMillis(); - if (Thread.currentThread().isInterrupted()) { - throw new InterruptedException("thread interrupt"); - } - PipesResult result = readResults(t, start); - while (result.getStatus().equals(PipesResult.STATUS.INTERMEDIATE_RESULT)) { - intermediateResult[0] = result; - result = readResults(t, start); - } - if (LOG.isDebugEnabled()) { - long elapsed = System.currentTimeMillis() - readStart; - LOG.debug("finished reading result in {} ms", elapsed); - } - - if (LOG.isTraceEnabled()) { - LOG.trace("pipesClientId={}: timer -- read result: {} ms", - pipesClientId, - System.currentTimeMillis() - readStart); - } - if (result.getStatus() == PipesResult.STATUS.OOM) { - return buildFatalResult(result, intermediateResult); - } - return result; - }); + FutureTask futureTask = + new FutureTask<>( + () -> { + UnsynchronizedByteArrayOutputStream bos = + UnsynchronizedByteArrayOutputStream.builder().get(); + try (ObjectOutputStream objectOutputStream = + new ObjectOutputStream(bos)) { + objectOutputStream.writeObject(t); + } + + byte[] bytes = bos.toByteArray(); + output.write(CALL.getByte()); + output.writeInt(bytes.length); + output.write(bytes); + output.flush(); + if (LOG.isTraceEnabled()) { + LOG.trace( + "pipesClientId={}: timer -- write tuple: {} ms", + pipesClientId, + System.currentTimeMillis() - start); + } + long readStart = System.currentTimeMillis(); + if (Thread.currentThread().isInterrupted()) { + throw new InterruptedException("thread interrupt"); + } + PipesResult result = readResults(t, start); + while (result.getStatus() + .equals(PipesResult.STATUS.INTERMEDIATE_RESULT)) { + intermediateResult[0] = result; + result = readResults(t, start); + } + if (LOG.isDebugEnabled()) { + long elapsed = System.currentTimeMillis() - readStart; + LOG.debug("finished reading result in {} ms", elapsed); + } + + if (LOG.isTraceEnabled()) { + LOG.trace( + "pipesClientId={}: timer -- read result: {} ms", + pipesClientId, + System.currentTimeMillis() - readStart); + } + if (result.getStatus() == PipesResult.STATUS.OOM) { + return buildFatalResult(result, intermediateResult); + } + return result; + }); try { if (closed) { - throw new IllegalArgumentException("pipesClientId=" + pipesClientId + - ": PipesClient closed"); + throw new IllegalArgumentException( + "pipesClientId=" + pipesClientId + ": PipesClient closed"); } executorService.execute(futureTask); return futureTask.get(pipesConfig.getTimeoutMillis(), TimeUnit.MILLISECONDS); @@ -207,23 +214,36 @@ private PipesResult actuallyProcess(FetchEmitTuple t) throws InterruptedExceptio long elapsed = System.currentTimeMillis() - start; pauseThenDestroy(); if (!process.isAlive() && TIMEOUT_EXIT_CODE == process.exitValue()) { - LOG.warn("pipesClientId={} server timeout: {} in {} ms", pipesClientId, t.getId(), + LOG.warn( + "pipesClientId={} server timeout: {} in {} ms", + pipesClientId, + t.getId(), elapsed); return buildFatalResult(PipesResult.TIMEOUT, intermediateResult); } process.waitFor(500, TimeUnit.MILLISECONDS); if (process.isAlive()) { - LOG.warn("pipesClientId={} crash: {} in {} ms with no exit code available", - pipesClientId, t.getId(), elapsed); + LOG.warn( + "pipesClientId={} crash: {} in {} ms with no exit code available", + pipesClientId, + t.getId(), + elapsed); } else { - LOG.warn("pipesClientId={} crash: {} in {} ms with exit code {}", pipesClientId, - t.getId(), elapsed, process.exitValue()); + LOG.warn( + "pipesClientId={} crash: {} in {} ms with exit code {}", + pipesClientId, + t.getId(), + elapsed, + process.exitValue()); } return buildFatalResult(PipesResult.UNSPECIFIED_CRASH, intermediateResult); } catch (TimeoutException e) { long elapsed = System.currentTimeMillis() - start; destroyForcibly(); - LOG.warn("pipesClientId={} client timeout: {} in {} ms", pipesClientId, t.getId(), + LOG.warn( + "pipesClientId={} client timeout: {} in {} ms", + pipesClientId, + t.getId(), elapsed); return buildFatalResult(PipesResult.TIMEOUT, intermediateResult); } finally { @@ -231,8 +251,7 @@ private PipesResult actuallyProcess(FetchEmitTuple t) throws InterruptedExceptio } } - private PipesResult buildFatalResult(PipesResult result, - PipesResult[] intermediateResult) { + private PipesResult buildFatalResult(PipesResult result, PipesResult[] intermediateResult) { if (intermediateResult[0] == null) { return result; @@ -240,16 +259,18 @@ private PipesResult buildFatalResult(PipesResult result, if (LOG.isTraceEnabled()) { LOG.trace("intermediate result: {}", intermediateResult[0].getEmitData()); } - intermediateResult[0].getEmitData().getMetadataList().get(0).set( - TikaCoreProperties.PIPES_RESULT, result.getStatus().toString()); - return new PipesResult(result.getStatus(), - intermediateResult[0].getEmitData(), true); + intermediateResult[0] + .getEmitData() + .getMetadataList() + .get(0) + .set(TikaCoreProperties.PIPES_RESULT, result.getStatus().toString()); + return new PipesResult(result.getStatus(), intermediateResult[0].getEmitData(), true); } } private void pauseThenDestroy() throws InterruptedException { - //wait just a little bit to let process end to get exit value - //if there's a timeout on the server side + // wait just a little bit to let process end to get exit value + // if there's a timeout on the server side try { process.waitFor(200, TimeUnit.MILLISECONDS); } finally { @@ -260,19 +281,19 @@ private void pauseThenDestroy() throws InterruptedException { private void destroyForcibly() throws InterruptedException { process.destroyForcibly(); process.waitFor(WAIT_ON_DESTROY_MS, TimeUnit.MILLISECONDS); - //important to close streams so that threads running in this - //process receive notice that they really ought to stop. - //TIKA-3588 showed that we can't trust that forcibly destroying - //the process caused the actuallyProcess thread in this process to stop. + // important to close streams so that threads running in this + // process receive notice that they really ought to stop. + // TIKA-3588 showed that we can't trust that forcibly destroying + // the process caused the actuallyProcess thread in this process to stop. try { input.close(); } catch (IOException closeException) { - //swallow + // swallow } try { output.close(); } catch (IOException closeException) { - //swallow + // swallow } if (process.isAlive()) { LOG.error("Process still alive after {}ms", WAIT_ON_DESTROY_MS); @@ -289,7 +310,7 @@ private PipesResult readResults(FetchEmitTuple t, long start) throws IOException } catch (IllegalArgumentException e) { String byteString = "-1"; if (statusByte > -1) { - byteString = String.format(Locale.US, "%02x", (byte)statusByte); + byteString = String.format(Locale.US, "%02x", (byte) statusByte); } throw new IOException("problem reading response from server: " + byteString, e); } @@ -299,49 +320,76 @@ private PipesResult readResults(FetchEmitTuple t, long start) throws IOException LOG.warn("pipesClientId={} oom: {} in {} ms", pipesClientId, t.getId(), millis); return PipesResult.OOM; case TIMEOUT: - LOG.warn("pipesClientId={} server response timeout: {} in {} ms", pipesClientId, - t.getId(), millis); + LOG.warn( + "pipesClientId={} server response timeout: {} in {} ms", + pipesClientId, + t.getId(), + millis); return PipesResult.TIMEOUT; case EMIT_EXCEPTION: - LOG.warn("pipesClientId={} emit exception: {} in {} ms", pipesClientId, t.getId(), + LOG.warn( + "pipesClientId={} emit exception: {} in {} ms", + pipesClientId, + t.getId(), millis); return readMessage(PipesResult.STATUS.EMIT_EXCEPTION); case EMITTER_NOT_FOUND: - LOG.warn("pipesClientId={} emitter not found: {} in {} ms", pipesClientId, - t.getId(), millis); + LOG.warn( + "pipesClientId={} emitter not found: {} in {} ms", + pipesClientId, + t.getId(), + millis); return readMessage(PipesResult.STATUS.NO_EMITTER_FOUND); case FETCHER_NOT_FOUND: - LOG.warn("pipesClientId={} fetcher not found: {} in {} ms", pipesClientId, - t.getId(), millis); + LOG.warn( + "pipesClientId={} fetcher not found: {} in {} ms", + pipesClientId, + t.getId(), + millis); return readMessage(PipesResult.STATUS.NO_FETCHER_FOUND); case FETCHER_INITIALIZATION_EXCEPTION: - LOG.warn("pipesClientId={} fetcher initialization exception: {} in {} ms", - pipesClientId, t.getId(), millis); + LOG.warn( + "pipesClientId={} fetcher initialization exception: {} in {} ms", + pipesClientId, + t.getId(), + millis); return readMessage(PipesResult.STATUS.FETCHER_INITIALIZATION_EXCEPTION); case FETCH_EXCEPTION: - LOG.warn("pipesClientId={} fetch exception: {} in {} ms", pipesClientId, t.getId(), + LOG.warn( + "pipesClientId={} fetch exception: {} in {} ms", + pipesClientId, + t.getId(), millis); return readMessage(PipesResult.STATUS.FETCH_EXCEPTION); case INTERMEDIATE_RESULT: - LOG.debug("pipesClientId={} intermediate success: {} in {} ms", pipesClientId, - t.getId(), millis); + LOG.debug( + "pipesClientId={} intermediate success: {} in {} ms", + pipesClientId, + t.getId(), + millis); return deserializeIntermediateResult(t.getEmitKey()); case PARSE_SUCCESS: - //there may have been a parse exception, but the parse didn't crash - LOG.debug("pipesClientId={} parse success: {} in {} ms", pipesClientId, t.getId(), + // there may have been a parse exception, but the parse didn't crash + LOG.debug( + "pipesClientId={} parse success: {} in {} ms", + pipesClientId, + t.getId(), millis); return deserializeEmitData(); case PARSE_EXCEPTION_NO_EMIT: return readMessage(PipesResult.STATUS.PARSE_EXCEPTION_NO_EMIT); case EMIT_SUCCESS: - LOG.debug("pipesClientId={} emit success: {} in {} ms", pipesClientId, t.getId(), + LOG.debug( + "pipesClientId={} emit success: {} in {} ms", + pipesClientId, + t.getId(), millis); return PipesResult.EMIT_SUCCESS; case EMIT_SUCCESS_PARSE_EXCEPTION: return readMessage(PipesResult.STATUS.EMIT_SUCCESS_PARSE_EXCEPTION); case EMPTY_OUTPUT: return PipesResult.EMPTY_OUTPUT; - //fall through + // fall through case READY: case CALL: case PING: @@ -350,11 +398,10 @@ private PipesResult readResults(FetchEmitTuple t, long start) throws IOException default: throw new IOException("Need to handle procesing for: " + status); } - } private PipesResult readMessage(PipesResult.STATUS status) throws IOException { - //readInt checks for EOF + // readInt checks for EOF int length = input.readInt(); byte[] bytes = new byte[length]; input.readFully(bytes); @@ -366,8 +413,8 @@ private PipesResult deserializeEmitData() throws IOException { int length = input.readInt(); byte[] bytes = new byte[length]; input.readFully(bytes); - try (ObjectInputStream objectInputStream = new ObjectInputStream( - new UnsynchronizedByteArrayInputStream(bytes))) { + try (ObjectInputStream objectInputStream = + new ObjectInputStream(new UnsynchronizedByteArrayInputStream(bytes))) { EmitData emitData = (EmitData) objectInputStream.readObject(); String stack = emitData.getContainerStackTrace(); @@ -378,7 +425,7 @@ private PipesResult deserializeEmitData() throws IOException { } } catch (ClassNotFoundException e) { LOG.error("class not found exception deserializing data", e); - //this should be catastrophic + // this should be catastrophic throw new RuntimeException(e); } } @@ -388,14 +435,14 @@ private PipesResult deserializeIntermediateResult(EmitKey emitKey) throws IOExce int length = input.readInt(); byte[] bytes = new byte[length]; input.readFully(bytes); - try (ObjectInputStream objectInputStream = new ObjectInputStream( - new UnsynchronizedByteArrayInputStream(bytes))) { + try (ObjectInputStream objectInputStream = + new ObjectInputStream(new UnsynchronizedByteArrayInputStream(bytes))) { Metadata metadata = (Metadata) objectInputStream.readObject(); EmitData emitData = new EmitData(emitKey, Collections.singletonList(metadata)); return new PipesResult(PipesResult.STATUS.INTERMEDIATE_RESULT, emitData, true); } catch (ClassNotFoundException e) { LOG.error("class not found exception deserializing data", e); - //this should be catastrophic + // this should be catastrophic throw new RuntimeException(e); } } @@ -405,18 +452,18 @@ private void restart() throws IOException, InterruptedException, TimeoutExceptio LOG.debug("process still alive; trying to destroy it"); destroyForcibly(); boolean processEnded = process.waitFor(30, TimeUnit.SECONDS); - if (! processEnded) { + if (!processEnded) { LOG.warn("pipesClientId={}: process has not yet ended", pipesClientId); } executorService.shutdownNow(); boolean shutdown = executorService.awaitTermination(30, TimeUnit.SECONDS); - if (! shutdown) { + if (!shutdown) { LOG.warn("pipesClientId={}: executorService has not yet shutdown", pipesClientId); } synchronized (executorServiceLock) { if (closed) { - throw new IllegalArgumentException("pipesClientId=" + pipesClientId + - ": PipesClient closed"); + throw new IllegalArgumentException( + "pipesClientId=" + pipesClientId + ": PipesClient closed"); } executorService = Executors.newFixedThreadPool(1); } @@ -430,41 +477,57 @@ private void restart() throws IOException, InterruptedException, TimeoutExceptio try { process = pb.start(); } catch (Exception e) { - //Do we ever want this to be not fatal?! + // Do we ever want this to be not fatal?! LOG.error("failed to start client", e); throw new FailedToStartClientException(e); } input = new DataInputStream(process.getInputStream()); output = new DataOutputStream(process.getOutputStream()); - //wait for ready signal - final UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get(); - FutureTask futureTask = new FutureTask<>(() -> { - int b = input.read(); - int read = 1; - while (read < MAX_BYTES_BEFORE_READY && b != READY.getByte()) { - - if (b == -1) { - throw new RuntimeException(getMsg("pipesClientId=" + pipesClientId + ": " + - "Couldn't start server -- read EOF before 'ready' byte.\n" + - " process isAlive=" + process.isAlive(), bos)); - } - bos.write(b); - b = input.read(); - read++; - } - if (read >= MAX_BYTES_BEFORE_READY) { - throw new RuntimeException(getMsg("pipesClientId=" + pipesClientId + ": " + - "Couldn't start server: read too many bytes before 'ready' byte.\n" + - " Make absolutely certain that your logger is not writing to " + - "stdout.\n", bos)); - } - if (bos.size() > 0) { - LOG.warn("pipesClientId={}: From forked process before start byte: {}", - pipesClientId, bos.toString(StandardCharsets.UTF_8)); - } - return 1; - }); + // wait for ready signal + final UnsynchronizedByteArrayOutputStream bos = + UnsynchronizedByteArrayOutputStream.builder().get(); + FutureTask futureTask = + new FutureTask<>( + () -> { + int b = input.read(); + int read = 1; + while (read < MAX_BYTES_BEFORE_READY && b != READY.getByte()) { + + if (b == -1) { + throw new RuntimeException( + getMsg( + "pipesClientId=" + + pipesClientId + + ": " + + "Couldn't start server -- read EOF before 'ready' byte.\n" + + " process isAlive=" + + process.isAlive(), + bos)); + } + bos.write(b); + b = input.read(); + read++; + } + if (read >= MAX_BYTES_BEFORE_READY) { + throw new RuntimeException( + getMsg( + "pipesClientId=" + + pipesClientId + + ": " + + "Couldn't start server: read too many bytes before 'ready' byte.\n" + + " Make absolutely certain that your logger is not writing to " + + "stdout.\n", + bos)); + } + if (bos.size() > 0) { + LOG.warn( + "pipesClientId={}: From forked process before start byte: {}", + pipesClientId, + bos.toString(StandardCharsets.UTF_8)); + } + return 1; + }); long start = System.currentTimeMillis(); executorService.submit(futureTask); try { @@ -478,10 +541,13 @@ private void restart() throws IOException, InterruptedException, TimeoutExceptio throw new RuntimeException(e); } catch (TimeoutException e) { long elapsed = System.currentTimeMillis() - start; - LOG.error("pipesClientId={} didn't receive ready byte from server within " + - "StartupTimeoutMillis {}; ms elapsed {}; did read >{}<", - pipesClientId, pipesConfig.getStartupTimeoutMillis(), - elapsed, bos.toString(StandardCharsets.UTF_8)); + LOG.error( + "pipesClientId={} didn't receive ready byte from server within " + + "StartupTimeoutMillis {}; ms elapsed {}; did read >{}<", + pipesClientId, + pipesConfig.getStartupTimeoutMillis(), + elapsed, + bos.toString(StandardCharsets.UTF_8)); destroyForcibly(); throw e; } finally { @@ -513,8 +579,8 @@ private String[] getCommandline() { if (arg.equals("-cp") || arg.equals("--classpath")) { hasClassPath = true; } - if (arg.equals("-XX:+ExitOnOutOfMemoryError") || - arg.equals("-XX:+CrashOnOutOfMemoryError")) { + if (arg.equals("-XX:+ExitOnOutOfMemoryError") + || arg.equals("-XX:+CrashOnOutOfMemoryError")) { hasExitOnOOM = true; } if (arg.startsWith("-Dlog4j.configuration")) { @@ -543,9 +609,9 @@ private String[] getCommandline() { } if (hasExitOnOOM) { LOG.warn( - "I notice that you have an exit/crash on OOM. If you run heavy external processes " + - "like tesseract, this setting may result in orphaned processes which could be disastrous" + - " for performance."); + "I notice that you have an exit/crash on OOM. If you run heavy external processes " + + "like tesseract, this setting may result in orphaned processes which could be disastrous" + + " for performance."); } if (!hasLog4j) { commandLine.add( @@ -554,8 +620,9 @@ private String[] getCommandline() { commandLine.add("-DpipesClientId=" + pipesClientId); commandLine.addAll(configArgs); commandLine.add("org.apache.tika.pipes.PipesServer"); - commandLine.add(ProcessUtils.escapeCommandLine( - pipesConfig.getTikaConfig().toAbsolutePath().toString())); + commandLine.add( + ProcessUtils.escapeCommandLine( + pipesConfig.getTikaConfig().toAbsolutePath().toString())); commandLine.add(Long.toString(pipesConfig.getMaxForEmitBatchBytes())); commandLine.add(Long.toString(pipesConfig.getTimeoutMillis())); diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java index 06783d67c1..4bfcbedb09 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java @@ -21,12 +21,10 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.Set; - +import org.apache.tika.exception.TikaConfigException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.exception.TikaConfigException; - public class PipesConfig extends PipesConfigBase { private static final Logger LOG = LoggerFactory.getLogger(PipesClient.class); @@ -39,16 +37,16 @@ public static PipesConfig load(Path tikaConfig) throws IOException, TikaConfigEx Set settings = pipesConfig.configure("pipes", is); } if (pipesConfig.getTikaConfig() == null) { - LOG.debug("A separate tikaConfig was not specified in the element in the " + - "config file; will use {} for pipes", tikaConfig); + LOG.debug( + "A separate tikaConfig was not specified in the element in the " + + "config file; will use {} for pipes", + tikaConfig); pipesConfig.setTikaConfig(tikaConfig); } return pipesConfig; } - private PipesConfig() { - - } + private PipesConfig() {} public long getMaxWaitForClientMillis() { return maxWaitForClientMillis; diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java index bf6a6bb696..2e19c64069 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java @@ -21,15 +21,13 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; - import org.apache.tika.config.ConfigBase; public class PipesConfigBase extends ConfigBase { /** - * default size to send back to the PipesClient for batch - * emitting. If an extract is larger than this, it will be emitted - * directly from the forked PipesServer. + * default size to send back to the PipesClient for batch emitting. If an extract is larger than + * this, it will be emitted directly from the forked PipesServer. */ public static final long DEFAULT_MAX_FOR_EMIT_BATCH = 100000; @@ -43,8 +41,8 @@ public class PipesConfigBase extends ConfigBase { public static final int DEFAULT_MAX_FILES_PROCESSED_PER_PROCESS = 10000; - //if an extract is larger than this, the forked PipesServer should - //emit the extract directly and not send the contents back to the PipesClient + // if an extract is larger than this, the forked PipesServer should + // emit the extract directly and not send the contents back to the PipesClient private long maxForEmitBatchBytes = DEFAULT_MAX_FOR_EMIT_BATCH; private long timeoutMillis = DEFAULT_TIMEOUT_MILLIS; private long startupTimeoutMillis = DEFAULT_STARTUP_TIMEOUT_MILLIS; @@ -65,6 +63,7 @@ public long getTimeoutMillis() { /** * How long to wait in milliseconds before timing out the forked process. + * * @param timeoutMillis */ public void setTimeoutMillis(long timeoutMillis) { @@ -76,8 +75,7 @@ public long getShutdownClientAfterMillis() { } /** - * If the client has been inactive after this many milliseconds, - * shut it down. + * If the client has been inactive after this many milliseconds, shut it down. * * @param shutdownClientAfterMillis */ @@ -94,7 +92,7 @@ public void setNumClients(int numClients) { } public List getForkedJvmArgs() { - //defensive copy + // defensive copy List ret = new ArrayList<>(); ret.addAll(forkedJvmArgs); return ret; @@ -109,8 +107,9 @@ public void setForkedJvmArgs(List jvmArgs) { } /** - * Restart the forked PipesServer after it has processed this many files to avoid - * slow-building memory leaks. + * Restart the forked PipesServer after it has processed this many files to avoid slow-building + * memory leaks. + * * @return */ public int getMaxFilesProcessedPerProcess() { @@ -146,13 +145,12 @@ public long getStartupTimeoutMillis() { } /** - * What is the maximum bytes size per extract that - * will be allowed to be shipped back to the emit queue in the forking process. - * If an extract is too big, skip the emit queue and forward it directly from the - * forked PipesServer. - * If set to 0, this will never send an extract back for batch emitting, - * but will always emit the extract directly from the forked PipeServer. - * If set to -1, this will always send the extract back for batch emitting. + * What is the maximum bytes size per extract that will be allowed to be shipped back to the + * emit queue in the forking process. If an extract is too big, skip the emit queue and forward + * it directly from the forked PipesServer. If set to 0, this will never send an + * extract back for batch emitting, but will always emit the extract directly from the forked + * PipeServer. If set to -1, this will always send the extract back for batch + * emitting. * * @return the threshold extract size at which to emit directly from the forked PipeServer */ diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesException.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesException.java index ee9545f0aa..e6066f5c72 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesException.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesException.java @@ -16,9 +16,7 @@ */ package org.apache.tika.pipes; -/** - * Fatal exception that means that something went seriously wrong. - */ +/** Fatal exception that means that something went seriously wrong. */ public class PipesException extends Exception { public PipesException(Throwable t) { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesParser.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesParser.java index 8446983aa4..72bfe21c39 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesParser.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesParser.java @@ -25,11 +25,9 @@ public class PipesParser implements Closeable { - private final PipesConfig pipesConfig; private final List clients = new ArrayList<>(); - private final ArrayBlockingQueue clientQueue ; - + private final ArrayBlockingQueue clientQueue; public PipesParser(PipesConfig pipesConfig) { this.pipesConfig = pipesConfig; @@ -41,12 +39,13 @@ public PipesParser(PipesConfig pipesConfig) { } } - public PipesResult parse(FetchEmitTuple t) throws InterruptedException, - PipesException, IOException { + public PipesResult parse(FetchEmitTuple t) + throws InterruptedException, PipesException, IOException { PipesClient client = null; try { - client = clientQueue.poll(pipesConfig.getMaxWaitForClientMillis(), - TimeUnit.MILLISECONDS); + client = + clientQueue.poll( + pipesConfig.getMaxWaitForClientMillis(), TimeUnit.MILLISECONDS); if (client == null) { return PipesResult.CLIENT_UNAVAILABLE_WITHIN_MS; } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporter.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesReporter.java index 3978039b40..69f63f20ef 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporter.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesReporter.java @@ -18,81 +18,76 @@ import java.io.Closeable; import java.io.IOException; - import org.apache.tika.pipes.pipesiterator.TotalCountResult; /** - * This is called asynchronously by the AsyncProcessor. This - * is not thread safe, and implementers must be careful to implement - * {@link #report(FetchEmitTuple, PipesResult, long)} in a thread safe + * This is called asynchronously by the AsyncProcessor. This is not thread safe, and implementers + * must be careful to implement {@link #report(FetchEmitTuple, PipesResult, long)} in a thread safe * way. - *

- * Note, however, that this is not called in the forked processes. - * Implementers do not have to worry about synchronizing across processes; - * for example, one could use an in-memory h2 database as a target. + * + *

Note, however, that this is not called in the forked processes. Implementers do not have to + * worry about synchronizing across processes; for example, one could use an in-memory h2 database + * as a target. */ public abstract class PipesReporter implements Closeable { - public static final PipesReporter NO_OP_REPORTER = new PipesReporter() { - - @Override - public void report(FetchEmitTuple t, PipesResult result, long elapsed) { - - } + public static final PipesReporter NO_OP_REPORTER = + new PipesReporter() { - @Override - public void error(Throwable t) { + @Override + public void report(FetchEmitTuple t, PipesResult result, long elapsed) {} - } + @Override + public void error(Throwable t) {} - @Override - public void error(String msg) { + @Override + public void error(String msg) {} + }; - } - }; - - //Implementers are responsible for preventing reporting after - //crashes if that is the desired behavior. + // Implementers are responsible for preventing reporting after + // crashes if that is the desired behavior. public abstract void report(FetchEmitTuple t, PipesResult result, long elapsed); - /** - * No-op implementation. Override for custom behavior - * and make sure to override {@link #supportsTotalCount()} - * to return true + * No-op implementation. Override for custom behavior and make sure to override {@link + * #supportsTotalCount()} to return true + * * @param totalCountResult */ - public void report(TotalCountResult totalCountResult) { - - } + public void report(TotalCountResult totalCountResult) {} /** * Override this if your reporter supports total count. + * * @return false as the baseline implementation */ public boolean supportsTotalCount() { return false; } + /** - * No-op implementation. Override for custom behavior + * No-op implementation. Override for custom behavior + * * @throws IOException */ @Override public void close() throws IOException { - //no-op + // no-op } /** - * This is called if the process has crashed. - * Implementers should not rely on close() to be called after this. + * This is called if the process has crashed. Implementers should not rely on close() to be + * called after this. + * * @param t */ public abstract void error(Throwable t); + /** - * This is called if the process has crashed. - * Implementers should not rely on close() to be called after this. + * This is called if the process has crashed. Implementers should not rely on close() to be + * called after this. + * * @param msg */ public abstract void error(String msg); - } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporterBase.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesReporterBase.java index 3dcddfa71e..8a52060f90 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporterBase.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesReporterBase.java @@ -20,16 +20,13 @@ import java.util.List; import java.util.Map; import java.util.Set; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; -/** - * Base class that includes filtering by {@link PipesResult.STATUS} - */ +/** Base class that includes filtering by {@link PipesResult.STATUS} */ public abstract class PipesReporterBase extends PipesReporter implements Initializable { private final Set includes = new HashSet<>(); @@ -42,11 +39,12 @@ public void initialize(Map params) throws TikaConfigException { statusFilter = buildStatusFilter(includes, excludes); } - private StatusFilter buildStatusFilter(Set includes, - Set excludes) throws TikaConfigException { + private StatusFilter buildStatusFilter( + Set includes, Set excludes) + throws TikaConfigException { if (includes.size() > 0 && excludes.size() > 0) { - throw new TikaConfigException("Only one of includes and excludes may have any " + - "contents"); + throw new TikaConfigException( + "Only one of includes and excludes may have any " + "contents"); } if (includes.size() > 0) { return new IncludesFilter(includes); @@ -58,12 +56,11 @@ private StatusFilter buildStatusFilter(Set includes, @Override public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { - - } + throws TikaConfigException {} /** * Implementations must call this for the includes/excludes filters to work! + * * @param status * @return */ @@ -150,6 +147,4 @@ boolean accept(PipesResult.STATUS status) { return true; } } - - } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesResult.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesResult.java index 639bfc4378..29fe3ed67b 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesResult.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesResult.java @@ -27,14 +27,19 @@ public enum STATUS { FETCHER_INITIALIZATION_EXCEPTION, FETCH_EXCEPTION, EMPTY_OUTPUT, - PARSE_EXCEPTION_NO_EMIT, //within the pipes server - PARSE_EXCEPTION_EMIT, //within the pipes server - PARSE_SUCCESS, //when passed back to the async processor for emit - PARSE_SUCCESS_WITH_EXCEPTION,//when passed back to the async processor for emit - OOM, TIMEOUT, UNSPECIFIED_CRASH, + PARSE_EXCEPTION_NO_EMIT, // within the pipes server + PARSE_EXCEPTION_EMIT, // within the pipes server + PARSE_SUCCESS, // when passed back to the async processor for emit + PARSE_SUCCESS_WITH_EXCEPTION, // when passed back to the async processor for emit + OOM, + TIMEOUT, + UNSPECIFIED_CRASH, NO_EMITTER_FOUND, - EMIT_SUCCESS, EMIT_SUCCESS_PARSE_EXCEPTION, EMIT_EXCEPTION, - INTERRUPTED_EXCEPTION, NO_FETCHER_FOUND, + EMIT_SUCCESS, + EMIT_SUCCESS_PARSE_EXCEPTION, + EMIT_EXCEPTION, + INTERRUPTED_EXCEPTION, + NO_FETCHER_FOUND, INTERMEDIATE_RESULT; } @@ -44,9 +49,9 @@ public enum STATUS { public static final PipesResult OOM = new PipesResult(STATUS.OOM); public static final PipesResult UNSPECIFIED_CRASH = new PipesResult(STATUS.UNSPECIFIED_CRASH); public static final PipesResult EMIT_SUCCESS = new PipesResult(STATUS.EMIT_SUCCESS); - public static final PipesResult INTERRUPTED_EXCEPTION = new PipesResult(STATUS.INTERRUPTED_EXCEPTION); - public static final PipesResult EMPTY_OUTPUT = - new PipesResult(STATUS.EMPTY_OUTPUT); + public static final PipesResult INTERRUPTED_EXCEPTION = + new PipesResult(STATUS.INTERRUPTED_EXCEPTION); + public static final PipesResult EMPTY_OUTPUT = new PipesResult(STATUS.EMPTY_OUTPUT); private final STATUS status; private final EmitData emitData; private final String message; @@ -80,8 +85,7 @@ public PipesResult(STATUS status, EmitData emitData, boolean intermediate) { } /** - * This assumes that the message is a stack trace (container - * parse exception). + * This assumes that the message is a stack trace (container parse exception). * * @param emitData * @param message @@ -108,7 +112,16 @@ public boolean isIntermediate() { @Override public String toString() { - return "PipesResult{" + "intermediate=" + intermediate + ", status=" + status + - ", emitData=" + emitData + ", message='" + message + '\'' + '}'; + return "PipesResult{" + + "intermediate=" + + intermediate + + ", status=" + + status + + ", emitData=" + + emitData + + ", message='" + + message + + '\'' + + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java index 991694f889..066cdfb118 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java @@ -31,15 +31,9 @@ import java.util.Collections; import java.util.List; import java.util.Optional; - import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.EncryptedDocumentException; @@ -79,30 +73,46 @@ import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * This server is forked from the PipesClient. This class isolates - * parsing from the client to protect the primary JVM. - *

- * When configuring logging for this class, make absolutely certain - * not to write to STDOUT. This class uses STDOUT to communicate with - * the PipesClient. + * This server is forked from the PipesClient. This class isolates parsing from the client to + * protect the primary JVM. + * + *

When configuring logging for this class, make absolutely certain not to write to STDOUT. This + * class uses STDOUT to communicate with the PipesClient. */ public class PipesServer implements Runnable { private static final Logger LOG = LoggerFactory.getLogger(PipesServer.class); - //this has to be some number not close to 0-3 - //it looks like the server crashes with exit value 3 on OOM, for example + // this has to be some number not close to 0-3 + // it looks like the server crashes with exit value 3 on OOM, for example public static final int TIMEOUT_EXIT_CODE = 17; private DigestingParser.Digester digester; private Detector detector; public enum STATUS { - READY, CALL, PING, FAILED_TO_START, FETCHER_NOT_FOUND, EMITTER_NOT_FOUND, - FETCHER_INITIALIZATION_EXCEPTION, FETCH_EXCEPTION, PARSE_SUCCESS, PARSE_EXCEPTION_NO_EMIT, - EMIT_SUCCESS, EMIT_SUCCESS_PARSE_EXCEPTION, EMIT_EXCEPTION, OOM, TIMEOUT, EMPTY_OUTPUT, + READY, + CALL, + PING, + FAILED_TO_START, + FETCHER_NOT_FOUND, + EMITTER_NOT_FOUND, + FETCHER_INITIALIZATION_EXCEPTION, + FETCH_EXCEPTION, + PARSE_SUCCESS, + PARSE_EXCEPTION_NO_EMIT, + EMIT_SUCCESS, + EMIT_SUCCESS_PARSE_EXCEPTION, + EMIT_EXCEPTION, + OOM, + TIMEOUT, + EMPTY_OUTPUT, INTERMEDIATE_RESULT; byte getByte() { @@ -129,9 +139,9 @@ public static STATUS lookup(int val) { private final Path tikaConfigPath; private final DataInputStream input; private final DataOutputStream output; - //if an extract is larger than this value, emit it directly; - //if it is smaller than this value, write it back to the - //PipesClient so that it can cache the extracts and then batch emit. + // if an extract is larger than this value, emit it directly; + // if it is smaller than this value, write it back to the + // PipesClient so that it can cache the extracts and then batch emit. private final long maxForEmitBatchBytes; private final long serverParseTimeoutMillis; private final long serverWaitTimeoutMillis; @@ -143,10 +153,13 @@ public static STATUS lookup(int val) { private volatile boolean parsing; private volatile long since; - - public PipesServer(Path tikaConfigPath, InputStream in, PrintStream out, - long maxForEmitBatchBytes, long serverParseTimeoutMillis, - long serverWaitTimeoutMillis) + public PipesServer( + Path tikaConfigPath, + InputStream in, + PrintStream out, + long maxForEmitBatchBytes, + long serverParseTimeoutMillis, + long serverWaitTimeoutMillis) throws IOException, TikaException, SAXException { this.tikaConfigPath = tikaConfigPath; this.input = new DataInputStream(in); @@ -158,7 +171,6 @@ public PipesServer(Path tikaConfigPath, InputStream in, PrintStream out, this.since = System.currentTimeMillis(); } - public static void main(String[] args) throws Exception { try { Path tikaConfig = Paths.get(args[0]); @@ -167,8 +179,13 @@ public static void main(String[] args) throws Exception { long serverWaitTimeoutMillis = Long.parseLong(args[3]); PipesServer server = - new PipesServer(tikaConfig, System.in, System.out, maxForEmitBatchBytes, - serverParseTimeoutMillis, serverWaitTimeoutMillis); + new PipesServer( + tikaConfig, + System.in, + System.out, + maxForEmitBatchBytes, + serverParseTimeoutMillis, + serverWaitTimeoutMillis); System.setIn(new UnsynchronizedByteArrayInputStream(new byte[0])); System.setOut(System.err); Thread watchdog = new Thread(server, "Tika Watchdog"); @@ -188,11 +205,14 @@ public void run() { synchronized (lock) { long elapsed = System.currentTimeMillis() - since; if (parsing && elapsed > serverParseTimeoutMillis) { - LOG.warn("timeout server; elapsed {} with {}", elapsed, + LOG.warn( + "timeout server; elapsed {} with {}", + elapsed, serverParseTimeoutMillis); exit(TIMEOUT_EXIT_CODE); - } else if (!parsing && serverWaitTimeoutMillis > 0 && - elapsed > serverWaitTimeoutMillis) { + } else if (!parsing + && serverWaitTimeoutMillis > 0 + && elapsed > serverWaitTimeoutMillis) { LOG.info("closing down from inactivity"); exit(0); } @@ -206,12 +226,13 @@ public void run() { public void processRequests() { LOG.debug("processing requests {}"); - //initialize + // initialize try { long start = System.currentTimeMillis(); initializeResources(); if (LOG.isTraceEnabled()) { - LOG.trace("timer -- initialize parser and other resources: {} ms", + LOG.trace( + "timer -- initialize parser and other resources: {} ms", System.currentTimeMillis() - start); } LOG.debug("pipes server initialized"); @@ -225,7 +246,7 @@ public void processRequests() { } return; } - //main loop + // main loop try { write(STATUS.READY); long start = System.currentTimeMillis(); @@ -263,8 +284,8 @@ private boolean metadataIsEmpty(List metadataList) { } /** - * returns stack trace if there was a container exception or empty string - * if there was no stacktrace + * returns stack trace if there was a container exception or empty string if there was no + * stacktrace * * @param t * @param metadataList @@ -278,10 +299,12 @@ private String getContainerStacktrace(FetchEmitTuple t, List metadataL return (stack != null) ? stack : StringUtils.EMPTY; } - - private void emit(String taskId, EmitKey emitKey, - boolean isExtractEmbeddedBytes, MetadataListAndEmbeddedBytes parseData, - String parseExceptionStack) { + private void emit( + String taskId, + EmitKey emitKey, + boolean isExtractEmbeddedBytes, + MetadataListAndEmbeddedBytes parseData, + String parseExceptionStack) { Emitter emitter = null; try { @@ -293,8 +316,7 @@ private void emit(String taskId, EmitKey emitKey, return; } try { - if (isExtractEmbeddedBytes && - parseData.toBePackagedForStreamEmitter()) { + if (isExtractEmbeddedBytes && parseData.toBePackagedForStreamEmitter()) { emitContentsAndBytes(emitter, emitKey, parseData); } else { emitter.emit(emitKey.getEmitKey(), parseData.getMetadataList()); @@ -303,25 +325,28 @@ private void emit(String taskId, EmitKey emitKey, LOG.warn("emit exception", e); String msg = ExceptionUtils.getStackTrace(e); byte[] bytes = msg.getBytes(StandardCharsets.UTF_8); - //for now, we're hiding the parse exception if there was also an emit exception + // for now, we're hiding the parse exception if there was also an emit exception write(STATUS.EMIT_EXCEPTION, bytes); return; } if (StringUtils.isBlank(parseExceptionStack)) { write(STATUS.EMIT_SUCCESS); } else { - write(STATUS.EMIT_SUCCESS_PARSE_EXCEPTION, + write( + STATUS.EMIT_SUCCESS_PARSE_EXCEPTION, parseExceptionStack.getBytes(StandardCharsets.UTF_8)); } } - private void emitContentsAndBytes(Emitter emitter, EmitKey emitKey, - MetadataListAndEmbeddedBytes parseData) { + private void emitContentsAndBytes( + Emitter emitter, EmitKey emitKey, MetadataListAndEmbeddedBytes parseData) { if (!(emitter instanceof StreamEmitter)) { - throw new IllegalArgumentException("The emitter for embedded document byte store must" + - " be a StreamEmitter. I see: " + emitter.getClass()); + throw new IllegalArgumentException( + "The emitter for embedded document byte store must" + + " be a StreamEmitter. I see: " + + emitter.getClass()); } - //TODO: implement this + // TODO: implement this throw new UnsupportedOperationException("this is not yet implemented"); } @@ -335,8 +360,8 @@ private void parseOne() { long start = System.currentTimeMillis(); t = readFetchEmitTuple(); if (LOG.isTraceEnabled()) { - LOG.trace("timer -- read fetchEmitTuple: {} ms", - System.currentTimeMillis() - start); + LOG.trace( + "timer -- read fetchEmitTuple: {} ms", System.currentTimeMillis() - start); } start = System.currentTimeMillis(); actuallyParse(t); @@ -358,7 +383,7 @@ private void actuallyParse(FetchEmitTuple t) { long start = System.currentTimeMillis(); Fetcher fetcher = getFetcher(t); if (fetcher == null) { - //rely on proper logging/exception handling in getFetcher + // rely on proper logging/exception handling in getFetcher return; } @@ -371,7 +396,7 @@ private void actuallyParse(FetchEmitTuple t) { MetadataListAndEmbeddedBytes parseData = null; try { - //this can be null if there is a fetch exception + // this can be null if there is a fetch exception parseData = parseFromTuple(t, fetcher); if (LOG.isTraceEnabled()) { @@ -385,8 +410,9 @@ private void actuallyParse(FetchEmitTuple t) { emitParseData(t, parseData); } finally { - if (parseData != null && parseData.hasEmbeddedDocumentByteStore() && - parseData.getEmbeddedDocumentBytesHandler() instanceof Closeable) { + if (parseData != null + && parseData.hasEmbeddedDocumentByteStore() + && parseData.getEmbeddedDocumentBytesHandler() instanceof Closeable) { try { ((Closeable) parseData.getEmbeddedDocumentBytesHandler()).close(); } catch (IOException e) { @@ -399,10 +425,10 @@ private void actuallyParse(FetchEmitTuple t) { private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData) { long start = System.currentTimeMillis(); String stack = getContainerStacktrace(t, parseData.getMetadataList()); - //we need to apply this after we pull out the stacktrace + // we need to apply this after we pull out the stacktrace filterMetadata(parseData.getMetadataList()); - if (StringUtils.isBlank(stack) || - t.getOnParseException() == FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT) { + if (StringUtils.isBlank(stack) + || t.getOnParseException() == FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT) { injectUserMetadata(t.getMetadata(), parseData.getMetadataList()); EmitKey emitKey = t.getEmitKey(); if (StringUtils.isBlank(emitKey.getEmitKey())) { @@ -410,16 +436,24 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD t.setEmitKey(emitKey); } EmitData emitData = new EmitData(t.getEmitKey(), parseData.getMetadataList(), stack); - if (t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes() && - parseData.toBePackagedForStreamEmitter()) { - emit(t.getId(), emitKey, t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(), - parseData, stack); - } else if (maxForEmitBatchBytes >= 0 && - emitData.getEstimatedSizeBytes() >= maxForEmitBatchBytes) { - emit(t.getId(), emitKey, t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(), - parseData, stack); + if (t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes() + && parseData.toBePackagedForStreamEmitter()) { + emit( + t.getId(), + emitKey, + t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(), + parseData, + stack); + } else if (maxForEmitBatchBytes >= 0 + && emitData.getEstimatedSizeBytes() >= maxForEmitBatchBytes) { + emit( + t.getId(), + emitKey, + t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(), + parseData, + stack); } else { - //send back to the client + // send back to the client write(emitData); } if (LOG.isTraceEnabled()) { @@ -463,8 +497,13 @@ protected MetadataListAndEmbeddedBytes parseFromTuple(FetchEmitTuple t, Fetcher "fetch key has a range, but the fetcher is not a range fetcher"); } Metadata metadata = new Metadata(); - try (InputStream stream = ((RangeFetcher) fetcher).fetch(fetchKey.getFetchKey(), - fetchKey.getRangeStart(), fetchKey.getRangeEnd(), metadata)) { + try (InputStream stream = + ((RangeFetcher) fetcher) + .fetch( + fetchKey.getFetchKey(), + fetchKey.getRangeStart(), + fetchKey.getRangeEnd(), + metadata)) { return parseWithStream(t, stream, metadata); } catch (SecurityException e) { LOG.error("security exception " + t.getId(), e); @@ -518,87 +557,105 @@ private String getNoEmitterMsg(String emitterName) { return sb.toString(); } - private void handleOOM(String taskId, OutOfMemoryError oom) { write(STATUS.OOM); LOG.error("oom: " + taskId, oom); exit(1); } - private MetadataListAndEmbeddedBytes parseWithStream(FetchEmitTuple fetchEmitTuple, - InputStream stream, Metadata metadata) + private MetadataListAndEmbeddedBytes parseWithStream( + FetchEmitTuple fetchEmitTuple, InputStream stream, Metadata metadata) throws TikaConfigException { HandlerConfig handlerConfig = fetchEmitTuple.getHandlerConfig(); List metadataList; - //this adds the EmbeddedDocumentByteStore to the parsecontext + // this adds the EmbeddedDocumentByteStore to the parsecontext ParseContext parseContext = createParseContext(fetchEmitTuple); if (handlerConfig.getParseMode() == HandlerConfig.PARSE_MODE.RMETA) { metadataList = parseRecursive(fetchEmitTuple, handlerConfig, stream, metadata, parseContext); } else { - metadataList = parseConcatenated(fetchEmitTuple, handlerConfig, stream, metadata, - parseContext); + metadataList = + parseConcatenated( + fetchEmitTuple, handlerConfig, stream, metadata, parseContext); } - return new MetadataListAndEmbeddedBytes(metadataList, - parseContext.get(EmbeddedDocumentBytesHandler.class)); + return new MetadataListAndEmbeddedBytes( + metadataList, parseContext.get(EmbeddedDocumentBytesHandler.class)); } private ParseContext createParseContext(FetchEmitTuple fetchEmitTuple) throws TikaConfigException { ParseContext parseContext = new ParseContext(); - if (! fetchEmitTuple.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes()) { + if (!fetchEmitTuple.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes()) { return parseContext; } - EmbeddedDocumentExtractorFactory factory = ((AutoDetectParser)autoDetectParser) - .getAutoDetectParserConfig().getEmbeddedDocumentExtractorFactory(); + EmbeddedDocumentExtractorFactory factory = + ((AutoDetectParser) autoDetectParser) + .getAutoDetectParserConfig() + .getEmbeddedDocumentExtractorFactory(); if (factory == null) { - parseContext.set(EmbeddedDocumentExtractor.class, new RUnpackExtractor(parseContext, - RUnpackExtractorFactory.DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION)); + parseContext.set( + EmbeddedDocumentExtractor.class, + new RUnpackExtractor( + parseContext, + RUnpackExtractorFactory.DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION)); } else { - if (! (factory instanceof EmbeddedDocumentByteStoreExtractorFactory)) { - throw new TikaConfigException("EmbeddedDocumentExtractorFactory must be an " + - "instance of EmbeddedDocumentByteStoreExtractorFactory if you want" + - "to extract embedded bytes! I see this embedded doc factory: " + - factory.getClass() + "and a request: " + - fetchEmitTuple.getEmbeddedDocumentBytesConfig()); + if (!(factory instanceof EmbeddedDocumentByteStoreExtractorFactory)) { + throw new TikaConfigException( + "EmbeddedDocumentExtractorFactory must be an " + + "instance of EmbeddedDocumentByteStoreExtractorFactory if you want" + + "to extract embedded bytes! I see this embedded doc factory: " + + factory.getClass() + + "and a request: " + + fetchEmitTuple.getEmbeddedDocumentBytesConfig()); } } - //TODO: especially clean this up. + // TODO: especially clean this up. if (!StringUtils.isBlank(fetchEmitTuple.getEmbeddedDocumentBytesConfig().getEmitter())) { - parseContext.set(EmbeddedDocumentBytesHandler.class, - new EmittingEmbeddedDocumentBytesHandler(fetchEmitTuple.getEmitKey(), - fetchEmitTuple.getEmbeddedDocumentBytesConfig(), emitterManager)); + parseContext.set( + EmbeddedDocumentBytesHandler.class, + new EmittingEmbeddedDocumentBytesHandler( + fetchEmitTuple.getEmitKey(), + fetchEmitTuple.getEmbeddedDocumentBytesConfig(), + emitterManager)); } else { - parseContext.set(EmbeddedDocumentBytesHandler.class, + parseContext.set( + EmbeddedDocumentBytesHandler.class, new BasicEmbeddedDocumentBytesHandler( - fetchEmitTuple.getEmbeddedDocumentBytesConfig())); + fetchEmitTuple.getEmbeddedDocumentBytesConfig())); } return parseContext; } - private List parseConcatenated(FetchEmitTuple fetchEmitTuple, - HandlerConfig handlerConfig, InputStream stream, - Metadata metadata, ParseContext parseContext) { + private List parseConcatenated( + FetchEmitTuple fetchEmitTuple, + HandlerConfig handlerConfig, + InputStream stream, + Metadata metadata, + ParseContext parseContext) { ContentHandlerFactory contentHandlerFactory = - new BasicContentHandlerFactory(handlerConfig.getType(), - handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), + new BasicContentHandlerFactory( + handlerConfig.getType(), + handlerConfig.getWriteLimit(), + handlerConfig.isThrowOnWriteLimitReached(), parseContext); ContentHandler handler = contentHandlerFactory.getNewContentHandler(); - parseContext.set(DocumentSelector.class, new DocumentSelector() { - final int maxEmbedded = handlerConfig.maxEmbeddedResources; - int embedded = 0; - - @Override - public boolean select(Metadata metadata) { - if (maxEmbedded < 0) { - return true; - } - return embedded++ < maxEmbedded; - } - }); + parseContext.set( + DocumentSelector.class, + new DocumentSelector() { + final int maxEmbedded = handlerConfig.maxEmbeddedResources; + int embedded = 0; + + @Override + public boolean select(Metadata metadata) { + if (maxEmbedded < 0) { + return true; + } + return embedded++ < maxEmbedded; + } + }); String containerException = null; long start = System.currentTimeMillis(); @@ -629,15 +686,22 @@ public boolean select(Metadata metadata) { return Collections.singletonList(metadata); } - private List parseRecursive(FetchEmitTuple fetchEmitTuple, - HandlerConfig handlerConfig, InputStream stream, - Metadata metadata, ParseContext parseContext) { - //Intentionally do not add the metadata filter here! - //We need to let stacktraces percolate - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(handlerConfig.getType(), - handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), - parseContext), handlerConfig.getMaxEmbeddedResources()); + private List parseRecursive( + FetchEmitTuple fetchEmitTuple, + HandlerConfig handlerConfig, + InputStream stream, + Metadata metadata, + ParseContext parseContext) { + // Intentionally do not add the metadata filter here! + // We need to let stacktraces percolate + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory( + handlerConfig.getType(), + handlerConfig.getWriteLimit(), + handlerConfig.isThrowOnWriteLimitReached(), + parseContext), + handlerConfig.getMaxEmbeddedResources()); long start = System.currentTimeMillis(); @@ -661,8 +725,8 @@ private List parseRecursive(FetchEmitTuple fetchEmitTuple, return handler.getMetadataList(); } - private void preParse(FetchEmitTuple t, InputStream stream, Metadata metadata, - ParseContext parseContext) { + private void preParse( + FetchEmitTuple t, InputStream stream, Metadata metadata, ParseContext parseContext) { TemporaryResources tmp = null; try { TikaInputStream tis = TikaInputStream.cast(stream); @@ -673,12 +737,12 @@ private void preParse(FetchEmitTuple t, InputStream stream, Metadata metadata, } finally { IOUtils.closeQuietly(tmp); } - //do we want to filter the metadata to digest, length, content-type? + // do we want to filter the metadata to digest, length, content-type? writeIntermediate(t.getEmitKey(), metadata); } - private void _preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metadata, - ParseContext parseContext) { + private void _preParse( + FetchEmitTuple t, TikaInputStream tis, Metadata metadata, ParseContext parseContext) { if (digester != null) { try { digester.digest(tis, metadata, parseContext); @@ -694,8 +758,8 @@ private void _preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metadata, LOG.warn("problem detecting: " + t.getId(), e); } - if (t.getEmbeddedDocumentBytesConfig() != null && - t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) { + if (t.getEmbeddedDocumentBytesConfig() != null + && t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) { EmbeddedDocumentBytesHandler embeddedDocumentByteStore = parseContext.get(EmbeddedDocumentBytesHandler.class); try (InputStream is = Files.newInputStream(tis.getPath())) { @@ -708,7 +772,7 @@ private void _preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metadata, private void injectUserMetadata(Metadata userMetadata, List metadataList) { for (String n : userMetadata.names()) { - //overwrite whatever was there + // overwrite whatever was there metadataList.get(0).set(n, null); for (String val : userMetadata.getValues(n)) { metadataList.get(0).add(n, val); @@ -725,14 +789,13 @@ private void exit(int exitCode) { System.exit(exitCode); } - private FetchEmitTuple readFetchEmitTuple() { try { int length = input.readInt(); byte[] bytes = new byte[length]; input.readFully(bytes); - try (ObjectInputStream objectInputStream = new ObjectInputStream( - new UnsynchronizedByteArrayInputStream(bytes))) { + try (ObjectInputStream objectInputStream = + new ObjectInputStream(new UnsynchronizedByteArrayInputStream(bytes))) { return (FetchEmitTuple) objectInputStream.readObject(); } } catch (IOException e) { @@ -742,16 +805,16 @@ private FetchEmitTuple readFetchEmitTuple() { LOG.error("can't find class?!", e); exit(1); } - //unreachable, no?! + // unreachable, no?! return null; } protected void initializeResources() throws TikaException, IOException, SAXException { - //TODO allowed named configurations in tika config + // TODO allowed named configurations in tika config this.tikaConfig = new TikaConfig(tikaConfigPath); this.fetcherManager = FetcherManager.load(tikaConfigPath); - //skip initialization of the emitters if emitting - //from the pipesserver is turned off. + // skip initialization of the emitters if emitting + // from the pipesserver is turned off. if (maxForEmitBatchBytes > -1) { this.emitterManager = EmitterManager.load(tikaConfigPath); } else { @@ -759,30 +822,37 @@ protected void initializeResources() throws TikaException, IOException, SAXExcep this.emitterManager = null; } this.autoDetectParser = new AutoDetectParser(this.tikaConfig); - if (((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig() - .getDigesterFactory() != null) { - this.digester = ((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig() - .getDigesterFactory().build(); - //override this value because we'll be digesting before parse - ((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig().getDigesterFactory() + if (((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig().getDigesterFactory() + != null) { + this.digester = + ((AutoDetectParser) autoDetectParser) + .getAutoDetectParserConfig() + .getDigesterFactory() + .build(); + // override this value because we'll be digesting before parse + ((AutoDetectParser) autoDetectParser) + .getAutoDetectParserConfig() + .getDigesterFactory() .setSkipContainerDocument(true); - //if the user hasn't configured an embedded document extractor, set up the + // if the user hasn't configured an embedded document extractor, set up the // RUnpackExtractorFactory - if (((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig() - .getEmbeddedDocumentExtractorFactory() == null) { + if (((AutoDetectParser) autoDetectParser) + .getAutoDetectParserConfig() + .getEmbeddedDocumentExtractorFactory() + == null) { ((AutoDetectParser) autoDetectParser) - .getAutoDetectParserConfig().setEmbeddedDocumentExtractorFactory( - new RUnpackExtractorFactory()); + .getAutoDetectParserConfig() + .setEmbeddedDocumentExtractorFactory(new RUnpackExtractorFactory()); } } this.detector = ((AutoDetectParser) this.autoDetectParser).getDetector(); this.rMetaParser = new RecursiveParserWrapper(autoDetectParser); } - private void writeIntermediate(EmitKey emitKey, Metadata metadata) { try { - UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get(); + UnsynchronizedByteArrayOutputStream bos = + UnsynchronizedByteArrayOutputStream.builder().get(); try (ObjectOutputStream objectOutputStream = new ObjectOutputStream(bos)) { objectOutputStream.writeObject(metadata); } @@ -795,7 +865,8 @@ private void writeIntermediate(EmitKey emitKey, Metadata metadata) { private void write(EmitData emitData) { try { - UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get(); + UnsynchronizedByteArrayOutputStream bos = + UnsynchronizedByteArrayOutputStream.builder().get(); try (ObjectOutputStream objectOutputStream = new ObjectOutputStream(bos)) { objectOutputStream.writeObject(emitData); } @@ -838,8 +909,9 @@ class MetadataListAndEmbeddedBytes { final List metadataList; final Optional embeddedDocumentBytesHandler; - public MetadataListAndEmbeddedBytes(List metadataList, - EmbeddedDocumentBytesHandler embeddedDocumentBytesHandler) { + public MetadataListAndEmbeddedBytes( + List metadataList, + EmbeddedDocumentBytesHandler embeddedDocumentBytesHandler) { this.metadataList = metadataList; this.embeddedDocumentBytesHandler = Optional.ofNullable(embeddedDocumentBytesHandler); } @@ -853,8 +925,8 @@ public EmbeddedDocumentBytesHandler getEmbeddedDocumentBytesHandler() { } /** - * This tests whether there's any type of embedded document store - * ...that, for example, may require closing at the end of the parse. + * This tests whether there's any type of embedded document store ...that, for example, may + * require closing at the end of the parse. * * @return */ @@ -863,15 +935,16 @@ public boolean hasEmbeddedDocumentByteStore() { } /** - * If the intent is that the metadata and byte store be packaged in a zip - * or similar and emitted via a single stream emitter. - *

- * This is basically a test that this is not an EmbeddedDocumentEmitterStore. + * If the intent is that the metadata and byte store be packaged in a zip or similar and + * emitted via a single stream emitter. + * + *

This is basically a test that this is not an EmbeddedDocumentEmitterStore. * * @return */ public boolean toBePackagedForStreamEmitter() { - return !(embeddedDocumentBytesHandler.get() instanceof EmittingEmbeddedDocumentBytesHandler); + return !(embeddedDocumentBytesHandler.get() + instanceof EmittingEmbeddedDocumentBytesHandler); } } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java index bc55cca5db..29d8c2ee5c 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java @@ -20,7 +20,6 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; - import org.apache.tika.exception.TikaConfigException; import org.apache.tika.pipes.PipesConfigBase; import org.apache.tika.pipes.PipesReporter; @@ -53,9 +52,8 @@ public long getEmitWithinMillis() { } /** - * If nothing has been emitted in this amount of time - * and the {@link #getEmitMaxEstimatedBytes()} has not been reached yet, - * emit what's in the emit queue. + * If nothing has been emitted in this amount of time and the {@link + * #getEmitMaxEstimatedBytes()} has not been reached yet, emit what's in the emit queue. * * @param emitWithinMillis */ @@ -64,8 +62,9 @@ public void setEmitWithinMillis(long emitWithinMillis) { } /** - * When the emit queue hits this estimated size (sum of - * estimated extract sizes), emit the batch. + * When the emit queue hits this estimated size (sum of estimated extract sizes), emit the + * batch. + * * @return */ public long getEmitMaxEstimatedBytes() { @@ -76,13 +75,13 @@ public void setEmitMaxEstimatedBytes(long emitMaxEstimatedBytes) { this.emitMaxEstimatedBytes = emitMaxEstimatedBytes; } - public void setNumEmitters(int numEmitters) { this.numEmitters = numEmitters; } /** * FetchEmitTuple queue size + * * @return */ public int getQueueSize() { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java index fce65c5403..25d2c67aeb 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java @@ -26,20 +26,15 @@ import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.TimeUnit; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.pipes.emitter.EmitData; import org.apache.tika.pipes.emitter.Emitter; import org.apache.tika.pipes.emitter.EmitterManager; import org.apache.tika.pipes.emitter.TikaEmitterException; import org.apache.tika.utils.ExceptionUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -/** - * Worker thread that takes EmitData off the queue, batches it - * and tries to emit it as a batch - */ +/** Worker thread that takes EmitData off the queue, batches it and tries to emit it as a batch */ public class AsyncEmitter implements Callable { static final EmitData EMIT_DATA_STOP_SEMAPHORE = new EmitData(null, null); @@ -53,8 +48,10 @@ public class AsyncEmitter implements Callable { Instant lastEmitted = Instant.now(); - public AsyncEmitter(AsyncConfig asyncConfig, ArrayBlockingQueue emitData, - EmitterManager emitterManager) { + public AsyncEmitter( + AsyncConfig asyncConfig, + ArrayBlockingQueue emitData, + EmitterManager emitterManager) { this.asyncConfig = asyncConfig; this.emitDataQueue = emitData; this.emitterManager = emitterManager; @@ -71,17 +68,22 @@ public Integer call() throws Exception { return EMITTER_FUTURE_CODE; } if (emitData != null) { - //this can block on emitAll + // this can block on emitAll cache.add(emitData); } else { LOG.trace("Nothing on the async queue"); } - LOG.debug("cache size: ({}) bytes and extract count: {}", cache.estimatedSize, + LOG.debug( + "cache size: ({}) bytes and extract count: {}", + cache.estimatedSize, cache.size); long elapsed = ChronoUnit.MILLIS.between(lastEmitted, Instant.now()); if (elapsed > asyncConfig.getEmitWithinMillis()) { - LOG.debug("{} elapsed > {}, going to emitAll", elapsed, asyncConfig.getEmitWithinMillis()); - //this can block + LOG.debug( + "{} elapsed > {}, going to emitAll", + elapsed, + asyncConfig.getEmitWithinMillis()); + // this can block cache.emitAll(); } } @@ -106,11 +108,14 @@ void add(EmitData data) { size++; long sz = data.getEstimatedSizeBytes(); if (estimatedSize + sz > maxBytes) { - LOG.debug("estimated size ({}) > maxBytes({}), going to emitAll", - (estimatedSize + sz), maxBytes); + LOG.debug( + "estimated size ({}) > maxBytes({}), going to emitAll", + (estimatedSize + sz), + maxBytes); emitAll(); } - List cached = map.computeIfAbsent(data.getEmitKey().getEmitterName(), k -> new ArrayList<>()); + List cached = + map.computeIfAbsent(data.getEmitKey().getEmitterName(), k -> new ArrayList<>()); updateEstimatedSize(sz); cached.add(data); } @@ -136,7 +141,9 @@ private void tryToEmit(Emitter emitter, List cachedEmitData) { try { emitter.emit(cachedEmitData); } catch (IOException | TikaEmitterException e) { - LOG.warn("emitter class ({}): {}", emitter.getClass(), + LOG.warn( + "emitter class ({}): {}", + emitter.getClass(), ExceptionUtils.getStackTrace(e)); } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java index 3a6751f4ff..850e7735b0 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java @@ -29,10 +29,6 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.exception.TikaException; import org.apache.tika.pipes.FetchEmitTuple; import org.apache.tika.pipes.PipesClient; @@ -44,11 +40,11 @@ import org.apache.tika.pipes.pipesiterator.PipesIterator; import org.apache.tika.pipes.pipesiterator.TotalCountResult; import org.apache.tika.pipes.pipesiterator.TotalCounter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * This is the main class for handling async requests. This manages - * AsyncClients and AsyncEmitters. - * + * This is the main class for handling async requests. This manages AsyncClients and AsyncEmitters. */ public class AsyncProcessor implements Closeable { @@ -73,33 +69,39 @@ public AsyncProcessor(Path tikaConfigPath) throws TikaException, IOException { this(tikaConfigPath, null); } - public AsyncProcessor(Path tikaConfigPath, PipesIterator pipesIterator) throws TikaException, IOException { + public AsyncProcessor(Path tikaConfigPath, PipesIterator pipesIterator) + throws TikaException, IOException { this.asyncConfig = AsyncConfig.load(tikaConfigPath); this.fetchEmitTuples = new ArrayBlockingQueue<>(asyncConfig.getQueueSize()); this.emitData = new ArrayBlockingQueue<>(100); - //+1 is the watcher thread - this.executorService = Executors.newFixedThreadPool( - asyncConfig.getNumClients() + asyncConfig.getNumEmitters() + 1); - this.executorCompletionService = - new ExecutorCompletionService<>(executorService); + // +1 is the watcher thread + this.executorService = + Executors.newFixedThreadPool( + asyncConfig.getNumClients() + asyncConfig.getNumEmitters() + 1); + this.executorCompletionService = new ExecutorCompletionService<>(executorService); try { - if (!tikaConfigPath.toAbsolutePath().equals(asyncConfig.getTikaConfig().toAbsolutePath())) { - LOG.warn("TikaConfig for AsyncProcessor ({}) is different " + - "from TikaConfig for workers ({}). If this is intended," + - " please ignore this warning.", tikaConfigPath.toAbsolutePath(), + if (!tikaConfigPath + .toAbsolutePath() + .equals(asyncConfig.getTikaConfig().toAbsolutePath())) { + LOG.warn( + "TikaConfig for AsyncProcessor ({}) is different " + + "from TikaConfig for workers ({}). If this is intended," + + " please ignore this warning.", + tikaConfigPath.toAbsolutePath(), asyncConfig.getTikaConfig().toAbsolutePath()); } - this.executorCompletionService.submit(() -> { - while (true) { - try { - Thread.sleep(500); - checkActive(); - } catch (InterruptedException e) { - return WATCHER_FUTURE_CODE; - } - } - }); - //this is run in a daemon thread + this.executorCompletionService.submit( + () -> { + while (true) { + try { + Thread.sleep(500); + checkActive(); + } catch (InterruptedException e) { + return WATCHER_FUTURE_CODE; + } + } + }); + // this is run in a daemon thread if (pipesIterator != null && (pipesIterator instanceof TotalCounter)) { LOG.debug("going to total counts"); startCounter((TotalCounter) pipesIterator); @@ -124,23 +126,28 @@ public AsyncProcessor(Path tikaConfigPath, PipesIterator pipesIterator) throws T } private void startCounter(TotalCounter totalCounter) { - Thread counterThread = new Thread(() -> { - totalCounter.startTotalCount(); - PipesReporter pipesReporter = asyncConfig.getPipesReporter(); - TotalCountResult.STATUS status = totalCounter.getTotalCount().getStatus(); - while (status == TotalCountResult.STATUS.NOT_COMPLETED) { - try { - Thread.sleep(500); - TotalCountResult result = totalCounter.getTotalCount(); - LOG.trace("counter total {} {} ", result.getStatus(), result.getTotalCount()); - pipesReporter.report(result); - status = result.getStatus(); - } catch (InterruptedException e) { - return; - } - } - - }); + Thread counterThread = + new Thread( + () -> { + totalCounter.startTotalCount(); + PipesReporter pipesReporter = asyncConfig.getPipesReporter(); + TotalCountResult.STATUS status = + totalCounter.getTotalCount().getStatus(); + while (status == TotalCountResult.STATUS.NOT_COMPLETED) { + try { + Thread.sleep(500); + TotalCountResult result = totalCounter.getTotalCount(); + LOG.trace( + "counter total {} {} ", + result.getStatus(), + result.getTotalCount()); + pipesReporter.report(result); + status = result.getStatus(); + } catch (InterruptedException e) { + return; + } + } + }); counterThread.setDaemon(true); counterThread.start(); } @@ -152,8 +159,8 @@ public synchronized boolean offer(List newFetchEmitTuples, long "Can't call offer after calling close() or " + "shutdownNow()"); } if (newFetchEmitTuples.size() > asyncConfig.getQueueSize()) { - throw new OfferLargerThanQueueSize(newFetchEmitTuples.size(), - asyncConfig.getQueueSize()); + throw new OfferLargerThanQueueSize( + newFetchEmitTuples.size(), asyncConfig.getQueueSize()); } long start = System.currentTimeMillis(); long elapsed = System.currentTimeMillis() - start; @@ -163,8 +170,8 @@ public synchronized boolean offer(List newFetchEmitTuples, long fetchEmitTuples.addAll(newFetchEmitTuples); return true; } catch (IllegalStateException e) { - //this means that the add all failed because the queue couldn't - //take the full list + // this means that the add all failed because the queue couldn't + // take the full list LOG.debug("couldn't add full list", e); } } @@ -192,11 +199,14 @@ public synchronized boolean offer(FetchEmitTuple t, long offerMs) public void finished() throws InterruptedException { for (int i = 0; i < asyncConfig.getNumClients(); i++) { - boolean offered = fetchEmitTuples.offer(PipesIterator.COMPLETED_SEMAPHORE, - MAX_OFFER_WAIT_MS, TimeUnit.MILLISECONDS); - if (! offered) { - throw new RuntimeException("Couldn't offer completed semaphore within " + - MAX_OFFER_WAIT_MS + " ms"); + boolean offered = + fetchEmitTuples.offer( + PipesIterator.COMPLETED_SEMAPHORE, + MAX_OFFER_WAIT_MS, + TimeUnit.MILLISECONDS); + if (!offered) { + throw new RuntimeException( + "Couldn't offer completed semaphore within " + MAX_OFFER_WAIT_MS + " ms"); } } } @@ -208,19 +218,20 @@ public synchronized boolean checkActive() throws InterruptedException { try { Integer i = future.get(); switch (i) { - case PARSER_FUTURE_CODE : + case PARSER_FUTURE_CODE: numParserThreadsFinished++; LOG.debug("fetchEmitWorker finished, total {}", numParserThreadsFinished); break; - case AsyncEmitter.EMITTER_FUTURE_CODE : + case AsyncEmitter.EMITTER_FUTURE_CODE: numEmitterThreadsFinished++; LOG.debug("emitter thread finished, total {}", numEmitterThreadsFinished); break; - case WATCHER_FUTURE_CODE : + case WATCHER_FUTURE_CODE: LOG.debug("watcher thread finished"); break; - default : - throw new IllegalArgumentException("Don't recognize this future code: " + i); + default: + throw new IllegalArgumentException( + "Don't recognize this future code: " + i); } } catch (ExecutionException e) { LOG.error("execution exception", e); @@ -228,15 +239,20 @@ public synchronized boolean checkActive() throws InterruptedException { throw new RuntimeException(e); } } - if (numParserThreadsFinished == asyncConfig.getNumClients() && ! addedEmitterSemaphores) { + if (numParserThreadsFinished == asyncConfig.getNumClients() && !addedEmitterSemaphores) { for (int i = 0; i < asyncConfig.getNumEmitters(); i++) { try { - boolean offered = emitData.offer(AsyncEmitter.EMIT_DATA_STOP_SEMAPHORE, - MAX_OFFER_WAIT_MS, - TimeUnit.MILLISECONDS); - if (! offered) { - throw new RuntimeException("Couldn't offer emit data stop semaphore " + - "within " + MAX_OFFER_WAIT_MS + " ms"); + boolean offered = + emitData.offer( + AsyncEmitter.EMIT_DATA_STOP_SEMAPHORE, + MAX_OFFER_WAIT_MS, + TimeUnit.MILLISECONDS); + if (!offered) { + throw new RuntimeException( + "Couldn't offer emit data stop semaphore " + + "within " + + MAX_OFFER_WAIT_MS + + " ms"); } } catch (InterruptedException e) { throw new RuntimeException(e); @@ -244,8 +260,8 @@ public synchronized boolean checkActive() throws InterruptedException { } addedEmitterSemaphores = true; } - return !(numParserThreadsFinished == asyncConfig.getNumClients() && - numEmitterThreadsFinished == asyncConfig.getNumEmitters()); + return !(numParserThreadsFinished == asyncConfig.getNumClients() + && numEmitterThreadsFinished == asyncConfig.getNumEmitters()); } @Override @@ -264,9 +280,10 @@ private class FetchEmitWorker implements Callable { private final ArrayBlockingQueue fetchEmitTuples; private final ArrayBlockingQueue emitDataQueue; - private FetchEmitWorker(AsyncConfig asyncConfig, - ArrayBlockingQueue fetchEmitTuples, - ArrayBlockingQueue emitDataQueue) { + private FetchEmitWorker( + AsyncConfig asyncConfig, + ArrayBlockingQueue fetchEmitTuples, + ArrayBlockingQueue emitDataQueue) { this.asyncConfig = asyncConfig; this.fetchEmitTuples = fetchEmitTuples; this.emitDataQueue = emitDataQueue; @@ -279,7 +296,7 @@ public Integer call() throws Exception { while (true) { FetchEmitTuple t = fetchEmitTuples.poll(1, TimeUnit.SECONDS); if (t == null) { - //skip + // skip if (LOG.isTraceEnabled()) { LOG.trace("null fetch emit tuple"); } @@ -298,23 +315,30 @@ public Integer call() throws Exception { result = PipesResult.UNSPECIFIED_CRASH; } if (LOG.isTraceEnabled()) { - LOG.trace("timer -- pipes client process: {} ms", + LOG.trace( + "timer -- pipes client process: {} ms", System.currentTimeMillis() - start); } long offerStart = System.currentTimeMillis(); if (shouldEmit(result)) { LOG.trace("adding result to emitter queue: " + result.getEmitData()); - boolean offered = emitDataQueue.offer(result.getEmitData(), - MAX_OFFER_WAIT_MS, - TimeUnit.MILLISECONDS); - if (! offered) { - throw new RuntimeException("Couldn't offer emit data to queue " + - "within " + MAX_OFFER_WAIT_MS + " ms"); + boolean offered = + emitDataQueue.offer( + result.getEmitData(), + MAX_OFFER_WAIT_MS, + TimeUnit.MILLISECONDS); + if (!offered) { + throw new RuntimeException( + "Couldn't offer emit data to queue " + + "within " + + MAX_OFFER_WAIT_MS + + " ms"); } } if (LOG.isTraceEnabled()) { - LOG.trace("timer -- offered: {} ms", + LOG.trace( + "timer -- offered: {} ms", System.currentTimeMillis() - offerStart); } long elapsed = System.currentTimeMillis() - start; @@ -327,8 +351,8 @@ public Integer call() throws Exception { private boolean shouldEmit(PipesResult result) { - if (result.getStatus() == PipesResult.STATUS.PARSE_SUCCESS || - result.getStatus() == PipesResult.STATUS.PARSE_SUCCESS_WITH_EXCEPTION) { + if (result.getStatus() == PipesResult.STATUS.PARSE_SUCCESS + || result.getStatus() == PipesResult.STATUS.PARSE_SUCCESS_WITH_EXCEPTION) { return true; } return result.isIntermediate() && asyncConfig.isEmitIntermediateResults(); diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java index 46a58ff2ba..ecd779e4ae 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java @@ -19,7 +19,6 @@ import java.time.Instant; import java.util.HashMap; import java.util.Map; - import org.apache.tika.pipes.PipesResult; import org.apache.tika.pipes.pipesiterator.TotalCountResult; import org.apache.tika.utils.StringUtils; @@ -31,10 +30,12 @@ public enum ASYNC_STATUS { COMPLETED, CRASHED } + private final Instant started; private Instant lastUpdate; - private TotalCountResult totalCountResult = new TotalCountResult(0, TotalCountResult.STATUS.NOT_COMPLETED); + private TotalCountResult totalCountResult = + new TotalCountResult(0, TotalCountResult.STATUS.NOT_COMPLETED); private Map statusCounts = new HashMap<>(); private ASYNC_STATUS asyncStatus = ASYNC_STATUS.STARTED; @@ -45,8 +46,10 @@ public AsyncStatus() { lastUpdate = started; } - public synchronized void update(Map statusCounts, - TotalCountResult totalCountResult, ASYNC_STATUS status) { + public synchronized void update( + Map statusCounts, + TotalCountResult totalCountResult, + ASYNC_STATUS status) { this.lastUpdate = Instant.now(); this.statusCounts = statusCounts; this.totalCountResult = totalCountResult; @@ -83,8 +86,20 @@ public String getCrashMessage() { @Override public String toString() { - return "AsyncStatus{" + "started=" + started + ", lastUpdate=" + lastUpdate + - ", totalCountResult=" + totalCountResult + ", statusCounts=" + statusCounts + - ", asyncStatus=" + asyncStatus + ", crashMessage='" + crashMessage + '\'' + '}'; + return "AsyncStatus{" + + "started=" + + started + + ", lastUpdate=" + + lastUpdate + + ", totalCountResult=" + + totalCountResult + + ", statusCounts=" + + statusCounts + + ", asyncStatus=" + + asyncStatus + + ", crashMessage='" + + crashMessage + + '\'' + + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java b/tika-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java index da96c80ba0..0896af6ac8 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java @@ -27,8 +27,7 @@ public OfferLargerThanQueueSize(int sizeOffered, int queueSize) { @Override public String getMessage() { - return "sizeOffered (" + sizeOffered + ") is greater than queue size (" + - queueSize + ")"; + return "sizeOffered (" + sizeOffered + ") is greater than queue size (" + queueSize + ")"; } public int getQueueSize() { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java index 648e0949dc..13312e07b6 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java @@ -33,9 +33,9 @@ public void setName(String name) { } /** - * The default behavior is to call {@link #emit(String, List)} on each item. - * Some implementations, e.g. Solr/ES/vespa, can benefit from subclassing this and - * emitting a bunch of docs at once. + * The default behavior is to call {@link #emit(String, List)} on each item. Some + * implementations, e.g. Solr/ES/vespa, can benefit from subclassing this and emitting a bunch + * of docs at once. * * @param emitData * @throws IOException diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java index 95376a9fac..b56f4e940d 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java @@ -18,14 +18,11 @@ import java.io.Serializable; import java.util.List; - import org.apache.tika.metadata.Metadata; import org.apache.tika.utils.StringUtils; public class EmitData implements Serializable { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = -3861669115439125268L; private final EmitKey emitKey; @@ -40,8 +37,8 @@ public EmitData(EmitKey emitKey, List metadataList) { public EmitData(EmitKey emitKey, List metadataList, String containerStackTrace) { this.emitKey = emitKey; this.metadataList = metadataList; - this.containerStackTrace = (containerStackTrace == null) ? StringUtils.EMPTY : - containerStackTrace; + this.containerStackTrace = + (containerStackTrace == null) ? StringUtils.EMPTY : containerStackTrace; } public EmitKey getEmitKey() { @@ -57,11 +54,12 @@ public String getContainerStackTrace() { } public long getEstimatedSizeBytes() { - return estimateSizeInBytes(getEmitKey().getEmitKey(), getMetadataList(), containerStackTrace); + return estimateSizeInBytes( + getEmitKey().getEmitKey(), getMetadataList(), containerStackTrace); } - private static long estimateSizeInBytes(String id, List metadataList, - String containerStackTrace) { + private static long estimateSizeInBytes( + String id, List metadataList, String containerStackTrace) { long sz = 36 + id.length() * 2; sz += 36 + containerStackTrace.length() * 2; for (Metadata m : metadataList) { @@ -77,7 +75,14 @@ private static long estimateSizeInBytes(String id, List metadataList, @Override public String toString() { - return "EmitData{" + "emitKey=" + emitKey + ", metadataList=" + metadataList + - ", containerStackTrace='" + containerStackTrace + '\'' + '}'; + return "EmitData{" + + "emitKey=" + + emitKey + + ", metadataList=" + + metadataList + + ", containerStackTrace='" + + containerStackTrace + + '\'' + + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java index e570064805..8ab3b95100 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java @@ -21,18 +21,15 @@ public class EmitKey implements Serializable { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = -3861669115439125268L; private String emitterName; private String emitKey; - //for serialization only...yuck. - public EmitKey() { + // for serialization only...yuck. + public EmitKey() {} - } public EmitKey(String emitterName, String emitKey) { this.emitterName = emitterName; this.emitKey = emitKey; @@ -48,8 +45,14 @@ public String getEmitKey() { @Override public String toString() { - return "EmitterKey{" + "emitterName='" + emitterName + '\'' + ", emitterKey='" + emitKey + - '\'' + '}'; + return "EmitterKey{" + + "emitterName='" + + emitterName + + '\'' + + ", emitterKey='" + + emitKey + + '\'' + + '}'; } @Override diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java index f60ef3b77e..0c15ec0084 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.util.List; - import org.apache.tika.metadata.Metadata; public interface Emitter { @@ -28,7 +27,7 @@ public interface Emitter { void emit(String emitKey, List metadataList) throws IOException, TikaEmitterException; void emit(List emitData) throws IOException, TikaEmitterException; - //TODO -- add this later for xhtml? - //void emit(String txt, Metadata metadata) throws IOException, TikaException; + // TODO -- add this later for xhtml? + // void emit(String txt, Metadata metadata) throws IOException, TikaException; } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java index 7d1aba1cfa..c1245b8895 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java @@ -24,32 +24,26 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; - import org.apache.tika.config.ConfigBase; import org.apache.tika.exception.TikaConfigException; /** - * Utility class that will apply the appropriate fetcher - * to the fetcherString based on the prefix. - *

- * This does not allow multiple fetchers supporting the same prefix. + * Utility class that will apply the appropriate fetcher to the fetcherString based on the prefix. + * + *

This does not allow multiple fetchers supporting the same prefix. */ public class EmitterManager extends ConfigBase { private final Map emitterMap = new ConcurrentHashMap<>(); public static EmitterManager load(Path tikaConfigPath) throws IOException, TikaConfigException { - try (InputStream is = Files.newInputStream(tikaConfigPath) ) { + try (InputStream is = Files.newInputStream(tikaConfigPath)) { return EmitterManager.buildComposite( - "emitters", EmitterManager.class, - "emitter", - Emitter.class, is); + "emitters", EmitterManager.class, "emitter", Emitter.class, is); } } - private EmitterManager() { - - } + private EmitterManager() {} public EmitterManager(List emitters) { for (Emitter emitter : emitters) { @@ -58,7 +52,6 @@ public EmitterManager(List emitters) { "Multiple emitters cannot support the same name: " + emitter.getName()); } emitterMap.put(emitter.getName(), emitter); - } } @@ -66,7 +59,6 @@ public Set getSupported() { return emitterMap.keySet(); } - public Emitter getEmitter(String emitterName) { Emitter emitter = emitterMap.get(emitterName); if (emitter == null) { @@ -76,9 +68,10 @@ public Emitter getEmitter(String emitterName) { } /** - * Convenience method that returns an emitter if only one emitter - * is specified in the tika-config file. If 0 or > 1 emitters - * are specified, this throws an IllegalArgumentException. + * Convenience method that returns an emitter if only one emitter is specified in the + * tika-config file. If 0 or > 1 emitters are specified, this throws an + * IllegalArgumentException. + * * @return */ public Emitter getEmitter() { @@ -86,13 +79,13 @@ public Emitter getEmitter() { throw new IllegalArgumentException("emitters size must == 1 for the no arg call"); } if (emitterMap.size() > 1) { - throw new IllegalArgumentException("need to specify 'emitterName' if > 1 emitters are" + - " available"); + throw new IllegalArgumentException( + "need to specify 'emitterName' if > 1 emitters are" + " available"); } for (Emitter emitter : emitterMap.values()) { return emitter; } - //this should be unreachable?! + // this should be unreachable?! throw new IllegalArgumentException("emitters size must == 0"); } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java index b77107ba09..ee9851193a 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.util.List; - import org.apache.tika.metadata.Metadata; public class EmptyEmitter implements Emitter { @@ -30,12 +29,8 @@ public String getName() { @Override public void emit(String emitKey, List metadataList) - throws IOException, TikaEmitterException { - - } + throws IOException, TikaEmitterException {} @Override - public void emit(List emitData) throws IOException, TikaEmitterException { - - } + public void emit(List emitData) throws IOException, TikaEmitterException {} } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java index 10526eb0ee..4876c80bb7 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.tika.metadata.Metadata; public interface StreamEmitter extends Emitter { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java index 071de05c45..7d6bc8719b 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java @@ -21,16 +21,15 @@ public class EmbeddedDocumentBytesConfig implements Serializable { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = -3861669115439125268L; - public static EmbeddedDocumentBytesConfig SKIP = new EmbeddedDocumentBytesConfig(false); public enum SUFFIX_STRATEGY { - NONE, EXISTING, DETECTED; + NONE, + EXISTING, + DETECTED; public static SUFFIX_STRATEGY parse(String s) { if (s.equalsIgnoreCase("none")) { @@ -43,6 +42,7 @@ public static SUFFIX_STRATEGY parse(String s) { throw new IllegalArgumentException("can't parse " + s); } } + private final boolean extractEmbeddedDocumentBytes; private int zeroPadName = 0; @@ -56,9 +56,8 @@ public static SUFFIX_STRATEGY parse(String s) { private boolean includeOriginal = false; /** - * Create an EmbeddedDocumentBytesConfig with - * {@link EmbeddedDocumentBytesConfig#extractEmbeddedDocumentBytes} - * set to true + * Create an EmbeddedDocumentBytesConfig with {@link + * EmbeddedDocumentBytesConfig#extractEmbeddedDocumentBytes} set to true */ public EmbeddedDocumentBytesConfig() { this.extractEmbeddedDocumentBytes = true; @@ -118,11 +117,22 @@ public void setIncludeOriginal(boolean includeOriginal) { @Override public String toString() { - return "EmbeddedDocumentBytesConfig{" + "extractEmbeddedDocumentBytes=" + - extractEmbeddedDocumentBytes + ", zeroPadName=" + zeroPadName + - ", suffixStrategy=" + suffixStrategy + ", embeddedIdPrefix='" + embeddedIdPrefix + - '\'' + ", emitter='" + emitter + '\'' + ", includeOriginal=" + includeOriginal + - '}'; + return "EmbeddedDocumentBytesConfig{" + + "extractEmbeddedDocumentBytes=" + + extractEmbeddedDocumentBytes + + ", zeroPadName=" + + zeroPadName + + ", suffixStrategy=" + + suffixStrategy + + ", embeddedIdPrefix='" + + embeddedIdPrefix + + '\'' + + ", emitter='" + + emitter + + '\'' + + ", includeOriginal=" + + includeOriginal + + '}'; } @Override diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java index 1132a4bc6b..92a51b5eaf 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java @@ -19,9 +19,7 @@ import java.io.Closeable; import java.io.IOException; import java.io.InputStream; - import org.apache.commons.io.IOExceptionWithCause; - import org.apache.tika.exception.TikaConfigException; import org.apache.tika.extractor.AbstractEmbeddedDocumentBytesHandler; import org.apache.tika.metadata.Metadata; @@ -37,26 +35,30 @@ public class EmittingEmbeddedDocumentBytesHandler extends AbstractEmbeddedDocume private final StreamEmitter emitter; private static final Metadata METADATA = new Metadata(); - public EmittingEmbeddedDocumentBytesHandler(EmitKey containerEmitKey, - EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig, - EmitterManager emitterManager) throws TikaConfigException { + + public EmittingEmbeddedDocumentBytesHandler( + EmitKey containerEmitKey, + EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig, + EmitterManager emitterManager) + throws TikaConfigException { this.containerEmitKey = containerEmitKey; this.embeddedDocumentBytesConfig = embeddedDocumentBytesConfig; - Emitter tmpEmitter = - emitterManager.getEmitter(embeddedDocumentBytesConfig.getEmitter()); - if (! (tmpEmitter instanceof StreamEmitter)) { - throw new TikaConfigException("Emitter " + - embeddedDocumentBytesConfig.getEmitter() - + " must implement a StreamEmitter"); + Emitter tmpEmitter = emitterManager.getEmitter(embeddedDocumentBytesConfig.getEmitter()); + if (!(tmpEmitter instanceof StreamEmitter)) { + throw new TikaConfigException( + "Emitter " + + embeddedDocumentBytesConfig.getEmitter() + + " must implement a StreamEmitter"); } this.emitter = (StreamEmitter) tmpEmitter; } @Override public void add(int id, Metadata metadata, InputStream inputStream) throws IOException { - //intentionally do not call super.add, because we want the ids list to be empty - String emitKey = getEmitKey(containerEmitKey.getEmitKey(), - id, embeddedDocumentBytesConfig, metadata); + // intentionally do not call super.add, because we want the ids list to be empty + String emitKey = + getEmitKey( + containerEmitKey.getEmitKey(), id, embeddedDocumentBytesConfig, metadata); try { emitter.emit(emitKey, inputStream, METADATA); } catch (TikaEmitterException e) { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java index 0b417e3fb1..76fde46aa7 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java @@ -18,14 +18,11 @@ import org.apache.tika.config.Field; - public abstract class AbstractFetcher implements Fetcher { private String name; - public AbstractFetcher() { - - } + public AbstractFetcher() {} public AbstractFetcher(String name) { this.name = name; @@ -40,5 +37,4 @@ public String getName() { public void setName(String name) { this.name = name; } - } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java index 022d00a8cb..bd7241684c 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java index 148e3532c4..3f1d2042eb 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java @@ -20,13 +20,11 @@ import java.util.Objects; /** - * Pair of fetcherName (which fetcher to call) and the key - * to send to that fetcher to retrieve a specific file. + * Pair of fetcherName (which fetcher to call) and the key to send to that fetcher to retrieve a + * specific file. */ public class FetchKey implements Serializable { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = -3861669115439125268L; private String fetcherName; @@ -34,10 +32,8 @@ public class FetchKey implements Serializable { private long rangeStart = -1; private long rangeEnd = -1; - //this is for serialization...yuck - public FetchKey() { - - } + // this is for serialization...yuck + public FetchKey() {} public FetchKey(String fetcherName, String fetchKey) { this(fetcherName, fetchKey, -1, -1); @@ -79,9 +75,10 @@ public boolean equals(Object o) { return false; } FetchKey fetchKey1 = (FetchKey) o; - return rangeStart == fetchKey1.rangeStart && rangeEnd == fetchKey1.rangeEnd && - Objects.equals(fetcherName, fetchKey1.fetcherName) && - Objects.equals(fetchKey, fetchKey1.fetchKey); + return rangeStart == fetchKey1.rangeStart + && rangeEnd == fetchKey1.rangeEnd + && Objects.equals(fetcherName, fetchKey1.fetcherName) + && Objects.equals(fetchKey, fetchKey1.fetchKey); } @Override @@ -91,7 +88,17 @@ public int hashCode() { @Override public String toString() { - return "FetchKey{" + "fetcherName='" + fetcherName + '\'' + ", fetchKey='" + fetchKey + - '\'' + ", rangeStart=" + rangeStart + ", rangeEnd=" + rangeEnd + '}'; + return "FetchKey{" + + "fetcherName='" + + fetcherName + + '\'' + + ", fetchKey='" + + fetchKey + + '\'' + + ", rangeStart=" + + rangeStart + + ", rangeEnd=" + + rangeEnd + + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java index 1b3fa2a241..dea2467dc6 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java @@ -18,16 +18,14 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; /** - * Interface for an object that will fetch an InputStream given - * a fetch string. This will also update the metadata object - * based on the fetch. - *

- * Implementations of Fetcher must be thread safe. + * Interface for an object that will fetch an InputStream given a fetch string. This will also + * update the metadata object based on the fetch. + * + *

Implementations of Fetcher must be thread safe. */ public interface Fetcher { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java index 40121f9a7e..21fcc419bf 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java @@ -24,25 +24,24 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; - import org.apache.tika.config.ConfigBase; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; /** * Utility class to hold multiple fetchers. - *

- * This forbids multiple fetchers supporting the same name. + * + *

This forbids multiple fetchers supporting the same name. */ public class FetcherManager extends ConfigBase { public static FetcherManager load(Path p) throws IOException, TikaConfigException { - try (InputStream is = - Files.newInputStream(p)) { - return FetcherManager.buildComposite("fetchers", FetcherManager.class, - "fetcher", Fetcher.class, is); + try (InputStream is = Files.newInputStream(p)) { + return FetcherManager.buildComposite( + "fetchers", FetcherManager.class, "fetcher", Fetcher.class, is); } } + private final Map fetcherMap = new ConcurrentHashMap<>(); public FetcherManager(List fetchers) throws TikaConfigException { @@ -63,8 +62,10 @@ public Fetcher getFetcher(String fetcherName) throws IOException, TikaException Fetcher fetcher = fetcherMap.get(fetcherName); if (fetcher == null) { throw new IllegalArgumentException( - "Can't find fetcher for fetcherName: " + fetcherName + ". I've loaded: " + - fetcherMap.keySet()); + "Can't find fetcher for fetcherName: " + + fetcherName + + ". I've loaded: " + + fetcherMap.keySet()); } return fetcher; } @@ -74,9 +75,9 @@ public Set getSupported() { } /** - * Convenience method that returns a fetcher if only one fetcher - * is specified in the tika-config file. If 0 or > 1 fetchers - * are specified, this throws an IllegalArgumentException. + * Convenience method that returns a fetcher if only one fetcher is specified in the tika-config + * file. If 0 or > 1 fetchers are specified, this throws an IllegalArgumentException. + * * @return */ public Fetcher getFetcher() { @@ -84,13 +85,13 @@ public Fetcher getFetcher() { throw new IllegalArgumentException("fetchers size must == 1 for the no arg call"); } if (fetcherMap.size() > 1) { - throw new IllegalArgumentException("need to specify 'fetcherName' if > 1 fetchers are" + - " available"); + throw new IllegalArgumentException( + "need to specify 'fetcherName' if > 1 fetchers are" + " available"); } for (Fetcher fetcher : fetcherMap.values()) { return fetcher; } - //this should be unreachable?! + // this should be unreachable?! throw new IllegalArgumentException("fetchers size must == 0"); } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java index a07439a00f..97d3972ff6 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java @@ -18,9 +18,7 @@ import org.apache.tika.exception.TikaException; -/** - * If something goes wrong in parsing the fetcher string - */ +/** If something goes wrong in parsing the fetcher string */ public class FetcherStringException extends TikaException { public FetcherStringException(String msg) { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java index 0a3ceae7f6..16798972c9 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java @@ -18,17 +18,13 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -/** - * This class extracts a range of bytes from a given fetch key. - */ +/** This class extracts a range of bytes from a given fetch key. */ public interface RangeFetcher extends Fetcher { - //At some point, Tika 3.x?, we may want to add optional ranges to the fetchKey? + // At some point, Tika 3.x?, we may want to add optional ranges to the fetchKey? InputStream fetch(String fetchKey, long startOffset, long endOffset, Metadata metadata) throws TikaException, IOException; - } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java index d926e3ca66..8bf2f0be04 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java @@ -26,10 +26,6 @@ import java.nio.file.attribute.FileTime; import java.util.Date; import java.util.Map; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -42,18 +38,22 @@ import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class FileSystemFetcher extends AbstractFetcher implements Initializable { private static final Logger LOG = LoggerFactory.getLogger(FileSystemFetcher.class); - //Warning! basePath can be null! + // Warning! basePath can be null! private Path basePath = null; private boolean extractFileSystemMetadata = false; static boolean isDescendant(Path root, Path descendant) { - return descendant.toAbsolutePath().normalize() + return descendant + .toAbsolutePath() + .normalize() .startsWith(root.toAbsolutePath().normalize()); } @@ -61,9 +61,10 @@ static boolean isDescendant(Path root, Path descendant) { public InputStream fetch(String fetchKey, Metadata metadata) throws IOException, TikaException { if (fetchKey.contains("\u0000")) { - throw new IllegalArgumentException("Path must not contain \u0000. " + - "Please review the life decisions that led you to requesting " + - "a file name with this character in it."); + throw new IllegalArgumentException( + "Path must not contain \u0000. " + + "Please review the life decisions that led you to requesting " + + "a file name with this character in it."); } Path p = null; if (basePath != null) { @@ -91,14 +92,14 @@ public InputStream fetch(String fetchKey, Metadata metadata) throws IOException, } private void updateFileSystemMetadata(Path p, Metadata metadata) throws IOException { - if (! extractFileSystemMetadata) { + if (!extractFileSystemMetadata) { return; } BasicFileAttributes attrs = Files.readAttributes(p, BasicFileAttributes.class); updateFileTime(FileSystem.CREATED, attrs.creationTime(), metadata); updateFileTime(FileSystem.MODIFIED, attrs.lastModifiedTime(), metadata); updateFileTime(FileSystem.ACCESSED, attrs.lastAccessTime(), metadata); - //TODO extract owner or group? + // TODO extract owner or group? } private void updateFileTime(Property property, FileTime fileTime, Metadata metadata) { @@ -109,7 +110,6 @@ private void updateFileTime(Property property, FileTime fileTime, Metadata metad } /** - * * @return the basePath or null if no base path was set */ public Path getBasePath() { @@ -117,9 +117,8 @@ public Path getBasePath() { } /** - * Default behavior si that clients will send in relative paths, this - * must be set to allow this fetcher to fetch the - * full path. + * Default behavior si that clients will send in relative paths, this must be set to allow this + * fetcher to fetch the full path. * * @param basePath */ @@ -129,8 +128,8 @@ public void setBasePath(String basePath) { } /** - * Extract file system metadata (created, modified, accessed) when fetching file. - * The default is false. + * Extract file system metadata (created, modified, accessed) when fetching file. The default is + * false. * * @param extractFileSystemMetadata */ @@ -141,29 +140,33 @@ public void setExtractFileSystemMetadata(boolean extractFileSystemMetadata) { @Override public void initialize(Map params) throws TikaConfigException { - //no-op + // no-op } @Override public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { if (basePath == null || basePath.toString().trim().length() == 0) { - LOG.warn("'basePath' has not been set. " + - "This means that client code or clients can read from any file that this " + - "process has permissions to read. If you are running tika-server, make " + - "absolutely certain that you've locked down " + - "access to tika-server and file-permissions for the tika-server process."); + LOG.warn( + "'basePath' has not been set. " + + "This means that client code or clients can read from any file that this " + + "process has permissions to read. If you are running tika-server, make " + + "absolutely certain that you've locked down " + + "access to tika-server and file-permissions for the tika-server process."); return; } if (basePath.toString().startsWith("http://")) { - throw new TikaConfigException("FileSystemFetcher only works with local file systems. " + - " Please use the tika-fetcher-http module for http calls"); + throw new TikaConfigException( + "FileSystemFetcher only works with local file systems. " + + " Please use the tika-fetcher-http module for http calls"); } else if (basePath.toString().startsWith("ftp://")) { - throw new TikaConfigException("FileSystemFetcher only works with local file systems. " + - " Please consider contributing an ftp fetcher module"); + throw new TikaConfigException( + "FileSystemFetcher only works with local file systems. " + + " Please consider contributing an ftp fetcher module"); } else if (basePath.toString().startsWith("s3://")) { - throw new TikaConfigException("FileSystemFetcher only works with local file systems. " + - " Please use the tika-fetcher-s3 module"); + throw new TikaConfigException( + "FileSystemFetcher only works with local file systems. " + + " Please use the tika-fetcher-s3 module"); } if (basePath.toAbsolutePath().toString().contains("\u0000")) { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java index f415a3560a..d791952531 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java @@ -20,33 +20,31 @@ import java.io.InputStream; import java.net.URL; import java.util.Locale; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.fetcher.AbstractFetcher; /** - * Simple fetcher for URLs. This simply calls {@link TikaInputStream#get(URL)}. - * This intentionally does not support fetching for files. - * Please use the FileSystemFetcher for that. If you need more advanced control (passwords, - * timeouts, proxies, etc), please use the tika-fetcher-http module. + * Simple fetcher for URLs. This simply calls {@link TikaInputStream#get(URL)}. This intentionally + * does not support fetching for files. Please use the FileSystemFetcher for that. If you need more + * advanced control (passwords, timeouts, proxies, etc), please use the tika-fetcher-http module. */ public class UrlFetcher extends AbstractFetcher { @Override public InputStream fetch(String fetchKey, Metadata metadata) throws IOException, TikaException { if (fetchKey.contains("\u0000")) { - throw new IllegalArgumentException("URL must not contain \u0000. " + - "Please review the life decisions that led you to requesting " + - "a URL with this character in it."); + throw new IllegalArgumentException( + "URL must not contain \u0000. " + + "Please review the life decisions that led you to requesting " + + "a URL with this character in it."); } if (fetchKey.toLowerCase(Locale.US).trim().startsWith("file:")) { throw new IllegalArgumentException( - "The UrlFetcher does not fetch from file shares; " + - "please use the FileSystemFetcher"); + "The UrlFetcher does not fetch from file shares; " + + "please use the FileSystemFetcher"); } return TikaInputStream.get(new URL(fetchKey), metadata); } - } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java index a60784f0c6..b415e6f96d 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java @@ -20,12 +20,11 @@ import java.util.concurrent.Callable; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; - import org.apache.tika.pipes.FetchEmitTuple; /** - * This is a simple wrapper around {@link PipesIterator} - * that allows it to be called in its own thread. + * This is a simple wrapper around {@link PipesIterator} that allows it to be called in its own + * thread. */ public class CallablePipesIterator implements Callable { @@ -37,48 +36,50 @@ public class CallablePipesIterator implements Callable { private final int numConsumers; /** - * This sets timeoutMillis to -1, meaning that - * this will block forever trying to add fetchemittuples to the queue. - * This sets the number of {@link PipesIterator#COMPLETED_SEMAPHORE} to 1. - * This means that your consumers must put the semaphore back in the queue - * after they finish. + * This sets timeoutMillis to -1, meaning that this will block forever trying to add + * fetchemittuples to the queue. This sets the number of {@link + * PipesIterator#COMPLETED_SEMAPHORE} to 1. This means that your consumers must put the + * semaphore back in the queue after they finish. * * @param pipesIterator * @param queue */ - public CallablePipesIterator(PipesIterator pipesIterator, - ArrayBlockingQueue queue) { + public CallablePipesIterator( + PipesIterator pipesIterator, ArrayBlockingQueue queue) { this(pipesIterator, queue, -1); } /** - * This sets the number of {@link PipesIterator#COMPLETED_SEMAPHORE} to 1. - * This means that your consumers must put the semaphore back in the queue - * after they finish. + * This sets the number of {@link PipesIterator#COMPLETED_SEMAPHORE} to 1. This means that your + * consumers must put the semaphore back in the queue after they finish. + * * @param pipesIterator underlying pipes iterator to use * @param queue queue to add the fetch emit tuples to - * @param timeoutMillis how long to try to offer the fetch emit tuples to the queue. If -1, - * this will block with {@link ArrayBlockingQueue#put(Object)} forever. - */ - public CallablePipesIterator(PipesIterator pipesIterator, - ArrayBlockingQueue queue, long timeoutMillis) { + * @param timeoutMillis how long to try to offer the fetch emit tuples to the queue. If -1, this + * will block with {@link ArrayBlockingQueue#put(Object)} forever. + */ + public CallablePipesIterator( + PipesIterator pipesIterator, + ArrayBlockingQueue queue, + long timeoutMillis) { this(pipesIterator, queue, timeoutMillis, 1); } /** - * * @param pipesIterator underlying pipes iterator to use * @param queue queue to add the fetch emit tuples to - * @param timeoutMillis how long to try to offer the fetch emit tuples to the queue. If -1, - * this will block with {@link ArrayBlockingQueue#put(Object)} forever. - * @param numConsumers how many {@link PipesIterator#COMPLETED_SEMAPHORE} to add to the - * queue. If the consumers are adding this back to the queue when they - * find it, then this should be set to 1, otherwise, for a single semaphore - * for each consumer, set this to the number of consumers + * @param timeoutMillis how long to try to offer the fetch emit tuples to the queue. If -1, this + * will block with {@link ArrayBlockingQueue#put(Object)} forever. + * @param numConsumers how many {@link PipesIterator#COMPLETED_SEMAPHORE} to add to the queue. + * If the consumers are adding this back to the queue when they find it, then this should be + * set to 1, otherwise, for a single semaphore for each consumer, set this to the number of + * consumers */ - public CallablePipesIterator(PipesIterator pipesIterator, - ArrayBlockingQueue queue, long timeoutMillis, - int numConsumers) { + public CallablePipesIterator( + PipesIterator pipesIterator, + ArrayBlockingQueue queue, + long timeoutMillis, + int numConsumers) { this.pipesIterator = pipesIterator; this.queue = queue; this.timeoutMillis = timeoutMillis; @@ -91,21 +92,24 @@ public Long call() throws Exception { if (timeoutMillis > 0) { for (FetchEmitTuple t : pipesIterator) { boolean offered = queue.offer(t, timeoutMillis, TimeUnit.MILLISECONDS); - if (! offered) { + if (!offered) { throw new TimeoutException("timed out trying to offer tuple"); } added++; } for (int i = 0; i < numConsumers; i++) { - boolean offered = queue.offer(PipesIterator.COMPLETED_SEMAPHORE, timeoutMillis, - TimeUnit.MILLISECONDS); + boolean offered = + queue.offer( + PipesIterator.COMPLETED_SEMAPHORE, + timeoutMillis, + TimeUnit.MILLISECONDS); if (!offered) { - throw new TimeoutException("timed out trying to offer the completed " + - "semaphore"); + throw new TimeoutException( + "timed out trying to offer the completed " + "semaphore"); } } } else { - //blocking! + // blocking! for (FetchEmitTuple t : pipesIterator) { queue.put(t); added++; diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java index 34706f7e88..a615a45c2b 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java @@ -28,10 +28,6 @@ import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.ConfigBase; import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; @@ -42,22 +38,23 @@ import org.apache.tika.pipes.FetchEmitTuple; import org.apache.tika.pipes.HandlerConfig; import org.apache.tika.sax.BasicContentHandlerFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * Abstract class that handles the testing for timeouts/thread safety - * issues. Concrete classes implement the blocking {@link #enqueue()}. - * If there's an exception in the enqueuing thread, this will throw - * a RuntimeException. It will throw an IllegalStateException if - * next() is called after hasNext() has returned false. + * Abstract class that handles the testing for timeouts/thread safety issues. Concrete classes + * implement the blocking {@link #enqueue()}. If there's an exception in the enqueuing thread, this + * will throw a RuntimeException. It will throw an IllegalStateException if next() is called after + * hasNext() has returned false. */ public abstract class PipesIterator extends ConfigBase - implements Callable, Iterable, Initializable { + implements Callable, Iterable, Initializable { public static final long DEFAULT_MAX_WAIT_MS = 300_000; public static final int DEFAULT_QUEUE_SIZE = 1000; public static final FetchEmitTuple COMPLETED_SEMAPHORE = - new FetchEmitTuple(null,null, null, null, null, null); + new FetchEmitTuple(null, null, null, null, null, null); private static final Logger LOGGER = LoggerFactory.getLogger(PipesIterator.class); @@ -80,12 +77,9 @@ public abstract class PipesIterator extends ConfigBase private int added = 0; private FutureTask futureTask; - public static PipesIterator build(Path tikaConfigFile) throws IOException, - TikaConfigException { + public static PipesIterator build(Path tikaConfigFile) throws IOException, TikaConfigException { try (InputStream is = Files.newInputStream(tikaConfigFile)) { - return buildSingle( - "pipesIterator", - PipesIterator.class, is); + return buildSingle("pipesIterator", PipesIterator.class, is); } } @@ -138,8 +132,9 @@ public void setOnParseException(FetchEmitTuple.ON_PARSE_EXCEPTION onParseExcepti @Field public void setHandlerType(String handlerType) { - this.handlerType = BasicContentHandlerFactory - .parseHandlerType(handlerType, BasicContentHandlerFactory.HANDLER_TYPE.TEXT); + this.handlerType = + BasicContentHandlerFactory.parseHandlerType( + handlerType, BasicContentHandlerFactory.HANDLER_TYPE.TEXT); } @Field @@ -173,9 +168,9 @@ public Integer call() throws Exception { } protected HandlerConfig getHandlerConfig() { - //TODO: make throwOnWriteLimitReached configurable - return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources, - throwOnWriteLimitReached); + // TODO: make throwOnWriteLimitReached configurable + return new HandlerConfig( + handlerType, parseMode, writeLimit, maxEmbeddedResources, throwOnWriteLimitReached); } protected abstract void enqueue() throws IOException, TimeoutException, InterruptedException; @@ -190,13 +185,13 @@ protected void tryToAdd(FetchEmitTuple p) throws InterruptedException, TimeoutEx @Override public void initialize(Map params) throws TikaConfigException { - //no-op + // no-op } @Override public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { - //no-op + // no-op } @Override @@ -255,11 +250,10 @@ private FetchEmitTuple pollNext() throws TikaTimeoutException { } /** - * this checks to make sure that the thread hasn't terminated early. - * Will return true if the thread has successfully completed or if - * it has not completed. Will return false if there has been a thread - * interrupt. Will throw a RuntimeException if there's been - * an execution exception in the thread. + * this checks to make sure that the thread hasn't terminated early. Will return true if the + * thread has successfully completed or if it has not completed. Will return false if there + * has been a thread interrupt. Will throw a RuntimeException if there's been an execution + * exception in the thread. */ private void checkThreadOk() throws InterruptedException { if (futureTask.isDone()) { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java index 8ab7086e96..e230d7696f 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java @@ -17,27 +17,26 @@ package org.apache.tika.pipes.pipesiterator; /** - * Interface for pipesiterators that allow counting of total - * documents. This is useful for user-facing frontends where - * the user does not have easy access to the total number of files - * for processing. + * Interface for pipesiterators that allow counting of total documents. This is useful for + * user-facing frontends where the user does not have easy access to the total number of files for + * processing. * - * This is run in a daemon thread and is not guaranteed to complete before - * the actual file processing has completed. + *

This is run in a daemon thread and is not guaranteed to complete before the actual file + * processing has completed. * - * This is an ancillary task, and should not throw runtime exceptions. - * - * Implementers should be careful to check for thread interrupts. + *

This is an ancillary task, and should not throw runtime exceptions. * + *

Implementers should be careful to check for thread interrupts. */ public interface TotalCounter { void startTotalCount(); /** - * Returns the total count so far. Check the {@link TotalCountResult#getStatus()} - * to figure out if the count has completed yet, if it is unsupported or if - * there was an exception during the counting. + * Returns the total count so far. Check the {@link TotalCountResult#getStatus()} to figure out + * if the count has completed yet, if it is unsupported or if there was an exception during the + * counting. + * * @return */ TotalCountResult getTotalCount(); diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java index 90cabe881f..27ed359309 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java @@ -23,7 +23,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.concurrent.TimeoutException; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -37,45 +36,46 @@ import org.apache.tika.utils.StringUtils; /** - * Reads a list of file names/relative paths from a UTF-8 file. - * One file name/relative path per line. This path is used for the fetch key, - * the id and the emit key. If you need more customized control of the keys/ids, - * consider using the jdbc pipes iterator or the csv pipes iterator. - * - * Skips empty lines and lines starting with '#' - * + * Reads a list of file names/relative paths from a UTF-8 file. One file name/relative path per + * line. This path is used for the fetch key, the id and the emit key. If you need more customized + * control of the keys/ids, consider using the jdbc pipes iterator or the csv pipes iterator. * + *

Skips empty lines and lines starting with '#' */ public class FileListPipesIterator extends PipesIterator implements Initializable { - @Field - private String fileList; + @Field private String fileList; - @Field - private boolean hasHeader = false; + @Field private boolean hasHeader = false; private Path fileListPath; @Override protected void enqueue() throws IOException, TimeoutException, InterruptedException { - try (BufferedReader reader = Files.newBufferedReader(fileListPath, StandardCharsets.UTF_8)) { + try (BufferedReader reader = + Files.newBufferedReader(fileListPath, StandardCharsets.UTF_8)) { if (hasHeader) { reader.readLine(); } String line = reader.readLine(); while (line != null) { - if (! line.startsWith("#") && !StringUtils.isBlank(line)) { + if (!line.startsWith("#") && !StringUtils.isBlank(line)) { FetchKey fetchKey = new FetchKey(getFetcherName(), line); EmitKey emitKey = new EmitKey(getEmitterName(), line); - tryToAdd(new FetchEmitTuple(line, fetchKey, emitKey, - new Metadata(), getHandlerConfig(), getOnParseException())); + tryToAdd( + new FetchEmitTuple( + line, + fetchKey, + emitKey, + new Metadata(), + getHandlerConfig(), + getOnParseException())); } line = reader.readLine(); } } } - @Field public void setFileList(String path) { this.fileList = path; @@ -89,15 +89,18 @@ public void setHasHeader(boolean hasHeader) { @Override public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { - //these should all be fatal + // these should all be fatal TikaConfig.mustNotBeEmpty("fileList", fileList); TikaConfig.mustNotBeEmpty("fetcherName", getFetcherName()); TikaConfig.mustNotBeEmpty("emitterName", getFetcherName()); fileListPath = Paths.get(fileList); if (!Files.isRegularFile(fileListPath)) { - throw new TikaConfigException("file list " + fileList + " does not exist. " + - "Must specify an existing file"); + throw new TikaConfigException( + "file list " + + fileList + + " does not exist. " + + "Must specify an existing file"); } } } diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java index 9e903fd8b2..509e1b21a5 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java @@ -27,10 +27,6 @@ import java.util.Map; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicLong; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -45,20 +41,20 @@ import org.apache.tika.pipes.pipesiterator.PipesIterator; import org.apache.tika.pipes.pipesiterator.TotalCountResult; import org.apache.tika.pipes.pipesiterator.TotalCounter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class FileSystemPipesIterator extends PipesIterator implements TotalCounter, Initializable, Closeable { private static final Logger LOG = LoggerFactory.getLogger(AsyncProcessor.class); - private Path basePath; private boolean countTotal = false; private FileCountWorker fileCountWorker; - public FileSystemPipesIterator() { - } + public FileSystemPipesIterator() {} public FileSystemPipesIterator(Path basePath) { this.basePath = basePath; @@ -87,11 +83,10 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept } } - @Override public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { - //these should all be fatal + // these should all be fatal TikaConfig.mustNotBeEmpty("basePath", basePath); TikaConfig.mustNotBeEmpty("fetcherName", getFetcherName()); TikaConfig.mustNotBeEmpty("emitterName", getFetcherName()); @@ -108,9 +103,10 @@ public void initialize(Map params) throws TikaConfigException { public void setCountTotal(boolean countTotal) { this.countTotal = countTotal; } + @Override public void startTotalCount() { - if (! countTotal) { + if (!countTotal) { return; } fileCountWorker.startTotalCount(); @@ -118,7 +114,7 @@ public void startTotalCount() { @Override public TotalCountResult getTotalCount() { - if (! countTotal) { + if (!countTotal) { return TotalCountResult.UNSUPPORTED; } return fileCountWorker.getTotalCount(); @@ -152,9 +148,14 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IO String relPath = basePath.relativize(file).toString(); try { - tryToAdd(new FetchEmitTuple(relPath, new FetchKey(fetcherName, relPath), - new EmitKey(emitterName, relPath), new Metadata(), getHandlerConfig(), - getOnParseException())); + tryToAdd( + new FetchEmitTuple( + relPath, + new FetchKey(fetcherName, relPath), + new EmitKey(emitterName, relPath), + new Metadata(), + getHandlerConfig(), + getOnParseException())); } catch (TimeoutException e) { throw new IOException(e); } catch (InterruptedException e) { @@ -174,7 +175,6 @@ public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOEx } } - private static class FileCountWorker implements TotalCounter, Closeable { private Thread totalCounterThread; @@ -191,17 +191,19 @@ public FileCountWorker(Path basePath) { @Override public void startTotalCount() { - totalCounterThread = new Thread(() -> { - try { - Files.walkFileTree(basePath, new FSFileCounter(totalCount)); - status = TotalCountResult.STATUS.COMPLETED; - finalResult = new TotalCountResult(totalCount.get(), status); - } catch (IOException e) { - LOG.warn("problem counting files", e); - status = TotalCountResult.STATUS.EXCEPTION; - finalResult = new TotalCountResult(totalCount.get(), status); - } - }); + totalCounterThread = + new Thread( + () -> { + try { + Files.walkFileTree(basePath, new FSFileCounter(totalCount)); + status = TotalCountResult.STATUS.COMPLETED; + finalResult = new TotalCountResult(totalCount.get(), status); + } catch (IOException e) { + LOG.warn("problem counting files", e); + status = TotalCountResult.STATUS.EXCEPTION; + finalResult = new TotalCountResult(totalCount.get(), status); + } + }); totalCounterThread.setDaemon(true); totalCounterThread.start(); } @@ -222,6 +224,7 @@ public void close() throws IOException { private class FSFileCounter implements FileVisitor { private final AtomicLong count; + private FSFileCounter(AtomicLong count) { this.count = count; } @@ -233,7 +236,8 @@ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) } @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) + throws IOException { count.incrementAndGet(); return FileVisitResult.CONTINUE; } @@ -244,7 +248,8 @@ public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOExce } @Override - public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + public FileVisitResult postVisitDirectory(Path dir, IOException exc) + throws IOException { return FileVisitResult.CONTINUE; } } diff --git a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java index a98d39c974..f06d355c37 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java @@ -13,7 +13,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */package org.apache.tika.renderer; + */ package org.apache.tika.renderer; import java.io.IOException; import java.io.InputStream; @@ -23,7 +23,6 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; - import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; @@ -54,14 +53,16 @@ public CompositeRenderer(List renderers) { } rendererMap = Collections.unmodifiableMap(tmp); } + @Override public Set getSupportedTypes(ParseContext context) { return rendererMap.keySet(); } @Override - public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext, - RenderRequest... requests) throws IOException, TikaException { + public RenderResults render( + InputStream is, Metadata metadata, ParseContext parseContext, RenderRequest... requests) + throws IOException, TikaException { String mediaTypeString = metadata.get(TikaCoreProperties.TYPE); if (mediaTypeString == null) { @@ -81,20 +82,16 @@ public RenderResults render(InputStream is, Metadata metadata, ParseContext pars public Renderer getLeafRenderer(MediaType mt) { return rendererMap.get(mt); } - @Override - public void initialize(Map params) throws TikaConfigException { - } + @Override + public void initialize(Map params) throws TikaConfigException {} @Override public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { - - } + throws TikaConfigException {} private static List getDefaultRenderers(ServiceLoader loader) { - List staticRenderers = - loader.loadStaticServiceProviders(Renderer.class); + List staticRenderers = loader.loadStaticServiceProviders(Renderer.class); ServiceLoaderUtils.sortLoadedClasses(staticRenderers); return staticRenderers; diff --git a/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java index d80ff7c5c9..a803f0b058 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java @@ -20,7 +20,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - import org.apache.tika.io.TemporaryResources; import org.apache.tika.metadata.TikaPagedText; @@ -31,6 +30,7 @@ public class PageBasedRenderResults extends RenderResults { public PageBasedRenderResults(TemporaryResources tmp) { super(tmp); } + public void add(RenderResult result) { Integer page = result.getMetadata().getInt(TikaPagedText.PAGE_NUMBER); if (page != null) { diff --git a/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java b/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java index 2534d7032f..4cfbf9763d 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java @@ -18,9 +18,7 @@ import java.util.Objects; -/** - * The range of pages to render. These are 1-based, and "to" is inclusive. - */ +/** The range of pages to render. These are 1-based, and "to" is inclusive. */ public class PageRangeRequest implements RenderRequest { public static PageRangeRequest RENDER_ALL = new PageRangeRequest(1, -1); diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java index 3277d866af..caaeee9049 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java @@ -17,11 +17,9 @@ package org.apache.tika.renderer; /** - * Empty interface for requests to a renderer. Different - * file formats and different use cases will have different types of requests. - * For page based, it could be a page range (render the full pages from 2 to 5); - * or it could be a single page with an x-y bounding box. For video files, - * it could be a temporal offset or a temporal offset with an x-y bounding box. + * Empty interface for requests to a renderer. Different file formats and different use cases will + * have different types of requests. For page based, it could be a page range (render the full pages + * from 2 to 5); or it could be a single page with an x-y bounding box. For video files, it could be + * a temporal offset or a temporal offset with an x-y bounding box. */ -public interface RenderRequest { -} +public interface RenderRequest {} diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java index 3fd8d7d2c0..cc2aab6032 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java @@ -21,7 +21,6 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; - import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -33,14 +32,15 @@ public enum STATUS { EXCEPTION, TIMEOUT } + private final STATUS status; private final int id; private final Object result; - //TODO: we're relying on metadata to bring in a bunch of info. - //Might be cleaner to add specific parameters for page number, embedded path, etc.? + // TODO: we're relying on metadata to bring in a bunch of info. + // Might be cleaner to add specific parameters for page number, embedded path, etc.? private final Metadata metadata; TemporaryResources tmp = new TemporaryResources(); @@ -51,12 +51,13 @@ public RenderResult(STATUS status, int id, Object result, Metadata metadata) { this.result = result; this.metadata = metadata; if (result instanceof Path) { - tmp.addResource(new Closeable() { - @Override - public void close() throws IOException { - Files.delete((Path)result); - } - }); + tmp.addResource( + new Closeable() { + @Override + public void close() throws IOException { + Files.delete((Path) result); + } + }); } else if (result instanceof Closeable) { tmp.addResource((Closeable) result); } @@ -64,7 +65,7 @@ public void close() throws IOException { public InputStream getInputStream() throws IOException { if (result instanceof Path) { - return TikaInputStream.get((Path)result, metadata); + return TikaInputStream.get((Path) result, metadata); } else { TikaInputStream tis = TikaInputStream.get(new byte[0]); tis.setOpenContainer(result); @@ -88,5 +89,4 @@ public int getId() { public void close() throws IOException { tmp.close(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java index 108c062605..7e1643e4d3 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; - import org.apache.tika.io.TemporaryResources; public class RenderResults implements Closeable { @@ -28,9 +27,11 @@ public class RenderResults implements Closeable { private List results = new ArrayList<>(); private final TemporaryResources tmp; + public RenderResults(TemporaryResources tmp) { this.tmp = tmp; } + public void add(RenderResult result) { tmp.addResource(result); results.add(result); diff --git a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java index bc4261f521..2272ca2a50 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java @@ -20,24 +20,20 @@ import java.io.InputStream; import java.io.Serializable; import java.util.Set; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; /** - * Interface for a renderer. This should be flexible enough to run on the initial design: PDF pages + * Interface for a renderer. This should be flexible enough to run on the initial design: PDF pages * but also on portions of PDF pages as well as on other document types. - * */ public interface Renderer extends Serializable { - - /** - * Returns the set of media types supported by this renderer when used - * with the given parse context. + * Returns the set of media types supported by this renderer when used with the given parse + * context. * * @param context parse context * @return immutable set of media types @@ -45,9 +41,9 @@ public interface Renderer extends Serializable { */ Set getSupportedTypes(ParseContext context); - RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext, - RenderRequest ... requests) throws IOException, - TikaException; + RenderResults render( + InputStream is, Metadata metadata, ParseContext parseContext, RenderRequest... requests) + throws IOException, TikaException; /* At some point, we might need/want to add something like this, where for a given diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java index ed82500659..38591a3fe4 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java @@ -17,10 +17,7 @@ package org.apache.tika.renderer; /** - * This should be to track state for each file (embedded or otherwise). - * This should be reset in the parseContext at the beginning of a parse - * and then replaced at the end of the parse. + * This should be to track state for each file (embedded or otherwise). This should be reset in the + * parseContext at the beginning of a parse and then replaced at the end of the parse. */ -public class RenderingState { - -} +public class RenderingState {} diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java index 2e3143261a..e20c00d784 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java @@ -17,12 +17,11 @@ package org.apache.tika.renderer; /** - * Use this in the ParseContext to keep track of unique ids for rendered - * images in embedded docs. This should be used for the full parse of - * a main document and its embedded document. + * Use this in the ParseContext to keep track of unique ids for rendered images in embedded docs. + * This should be used for the full parse of a main document and its embedded document. * - * This is different from RenderingState, which is used to track - * rendering per file/per embedded doc. + *

This is different from RenderingState, which is used to track rendering per file/per embedded + * doc. */ public class RenderingTracker { diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java index d423009dc1..8753977464 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java @@ -19,26 +19,26 @@ import java.io.OutputStream; import java.io.Serializable; import java.nio.charset.Charset; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** - * This is a special handler to be used only with the - * {@link org.apache.tika.parser.RecursiveParserWrapper}. - * It allows for finer-grained processing of embedded documents than in the legacy handlers. - * Subclasses can choose how to process individual embedded documents. + * This is a special handler to be used only with the {@link + * org.apache.tika.parser.RecursiveParserWrapper}. It allows for finer-grained processing of + * embedded documents than in the legacy handlers. Subclasses can choose how to process individual + * embedded documents. */ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandler implements Serializable { - public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = Property.internalBoolean( - TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached"); + public static final Property EMBEDDED_RESOURCE_LIMIT_REACHED = + Property.internalBoolean( + TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + + "embedded_resource_limit_reached"); private static final int MAX_DEPTH = 100; private final ContentHandlerFactory contentHandlerFactory; private final int maxEmbeddedResources; @@ -49,8 +49,8 @@ public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandle this(contentHandlerFactory, -1); } - public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, - int maxEmbeddedResources) { + public AbstractRecursiveParserWrapperHandler( + ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) { this.contentHandlerFactory = contentHandlerFactory; this.maxEmbeddedResources = maxEmbeddedResources; } @@ -64,12 +64,12 @@ public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { } /** - * This is called before parsing each embedded document. Override this - * for custom behavior. Make sure to call this in your custom classes - * because this tracks the number of embedded documents. + * This is called before parsing each embedded document. Override this for custom behavior. Make + * sure to call this in your custom classes because this tracks the number of embedded + * documents. * * @param contentHandler local handler to be used on this embedded document - * @param metadata embedded document's metadata + * @param metadata embedded document's metadata */ public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { @@ -82,11 +82,11 @@ public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metada } /** - * This is called after parsing each embedded document. Override this - * for custom behavior. This is currently a no-op. + * This is called after parsing each embedded document. Override this for custom behavior. This + * is currently a no-op. * * @param contentHandler content handler that was used on this embedded document - * @param metadata metadata for this embedded document + * @param metadata metadata for this embedded document * @throws SAXException */ public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) @@ -95,13 +95,12 @@ public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata } /** - * This is called after the full parse has completed. Override this - * for custom behavior. Make sure to call this as super.endDocument(...) - * in subclasses because this adds whether or not the embedded resource - * maximum has been hit to the metadata. + * This is called after the full parse has completed. Override this for custom behavior. Make + * sure to call this as super.endDocument(...) in subclasses because this adds + * whether or not the embedded resource maximum has been hit to the metadata. * * @param contentHandler content handler that was used on the main document - * @param metadata metadata that was gathered for the main document + * @param metadata metadata that was gathered for the main document * @throws SAXException */ public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java index 361b7817c7..110e115969 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java @@ -16,21 +16,16 @@ */ package org.apache.tika.sax; - import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.Locale; - +import org.apache.tika.parser.ParseContext; import org.xml.sax.ContentHandler; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.parser.ParseContext; - -/** - * Basic factory for creating common types of ContentHandlers - */ +/** Basic factory for creating common types of ContentHandlers */ public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteLimiter { private final HANDLER_TYPE type; @@ -42,45 +37,46 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteL /** * Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true - * @param type basic type of handler - * @param writeLimit max number of characters to store; if < 0, - * the handler will store all characters + * + * @param type basic type of handler + * @param writeLimit max number of characters to store; if < 0, the handler will store all + * characters */ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) { this(type, writeLimit, true, null); } /** - * * @param type basic type of handler * @param writeLimit maximum number of characters to store - * @param throwOnWriteLimitReached whether or not to throw a - * {@link org.apache.tika.exception.WriteLimitReachedException} - * when the write limit has been reached - * @param parseContext to store the writelimitreached warning if - * throwOnWriteLimitReached is set to false + * @param throwOnWriteLimitReached whether or not to throw a {@link + * org.apache.tika.exception.WriteLimitReachedException} when the write limit has been + * reached + * @param parseContext to store the writelimitreached warning if throwOnWriteLimitReached is set + * to false */ - public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit, - boolean throwOnWriteLimitReached, ParseContext parseContext) { + public BasicContentHandlerFactory( + HANDLER_TYPE type, + int writeLimit, + boolean throwOnWriteLimitReached, + ParseContext parseContext) { this.type = type; this.writeLimit = writeLimit; this.throwOnWriteLimitReached = throwOnWriteLimitReached; this.parseContext = parseContext; if (throwOnWriteLimitReached == false && parseContext == null) { - throw new IllegalArgumentException("parse context must not be null if " + - "throwOnWriteLimitReached is false"); + throw new IllegalArgumentException( + "parse context must not be null if " + "throwOnWriteLimitReached is false"); } - } /** - * Tries to parse string into handler type. Returns default if string is null or - * parse fails. - *

- * Options: xml, html, text, body, ignore (no content) + * Tries to parse string into handler type. Returns default if string is null or parse fails. + * + *

Options: xml, html, text, body, ignore (no content) * * @param handlerTypeName string to parse - * @param defaultType type to return if parse fails + * @param defaultType type to return if parse fails * @return handler type */ public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE defaultType) { @@ -112,8 +108,11 @@ public ContentHandler getNewContentHandler() { if (type == HANDLER_TYPE.BODY) { return new BodyContentHandler( - new WriteOutContentHandler(new ToTextContentHandler(), writeLimit, - throwOnWriteLimitReached, parseContext)); + new WriteOutContentHandler( + new ToTextContentHandler(), + writeLimit, + throwOnWriteLimitReached, + parseContext)); } else if (type == HANDLER_TYPE.IGNORE) { return new DefaultHandler(); } @@ -121,8 +120,8 @@ public ContentHandler getNewContentHandler() { if (writeLimit < 0) { return formatHandler; } - return new WriteOutContentHandler(formatHandler, writeLimit, throwOnWriteLimitReached, - parseContext); + return new WriteOutContentHandler( + formatHandler, writeLimit, throwOnWriteLimitReached, parseContext); } private ContentHandler getFormatHandler() { @@ -176,7 +175,6 @@ public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { return new ToXMLContentHandler(os, charset.name()); default: return new ToTextContentHandler(os, charset.name()); - } } } catch (UnsupportedEncodingException e) { @@ -191,12 +189,13 @@ public HANDLER_TYPE getType() { return type; } - /** - * Common handler types for content. - */ + /** Common handler types for content. */ public enum HANDLER_TYPE { - BODY, IGNORE, //don't store content - TEXT, HTML, XML + BODY, + IGNORE, // don't store content + TEXT, + HTML, + XML } public int getWriteLimit() { diff --git a/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java index dfdecb83c3..de614b8845 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java @@ -17,35 +17,28 @@ package org.apache.tika.sax; import java.io.Writer; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.sax.xpath.Matcher; import org.apache.tika.sax.xpath.MatchingContentHandler; import org.apache.tika.sax.xpath.XPathParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Content handler decorator that only passes everything inside - * the XHTML <body/> tag to the underlying handler. Note that - * the <body/> tag itself is not passed on. + * Content handler decorator that only passes everything inside the XHTML <body/> tag to the + * underlying handler. Note that the <body/> tag itself is not passed on. */ public class BodyContentHandler extends ContentHandlerDecorator { - /** - * XHTML XPath parser. - */ + /** XHTML XPath parser. */ private static final XPathParser PARSER = new XPathParser("xhtml", XHTMLContentHandler.XHTML); - /** - * The XPath matcher used to select the XHTML body contents. - */ + /** The XPath matcher used to select the XHTML body contents. */ private static final Matcher MATCHER = PARSER.parse("/xhtml:html/xhtml:body/descendant::node()"); /** - * Creates a content handler that passes all XHTML body events to the - * given underlying content handler. + * Creates a content handler that passes all XHTML body events to the given underlying content + * handler. * * @param handler content handler */ @@ -54,8 +47,7 @@ public BodyContentHandler(ContentHandler handler) { } /** - * Creates a content handler that writes XHTML body character events to - * the given writer. + * Creates a content handler that writes XHTML body character events to the given writer. * * @param writer writer */ @@ -64,15 +56,14 @@ public BodyContentHandler(Writer writer) { } /** - * Creates a content handler that writes XHTML body character events to - * an internal string buffer. The contents of the buffer can be retrieved - * using the {@link #toString()} method. - *

- * The internal string buffer is bounded at the given number of characters. - * If this write limit is reached, then a {@link SAXException} is thrown. + * Creates a content handler that writes XHTML body character events to an internal string + * buffer. The contents of the buffer can be retrieved using the {@link #toString()} method. + * + *

The internal string buffer is bounded at the given number of characters. If this write + * limit is reached, then a {@link SAXException} is thrown. * - * @param writeLimit maximum number of characters to include in the string, - * or -1 to disable the write limit + * @param writeLimit maximum number of characters to include in the string, or -1 to disable the + * write limit * @since Apache Tika 0.7 */ public BodyContentHandler(int writeLimit) { @@ -80,15 +71,13 @@ public BodyContentHandler(int writeLimit) { } /** - * Creates a content handler that writes XHTML body character events to - * an internal string buffer. The contents of the buffer can be retrieved - * using the {@link #toString()} method. - *

- * The internal string buffer is bounded at 100k characters. If this write - * limit is reached, then a {@link SAXException} is thrown. + * Creates a content handler that writes XHTML body character events to an internal string + * buffer. The contents of the buffer can be retrieved using the {@link #toString()} method. + * + *

The internal string buffer is bounded at 100k characters. If this write limit is reached, + * then a {@link SAXException} is thrown. */ public BodyContentHandler() { this(new WriteOutContentHandler()); } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java b/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java index 6e6ddcde99..6a330737b1 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java +++ b/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java @@ -22,136 +22,232 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -/** - * Class to help de-obfuscate phone numbers in text. - */ +/** Class to help de-obfuscate phone numbers in text. */ public class CleanPhoneText { public static final String[][][] cleanSubstitutions = - new String[][][]{{{"&#\\d{1,3};", ""}}, // first simply remove numeric entities - {{"th0usand", "thousand"}, // handle common misspellings - {"th1rteen", "thirteen"}, {"f0urteen", "fourteen"}, - {"e1ghteen", "eighteen"}, {"n1neteen", "nineteen"}, - {"f1fteen", "fifteen"}, {"s1xteen", "sixteen"}, {"th1rty", "thirty"}, - {"e1ghty", "eighty"}, {"n1nety", "ninety"}, {"fourty", "forty"}, - {"f0urty", "forty"}, {"e1ght", "eight"}, {"f0rty", "forty"}, - {"f1fty", "fifty"}, {"s1xty", "sixty"}, {"zer0", "zero"}, - {"f0ur", "four"}, {"f1ve", "five"}, {"n1ne", "nine"}, {"0ne", "one"}, - {"tw0", "two"}, {"s1x", "six"}}, - // mixed compound numeral words - // consider 7teen, etc. - {{"twenty[\\W_]{0,3}1", "twenty-one"}, {"twenty[\\W_]{0,3}2", "twenty-two"}, - {"twenty[\\W_]{0,3}3", "twenty-three"}, - {"twenty[\\W_]{0,3}4", "twenty-four"}, - {"twenty[\\W_]{0,3}5", "twenty-five"}, - {"twenty[\\W_]{0,3}6", "twenty-six"}, - {"twenty[\\W_]{0,3}7", "twenty-seven"}, - {"twenty[\\W_]{0,3}8", "twenty-eight"}, - {"twenty[\\W_]{0,3}9", "twenty-nine"}, - {"thirty[\\W_]{0,3}1", "thirty-one"}, - {"thirty[\\W_]{0,3}2", "thirty-two"}, - {"thirty[\\W_]{0,3}3", "thirty-three"}, - {"thirty[\\W_]{0,3}4", "thirty-four"}, - {"thirty[\\W_]{0,3}5", "thirty-five"}, - {"thirty[\\W_]{0,3}6", "thirty-six"}, - {"thirty[\\W_]{0,3}7", "thirty-seven"}, - {"thirty[\\W_]{0,3}8", "thirty-eight"}, - {"thirty[\\W_]{0,3}9", "thirty-nine"}, - {"forty[\\W_]{0,3}1", "forty-one"}, {"forty[\\W_]{0,3}2", "forty-two"}, - {"forty[\\W_]{0,3}3", "forty-three"}, - {"forty[\\W_]{0,3}4", "forty-four"}, - {"forty[\\W_]{0,3}5", "forty-five"}, {"forty[\\W_]{0,3}6", "forty-six"}, - {"forty[\\W_]{0,3}7", "forty-seven"}, - {"forty[\\W_]{0,3}8", "forty-eight"}, - {"forty[\\W_]{0,3}9", "forty-nine"}, {"fifty[\\W_]{0,3}1", "fifty-one"}, - {"fifty[\\W_]{0,3}2", "fifty-two"}, - {"fifty[\\W_]{0,3}3", "fifty-three"}, - {"fifty[\\W_]{0,3}4", "fifty-four"}, - {"fifty[\\W_]{0,3}5", "fifty-five"}, {"fifty[\\W_]{0,3}6", "fifty-six"}, - {"fifty[\\W_]{0,3}7", "fifty-seven"}, - {"fifty[\\W_]{0,3}8", "fifty-eight"}, - {"fifty[\\W_]{0,3}9", "fifty-nine"}, {"sixty[\\W_]{0,3}1", "sixty-one"}, - {"sixty[\\W_]{0,3}2", "sixty-two"}, - {"sixty[\\W_]{0,3}3", "sixty-three"}, - {"sixty[\\W_]{0,3}4", "sixty-four"}, - {"sixty[\\W_]{0,3}5", "sixty-five"}, {"sixty[\\W_]{0,3}6", "sixty-six"}, - {"sixty[\\W_]{0,3}7", "sixty-seven"}, - {"sixty[\\W_]{0,3}8", "sixty-eight"}, - {"sixty[\\W_]{0,3}9", "sixty-nine"}, - {"seventy[\\W_]{0,3}1", "seventy-one"}, - {"seventy[\\W_]{0,3}2", "seventy-two"}, - {"seventy[\\W_]{0,3}3", "seventy-three"}, - {"seventy[\\W_]{0,3}4", "seventy-four"}, - {"seventy[\\W_]{0,3}5", "seventy-five"}, - {"seventy[\\W_]{0,3}6", "seventy-six"}, - {"seventy[\\W_]{0,3}7", "seventy-seven"}, - {"seventy[\\W_]{0,3}8", "seventy-eight"}, - {"seventy[\\W_]{0,3}9", "seventy-nine"}, - {"eighty[\\W_]{0,3}1", "eighty-one"}, - {"eighty[\\W_]{0,3}2", "eighty-two"}, - {"eighty[\\W_]{0,3}3", "eighty-three"}, - {"eighty[\\W_]{0,3}4", "eighty-four"}, - {"eighty[\\W_]{0,3}5", "eighty-five"}, - {"eighty[\\W_]{0,3}6", "eighty-six"}, - {"eighty[\\W_]{0,3}7", "eighty-seven"}, - {"eighty[\\W_]{0,3}8", "eighty-eight"}, - {"eighty[\\W_]{0,3}9", "eighty-nine"}, - {"ninety[\\W_]{0,3}1", "ninety-one"}, - {"ninety[\\W_]{0,3}2", "ninety-two"}, - {"ninety[\\W_]{0,3}3", "ninety-three"}, - {"ninety[\\W_]{0,3}4", "ninety-four"}, - {"ninety[\\W_]{0,3}5", "ninety-five"}, - {"ninety[\\W_]{0,3}6", "ninety-six"}, - {"ninety[\\W_]{0,3}7", "ninety-seven"}, - {"ninety[\\W_]{0,3}8", "ninety-eight"}, - {"ninety[\\W_]{0,3}9", "ninety-nine"}}, - // now resolve compound numeral words - {{"twenty-one", "21"}, {"twenty-two", "22"}, {"twenty-three", "23"}, - {"twenty-four", "24"}, {"twenty-five", "25"}, {"twenty-six", "26"}, - {"twenty-seven", "27"}, {"twenty-eight", "28"}, {"twenty-nine", "29"}, - {"thirty-one", "31"}, {"thirty-two", "32"}, {"thirty-three", "33"}, - {"thirty-four", "34"}, {"thirty-five", "35"}, {"thirty-six", "36"}, - {"thirty-seven", "37"}, {"thirty-eight", "38"}, {"thirty-nine", "39"}, - {"forty-one", "41"}, {"forty-two", "42"}, {"forty-three", "43"}, - {"forty-four", "44"}, {"forty-five", "45"}, {"forty-six", "46"}, - {"forty-seven", "47"}, {"forty-eight", "48"}, {"forty-nine", "49"}, - {"fifty-one", "51"}, {"fifty-two", "52"}, {"fifty-three", "53"}, - {"fifty-four", "54"}, {"fifty-five", "55"}, {"fifty-six", "56"}, - {"fifty-seven", "57"}, {"fifty-eight", "58"}, {"fifty-nine", "59"}, - {"sixty-one", "61"}, {"sixty-two", "62"}, {"sixty-three", "63"}, - {"sixty-four", "64"}, {"sixty-five", "65"}, {"sixty-six", "66"}, - {"sixty-seven", "67"}, {"sixty-eight", "68"}, {"sixty-nine", "69"}, - {"seventy-one", "71"}, {"seventy-two", "72"}, {"seventy-three", "73"}, - {"seventy-four", "74"}, {"seventy-five", "75"}, {"seventy-six", "76"}, - {"seventy-seven", "77"}, {"seventy-eight", "78"}, - {"seventy-nine", "79"}, {"eighty-one", "81"}, {"eighty-two", "82"}, - {"eighty-three", "83"}, {"eighty-four", "84"}, {"eighty-five", "85"}, - {"eighty-six", "86"}, {"eighty-seven", "87"}, {"eighty-eight", "88"}, - {"eighty-nine", "89"}, {"ninety-one", "91"}, {"ninety-two", "92"}, - {"ninety-three", "93"}, {"ninety-four", "94"}, {"ninety-five", "95"}, - {"ninety-six", "96"}, {"ninety-seven", "97"}, {"ninety-eight", "98"}, - {"ninety-nine", "99"}}, - // larger units function as suffixes now - // assume never have three hundred four, three hundred and four - {{"hundred", "00"}, {"thousand", "000"}}, - // single numeral words now - // some would have been ambiguous - {{"seventeen", "17"}, {"thirteen", "13"}, {"fourteen", "14"}, - {"eighteen", "18"}, {"nineteen", "19"}, {"fifteen", "15"}, - {"sixteen", "16"}, {"seventy", "70"}, {"eleven", "11"}, - {"twelve", "12"}, {"twenty", "20"}, {"thirty", "30"}, {"eighty", "80"}, - {"ninety", "90"}, {"three", "3"}, {"seven", "7"}, {"eight", "8"}, - {"forty", "40"}, {"fifty", "50"}, {"sixty", "60"}, {"zero", "0"}, - {"four", "4"}, {"five", "5"}, {"nine", "9"}, {"one", "1"}, {"two", "2"}, - {"six", "6"}, {"ten", "10"}}, - // now do letter for digit substitutions - {{"oh", "0"}, {"o", "0"}, {"i", "1"}, {"l", "1"}}}; + new String[][][] { + {{"&#\\d{1,3};", ""}}, // first simply remove numeric entities + { + {"th0usand", "thousand"}, // handle common misspellings + {"th1rteen", "thirteen"}, + {"f0urteen", "fourteen"}, + {"e1ghteen", "eighteen"}, + {"n1neteen", "nineteen"}, + {"f1fteen", "fifteen"}, + {"s1xteen", "sixteen"}, + {"th1rty", "thirty"}, + {"e1ghty", "eighty"}, + {"n1nety", "ninety"}, + {"fourty", "forty"}, + {"f0urty", "forty"}, + {"e1ght", "eight"}, + {"f0rty", "forty"}, + {"f1fty", "fifty"}, + {"s1xty", "sixty"}, + {"zer0", "zero"}, + {"f0ur", "four"}, + {"f1ve", "five"}, + {"n1ne", "nine"}, + {"0ne", "one"}, + {"tw0", "two"}, + {"s1x", "six"} + }, + // mixed compound numeral words + // consider 7teen, etc. + { + {"twenty[\\W_]{0,3}1", "twenty-one"}, + {"twenty[\\W_]{0,3}2", "twenty-two"}, + {"twenty[\\W_]{0,3}3", "twenty-three"}, + {"twenty[\\W_]{0,3}4", "twenty-four"}, + {"twenty[\\W_]{0,3}5", "twenty-five"}, + {"twenty[\\W_]{0,3}6", "twenty-six"}, + {"twenty[\\W_]{0,3}7", "twenty-seven"}, + {"twenty[\\W_]{0,3}8", "twenty-eight"}, + {"twenty[\\W_]{0,3}9", "twenty-nine"}, + {"thirty[\\W_]{0,3}1", "thirty-one"}, + {"thirty[\\W_]{0,3}2", "thirty-two"}, + {"thirty[\\W_]{0,3}3", "thirty-three"}, + {"thirty[\\W_]{0,3}4", "thirty-four"}, + {"thirty[\\W_]{0,3}5", "thirty-five"}, + {"thirty[\\W_]{0,3}6", "thirty-six"}, + {"thirty[\\W_]{0,3}7", "thirty-seven"}, + {"thirty[\\W_]{0,3}8", "thirty-eight"}, + {"thirty[\\W_]{0,3}9", "thirty-nine"}, + {"forty[\\W_]{0,3}1", "forty-one"}, + {"forty[\\W_]{0,3}2", "forty-two"}, + {"forty[\\W_]{0,3}3", "forty-three"}, + {"forty[\\W_]{0,3}4", "forty-four"}, + {"forty[\\W_]{0,3}5", "forty-five"}, + {"forty[\\W_]{0,3}6", "forty-six"}, + {"forty[\\W_]{0,3}7", "forty-seven"}, + {"forty[\\W_]{0,3}8", "forty-eight"}, + {"forty[\\W_]{0,3}9", "forty-nine"}, + {"fifty[\\W_]{0,3}1", "fifty-one"}, + {"fifty[\\W_]{0,3}2", "fifty-two"}, + {"fifty[\\W_]{0,3}3", "fifty-three"}, + {"fifty[\\W_]{0,3}4", "fifty-four"}, + {"fifty[\\W_]{0,3}5", "fifty-five"}, + {"fifty[\\W_]{0,3}6", "fifty-six"}, + {"fifty[\\W_]{0,3}7", "fifty-seven"}, + {"fifty[\\W_]{0,3}8", "fifty-eight"}, + {"fifty[\\W_]{0,3}9", "fifty-nine"}, + {"sixty[\\W_]{0,3}1", "sixty-one"}, + {"sixty[\\W_]{0,3}2", "sixty-two"}, + {"sixty[\\W_]{0,3}3", "sixty-three"}, + {"sixty[\\W_]{0,3}4", "sixty-four"}, + {"sixty[\\W_]{0,3}5", "sixty-five"}, + {"sixty[\\W_]{0,3}6", "sixty-six"}, + {"sixty[\\W_]{0,3}7", "sixty-seven"}, + {"sixty[\\W_]{0,3}8", "sixty-eight"}, + {"sixty[\\W_]{0,3}9", "sixty-nine"}, + {"seventy[\\W_]{0,3}1", "seventy-one"}, + {"seventy[\\W_]{0,3}2", "seventy-two"}, + {"seventy[\\W_]{0,3}3", "seventy-three"}, + {"seventy[\\W_]{0,3}4", "seventy-four"}, + {"seventy[\\W_]{0,3}5", "seventy-five"}, + {"seventy[\\W_]{0,3}6", "seventy-six"}, + {"seventy[\\W_]{0,3}7", "seventy-seven"}, + {"seventy[\\W_]{0,3}8", "seventy-eight"}, + {"seventy[\\W_]{0,3}9", "seventy-nine"}, + {"eighty[\\W_]{0,3}1", "eighty-one"}, + {"eighty[\\W_]{0,3}2", "eighty-two"}, + {"eighty[\\W_]{0,3}3", "eighty-three"}, + {"eighty[\\W_]{0,3}4", "eighty-four"}, + {"eighty[\\W_]{0,3}5", "eighty-five"}, + {"eighty[\\W_]{0,3}6", "eighty-six"}, + {"eighty[\\W_]{0,3}7", "eighty-seven"}, + {"eighty[\\W_]{0,3}8", "eighty-eight"}, + {"eighty[\\W_]{0,3}9", "eighty-nine"}, + {"ninety[\\W_]{0,3}1", "ninety-one"}, + {"ninety[\\W_]{0,3}2", "ninety-two"}, + {"ninety[\\W_]{0,3}3", "ninety-three"}, + {"ninety[\\W_]{0,3}4", "ninety-four"}, + {"ninety[\\W_]{0,3}5", "ninety-five"}, + {"ninety[\\W_]{0,3}6", "ninety-six"}, + {"ninety[\\W_]{0,3}7", "ninety-seven"}, + {"ninety[\\W_]{0,3}8", "ninety-eight"}, + {"ninety[\\W_]{0,3}9", "ninety-nine"} + }, + // now resolve compound numeral words + { + {"twenty-one", "21"}, + {"twenty-two", "22"}, + {"twenty-three", "23"}, + {"twenty-four", "24"}, + {"twenty-five", "25"}, + {"twenty-six", "26"}, + {"twenty-seven", "27"}, + {"twenty-eight", "28"}, + {"twenty-nine", "29"}, + {"thirty-one", "31"}, + {"thirty-two", "32"}, + {"thirty-three", "33"}, + {"thirty-four", "34"}, + {"thirty-five", "35"}, + {"thirty-six", "36"}, + {"thirty-seven", "37"}, + {"thirty-eight", "38"}, + {"thirty-nine", "39"}, + {"forty-one", "41"}, + {"forty-two", "42"}, + {"forty-three", "43"}, + {"forty-four", "44"}, + {"forty-five", "45"}, + {"forty-six", "46"}, + {"forty-seven", "47"}, + {"forty-eight", "48"}, + {"forty-nine", "49"}, + {"fifty-one", "51"}, + {"fifty-two", "52"}, + {"fifty-three", "53"}, + {"fifty-four", "54"}, + {"fifty-five", "55"}, + {"fifty-six", "56"}, + {"fifty-seven", "57"}, + {"fifty-eight", "58"}, + {"fifty-nine", "59"}, + {"sixty-one", "61"}, + {"sixty-two", "62"}, + {"sixty-three", "63"}, + {"sixty-four", "64"}, + {"sixty-five", "65"}, + {"sixty-six", "66"}, + {"sixty-seven", "67"}, + {"sixty-eight", "68"}, + {"sixty-nine", "69"}, + {"seventy-one", "71"}, + {"seventy-two", "72"}, + {"seventy-three", "73"}, + {"seventy-four", "74"}, + {"seventy-five", "75"}, + {"seventy-six", "76"}, + {"seventy-seven", "77"}, + {"seventy-eight", "78"}, + {"seventy-nine", "79"}, + {"eighty-one", "81"}, + {"eighty-two", "82"}, + {"eighty-three", "83"}, + {"eighty-four", "84"}, + {"eighty-five", "85"}, + {"eighty-six", "86"}, + {"eighty-seven", "87"}, + {"eighty-eight", "88"}, + {"eighty-nine", "89"}, + {"ninety-one", "91"}, + {"ninety-two", "92"}, + {"ninety-three", "93"}, + {"ninety-four", "94"}, + {"ninety-five", "95"}, + {"ninety-six", "96"}, + {"ninety-seven", "97"}, + {"ninety-eight", "98"}, + {"ninety-nine", "99"} + }, + // larger units function as suffixes now + // assume never have three hundred four, three hundred and four + {{"hundred", "00"}, {"thousand", "000"}}, + // single numeral words now + // some would have been ambiguous + { + {"seventeen", "17"}, + {"thirteen", "13"}, + {"fourteen", "14"}, + {"eighteen", "18"}, + {"nineteen", "19"}, + {"fifteen", "15"}, + {"sixteen", "16"}, + {"seventy", "70"}, + {"eleven", "11"}, + {"twelve", "12"}, + {"twenty", "20"}, + {"thirty", "30"}, + {"eighty", "80"}, + {"ninety", "90"}, + {"three", "3"}, + {"seven", "7"}, + {"eight", "8"}, + {"forty", "40"}, + {"fifty", "50"}, + {"sixty", "60"}, + {"zero", "0"}, + {"four", "4"}, + {"five", "5"}, + {"nine", "9"}, + {"one", "1"}, + {"two", "2"}, + {"six", "6"}, + {"ten", "10"} + }, + // now do letter for digit substitutions + {{"oh", "0"}, {"o", "0"}, {"i", "1"}, {"l", "1"}} + }; // Regex to identify a phone number static final String cleanPhoneRegex = "([2-9]\\d{2}[2-9]\\d{6})"; // Regex which attempts to ignore punctuation and other distractions. static final String phoneRegex = - "([{(<]{0,3}[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}" + - "[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}\\d" + - "[\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,3}\\d)"; + "([{(<]{0,3}[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}" + + "[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}\\d" + + "[\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,3}\\d)"; public static ArrayList extractPhoneNumbers(String text) { text = clean(text); diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java index b7ce5c77a1..49c5110fc0 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java @@ -25,16 +25,13 @@ import org.xml.sax.helpers.DefaultHandler; /** - * Decorator base class for the {@link ContentHandler} interface. This class - * simply delegates all SAX events calls to an underlying decorated handler - * instance. Subclasses can provide extra decoration by overriding one or more - * of the SAX event methods. + * Decorator base class for the {@link ContentHandler} interface. This class simply delegates all + * SAX events calls to an underlying decorated handler instance. Subclasses can provide extra + * decoration by overriding one or more of the SAX event methods. */ public class ContentHandlerDecorator extends DefaultHandler { - /** - * Decorated SAX event handler. - */ + /** Decorated SAX event handler. */ private ContentHandler handler; /** @@ -48,18 +45,18 @@ public ContentHandlerDecorator(ContentHandler handler) { } /** - * Creates a decorator that by default forwards incoming SAX events to - * a dummy content handler that simply ignores all the events. Subclasses - * should use the {@link #setContentHandler(ContentHandler)} method to - * switch to a more usable underlying content handler. + * Creates a decorator that by default forwards incoming SAX events to a dummy content handler + * that simply ignores all the events. Subclasses should use the {@link + * #setContentHandler(ContentHandler)} method to switch to a more usable underlying content + * handler. */ protected ContentHandlerDecorator() { this(new DefaultHandler()); } /** - * Sets the underlying content handler. All future SAX events will be - * directed to this handler instead of the one that was previously used. + * Sets the underlying content handler. All future SAX events will be directed to this handler + * instead of the one that was previously used. * * @param handler content handler */ @@ -170,48 +167,46 @@ public String toString() { } /** - * Handle any exceptions thrown by methods in this class. This method - * provides a single place to implement custom exception handling. The - * default behaviour is simply to re-throw the given exception, but - * subclasses can also provide alternative ways of handling the situation. - * - * If the wrapped handler is itself a ContentHandlerDecorator, the call - * is delegated to the wrapped handler's {@link ContentHandlerDecorator#handleException(SAXException)} + * Handle any exceptions thrown by methods in this class. This method provides a single place to + * implement custom exception handling. The default behaviour is simply to re-throw the given + * exception, but subclasses can also provide alternative ways of handling the situation. + * + *

If the wrapped handler is itself a ContentHandlerDecorator, the call is delegated to the + * wrapped handler's {@link ContentHandlerDecorator#handleException(SAXException)} * * @param exception the exception that was thrown * @throws SAXException the exception (if any) thrown to the client */ protected void handleException(SAXException exception) throws SAXException { if (handler instanceof ContentHandlerDecorator) { - ((ContentHandlerDecorator)handler).handleException(exception); + ((ContentHandlerDecorator) handler).handleException(exception); } else { throw exception; } } @Override - public void warning (SAXParseException exception) throws SAXException { + public void warning(SAXParseException exception) throws SAXException { if (handler instanceof ErrorHandler) { - ((ErrorHandler)handler).warning(exception); + ((ErrorHandler) handler).warning(exception); } else { super.warning(exception); } } @Override - public void error (SAXParseException exception) throws SAXException { + public void error(SAXParseException exception) throws SAXException { if (handler instanceof ErrorHandler) { - ((ErrorHandler)handler).error(exception); + ((ErrorHandler) handler).error(exception); } else { super.error(exception); } } @Override - public void fatalError (SAXParseException exception) - throws SAXException { + public void fatalError(SAXParseException exception) throws SAXException { if (handler instanceof ErrorHandler) { - ((ErrorHandler)handler).fatalError(exception); + ((ErrorHandler) handler).fatalError(exception); } else { super.fatalError(exception); } diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java index 967e186ebd..df92df0d7d 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java @@ -17,14 +17,12 @@ package org.apache.tika.sax; import java.io.Serializable; - -import org.xml.sax.ContentHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +import org.xml.sax.ContentHandler; public interface ContentHandlerDecoratorFactory extends Serializable { - ContentHandler decorate(ContentHandler contentHandler, Metadata metadata, - ParseContext parseContext); + ContentHandler decorate( + ContentHandler contentHandler, Metadata metadata, ParseContext parseContext); } diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java index dc2f3384fc..81962b8739 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java @@ -16,19 +16,14 @@ */ package org.apache.tika.sax; - import java.io.OutputStream; import java.io.Serializable; import java.nio.charset.Charset; - import org.xml.sax.ContentHandler; -/** - * Interface to allow easier injection of code for getting a new ContentHandler - */ +/** Interface to allow easier injection of code for getting a new ContentHandler */ public interface ContentHandlerFactory extends Serializable { ContentHandler getNewContentHandler(); ContentHandler getNewContentHandler(OutputStream os, Charset charset); - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java index b76c4d77be..fb5c645bb8 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java @@ -17,19 +17,17 @@ package org.apache.tika.sax; import java.util.Stack; - +import org.apache.tika.metadata.Metadata; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.metadata.Metadata; - public class DIFContentHandler extends DefaultHandler { - private static final char[] NEWLINE = new char[]{'\n'}; - private static final char[] TABSPACE = new char[]{'\t'}; + private static final char[] NEWLINE = new char[] {'\n'}; + private static final char[] TABSPACE = new char[] {'\t'}; private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); private final Stack treeStack; @@ -65,10 +63,10 @@ public void characters(char[] ch, int start, int length) throws SAXException { this.delegate.characters(title.toCharArray(), 0, title.length()); this.delegate.endElement("", "h3", "h3"); } - if (this.treeStack.peek().equals("Southernmost_Latitude") || - this.treeStack.peek().equals("Northernmost_Latitude") || - this.treeStack.peek().equals("Westernmost_Longitude") || - this.treeStack.peek().equals("Easternmost_Longitude")) { + if (this.treeStack.peek().equals("Southernmost_Latitude") + || this.treeStack.peek().equals("Northernmost_Latitude") + || this.treeStack.peek().equals("Westernmost_Longitude") + || this.treeStack.peek().equals("Easternmost_Longitude")) { this.delegate.characters(NEWLINE, 0, NEWLINE.length); this.delegate.characters(TABSPACE, 0, TABSPACE.length); this.delegate.characters(TABSPACE, 0, TABSPACE.length); @@ -146,5 +144,4 @@ public void endDocument() throws SAXException { public String toString() { return delegate.toString(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java index 4f9d30c920..ec28e1b8e2 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java @@ -19,26 +19,23 @@ import java.util.Collections; import java.util.Map; import javax.xml.namespace.QName; - import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; /** - * Content handler decorator that maps element QNames using - * a Map. Not mappable elements are not forwarded. - * Attributes may also be mapped (for each element different using - * a Map for attributes), not mappable attributes are not - * forwarded. The default is to not map any attributes and therefore do - * not forward any of them. + * Content handler decorator that maps element QNames using a Map. Not + * mappable elements are not forwarded. Attributes may also be mapped (for each element different + * using a Map for attributes), not mappable attributes are not forwarded. The default + * is to not map any attributes and therefore do not forward any of them. */ public class ElementMappingContentHandler extends ContentHandlerDecorator { private final Map mappings; - public ElementMappingContentHandler(ContentHandler handler, - Map mappings) { + public ElementMappingContentHandler( + ContentHandler handler, Map mappings) { super(handler); this.mappings = mappings; } @@ -58,7 +55,10 @@ public void startElement(String namespaceURI, String localName, String qName, At TargetElement mapping = mappings.get(new QName(namespaceURI, localName)); if (mapping != null) { QName tag = mapping.getMappedTagName(); - super.startElement(tag.getNamespaceURI(), tag.getLocalPart(), getQNameAsString(tag), + super.startElement( + tag.getNamespaceURI(), + tag.getLocalPart(), + getQNameAsString(tag), mapping.mapAttributes(atts)); } } @@ -78,34 +78,29 @@ public static class TargetElement { private final QName mappedTagName; private final Map attributesMapping; - /** - * Creates an TargetElement, attributes of this element will - * be mapped as specified - */ + /** Creates an TargetElement, attributes of this element will be mapped as specified */ public TargetElement(QName mappedTagName, Map attributesMapping) { this.mappedTagName = mappedTagName; this.attributesMapping = attributesMapping; } - /** - * A shortcut that automatically creates the QName object - */ - public TargetElement(String mappedTagURI, String mappedTagLocalName, - Map attributesMapping) { + /** A shortcut that automatically creates the QName object */ + public TargetElement( + String mappedTagURI, + String mappedTagLocalName, + Map attributesMapping) { this(new QName(mappedTagURI, mappedTagLocalName), attributesMapping); } /** - * Creates an TargetElement with no attributes, all attributes - * will be deleted from SAX stream + * Creates an TargetElement with no attributes, all attributes will be deleted from SAX + * stream */ public TargetElement(QName mappedTagName) { this(mappedTagName, Collections.emptyMap()); } - /** - * A shortcut that automatically creates the QName object - */ + /** A shortcut that automatically creates the QName object */ public TargetElement(String mappedTagURI, String mappedTagLocalName) { this(mappedTagURI, mappedTagLocalName, Collections.emptyMap()); } @@ -123,13 +118,15 @@ public Attributes mapAttributes(final Attributes atts) { for (int i = 0; i < atts.getLength(); i++) { QName name = attributesMapping.get(new QName(atts.getURI(i), atts.getLocalName(i))); if (name != null) { - natts.addAttribute(name.getNamespaceURI(), name.getLocalPart(), - getQNameAsString(name), atts.getType(i), atts.getValue(i)); + natts.addAttribute( + name.getNamespaceURI(), + name.getLocalPart(), + getQNameAsString(name), + atts.getType(i), + atts.getValue(i)); } } return natts; } - } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java index 38afb0ca73..6bfdb61dfc 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java @@ -19,19 +19,16 @@ import org.xml.sax.ContentHandler; /** - * Content handler decorator that prevents the {@link #startDocument()} - * and {@link #endDocument()} events from reaching the decorated handler. - * This is useful when you want to direct the results of parsing multiple - * different XML documents into a single target document without worrying - * about the {@link #startDocument()} and {@link #endDocument()} methods - * being called more than once. + * Content handler decorator that prevents the {@link #startDocument()} and {@link #endDocument()} + * events from reaching the decorated handler. This is useful when you want to direct the results of + * parsing multiple different XML documents into a single target document without worrying about the + * {@link #startDocument()} and {@link #endDocument()} methods being called more than once. */ public class EmbeddedContentHandler extends ContentHandlerDecorator { /** - * Created a decorator that prevents the given handler from - * receiving {@link #startDocument()} and {@link #endDocument()} - * events. + * Created a decorator that prevents the given handler from receiving {@link #startDocument()} + * and {@link #endDocument()} events. * * @param handler the content handler to be decorated */ @@ -39,18 +36,11 @@ public EmbeddedContentHandler(ContentHandler handler) { super(handler); } - /** - * Ignored. - */ + /** Ignored. */ @Override - public void startDocument() { - } + public void startDocument() {} - /** - * Ignored. - */ + /** Ignored. */ @Override - public void endDocument() { - } - + public void endDocument() {} } diff --git a/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java index 544db0d264..86d42e94a3 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java @@ -20,10 +20,9 @@ import org.xml.sax.SAXException; /** - * A wrapper around a {@link ContentHandler} which will ignore normal - * SAX calls to {@link #endDocument()}, and only fire them later. - * This is typically used to ensure that we can output the metadata - * before ending the document + * A wrapper around a {@link ContentHandler} which will ignore normal SAX calls to {@link + * #endDocument()}, and only fire them later. This is typically used to ensure that we can output + * the metadata before ending the document */ public class EndDocumentShieldingContentHandler extends ContentHandlerDecorator { private boolean endDocumentCalled; diff --git a/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java index e1fa733705..41a714d172 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java @@ -17,22 +17,21 @@ package org.apache.tika.sax; import javax.xml.transform.sax.TransformerHandler; - import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** - * Content handler decorator which wraps a {@link TransformerHandler} in order to - * allow the TITLE tag to render as <title></title> - * rather than <title/> which is accomplished - * by calling the {@link TransformerHandler#characters(char[], int, int)} method - * with a length of 1 but a zero length char array. - *

- * This workaround is an unfortunate circumstance of the limitations imposed by the - * implementation of the XML serialization code in the JDK brought over from - * the xalan project which no longer allows for the specification of an - * alternate content-handler via xslt templates or other means. + * Content handler decorator which wraps a {@link TransformerHandler} in order to allow the + * TITLE tag to render as <title></title> rather than + * <title/> which is accomplished by calling the {@link + * TransformerHandler#characters(char[], int, int)} method with a length of 1 but a + * zero length char array. + * + *

This workaround is an unfortunate circumstance of the limitations imposed by the + * implementation of the XML serialization code in the JDK brought over from the xalan project which + * no longer allows for the specification of an alternate content-handler via xslt + * templates or other means. * * @see TIKA-725 */ @@ -85,5 +84,4 @@ public void characters(char[] ch, int start, int length) throws SAXException { super.characters(ch, start, length); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/Link.java b/tika-core/src/main/java/org/apache/tika/sax/Link.java index cf3c25d688..150a7919b6 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/Link.java +++ b/tika-core/src/main/java/org/apache/tika/sax/Link.java @@ -119,5 +119,4 @@ public String toString() { } return builder.toString(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java index 310a183287..89f4262414 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java @@ -21,37 +21,27 @@ import java.util.ArrayList; import java.util.LinkedList; import java.util.List; - import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; -/** - * Content handler that collects links from an XHTML document. - */ +/** Content handler that collects links from an XHTML document. */ public class LinkContentHandler extends DefaultHandler { /** - * Stack of link builders, one for each level of nested links currently - * being processed. A usual case of a nested link would be a hyperlinked - * image (&a href="..."><img src="..."><>), - * but it's possible (though unlikely) for also other kinds of nesting - * to occur. + * Stack of link builders, one for each level of nested links currently being processed. A usual + * case of a nested link would be a hyperlinked image ( + * &a href="..."><img src="..."><>), but it's possible (though unlikely) + * for also other kinds of nesting to occur. */ private final LinkedList builderStack = new LinkedList<>(); - /** - * Collected links - */ + /** Collected links */ private final List links = new ArrayList<>(); - /** - * Whether to collapse whitespace in anchor text - */ + /** Whether to collapse whitespace in anchor text */ private final boolean collapseWhitespaceInAnchor; - /** - * Default constructor - */ + /** Default constructor */ public LinkContentHandler() { this(false); } @@ -76,7 +66,7 @@ public List getLinks() { return links; } - //-------------------------------------------------------< ContentHandler> + // -------------------------------------------------------< ContentHandler> @Override public void startElement(String uri, String local, String name, Attributes attributes) { @@ -133,8 +123,11 @@ public void ignorableWhitespace(char[] ch, int start, int length) { @Override public void endElement(String uri, String local, String name) { if (!builderStack.isEmpty() && XHTML.equals(uri)) { - if ("a".equals(local) || "img".equals(local) || "link".equals(local) || - "script".equals(local) || "iframe".equals(local)) { + if ("a".equals(local) + || "img".equals(local) + || "link".equals(local) + || "script".equals(local) + || "iframe".equals(local)) { // ensure this is the correct builder. not all tags correspond // to a LinkBuilder, e.g. for embedded scripts if (builderStack.getFirst().getType().equals(local)) { @@ -144,5 +137,4 @@ public void endElement(String uri, String local, String name) { } } } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java index 6461e0946a..729d1fd539 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java @@ -21,9 +21,9 @@ import org.xml.sax.InputSource; /** - * Content handler decorator that always returns an empty stream from the - * {@link #resolveEntity(String, String)} method to prevent potential - * network or other external resources from being accessed by an XML parser. + * Content handler decorator that always returns an empty stream from the {@link + * #resolveEntity(String, String)} method to prevent potential network or other external resources + * from being accessed by an XML parser. * * @see TIKA-185 */ @@ -34,12 +34,10 @@ public OfflineContentHandler(ContentHandler handler) { } /** - * Returns an empty stream. This will make an XML parser silently - * ignore any external entities. + * Returns an empty stream. This will make an XML parser silently ignore any external entities. */ @Override public InputSource resolveEntity(String publicId, String systemId) { return new InputSource(new ClosedInputStream()); } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java index 981174021a..46e87a4ccd 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java @@ -19,34 +19,29 @@ import java.util.Arrays; import java.util.List; - +import org.apache.tika.metadata.Metadata; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.metadata.Metadata; - /** * Class used to extract phone numbers while parsing. - *

- * Every time a document is parsed in Tika, the content is split into SAX events. - * Those SAX events are handled by a ContentHandler. You can think of these events - * as marking a tag in an HTML file. Once you're finished parsing, you can call - * handler.toString(), for example, to get the text contents of the file. On the other - * hand, any of the metadata of the file will be added to the Metadata object passed - * in during the parse() call. So, the Parser class sends metadata to the Metadata - * object and content to the ContentHandler. - *

- * This class is an example of how to combine a ContentHandler and a Metadata. - * As content is passed to the handler, we first check to see if it matches a - * textual pattern for a phone number. If the extracted content is a phone number, - * we add it to the metadata under the key "phonenumbers". So, if you used this - * ContentHandler when you parsed a document, then called - * metadata.getValues("phonenumbers"), you would get an array of Strings of phone - * numbers found in the document. - *

- * Please see the PhoneExtractingContentHandlerTest for an example of how to use - * this class. + * + *

Every time a document is parsed in Tika, the content is split into SAX events. Those SAX + * events are handled by a ContentHandler. You can think of these events as marking a tag in an HTML + * file. Once you're finished parsing, you can call handler.toString(), for example, to get the text + * contents of the file. On the other hand, any of the metadata of the file will be added to the + * Metadata object passed in during the parse() call. So, the Parser class sends metadata to the + * Metadata object and content to the ContentHandler. + * + *

This class is an example of how to combine a ContentHandler and a Metadata. As content is + * passed to the handler, we first check to see if it matches a textual pattern for a phone number. + * If the extracted content is a phone number, we add it to the metadata under the key + * "phonenumbers". So, if you used this ContentHandler when you parsed a document, then called + * metadata.getValues("phonenumbers"), you would get an array of Strings of phone numbers found in + * the document. + * + *

Please see the PhoneExtractingContentHandlerTest for an example of how to use this class. */ public class PhoneExtractingContentHandler extends ContentHandlerDecorator { private static final String PHONE_NUMBERS = "phonenumbers"; @@ -65,22 +60,20 @@ public PhoneExtractingContentHandler(ContentHandler handler, Metadata metadata) } /** - * Creates a decorator that by default forwards incoming SAX events to - * a dummy content handler that simply ignores all the events. Subclasses - * should use the {@link #setContentHandler(ContentHandler)} method to - * switch to a more usable underlying content handler. - * Also creates a dummy Metadata object to store phone numbers in. + * Creates a decorator that by default forwards incoming SAX events to a dummy content handler + * that simply ignores all the events. Subclasses should use the {@link + * #setContentHandler(ContentHandler)} method to switch to a more usable underlying content + * handler. Also creates a dummy Metadata object to store phone numbers in. */ protected PhoneExtractingContentHandler() { this(new DefaultHandler(), new Metadata()); } /** - * The characters method is called whenever a Parser wants to pass raw... - * characters to the ContentHandler. But, sometimes, phone numbers are split - * accross different calls to characters, depending on the specific Parser - * used. So, we simply add all characters to a StringBuilder and analyze it - * once the document is finished. + * The characters method is called whenever a Parser wants to pass raw... characters to the + * ContentHandler. But, sometimes, phone numbers are split accross different calls to + * characters, depending on the specific Parser used. So, we simply add all characters to a + * StringBuilder and analyze it once the document is finished. */ @Override public void characters(char[] ch, int start, int length) throws SAXException { @@ -93,10 +86,9 @@ public void characters(char[] ch, int start, int length) throws SAXException { } } - /** - * This method is called whenever the Parser is done parsing the file. So, - * we check the output for any phone numbers. + * This method is called whenever the Parser is done parsing the file. So, we check the output + * for any phone numbers. */ @Override public void endDocument() throws SAXException { diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java index 7ad6f8b250..e88dd53462 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java @@ -18,57 +18,52 @@ import java.util.LinkedList; import java.util.List; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; import org.apache.tika.utils.ParserUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** - * This is the default implementation of {@link AbstractRecursiveParserWrapperHandler}. - * See its documentation for more details. - *

- * This caches the a metadata object for each embedded file and for the container file. - * It places the extracted content in the metadata object, with this key: - * {@link TikaCoreProperties#TIKA_CONTENT} - * If memory is a concern, subclass AbstractRecursiveParserWrapperHandler to handle each - * embedded document. - *

- * NOTE: This handler must only be used with the {@link + * This is the default implementation of {@link AbstractRecursiveParserWrapperHandler}. See its + * documentation for more details. + * + *

This caches the a metadata object for each embedded file and for the container file. It places + * the extracted content in the metadata object, with this key: {@link + * TikaCoreProperties#TIKA_CONTENT} If memory is a concern, subclass + * AbstractRecursiveParserWrapperHandler to handle each embedded document. + * + *

NOTE: This handler must only be used with the {@link * org.apache.tika.parser.RecursiveParserWrapper} - *

*/ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler { protected final List metadataList = new LinkedList<>(); private final MetadataFilter metadataFilter; - /** - * Create a handler with no limit on the number of embedded resources - */ + /** Create a handler with no limit on the number of embedded resources */ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) { this(contentHandlerFactory, -1, NoOpFilter.NOOP_FILTER); } /** - * Create a handler that limits the number of embedded resources that will be - * parsed + * Create a handler that limits the number of embedded resources that will be parsed * * @param maxEmbeddedResources number of embedded resources that will be parsed */ - public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, - int maxEmbeddedResources) { + public RecursiveParserWrapperHandler( + ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) { this(contentHandlerFactory, maxEmbeddedResources, NoOpFilter.NOOP_FILTER); } - public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, - int maxEmbeddedResources, MetadataFilter metadataFilter) { + public RecursiveParserWrapperHandler( + ContentHandlerFactory contentHandlerFactory, + int maxEmbeddedResources, + MetadataFilter metadataFilter) { super(contentHandlerFactory, maxEmbeddedResources); this.metadataFilter = metadataFilter; } @@ -77,7 +72,7 @@ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory * This is called before parsing an embedded document * * @param contentHandler - local content handler to use on the embedded document - * @param metadata metadata to use for the embedded document + * @param metadata metadata to use for the embedded document * @throws SAXException */ @Override @@ -90,7 +85,7 @@ public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metada * This is called after parsing an embedded document. * * @param contentHandler local contenthandler used on the embedded document - * @param metadata metadata from the embedded document + * @param metadata metadata from the embedded document * @throws SAXException */ @Override @@ -111,7 +106,7 @@ public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata /** * @param contentHandler content handler used on the main document - * @param metadata metadata from the main document + * @param metadata metadata from the main document * @throws SAXException */ @Override @@ -131,7 +126,7 @@ public void endDocument(ContentHandler contentHandler, Metadata metadata) throws /** * @return a list of Metadata objects, one for the main document and one for each embedded - * document + * document */ public List getMetadataList() { return metadataList; @@ -140,14 +135,15 @@ public List getMetadataList() { void addContent(ContentHandler handler, Metadata metadata) { if (handler.getClass().equals(DefaultHandler.class)) { - //no-op: we can't rely on just testing for - //empty content because DefaultHandler's toString() - //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd" + // no-op: we can't rely on just testing for + // empty content because DefaultHandler's toString() + // returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd" } else { String content = handler.toString(); if (content != null && content.trim().length() > 0) { metadata.add(TikaCoreProperties.TIKA_CONTENT, content); - metadata.add(TikaCoreProperties.TIKA_CONTENT_HANDLER, + metadata.add( + TikaCoreProperties.TIKA_CONTENT_HANDLER, handler.getClass().getSimpleName()); } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java index c250fa28f1..dfcd6ace51 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java @@ -18,20 +18,17 @@ package org.apache.tika.sax; import java.io.Writer; - import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** - * Content handler for Rich Text, it will extract XHTML <img/> - * tag <alt/> attribute and XHTML <a/> tag <name/> - * attribute into the output. + * Content handler for Rich Text, it will extract XHTML <img/> tag <alt/> attribute and + * XHTML <a/> tag <name/> attribute into the output. */ public class RichTextContentHandler extends WriteOutContentHandler { /** - * Creates a content handler that writes XHTML body character events to - * the given writer. + * Creates a content handler that writes XHTML body character events to the given writer. * * @param writer writer */ diff --git a/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java index b04c327683..e03bce17a0 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java @@ -27,38 +27,36 @@ import org.xml.sax.helpers.AttributesImpl; /** - * Content handler decorator that makes sure that the character events - * ({@link #characters(char[], int, int)} or - * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated - * content handler contain only valid XML characters. All invalid characters - * are replaced with the Unicode replacement character U+FFFD (though a - * subclass may change this by overriding the {@link #writeReplacement(Output)} method). - *

- * The XML standard defines the following Unicode character ranges as - * valid XML characters: + * Content handler decorator that makes sure that the character events ({@link #characters(char[], + * int, int)} or {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated content + * handler contain only valid XML characters. All invalid characters are replaced with the Unicode + * replacement character U+FFFD (though a subclass may change this by overriding the {@link + * #writeReplacement(Output)} method). + * + *

The XML standard defines the following Unicode character ranges as valid XML characters: + * *

  * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
  * 
- *

- * Note that currently this class only detects those invalid characters whose - * UTF-16 representation fits a single char. Also, this class does not ensure - * that the UTF-16 encoding of incoming characters is correct. + * + *

Note that currently this class only detects those invalid characters whose UTF-16 + * representation fits a single char. Also, this class does not ensure that the UTF-16 encoding of + * incoming characters is correct. */ public class SafeContentHandler extends ContentHandlerDecorator { + /** Replacement for invalid characters. */ + private static final char[] REPLACEMENT = new char[] {'\ufffd'}; + /** - * Replacement for invalid characters. - */ - private static final char[] REPLACEMENT = new char[]{'\ufffd'}; - /** - * Output through the {@link ContentHandler#characters(char[], int, int)} - * method of the decorated content handler. + * Output through the {@link ContentHandler#characters(char[], int, int)} method of the + * decorated content handler. */ private final Output charactersOutput = SafeContentHandler.super::characters; + /** - * Output through the - * {@link ContentHandler#ignorableWhitespace(char[], int, int)} - * method of the decorated content handler. + * Output through the {@link ContentHandler#ignorableWhitespace(char[], int, int)} method of the + * decorated content handler. */ private final Output ignorableWhitespaceOutput = SafeContentHandler.super::ignorableWhitespace; @@ -67,13 +65,12 @@ public SafeContentHandler(ContentHandler handler) { } /** - * Filters and outputs the contents of the given input buffer. Any - * invalid characters in the input buffer area handled by sending a - * replacement (a space character) to the given output. Any sequences - * of valid characters are passed as-is to the given output. + * Filters and outputs the contents of the given input buffer. Any invalid characters in the + * input buffer area handled by sending a replacement (a space character) to the given output. + * Any sequences of valid characters are passed as-is to the given output. * - * @param ch input buffer - * @param start start offset within the buffer + * @param ch input buffer + * @param start start offset within the buffer * @param length number of characters to read from the buffer * @param output output channel * @throws SAXException if the filtered characters could not be written out @@ -110,8 +107,8 @@ private void filter(char[] ch, int start, int length, Output output) throws SAXE * Checks if the given string contains any invalid XML characters. * * @param value string to be checked - * @return true if the string contains invalid XML characters, - * false otherwise + * @return true if the string contains invalid XML characters, false + * otherwise */ private boolean isInvalid(String value) { char[] ch = value.toCharArray(); @@ -129,17 +126,17 @@ private boolean isInvalid(String value) { } /** - * Checks whether the given Unicode character is an invalid XML character - * and should be replaced for output. Subclasses can override this method - * to use an alternative definition of which characters should be replaced - * in the XML output. The default definition from the XML specification is: + * Checks whether the given Unicode character is an invalid XML character and should be replaced + * for output. Subclasses can override this method to use an alternative definition of which + * characters should be replaced in the XML output. The default definition from the XML + * specification is: + * *

      * Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
      * 
* * @param ch character - * @return true if the character should be replaced, - * false otherwise + * @return true if the character should be replaced, false otherwise */ protected boolean isInvalid(int ch) { if (ch < 0x20) { @@ -154,8 +151,8 @@ protected boolean isInvalid(int ch) { } /** - * Outputs the replacement for an invalid character. Subclasses can - * override this method to use a custom replacement. + * Outputs the replacement for an invalid character. Subclasses can override this method to use + * a custom replacement. * * @param output where the replacement is written to * @throws SAXException if the replacement could not be written @@ -169,7 +166,7 @@ public void startElement(String uri, String localName, String name, Attributes a throws SAXException { // TODO: enable this, but some parsers currently // trip it - //assert verifyStartElement(name); + // assert verifyStartElement(name); // Look for any invalid characters in attribute values. for (int i = 0; i < atts.getLength(); i++) { if (isInvalid(atts.getValue(i))) { @@ -183,8 +180,12 @@ public void startElement(String uri, String localName, String name, Attributes a filter(value.toCharArray(), 0, value.length(), buffer); value = buffer.toString(); } - filtered.addAttribute(atts.getURI(j), atts.getLocalName(j), atts.getQName(j), - atts.getType(j), value); + filtered.addAttribute( + atts.getURI(j), + atts.getLocalName(j), + atts.getQName(j), + atts.getType(j), + value); } atts = filtered; break; @@ -197,11 +198,10 @@ public void startElement(String uri, String localName, String name, Attributes a public void endElement(String uri, String localName, String name) throws SAXException { // TODO: enable this, but some parsers currently // trip it - //assert verifyEndElement(name); + // assert verifyEndElement(name); super.endElement(uri, localName, name); } - /* private final List elements = new ArrayList(); @@ -235,13 +235,13 @@ private boolean verifyEndDocument() { } */ - //------------------------------------------------------< ContentHandler > + // ------------------------------------------------------< ContentHandler > @Override public void endDocument() throws SAXException { // TODO: enable this, but some parsers currently // trip it - //assert verifyEndDocument(); + // assert verifyEndDocument(); super.endDocument(); } @@ -256,8 +256,8 @@ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXExce } /** - * Internal interface that allows both character and - * ignorable whitespace content to be filtered the same way. + * Internal interface that allows both character and ignorable whitespace content to be filtered + * the same way. */ protected interface Output { void write(char[] ch, int start, int length) throws SAXException; @@ -274,7 +274,5 @@ public void write(char[] ch, int start, int length) { public String toString() { return builder.toString(); } - } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java index 3f9f3c42b6..3be1632683 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java @@ -18,72 +18,56 @@ import java.io.IOException; import java.util.LinkedList; - +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TikaInputStream; - /** - * Content handler decorator that attempts to prevent denial of service - * attacks against Tika parsers. - *

- * Currently this class simply compares the number of output characters - * to to the number of input bytes and keeps track of the XML nesting levels. - * An exception gets thrown if the output seems excessive compared to the - * input document. This is a strong indication of a zip bomb. + * Content handler decorator that attempts to prevent denial of service attacks against Tika + * parsers. + * + *

Currently this class simply compares the number of output characters to to the number of input + * bytes and keeps track of the XML nesting levels. An exception gets thrown if the output seems + * excessive compared to the input document. This is a strong indication of a zip bomb. * * @see TIKA-216 * @since Apache Tika 0.4 */ public class SecureContentHandler extends ContentHandlerDecorator { - /** - * The input stream that Tika is parsing. - */ + /** The input stream that Tika is parsing. */ private final TikaInputStream stream; - /** - * Current number of nested <div class="package-entr"> elements. - */ + + /** Current number of nested <div class="package-entr"> elements. */ private final LinkedList packageEntryDepths = new LinkedList<>(); - /** - * Number of output characters that Tika has produced so far. - */ + + /** Number of output characters that Tika has produced so far. */ private long characterCount = 0; - /** - * The current XML element depth. - */ + + /** The current XML element depth. */ private int currentDepth = 0; - /** - * Output threshold. - */ + + /** Output threshold. */ private long threshold = 1000000; - /** - * Maximum compression ratio. - */ + /** Maximum compression ratio. */ private long ratio = 100; - /** - * Maximum XML element nesting level. - */ + /** Maximum XML element nesting level. */ private int maxDepth = 100; - /** - * Maximum package entry nesting level. - */ + /** Maximum package entry nesting level. */ private int maxPackageEntryDepth = 10; /** - * Decorates the given content handler with zip bomb prevention based - * on the count of bytes read from the given counting input stream. - * The resulting decorator can be passed to a Tika parser along with - * the given counting input stream. + * Decorates the given content handler with zip bomb prevention based on the count of bytes read + * from the given counting input stream. The resulting decorator can be passed to a Tika parser + * along with the given counting input stream. * * @param handler the content handler to be decorated - * @param stream the input stream to be parsed + * @param stream the input stream to be parsed */ public SecureContentHandler(ContentHandler handler, TikaInputStream stream) { super(handler); @@ -99,12 +83,10 @@ public long getOutputThreshold() { return threshold; } - /** - * Sets the threshold for output characters before the zip bomb prevention - * is activated. This avoids false positives in cases where an otherwise - * normal document for some reason starts with a highly compressible - * sequence of bytes. + * Sets the threshold for output characters before the zip bomb prevention is activated. This + * avoids false positives in cases where an otherwise normal document for some reason starts + * with a highly compressible sequence of bytes. * * @param threshold new output threshold */ @@ -112,7 +94,6 @@ public void setOutputThreshold(long threshold) { this.threshold = threshold; } - /** * Returns the maximum compression ratio. * @@ -122,11 +103,9 @@ public long getMaximumCompressionRatio() { return ratio; } - /** - * Sets the ratio between output characters and input bytes. If this - * ratio is exceeded (after the output threshold has been reached) then - * an exception gets thrown. + * Sets the ratio between output characters and input bytes. If this ratio is exceeded (after + * the output threshold has been reached) then an exception gets thrown. * * @param ratio new maximum compression ratio */ @@ -144,8 +123,8 @@ public int getMaximumDepth() { } /** - * Sets the maximum XML element nesting level. If this depth level is - * exceeded then an exception gets thrown. + * Sets the maximum XML element nesting level. If this depth level is exceeded then an exception + * gets thrown. * * @param depth maximum XML element nesting level */ @@ -163,8 +142,8 @@ public int getMaximumPackageEntryDepth() { } /** - * Sets the maximum package entry nesting level. If this depth level is - * exceeded then an exception gets thrown. + * Sets the maximum package entry nesting level. If this depth level is exceeded then an + * exception gets thrown. * * @param depth maximum package entry nesting level */ @@ -173,9 +152,8 @@ public void setMaximumPackageEntryDepth(int depth) { } /** - * Converts the given {@link SAXException} to a corresponding - * {@link TikaException} if it's caused by this instance detecting - * a zip bomb. + * Converts the given {@link SAXException} to a corresponding {@link TikaException} if it's + * caused by this instance detecting a zip bomb. * * @param e SAX exception * @throws TikaException zip bomb exception @@ -199,9 +177,9 @@ private long getByteCount() throws SAXException { } /** - * Records the given number of output characters (or more accurately - * UTF-16 code units). Throws an exception if the recorded number of - * characters highly exceeds the number of input bytes read. + * Records the given number of output characters (or more accurately UTF-16 code units). Throws + * an exception if the recorded number of characters highly exceeds the number of input bytes + * read. * * @param length number of new output characters produced * @throws SAXException if a zip bomb is detected @@ -211,8 +189,11 @@ protected void advance(int length) throws SAXException { long byteCount = getByteCount(); if (characterCount > threshold && characterCount > byteCount * ratio) { throw new SecureSAXException( - "Suspected zip bomb: " + byteCount + " input bytes produced " + characterCount + - " output characters"); + "Suspected zip bomb: " + + byteCount + + " input bytes produced " + + characterCount + + " output characters"); } } @@ -228,8 +209,10 @@ public void startElement(String uri, String localName, String name, Attributes a if ("div".equals(name) && "package-entry".equals(atts.getValue("class"))) { packageEntryDepths.addLast(currentDepth); if (packageEntryDepths.size() >= maxPackageEntryDepth) { - throw new SecureSAXException("Suspected zip bomb: " + packageEntryDepths.size() + - " levels of package entry nesting"); + throw new SecureSAXException( + "Suspected zip bomb: " + + packageEntryDepths.size() + + " levels of package entry nesting"); } } @@ -266,9 +249,7 @@ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXExce */ private class SecureSAXException extends SAXException { - /** - * Serial version UID. - */ + /** Serial version UID. */ private static final long serialVersionUID = 2285245380321771445L; public SecureSAXException(String message) throws SAXException { @@ -278,7 +259,5 @@ public SecureSAXException(String message) throws SAXException { public boolean isCausedBy(SecureContentHandler handler) { return SecureContentHandler.this == handler; } - } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java b/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java index c8e89a06ab..01f6fa442f 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java +++ b/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java @@ -21,12 +21,12 @@ import java.util.TreeMap; /** - * This class provides a collection of the most important technical standard organizations. - * The collection of standard organizations has been obtained from - * Wikipedia. - * Currently, the list is composed of the most important international standard organizations, - * the regional standard organizations (i.e., Africa, Americas, Asia Pacific, Europe, and Middle - * East), and British and American standard organizations among the national-based ones. + * This class provides a collection of the most important technical standard organizations. The + * collection of standard organizations has been obtained from Wikipedia. + * Currently, the list is composed of the most important international standard organizations, the + * regional standard organizations (i.e., Africa, Americas, Asia Pacific, Europe, and Middle East), + * and British and American standard organizations among the national-based ones. */ public class StandardOrganizations { @@ -34,28 +34,31 @@ public class StandardOrganizations { static { organizations = new TreeMap<>(); - //International standard organizations + // International standard organizations organizations.put("3GPP", "3rd Generation Partnership Project"); organizations.put("3GPP2", "3rd Generation Partnership Project 2"); organizations.put("Accellera", "Accellera Organization"); - organizations.put("A4L", - "Access for Learning Community (formerly known as the Schools Interoperability " + - "Framework)"); + organizations.put( + "A4L", + "Access for Learning Community (formerly known as the Schools Interoperability " + + "Framework)"); organizations.put("AES", "Audio Engineering Society"); organizations.put("AIIM", "Association for Information and Image Management"); - organizations.put("ASAM", - "Association for Automation and Measuring Systems - Automotive technology"); - organizations.put("ASHRAE", - "American Society of Heating, Refrigerating and Air-Conditioning Engineers " + - "(ASHRAE is an international organization, despite its name)"); + organizations.put( + "ASAM", "Association for Automation and Measuring Systems - Automotive technology"); + organizations.put( + "ASHRAE", + "American Society of Heating, Refrigerating and Air-Conditioning Engineers " + + "(ASHRAE is an international organization, despite its name)"); organizations.put("ASME", "formerly The American Society of Mechanical Engineers"); - organizations - .put("ASTM", "ASTM (American Society for Testing and Materials) International"); + organizations.put( + "ASTM", "ASTM (American Society for Testing and Materials) International"); organizations.put("ATIS", "Alliance for Telecommunications Industry Solutions"); organizations.put("AUTOSAR", "Automotive technology"); - organizations.put("BIPM, CGPM, and CIPM", - "Bureau International des Poids et Mesures and the related organizations " + - "established under the Metre Convention of 1875."); + organizations.put( + "BIPM, CGPM, and CIPM", + "Bureau International des Poids et Mesures and the related organizations " + + "established under the Metre Convention of 1875."); organizations.put("CableLabs", "Cable Television Laboratories"); organizations.put("CCSDS", "Consultative Committee for Space Data Sciences"); organizations.put("CISPR", "International Special Committee on Radio Interference"); @@ -66,9 +69,10 @@ public class StandardOrganizations { organizations.put("EKOenergy", "EKOenergy Network managed by environmental NGOs"); organizations.put("FAI", "Fédération Aéronautique Internationale"); organizations.put("GlobalPlatform", "Secure element and TEE standards"); - organizations.put("GS1", - "Global supply chain standards (identification numbers, barcodes, electronic " + - "commerce transactions, RFID)"); + organizations.put( + "GS1", + "Global supply chain standards (identification numbers, barcodes, electronic " + + "commerce transactions, RFID)"); organizations.put("HGI", "Home Gateway Initiative"); organizations.put("HFSB", "Hedge Fund Standards Board"); organizations.put("IATA", "International Air Transport Association"); @@ -86,28 +90,32 @@ public class StandardOrganizations { organizations.put("IPTC", "International Press Telecommunications Council"); organizations.put("ITU", "The International Telecommunication Union"); organizations.put("ITU-R", "ITU Radiocommunications Sector (formerly known as CCIR)"); - organizations.put("CCIR", + organizations.put( + "CCIR", "Comité Consultatif International pour la Radio, a forerunner of the ITU-R"); organizations.put("ITU-T", "ITU Telecommunications Sector (formerly known as CCITT)"); - organizations.put("CCITT", - "Comité Consultatif International Téléphonique et Télégraphique, renamed ITU-T in" + - " 1993"); + organizations.put( + "CCITT", + "Comité Consultatif International Téléphonique et Télégraphique, renamed ITU-T in" + + " 1993"); organizations.put("ITU-D", "ITU Telecom Development (formerly known as BDT)"); organizations.put("BDT", "Bureau de développement des télécommunications, renamed ITU-D"); organizations.put("IUPAC", "International Union of Pure and Applied Chemistry"); organizations.put("Liberty Alliance", "Liberty Alliance"); organizations.put("Media Grid", "Media Grid Standards Organization"); - organizations.put("NACE International", + organizations.put( + "NACE International", "Formerly known as National Association of Corrosion Engineers"); - organizations.put("OASIS", - "Organization for the Advancement of Structured Information Standards"); + organizations.put( + "OASIS", "Organization for the Advancement of Structured Information Standards"); organizations.put("OGC", "Open Geospatial Consortium"); organizations.put("OHICC", "Organization of Hotel Industry Classification & Certification"); organizations.put("OMA", "Open Mobile Alliance"); organizations.put("OMG", "Object Management Group"); - organizations.put("OGF", - "Open Grid Forum (merger of Global Grid Forum (GGF) and Enterprise Grid Alliance " + - "(EGA))"); + organizations.put( + "OGF", + "Open Grid Forum (merger of Global Grid Forum (GGF) and Enterprise Grid Alliance " + + "(EGA))"); organizations.put("GGF", "Global Grid Forum"); organizations.put("EGA", "Enterprise Grid Alliance"); organizations.put("OpenTravel Alliance", "OpenTravel Alliance (previously known as OTA)"); @@ -131,37 +139,38 @@ public class StandardOrganizations { organizations.put("WHO", "World Health Organization"); organizations.put("XSF", "The XMPP Standards Foundation"); organizations.put("FAO", "Food and Agriculture Organization"); - //Regional standards organizations - //Africa + // Regional standards organizations + // Africa organizations.put("ARSO", "African Regional Organization for Standarization"); - organizations.put("SADCSTAN", + organizations.put( + "SADCSTAN", "Southern African Development Community (SADC) Cooperation in Standarization"); - //Americas + // Americas organizations.put("COPANT", "Pan American Standards Commission"); organizations.put("AMN", "MERCOSUR Standardization Association"); organizations.put("CROSQ", "CARICOM Regional Organization for Standards and Quality"); organizations.put("AAQG", "America's Aerospace Quality Group"); - //Asia Pacific + // Asia Pacific organizations.put("PASC", "Pacific Area Standards Congress"); organizations.put("ACCSQ", "ASEAN Consultative Committee for Standards and Quality"); - //Europe + // Europe organizations.put("RoyalCert", "RoyalCert International Registrars"); organizations.put("CEN", "European Committee for Standardization"); organizations.put("CENELEC", "European Committee for Electrotechnical Standardization"); organizations.put("URS", "United Registrar of Systems, UK"); organizations.put("ETSI", "European Telecommunications Standards Institute"); - organizations - .put("EASC", "Euro-Asian Council for Standardization, Metrology and Certification"); - organizations - .put("IRMM", "Institute for Reference Materials and Measurements (European Union)"); - //Middle East + organizations.put( + "EASC", "Euro-Asian Council for Standardization, Metrology and Certification"); + organizations.put( + "IRMM", "Institute for Reference Materials and Measurements (European Union)"); + // Middle East organizations.put("AIDMO", "Arab Industrial Development and Mining Organization"); organizations.put("IAU", "International Arabic Union"); - //Nationally-based standards organizations - //United Kingdom + // Nationally-based standards organizations + // United Kingdom organizations.put("BSI", "British Standards Institution aka BSI Group"); organizations.put("DStan", "UK Defence Standardization"); - //United States of America + // United States of America organizations.put("ANSI", "American National Standards Institute"); organizations.put("ACI", "American Concrete Institute"); organizations.put("NIST", "National Institute of Standards and Technology"); @@ -172,7 +181,7 @@ public class StandardOrganizations { * organizations. * * @return the map containing the collection of the most important technical standard - * organizations. + * organizations. */ public static Map getOrganizations() { return organizations; @@ -183,7 +192,7 @@ public static Map getOrganizations() { * organizations. * * @return the regular expression containing the most important technical standard - * organizations. + * organizations. */ public static String getOrganzationsRegex() { diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java b/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java index 243a031c95..50e7503609 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java +++ b/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java @@ -17,9 +17,7 @@ package org.apache.tika.sax; -/** - * Class that represents a standard reference. - */ +/** Class that represents a standard reference. */ public class StandardReference { private String mainOrganization; private String separator; @@ -27,8 +25,12 @@ public class StandardReference { private String identifier; private double score; - private StandardReference(String mainOrganizationAcronym, String separator, - String secondOrganizationAcronym, String identifier, double score) { + private StandardReference( + String mainOrganizationAcronym, + String separator, + String secondOrganizationAcronym, + String identifier, + double score) { super(); this.mainOrganization = mainOrganizationAcronym; this.separator = separator; @@ -105,8 +107,8 @@ public StandardReferenceBuilder(String mainOrganization, String identifier) { this.score = 0; } - public StandardReferenceBuilder setSecondOrganization(String separator, - String secondOrganization) { + public StandardReferenceBuilder setSecondOrganization( + String separator, String secondOrganization) { this.separator = separator; this.secondOrganization = secondOrganization; return this; @@ -118,8 +120,8 @@ public StandardReferenceBuilder setScore(double score) { } public StandardReference build() { - return new StandardReference(mainOrganization, separator, secondOrganization, - identifier, score); + return new StandardReference( + mainOrganization, separator, secondOrganization, identifier, score); } } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java index 006034a01d..0a2418cd97 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java @@ -19,19 +19,17 @@ import java.util.Arrays; import java.util.List; - +import org.apache.tika.metadata.Metadata; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.metadata.Metadata; - /** - * StandardsExtractingContentHandler is a Content Handler used to extract - * standard references while parsing. - *

- * This handler relies on complex regular expressions which can be slow on some types of - * input data. + * StandardsExtractingContentHandler is a Content Handler used to extract standard references while + * parsing. + * + *

This handler relies on complex regular expressions which can be slow on some types of input + * data. */ public class StandardsExtractingContentHandler extends ContentHandlerDecorator { public static final String STANDARD_REFERENCES = "standard_references"; @@ -44,7 +42,7 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator { /** * Creates a decorator for the given SAX event handler and Metadata object. * - * @param handler SAX event handler to be decorated. + * @param handler SAX event handler to be decorated. * @param metadata {@link Metadata} object. */ public StandardsExtractingContentHandler(ContentHandler handler, Metadata metadata) { @@ -54,22 +52,21 @@ public StandardsExtractingContentHandler(ContentHandler handler, Metadata metada } /** - * Creates a decorator that by default forwards incoming SAX events to a - * dummy content handler that simply ignores all the events. Subclasses - * should use the {@link #setContentHandler(ContentHandler)} method to - * switch to a more usable underlying content handler. Also creates a dummy - * Metadata object to store phone numbers in. + * Creates a decorator that by default forwards incoming SAX events to a dummy content handler + * that simply ignores all the events. Subclasses should use the {@link + * #setContentHandler(ContentHandler)} method to switch to a more usable underlying content + * handler. Also creates a dummy Metadata object to store phone numbers in. */ protected StandardsExtractingContentHandler() { this(new DefaultHandler(), new Metadata()); } /** - * Gets the threshold to be used for selecting the standard references found - * within the text based on their score. + * Gets the threshold to be used for selecting the standard references found within the text + * based on their score. * - * @return the threshold to be used for selecting the standard references - * found within the text based on their score. + * @return the threshold to be used for selecting the standard references found within the text + * based on their score. */ public double getThreshold() { return threshold; @@ -85,11 +82,10 @@ public void setThreshold(double score) { } /** - * The characters method is called whenever a Parser wants to pass raw - * characters to the ContentHandler. However, standard references are often - * split across different calls to characters, depending on the specific - * Parser used. Therefore, we simply add all characters to a StringBuilder - * and analyze it once the document is finished. + * The characters method is called whenever a Parser wants to pass raw characters to the + * ContentHandler. However, standard references are often split across different calls to + * characters, depending on the specific Parser used. Therefore, we simply add all characters to + * a StringBuilder and analyze it once the document is finished. */ @Override public void characters(char[] ch, int start, int length) throws SAXException { @@ -107,8 +103,8 @@ public void characters(char[] ch, int start, int length) throws SAXException { } /** - * This method is called whenever the Parser is done parsing the file. So, - * we check the output for any standard references. + * This method is called whenever the Parser is done parsing the file. So, we check the output + * for any standard references. */ @Override public void endDocument() throws SAXException { @@ -120,12 +116,11 @@ public void endDocument() throws SAXException { } } - /** * The number of characters to store in memory for checking for standards. * - * If this is unbounded, the complex regular expressions can take a long time - * to process some types of data. Only increase this limit with great caution. + *

If this is unbounded, the complex regular expressions can take a long time to process some + * types of data. Only increase this limit with great caution. */ public void setMaxBufferLength(int maxBufferLength) { this.maxBufferLength = maxBufferLength; diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java index 697eedee91..c6ef61fd3f 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java +++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java @@ -24,37 +24,31 @@ import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.tika.sax.StandardReference.StandardReferenceBuilder; /** - * StandardText relies on regular expressions to extract standard references - * from text. + * StandardText relies on regular expressions to extract standard references from text. + * + *

This class helps to find the standard references from text by performing the following steps: * - *

- * This class helps to find the standard references from text by performing the - * following steps: *

    - *
  1. searches for headers;
  2. - *
  3. searches for patterns that are supposed to be standard references - * (basically, every string mostly composed of uppercase letters followed by an - * alphanumeric characters);
  4. - *
  5. each potential standard reference starts with score equal to 0.25;
  6. - *
  7. increases by 0.25 the score of references which include the name of a - * known standard organization ({@link StandardOrganizations});
  8. - *
  9. increases by 0.25 the score of references which include the word - * Publication or Standard;
  10. - *
  11. increases by 0.25 the score of references which have been found within - * "Applicable Documents" and equivalent sections;
  12. - *
  13. returns the standard references along with scores.
  14. + *
  15. searches for headers; + *
  16. searches for patterns that are supposed to be standard references (basically, every string + * mostly composed of uppercase letters followed by an alphanumeric characters); + *
  17. each potential standard reference starts with score equal to 0.25; + *
  18. increases by 0.25 the score of references which include the name of a known standard + * organization ({@link StandardOrganizations}); + *
  19. increases by 0.25 the score of references which include the word Publication or Standard; + *
  20. increases by 0.25 the score of references which have been found within "Applicable + * Documents" and equivalent sections; + *
  21. returns the standard references along with scores. *
- *

*/ public class StandardsText { // Regular expression to match uppercase headers private static final String REGEX_HEADER = - "(\\d{1,10}+\\.(\\d{1,10}+\\.?){0,10}+)\\p{Blank}+([A-Z]{1,64}+(\\s[A-Z]{1,64}+){0," + - "256}+){5,10}+"; + "(\\d{1,10}+\\.(\\d{1,10}+\\.?){0,10}+)\\p{Blank}+([A-Z]{1,64}+(\\s[A-Z]{1,64}+){0," + + "256}+){5,10}+"; // Regular expression to match the "APPLICABLE DOCUMENTS" and equivalent // sections @@ -63,8 +57,8 @@ public class StandardsText { // Regular expression to match the alphanumeric identifier of the standard private static final String REGEX_IDENTIFIER = - "(?([0-9]{3,64}+|([A-Z]{1,64}+(-|_|\\.)?[0-9]{2,64}+))((-|_|\\.)" + - "?[A-Z0-9]{1,64}+){0,64}+)"; + "(?([0-9]{3,64}+|([A-Z]{1,64}+(-|_|\\.)?[0-9]{2,64}+))((-|_|\\.)" + + "?[A-Z0-9]{1,64}+){0,64}+)"; // Regular expression to match the standard organization private static final String REGEX_ORGANIZATION = StandardOrganizations.getOrganzationsRegex(); @@ -75,10 +69,17 @@ public class StandardsText { // Regular expression to match a string that is supposed to be a standard // reference - private static final String REGEX_FALLBACK = "\\(?" + "(?[A-Z]\\w{1,64}+)" + - "\\)?((\\s?(?\\/)\\s?)(\\w{1,64}+\\s)*\\(?" + "(?[A-Z" + - "]\\w{1,64}+)" + - "\\)?)?" + REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER; + private static final String REGEX_FALLBACK = + "\\(?" + + "(?[A-Z]\\w{1,64}+)" + + "\\)?((\\s?(?\\/)\\s?)(\\w{1,64}+\\s)*\\(?" + + "(?[A-Z" + + "]\\w{1,64}+)" + + "\\)?)?" + + REGEX_STANDARD_TYPE + + "?" + + "(-|\\s)?" + + REGEX_IDENTIFIER; // Regular expression to match the standard organization within a string // that is supposed to be a standard reference @@ -88,16 +89,15 @@ public class StandardsText { /** * Extracts the standard references found within the given text. * - * @param text the text from which the standard references are extracted. - * @param threshold the lower bound limit to be used in order to select only the - * standard references with score greater than or equal to the - * threshold. For instance, using a threshold of 0.75 means that - * only the patterns with score greater than or equal to 0.75 - * will be returned. + * @param text the text from which the standard references are extracted. + * @param threshold the lower bound limit to be used in order to select only the standard + * references with score greater than or equal to the threshold. For instance, using a + * threshold of 0.75 means that only the patterns with score greater than or equal to 0.75 + * will be returned. * @return the list of standard references extracted from the given text. */ - public static ArrayList extractStandardReferences(String text, - double threshold) { + public static ArrayList extractStandardReferences( + String text, double threshold) { Map headers = findHeaders(text); return findStandards(text, headers, threshold); @@ -125,16 +125,14 @@ private static Map findHeaders(String text) { /** * This method helps to find the standard references within the given text. * - * @param text the text from which the standards references are extracted. - * @param headers the list of headers found within the given text. - * @param threshold the lower bound limit to be used in order to select only the - * standard references with score greater than or equal to the - * threshold. + * @param text the text from which the standards references are extracted. + * @param headers the list of headers found within the given text. + * @param threshold the lower bound limit to be used in order to select only the standard + * references with score greater than or equal to the threshold. * @return the list of standard references extracted from the given text. */ - private static ArrayList findStandards(String text, - Map headers, - double threshold) { + private static ArrayList findStandards( + String text, Map headers, double threshold) { ArrayList standards = new ArrayList<>(); double score = 0; @@ -142,10 +140,12 @@ private static ArrayList findStandards(String text, Matcher matcher = pattern.matcher(text); while (matcher.find()) { - StandardReferenceBuilder builder = new StandardReference.StandardReferenceBuilder( - matcher.group("mainOrganization"), matcher.group("identifier")) - .setSecondOrganization(matcher.group("separator"), - matcher.group("secondOrganization")); + StandardReferenceBuilder builder = + new StandardReference.StandardReferenceBuilder( + matcher.group("mainOrganization"), matcher.group("identifier")) + .setSecondOrganization( + matcher.group("separator"), + matcher.group("secondOrganization")); score = 0.25; // increases by 0.25 the score of references which include the name of a known diff --git a/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java b/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java index c79dd80a7a..2af01d15b2 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java +++ b/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java @@ -20,9 +20,8 @@ import org.xml.sax.SAXException; /** - * Sentinel exception to stop parsing xml once target is found - * while SAX parsing. This should be used when the parse - * can be stopped and the exception ignored. + * Sentinel exception to stop parsing xml once target is found while SAX parsing. This should be + * used when the parse can be stopped and the exception ignored. */ public class StoppingEarlyException extends SAXException { diff --git a/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java index fea0b83890..0a6f71f161 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java @@ -20,10 +20,10 @@ import org.xml.sax.SAXException; /** - * A content handler decorator that tags potential exceptions so that the - * handler that caused the exception can easily be identified. This is - * done by using the {@link TaggedSAXException} class to wrap all thrown - * {@link SAXException}s. See below for an example of using this class. + * A content handler decorator that tags potential exceptions so that the handler that caused the + * exception can easily be identified. This is done by using the {@link TaggedSAXException} class to + * wrap all thrown {@link SAXException}s. See below for an example of using this class. + * *
  * TaggedContentHandler handler = new TaggedContentHandler(...);
  * try {
@@ -39,11 +39,11 @@
  *     }
  * }
  * 
- *

- * Alternatively, the {@link #throwIfCauseOf(Exception)} method can be - * used to let higher levels of code handle the exception caused by this - * stream while other processing errors are being taken care of at this - * lower level. + * + *

Alternatively, the {@link #throwIfCauseOf(Exception)} method can be used to let higher levels + * of code handle the exception caused by this stream while other processing errors are being taken + * care of at this lower level. + * *

  * TaggedContentHandler handler = new TaggedContentHandler(...);
  * try {
@@ -71,8 +71,8 @@ public TaggedContentHandler(ContentHandler proxy) {
      * Tests if the given exception was caused by this handler.
      *
      * @param exception an exception
-     * @return true if the exception was thrown by this handler,
-     * false otherwise
+     * @return true if the exception was thrown by this handler, false
+     *     otherwise
      */
     public boolean isCauseOf(SAXException exception) {
         if (exception instanceof TaggedSAXException) {
@@ -84,11 +84,10 @@ public boolean isCauseOf(SAXException exception) {
     }
 
     /**
-     * Re-throws the original exception thrown by this handler. This method
-     * first checks whether the given exception is a {@link TaggedSAXException}
-     * wrapper created by this decorator, and then unwraps and throws the
-     * original wrapped exception. Returns normally if the exception was
-     * not thrown by this handler.
+     * Re-throws the original exception thrown by this handler. This method first checks whether the
+     * given exception is a {@link TaggedSAXException} wrapper created by this decorator, and then
+     * unwraps and throws the original wrapped exception. Returns normally if the exception was not
+     * thrown by this handler.
      *
      * @param exception an exception
      * @throws SAXException original exception, if any, thrown by this handler
@@ -112,5 +111,4 @@ public void throwIfCauseOf(Exception exception) throws SAXException {
     protected void handleException(SAXException e) throws SAXException {
         throw new TaggedSAXException(e, this);
     }
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java b/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java
index 7697cc6ea0..0a6f4eeefd 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java
@@ -19,22 +19,20 @@
 import org.xml.sax.SAXException;
 
 /**
- * A {@link SAXException} wrapper that tags the wrapped exception with
- * a given object reference. Both the tag and the wrapped original exception
- * can be used to determine further processing when this exception is caught.
+ * A {@link SAXException} wrapper that tags the wrapped exception with a given object reference.
+ * Both the tag and the wrapped original exception can be used to determine further processing when
+ * this exception is caught.
  */
 public class TaggedSAXException extends SAXException {
 
-    /**
-     * The object reference used to tag the exception.
-     */
+    /** The object reference used to tag the exception. */
     private final Object tag;
 
     /**
      * Creates a tagged wrapper for the given exception.
      *
      * @param original the exception to be tagged
-     * @param tag      tag object
+     * @param tag tag object
      */
     public TaggedSAXException(SAXException original, Object tag) {
         super(original.getMessage(), original);
@@ -51,8 +49,8 @@ public Object getTag() {
     }
 
     /**
-     * Returns the wrapped exception. The only difference to the overridden
-     * {@link Throwable#getCause()} method is the narrower return type.
+     * Returns the wrapped exception. The only difference to the overridden {@link
+     * Throwable#getCause()} method is the narrower return type.
      *
      * @return wrapped exception
      */
@@ -60,5 +58,4 @@ public Object getTag() {
     public SAXException getCause() {
         return (SAXException) super.getCause();
     }
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java
index c54e04fa5b..cc004ae9fe 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java
@@ -23,8 +23,8 @@
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * Content handler proxy that forwards the received SAX events to zero or
- * more underlying content handlers.
+ * Content handler proxy that forwards the received SAX events to zero or more underlying content
+ * handlers.
  */
 public class TeeContentHandler extends DefaultHandler {
 
@@ -111,5 +111,4 @@ public void skippedEntity(String name) throws SAXException {
             handler.skippedEntity(name);
         }
     }
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java
index ff20829dc8..ffd8b44e15 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java
@@ -29,8 +29,8 @@ public TextAndAttributeContentHandler(ContentHandler delegate) {
         this(delegate, false);
     }
 
-    public TextAndAttributeContentHandler(ContentHandler delegate,
-                                          boolean addSpaceBetweenElements) {
+    public TextAndAttributeContentHandler(
+            ContentHandler delegate, boolean addSpaceBetweenElements) {
         super(delegate, addSpaceBetweenElements);
     }
 
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
index a510baf82d..e37450100f 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
@@ -22,15 +22,13 @@
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * Content handler decorator that only passes the
- * {@link #characters(char[], int, int)} and
- * (@link {@link #ignorableWhitespace(char[], int, int)}
- * (plus {@link #startDocument()} and {@link #endDocument()} events to
- * the decorated content handler.
+ * Content handler decorator that only passes the {@link #characters(char[], int, int)} and (@link
+ * {@link #ignorableWhitespace(char[], int, int)} (plus {@link #startDocument()} and {@link
+ * #endDocument()} events to the decorated content handler.
  */
 public class TextContentHandler extends DefaultHandler {
 
-    private static final char[] SPACE = new char[]{' '};
+    private static final char[] SPACE = new char[] {' '};
 
     private final ContentHandler delegate;
     private final boolean addSpaceBetweenElements;
@@ -81,5 +79,4 @@ public void endDocument() throws SAXException {
     public String toString() {
         return delegate.toString();
     }
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
index 268edb12cf..e14e57ab87 100755
--- a/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
@@ -21,21 +21,32 @@
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
-
 import org.xml.sax.SAXException;
 
 /**
- * SAX event handler that serializes the HTML document to a character stream.
- * The incoming SAX events are expected to be well-formed (properly nested,
- * etc.) and valid HTML.
+ * SAX event handler that serializes the HTML document to a character stream. The incoming SAX
+ * events are expected to be well-formed (properly nested, etc.) and valid HTML.
  *
  * @since Apache Tika 0.10
  */
 public class ToHTMLContentHandler extends ToXMLContentHandler {
 
-    private static final Set EMPTY_ELEMENTS = new HashSet<>(
-            Arrays.asList("area", "base", "basefont", "br", "col", "frame", "hr", "img", "input",
-                    "isindex", "link", "meta", "param"));
+    private static final Set EMPTY_ELEMENTS =
+            new HashSet<>(
+                    Arrays.asList(
+                            "area",
+                            "base",
+                            "basefont",
+                            "br",
+                            "col",
+                            "frame",
+                            "hr",
+                            "img",
+                            "input",
+                            "isindex",
+                            "link",
+                            "meta",
+                            "param"));
 
     public ToHTMLContentHandler(OutputStream stream, String encoding)
             throws UnsupportedEncodingException {
@@ -47,8 +58,7 @@ public ToHTMLContentHandler() {
     }
 
     @Override
-    public void startDocument() throws SAXException {
-    }
+    public void startDocument() throws SAXException {}
 
     @Override
     public void endElement(String uri, String localName, String qName) throws SAXException {
@@ -64,5 +74,4 @@ public void endElement(String uri, String localName, String qName) throws SAXExc
 
         super.endElement(uri, localName, qName);
     }
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
index 868a3bc369..69eb4194f6 100755
--- a/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
@@ -23,19 +23,15 @@
 import java.io.UnsupportedEncodingException;
 import java.io.Writer;
 import java.util.Locale;
-
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * SAX event handler that writes all character content out to a character
- * stream. No escaping or other transformations are made on the character
- * content.
- * 

- * As of Tika 1.20, this handler ignores content within <script> and - * <style> tags. - *

+ * SAX event handler that writes all character content out to a character stream. No escaping or + * other transformations are made on the character content. + * + *

As of Tika 1.20, this handler ignores content within <script> and <style> tags. * * @since Apache Tika 0.10 */ @@ -43,16 +39,15 @@ public class ToTextContentHandler extends DefaultHandler { private static final String STYLE = "STYLE"; private static final String SCRIPT = "SCRIPT"; - /** - * The character stream. - */ + + /** The character stream. */ private final Writer writer; + private int styleDepth = 0; private int scriptDepth = 0; /** - * Creates a content handler that writes character events to - * the given writer. + * Creates a content handler that writes character events to the given writer. * * @param writer writer */ @@ -61,10 +56,10 @@ public ToTextContentHandler(Writer writer) { } /** - * Creates a content handler that writes character events to - * the given output stream using the given encoding. + * Creates a content handler that writes character events to the given output stream using the + * given encoding. * - * @param stream output stream + * @param stream output stream * @param encoding output encoding * @throws UnsupportedEncodingException if the encoding is unsupported */ @@ -74,17 +69,14 @@ public ToTextContentHandler(OutputStream stream, String encoding) } /** - * Creates a content handler that writes character events - * to an internal string buffer. Use the {@link #toString()} - * method to access the collected character content. + * Creates a content handler that writes character events to an internal string buffer. Use the + * {@link #toString()} method to access the collected character content. */ public ToTextContentHandler() { this(new StringWriter()); } - /** - * Writes the given characters to the given character stream. - */ + /** Writes the given characters to the given character stream. */ @Override public void characters(char[] ch, int start, int length) throws SAXException { @@ -99,11 +91,9 @@ public void characters(char[] ch, int start, int length) throws SAXException { } } - /** - * Writes the given ignorable characters to the given character stream. - * The default implementation simply forwards the call to the - * {@link #characters(char[], int, int)} method. + * Writes the given ignorable characters to the given character stream. The default + * implementation simply forwards the call to the {@link #characters(char[], int, int)} method. */ @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { @@ -111,8 +101,7 @@ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXExce } /** - * Flushes the character stream so that no characters are forgotten - * in internal buffers. + * Flushes the character stream so that no characters are forgotten in internal buffers. * * @throws SAXException if the stream can not be flushed * @see TIKA-179 @@ -150,15 +139,12 @@ public void endElement(String uri, String localName, String qName) throws SAXExc } /** - * Returns the contents of the internal string buffer where - * all the received characters have been collected. Only works - * when this object was constructed using the empty default - * constructor or by passing a {@link StringWriter} to the - * other constructor. + * Returns the contents of the internal string buffer where all the received characters have + * been collected. Only works when this object was constructed using the empty default + * constructor or by passing a {@link StringWriter} to the other constructor. */ @Override public String toString() { return writer.toString(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java index 60ab35f3d5..a586ce1858 100755 --- a/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java @@ -21,15 +21,13 @@ import java.util.Collections; import java.util.HashMap; import java.util.Map; - import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** - * SAX event handler that serializes the XML document to a character stream. - * The incoming SAX events are expected to be well-formed (properly nested, - * etc.) and to explicitly include namespace declaration attributes and - * corresponding namespace prefixes in element and attribute names. + * SAX event handler that serializes the XML document to a character stream. The incoming SAX events + * are expected to be well-formed (properly nested, etc.) and to explicitly include namespace + * declaration attributes and corresponding namespace prefixes in element and attribute names. * * @since Apache Tika 0.10 */ @@ -42,10 +40,10 @@ public class ToXMLContentHandler extends ToTextContentHandler { private ElementInfo currentElement; /** - * Creates an XML serializer that writes to the given byte stream - * using the given character encoding. + * Creates an XML serializer that writes to the given byte stream using the given character + * encoding. * - * @param stream output stream + * @param stream output stream * @param encoding output encoding * @throws UnsupportedEncodingException if the encoding is unsupported */ @@ -65,9 +63,7 @@ public ToXMLContentHandler() { this.encoding = null; } - /** - * Writes the XML prefix. - */ + /** Writes the XML prefix. */ @Override public void startDocument() throws SAXException { if (encoding != null) { @@ -168,7 +164,7 @@ private void lazyCloseStartElement() throws SAXException { * @throws SAXException if the character could not be written */ protected void write(char ch) throws SAXException { - super.characters(new char[]{ch}, 0, 1); + super.characters(new char[] {ch}, 0, 1); } /** @@ -184,12 +180,11 @@ protected void write(String string) throws SAXException { /** * Writes the given characters as-is followed by the given entity. * - * @param ch character array - * @param from start position in the array - * @param to end position in the array + * @param ch character array + * @param from start position in the array + * @param to end position in the array * @param entity entity code - * @return next position in the array, - * after the characters plus one entity + * @return next position in the array, after the characters plus one entity * @throws SAXException if the characters could not be written */ private int writeCharsAndEntity(char[] ch, int from, int to, String entity) @@ -204,11 +199,11 @@ private int writeCharsAndEntity(char[] ch, int from, int to, String entity) /** * Writes the given characters with XML meta characters escaped. * - * @param ch character array - * @param from start position in the array - * @param to end position in the array - * @param attribute whether the characters should be escaped as - * an attribute value or normal character content + * @param ch character array + * @param from start position in the array + * @param to end position in the array + * @param attribute whether the characters should be escaped as an attribute value or normal + * character content * @throws SAXException if the characters could not be written */ private void writeEscaped(char[] ch, int from, int to, boolean attribute) throws SAXException { @@ -265,7 +260,5 @@ public String getQName(String uri, String localName) throws SAXException { return localName; } } - } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java b/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java index d82895a1ba..bc37138431 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java +++ b/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java @@ -18,5 +18,6 @@ public interface WriteLimiter { int getWriteLimit(); + boolean isThrowOnWriteLimitReached(); } diff --git a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java index 22d69f0a4d..89154936d6 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java @@ -18,30 +18,24 @@ import java.io.StringWriter; import java.io.Writer; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.ParseRecord; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * SAX event handler that writes content up to an optional write - * limit out to a character stream or other decorated handler. + * SAX event handler that writes content up to an optional write limit out to a character stream or + * other decorated handler. */ public class WriteOutContentHandler extends ContentHandlerDecorator { - /** - * The maximum number of characters to write to the character stream. - * Set to -1 for no limit. + * The maximum number of characters to write to the character stream. Set to -1 for no limit. */ private final int writeLimit; - /** - * Number of characters written so far. - */ + /** Number of characters written so far. */ private int writeCount = 0; private boolean throwOnWriteLimitReached = true; @@ -51,10 +45,10 @@ public class WriteOutContentHandler extends ContentHandlerDecorator { private boolean writeLimitReached; /** - * Creates a content handler that writes content up to the given - * write limit to the given content handler. + * Creates a content handler that writes content up to the given write limit to the given + * content handler. * - * @param handler content handler to be decorated + * @param handler content handler to be decorated * @param writeLimit write limit * @since Apache Tika 0.10 */ @@ -64,10 +58,10 @@ public WriteOutContentHandler(ContentHandler handler, int writeLimit) { } /** - * Creates a content handler that writes content up to the given - * write limit to the given character stream. + * Creates a content handler that writes content up to the given write limit to the given + * character stream. * - * @param writer character stream + * @param writer character stream * @param writeLimit write limit * @since Apache Tika 0.10 */ @@ -76,8 +70,7 @@ public WriteOutContentHandler(Writer writer, int writeLimit) { } /** - * Creates a content handler that writes character events to - * the given writer. + * Creates a content handler that writes character events to the given writer. * * @param writer writer */ @@ -86,17 +79,16 @@ public WriteOutContentHandler(Writer writer) { } /** - * Creates a content handler that writes character events - * to an internal string buffer. Use the {@link #toString()} - * method to access the collected character content. - *

- * The internal string buffer is bounded at the given number of characters. - * If this write limit is reached, then a {@link SAXException} is thrown. - * The {@link WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to - * detect this case. + * Creates a content handler that writes character events to an internal string buffer. Use the + * {@link #toString()} method to access the collected character content. + * + *

The internal string buffer is bounded at the given number of characters. If this write + * limit is reached, then a {@link SAXException} is thrown. The {@link + * WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to detect this + * case. * - * @param writeLimit maximum number of characters to include in the string, - * or -1 to disable the write limit + * @param writeLimit maximum number of characters to include in the string, or -1 to disable the + * write limit * @since Apache Tika 0.7 */ public WriteOutContentHandler(int writeLimit) { @@ -104,14 +96,13 @@ public WriteOutContentHandler(int writeLimit) { } /** - * Creates a content handler that writes character events - * to an internal string buffer. Use the {@link #toString()} - * method to access the collected character content. - *

- * The internal string buffer is bounded at 100k characters. If this - * write limit is reached, then a {@link SAXException} is thrown. The - * {@link WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to - * detect this case. + * Creates a content handler that writes character events to an internal string buffer. Use the + * {@link #toString()} method to access the collected character content. + * + *

The internal string buffer is bounded at 100k characters. If this write limit is reached, + * then a {@link SAXException} is thrown. The {@link + * WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to detect this + * case. */ public WriteOutContentHandler() { this(100 * 1000); @@ -119,23 +110,24 @@ public WriteOutContentHandler() { /** * The default is to throw a {@link WriteLimitReachedException} + * * @param handler * @param writeLimit * @param throwOnWriteLimitReached * @param parseContext */ - public WriteOutContentHandler(ContentHandler handler, - int writeLimit, boolean throwOnWriteLimitReached, - ParseContext parseContext) { + public WriteOutContentHandler( + ContentHandler handler, + int writeLimit, + boolean throwOnWriteLimitReached, + ParseContext parseContext) { super(handler); this.writeLimit = writeLimit; this.throwOnWriteLimitReached = throwOnWriteLimitReached; this.parseContext = parseContext; } - /** - * Writes the given characters to the given character stream. - */ + /** Writes the given characters to the given character stream. */ @Override public void characters(char[] ch, int start, int length) throws SAXException { if (writeLimitReached) { @@ -176,5 +168,4 @@ private void handleWriteLimitReached() throws WriteLimitReachedException { } } } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java index 6ba4232205..a7d4454baa 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java @@ -20,71 +20,89 @@ import java.util.Collections; import java.util.HashSet; import java.util.Set; - +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; - /** - * Content handler decorator that simplifies the task of producing XHTML - * events for Tika content parsers. + * Content handler decorator that simplifies the task of producing XHTML events for Tika content + * parsers. */ public class XHTMLContentHandler extends SafeContentHandler { - /** - * The XHTML namespace URI - */ + /** The XHTML namespace URI */ public static final String XHTML = "http://www.w3.org/1999/xhtml"; - /** - * The elements that get appended with the {@link #NL} character. - */ + + /** The elements that get appended with the {@link #NL} character. */ public static final Set ENDLINE = - unmodifiableSet("p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", "pre", - "hr", "blockquote", "address", "fieldset", "table", "form", "noscript", "li", - "dt", "dd", "noframes", "br", "tr", "select", "option", "link", "script"); - /** - * The newline character that gets inserted after block elements. - */ - private static final char[] NL = new char[]{'\n'}; - /** - * The tab character gets inserted before table cells and list items. - */ - private static final char[] TAB = new char[]{'\t'}; - /** - * The elements that are in the section. - */ + unmodifiableSet( + "p", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "div", + "ul", + "ol", + "dl", + "pre", + "hr", + "blockquote", + "address", + "fieldset", + "table", + "form", + "noscript", + "li", + "dt", + "dd", + "noframes", + "br", + "tr", + "select", + "option", + "link", + "script"); + + /** The newline character that gets inserted after block elements. */ + private static final char[] NL = new char[] {'\n'}; + + /** The tab character gets inserted before table cells and list items. */ + private static final char[] TAB = new char[] {'\t'}; + + /** The elements that are in the section. */ private static final Set HEAD = unmodifiableSet("title", "link", "base", "meta", "script"); + /** - * The elements that are automatically emitted by lazyStartHead, so - * skip them if they get sent to startElement/endElement by mistake. + * The elements that are automatically emitted by lazyStartHead, so skip them if they get sent + * to startElement/endElement by mistake. */ private static final Set AUTO = unmodifiableSet("head", "frameset"); - /** - * The elements that get prepended with the {@link #TAB} character. - */ + + /** The elements that get prepended with the {@link #TAB} character. */ private static final Set INDENT = unmodifiableSet("li", "dd", "dt", "td", "th", "frame"); + private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); - /** - * Metadata associated with the document. Used to fill in the - * <head/> section. - */ + + /** Metadata associated with the document. Used to fill in the <head/> section. */ private final Metadata metadata; - /** - * Flag to indicate whether the document has been started. - */ + + /** Flag to indicate whether the document has been started. */ private boolean documentStarted = false; - /** - * Flags to indicate whether the document head element has been started/ended. - */ + + /** Flags to indicate whether the document head element has been started/ended. */ private boolean headStarted = false; + private boolean headEnded = false; private boolean useFrameset = false; + public XHTMLContentHandler(ContentHandler handler, Metadata metadata) { super(handler); this.metadata = metadata; @@ -95,10 +113,8 @@ private static Set unmodifiableSet(String... elements) { } /** - * Starts an XHTML document by setting up the namespace mappings - * when called for the first time. - * The standard XHTML prefix is generated lazily when the first - * element is started. + * Starts an XHTML document by setting up the namespace mappings when called for the first time. + * The standard XHTML prefix is generated lazily when the first element is started. */ @Override public void startDocument() throws SAXException { @@ -111,6 +127,7 @@ public void startDocument() throws SAXException { /** * Generates the following XHTML prefix when called for the first time: + * *

      * <html>
      *   <head>
@@ -139,6 +156,7 @@ private void lazyStartHead() throws SAXException {
 
     /**
      * Generates the following XHTML prefix when called for the first time:
+     *
      * 
      * <html>
      *   <head>
@@ -199,8 +217,8 @@ private void lazyEndHead(boolean isFrameset) throws SAXException {
     }
 
     /**
-     * Ends the XHTML document by writing the following footer and
-     * clearing the namespace mappings:
+     * Ends the XHTML document by writing the following footer and clearing the namespace mappings:
+     *
      * 
      *   </body>
      * </html>
@@ -223,8 +241,8 @@ public void endDocument() throws SAXException {
     }
 
     /**
-     * Starts the given element. Table cells and list items are automatically
-     * indented by emitting a tab character as ignorable whitespace.
+     * Starts the given element. Table cells and list items are automatically indented by emitting a
+     * tab character as ignorable whitespace.
      */
     @Override
     public void startElement(String uri, String local, String name, Attributes attributes)
@@ -247,10 +265,7 @@ public void startElement(String uri, String local, String name, Attributes attri
         }
     }
 
-    /**
-     * Ends the given element. Block elements are automatically followed
-     * by a newline character.
-     */
+    /** Ends the given element. Block elements are automatically followed by a newline character. */
     @Override
     public void endElement(String uri, String local, String name) throws SAXException {
         if (!AUTO.contains(name)) {
@@ -270,7 +285,7 @@ public void characters(char[] ch, int start, int length) throws SAXException {
         super.characters(ch, start, length);
     }
 
-    //------------------------------------------< public convenience methods >
+    // ------------------------------------------< public convenience methods >
 
     public void startElement(String name) throws SAXException {
         startElement(XHTML, name, name, EMPTY_ATTRIBUTES);
@@ -301,10 +316,10 @@ public void newline() throws SAXException {
     }
 
     /**
-     * Emits an XHTML element with the given text content. If the given
-     * text value is null or empty, then the element is not written.
+     * Emits an XHTML element with the given text content. If the given text value is null or empty,
+     * then the element is not written.
      *
-     * @param name  XHTML element name
+     * @param name XHTML element name
      * @param value element value, possibly null
      * @throws SAXException if the content element could not be written
      */
@@ -324,5 +339,4 @@ protected boolean isInvalid(int ch) {
         // These control chars are  invalid in XHTML.
         return 0x7F <= ch && ch <= 0x9F;
     }
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
index 953ad6a446..ec0a12800c 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
@@ -16,14 +16,13 @@
  */
 package org.apache.tika.sax;
 
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-
 /**
  * Content handler decorator that simplifies the task of producing XMP output.
  *
@@ -31,14 +30,10 @@
  */
 public class XMPContentHandler extends SafeContentHandler {
 
-    /**
-     * The RDF namespace URI
-     */
+    /** The RDF namespace URI */
     public static final String RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
 
-    /**
-     * The XMP namespace URI
-     */
+    /** The XMP namespace URI */
     public static final String XMP = "http://ns.adobe.com/xap/1.0/";
 
     private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
@@ -49,11 +44,12 @@ public XMPContentHandler(ContentHandler handler) {
         super(handler);
     }
 
-    //------------------------------------------< public convenience methods >
+    // ------------------------------------------< public convenience methods >
 
     /**
-     * Starts an XMP document by setting up the namespace mappings and
-     * writing out the following header:
+     * Starts an XMP document by setting up the namespace mappings and writing out the following
+     * header:
+     *
      * 
      * <rdf:RDF>
      * 
@@ -69,8 +65,8 @@ public void startDocument() throws SAXException { } /** - * Ends the XMP document by writing the following footer and - * clearing the namespace mappings: + * Ends the XMP document by writing the following footer and clearing the namespace mappings: + * *
      * </rdf:RDF>
      * 
@@ -143,5 +139,4 @@ private void description(Metadata metadata, String prefix, String uri) throws SA endDescription(); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/package-info.java b/tika-core/src/main/java/org/apache/tika/sax/package-info.java index 3c0b4ba48d..dde70a8ca9 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/sax/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * SAX utilities. - */ +/** SAX utilities. */ @aQute.bnd.annotation.Version("1.0.0") package org.apache.tika.sax; diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java index 7b1693d57a..66ef2ec4b1 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java @@ -17,8 +17,8 @@ package org.apache.tika.sax.xpath; /** - * Final evaluation state of a .../@* XPath expression. - * Matches all attributes of the current element. + * Final evaluation state of a .../@* XPath expression. Matches all attributes of the + * current element. */ public class AttributeMatcher extends Matcher { @@ -27,5 +27,4 @@ public class AttributeMatcher extends Matcher { public boolean matchesAttribute(String namespace, String name) { return true; } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java index b95983c1b6..6cc90688d6 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java @@ -17,8 +17,8 @@ package org.apache.tika.sax.xpath; /** - * Intermediate evaluation state of a .../*... XPath expression. - * Matches nothing, but specifies the evaluation state for all child elements. + * Intermediate evaluation state of a .../*... XPath expression. Matches nothing, but + * specifies the evaluation state for all child elements. */ public class ChildMatcher extends Matcher { @@ -31,5 +31,4 @@ public ChildMatcher(Matcher then) { public Matcher descend(String namespace, String name) { return then; } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java index b0ef5110a1..7b96a16c54 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java @@ -17,8 +17,8 @@ package org.apache.tika.sax.xpath; /** - * Composite XPath evaluation state. Used when XPath evaluation results - * in two or more branches of independent evaluation states. + * Composite XPath evaluation state. Used when XPath evaluation results in two or more branches of + * independent evaluation states. */ public class CompositeMatcher extends Matcher { @@ -56,5 +56,4 @@ public boolean matchesAttribute(String namespace, String name) { public boolean matchesText() { return a.matchesText() || b.matchesText(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java index 164e08aa29..ee6f5bc020 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java @@ -17,8 +17,8 @@ package org.apache.tika.sax.xpath; /** - * Final evaluation state of an XPath expression that targets an element. - * Matches the current element. + * Final evaluation state of an XPath expression that targets an element. Matches the current + * element. */ public class ElementMatcher extends Matcher { @@ -27,5 +27,4 @@ public class ElementMatcher extends Matcher { public boolean matchesElement() { return true; } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java index ab9d21c385..a24a6a64fd 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java @@ -16,25 +16,21 @@ */ package org.apache.tika.sax.xpath; -/** - * XPath element matcher. A matcher instance encapsulates a specific - * state in XPath evaluation. - */ +/** XPath element matcher. A matcher instance encapsulates a specific state in XPath evaluation. */ public class Matcher { /** - * State of a failed XPath evaluation, where nothing is matched. - * This matcher instance is used as a sentinel object whenever an - * XPath evaluation branch fails. + * State of a failed XPath evaluation, where nothing is matched. This matcher instance is used + * as a sentinel object whenever an XPath evaluation branch fails. */ public static final Matcher FAIL = new Matcher(); /** - * Returns the XPath evaluation state that results from descending - * to a child element with the given name. + * Returns the XPath evaluation state that results from descending to a child element with the + * given name. * * @param namespace element namespace or null - * @param name element name + * @param name element name * @return next XPath evaluation state */ public Matcher descend(String namespace, String name) { @@ -42,8 +38,8 @@ public Matcher descend(String namespace, String name) { } /** - * Returns true if the XPath expression matches - * the element associated with this evaluation state. + * Returns true if the XPath expression matches the element associated with this + * evaluation state. * * @return XPath evaluation state for this element */ @@ -52,11 +48,11 @@ public boolean matchesElement() { } /** - * Returns true if the XPath expression matches the named - * attribute of the element associated with this evaluation state. + * Returns true if the XPath expression matches the named attribute of the element + * associated with this evaluation state. * * @param namespace attribute namespace or null - * @param name attribute name + * @param name attribute name * @return XPath evaluation state for named attribute of this element */ public boolean matchesAttribute(String namespace, String name) { @@ -64,14 +60,12 @@ public boolean matchesAttribute(String namespace, String name) { } /** - * Returns true if the XPath expression matches all text - * nodes whose parent is the element associated with this evaluation - * state. + * Returns true if the XPath expression matches all text nodes whose parent is the + * element associated with this evaluation state. * * @return XPath evaluation state for text children of this element */ public boolean matchesText() { return false; } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java index 9f96186aab..d4e5250a1e 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java @@ -17,17 +17,15 @@ package org.apache.tika.sax.xpath; import java.util.LinkedList; - +import org.apache.tika.sax.ContentHandlerDecorator; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.sax.ContentHandlerDecorator; - /** - * Content handler decorator that only passes the elements, attributes, - * and text nodes that match the given XPath expression. + * Content handler decorator that only passes the elements, attributes, and text nodes that match + * the given XPath expression. */ public class MatchingContentHandler extends ContentHandlerDecorator { @@ -50,8 +48,12 @@ public void startElement(String uri, String localName, String name, Attributes a String attributeURI = attributes.getURI(i); String attributeName = attributes.getLocalName(i); if (matcher.matchesAttribute(attributeURI, attributeName)) { - matches.addAttribute(attributeURI, attributeName, attributes.getQName(i), - attributes.getType(i), attributes.getValue(i)); + matches.addAttribute( + attributeURI, + attributeName, + attributes.getQName(i), + attributes.getType(i), + attributes.getValue(i)); } } @@ -98,5 +100,4 @@ public void skippedEntity(String name) throws SAXException { super.skippedEntity(name); } } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java index 46b65a4da0..0ac8567d8f 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java @@ -19,8 +19,8 @@ import java.util.Objects; /** - * Final evaluation state of a .../@name XPath expression. - * Matches the named attributes of the current element. + * Final evaluation state of a .../@name XPath expression. Matches the named attributes + * of the current element. */ public class NamedAttributeMatcher extends Matcher { diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java index e304789c09..085935e7cd 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java @@ -19,9 +19,8 @@ import java.util.Objects; /** - * Intermediate evaluation state of a .../name... XPath - * expression. Matches nothing, but specifies the evaluation state - * for the child elements with the given name. + * Intermediate evaluation state of a .../name... XPath expression. Matches nothing, + * but specifies the evaluation state for the child elements with the given name. */ public class NamedElementMatcher extends ChildMatcher { diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java index 8c2e45cadd..f20a1f2073 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java @@ -17,8 +17,8 @@ package org.apache.tika.sax.xpath; /** - * Final evaluation state of a .../node() XPath expression. - * Matches all elements, attributes, and text. + * Final evaluation state of a .../node() XPath expression. Matches all elements, + * attributes, and text. */ public class NodeMatcher extends Matcher { @@ -38,5 +38,4 @@ public boolean matchesAttribute(String namespace, String name) { public boolean matchesText() { return true; } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java index 1915dfc8d8..11f757873f 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java @@ -17,8 +17,8 @@ package org.apache.tika.sax.xpath; /** - * Evaluation state of a ...//... XPath expression. Applies the - * contained evaluation state to the current element and all its descendants. + * Evaluation state of a ...//... XPath expression. Applies the contained evaluation + * state to the current element and all its descendants. */ public class SubtreeMatcher extends Matcher { @@ -52,5 +52,4 @@ public boolean matchesAttribute(String namespace, String name) { public boolean matchesText() { return then.matchesText(); } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java index caf82f4883..efe46ab322 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java @@ -17,8 +17,8 @@ package org.apache.tika.sax.xpath; /** - * Final evaluation state of a .../text() XPath expression. - * Matches all text children of the current element. + * Final evaluation state of a .../text() XPath expression. Matches all text children + * of the current element. */ public class TextMatcher extends Matcher { @@ -27,5 +27,4 @@ public class TextMatcher extends Matcher { public boolean matchesText() { return true; } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java index ffa4ccd719..d04ba4c862 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java @@ -20,29 +20,29 @@ import java.util.Map; /** - * Parser for a very simple XPath subset. Only the following XPath constructs - * (with namespaces) are supported: + * Parser for a very simple XPath subset. Only the following XPath constructs (with namespaces) are + * supported: + * *
    - *
  • .../node()
  • - *
  • .../text()
  • - *
  • .../@*
  • - *
  • .../@name
  • - *
  • .../*...
  • - *
  • .../name...
  • - *
  • ...//*...
  • - *
  • ...//name...
  • + *
  • .../node() + *
  • .../text() + *
  • .../@* + *
  • .../@name + *
  • .../*... + *
  • .../name... + *
  • ...//*... + *
  • ...//name... *
- *

- * In addition the non-abbreviated .../descendant::node() - * construct can be used for cases where the descendant-or-self axis - * used by the ...//node() construct is not appropriate. + * + *

In addition the non-abbreviated .../descendant::node() construct can be used for + * cases where the descendant-or-self axis used by the ...//node() construct is not + * appropriate. */ public class XPathParser { private final Map prefixes = new HashMap<>(); - public XPathParser() { - } + public XPathParser() {} public XPathParser(String prefix, String namespace) { addPrefix(prefix, namespace); @@ -53,9 +53,9 @@ public void addPrefix(String prefix, String namespace) { } /** - * Parses the given simple XPath expression to an evaluation state - * initialized at the document node. Invalid expressions are not flagged - * as errors, they just result in a failing evaluation state. + * Parses the given simple XPath expression to an evaluation state initialized at the document + * node. Invalid expressions are not flagged as errors, they just result in a failing evaluation + * state. * * @param xpath simple XPath expression * @return XPath evaluation state @@ -65,9 +65,10 @@ public Matcher parse(String xpath) { return TextMatcher.INSTANCE; } else if (xpath.equals("/node()")) { return NodeMatcher.INSTANCE; - } else if (xpath.equals("/descendant::node()") || - xpath.equals("/descendant:node()")) { // for compatibility - return new CompositeMatcher(TextMatcher.INSTANCE, + } else if (xpath.equals("/descendant::node()") + || xpath.equals("/descendant:node()")) { // for compatibility + return new CompositeMatcher( + TextMatcher.INSTANCE, new ChildMatcher(new SubtreeMatcher(NodeMatcher.INSTANCE))); } else if (xpath.equals("/@*")) { return AttributeMatcher.INSTANCE; @@ -105,8 +106,8 @@ public Matcher parse(String xpath) { name = name.substring(colon + 1); } if (prefixes.containsKey(prefix)) { - return new NamedElementMatcher(prefixes.get(prefix), name, - parse(xpath.substring(slash))); + return new NamedElementMatcher( + prefixes.get(prefix), name, parse(xpath.substring(slash))); } else { return Matcher.FAIL; } @@ -114,5 +115,4 @@ public Matcher parse(String xpath) { return Matcher.FAIL; } } - } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java index f9c1801bd6..2104b97c27 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * XPath utilities - */ +/** XPath utilities */ @aQute.bnd.annotation.Version("1.0.0") package org.apache.tika.sax.xpath; diff --git a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java index b3b8264b41..e2fcc50e4b 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java @@ -25,15 +25,13 @@ import java.util.List; import java.util.Locale; import java.util.Map; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.Field; import org.apache.tika.config.Param; import org.apache.tika.config.ParamField; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaConfigException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class contains utilities for dealing with tika annotations @@ -43,25 +41,23 @@ public class AnnotationUtils { private static final Logger LOG = LoggerFactory.getLogger(AnnotationUtils.class); - /** - * Cache for annotations for Bean classes which have {@link Field} - */ + /** Cache for annotations for Bean classes which have {@link Field} */ private static final Map, List> PARAM_INFO = new HashMap<>(); /** * Collects all the fields and methods for an annotation * - * @param clazz bean class with annotations + * @param clazz bean class with annotations * @param annotation annotation class * @return list of accessible objects such as fields and methods */ - private static List collectInfo(Class clazz, - Class annotation) { + private static List collectInfo( + Class clazz, Class annotation) { Class superClazz = clazz; List members = new ArrayList<>(); List annotatedMembers = new ArrayList<>(); - //walk through the inheritance chain + // walk through the inheritance chain while (superClazz != null && superClazz != Object.class) { members.addAll(Arrays.asList(superClazz.getDeclaredFields())); members.addAll(Arrays.asList(superClazz.getDeclaredMethods())); @@ -109,26 +105,39 @@ public static void assignFieldParams(Object bean, Map params) try { field.assignValue(bean, param.getValue()); } catch (InvocationTargetException e) { - LOG.error("Error assigning value '{}' to '{}'", param.getValue(), param.getName()); + LOG.error( + "Error assigning value '{}' to '{}'", + param.getValue(), + param.getName()); final Throwable cause = e.getCause() == null ? e : e.getCause(); throw new TikaConfigException(cause.getMessage(), cause); } catch (IllegalAccessException e) { - LOG.error("Error assigning value '{}' to '{}'", param.getValue(), param.getName()); + LOG.error( + "Error assigning value '{}' to '{}'", + param.getValue(), + param.getName()); throw new TikaConfigException(e.getMessage(), e); } } else { - String msg = String.format(Locale.ROOT, - "Value '%s' of type '%s' can't be" + - " assigned to field '%s' of defined type '%s'", - param.getValue(), - param.getValue().getClass(), field.getName(), field.getType()); + String msg = + String.format( + Locale.ROOT, + "Value '%s' of type '%s' can't be" + + " assigned to field '%s' of defined type '%s'", + param.getValue(), + param.getValue().getClass(), + field.getName(), + field.getType()); throw new TikaConfigException(msg); } } else if (field.isRequired()) { - //param not supplied but field is declared as required? - String msg = String.format(Locale.ROOT, - "Param %s is required for %s," + " but it is not given in config.", - field.getName(), bean.getClass().getName()); + // param not supplied but field is declared as required? + String msg = + String.format( + Locale.ROOT, + "Param %s is required for %s," + " but it is not given in config.", + field.getName(), + bean.getClass().getName()); throw new TikaConfigException(msg); } else { LOG.debug("Param not supplied, field is not mandatory"); diff --git a/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java b/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java index 5177752100..5f94141a53 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java @@ -44,12 +44,39 @@ public class CharsetUtils { private static Method isSupportedICU = null; static { - initCommonCharsets("Big5", "EUC-JP", "EUC-KR", "x-EUC-TW", "GB18030", "IBM855", "IBM866", - "ISO-2022-CN", "ISO-2022-JP", "ISO-2022-KR", "ISO-8859-1", "ISO-8859-2", - "ISO-8859-3", "ISO-8859-4", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8", - "ISO-8859-9", "ISO-8859-11", "ISO-8859-13", "ISO-8859-15", "KOI8-R", - "x-MacCyrillic", "SHIFT_JIS", "UTF-8", "UTF-16BE", "UTF-16LE", "windows-1251", - "windows-1252", "windows-1253", "windows-1255"); + initCommonCharsets( + "Big5", + "EUC-JP", + "EUC-KR", + "x-EUC-TW", + "GB18030", + "IBM855", + "IBM866", + "ISO-2022-CN", + "ISO-2022-JP", + "ISO-2022-KR", + "ISO-8859-1", + "ISO-8859-2", + "ISO-8859-3", + "ISO-8859-4", + "ISO-8859-5", + "ISO-8859-6", + "ISO-8859-7", + "ISO-8859-8", + "ISO-8859-9", + "ISO-8859-11", + "ISO-8859-13", + "ISO-8859-15", + "KOI8-R", + "x-MacCyrillic", + "SHIFT_JIS", + "UTF-8", + "UTF-16BE", + "UTF-16LE", + "windows-1251", + "windows-1252", + "windows-1253", + "windows-1255"); // Common aliases/typos not included in standard charset definitions COMMON_CHARSETS.put("iso-8851-1", COMMON_CHARSETS.get("iso-8859-1")); @@ -62,7 +89,7 @@ public class CharsetUtils { icuCharset = CharsetUtils.class.getClassLoader().loadClass("com.ibm.icu.charset.CharsetICU"); } catch (ClassNotFoundException e) { - //swallow + // swallow } if (icuCharset != null) { try { @@ -73,7 +100,7 @@ public class CharsetUtils { try { isSupportedICU = icuCharset.getMethod("isSupported", String.class); } catch (Throwable t) { - //swallow + // swallow } // TODO: would be nice to somehow log that we // successfully found ICU @@ -120,8 +147,8 @@ public static boolean isSupported(String charsetName) { } /** - * Handle various common charset name errors, and return something - * that will be considered valid (and is normalized) + * Handle various common charset name errors, and return something that will be considered valid + * (and is normalized) * * @param charsetName name of charset to process * @return potentially remapped/cleaned up version of charset name @@ -135,10 +162,9 @@ public static String clean(String charsetName) { } /** - * Returns Charset impl, if one exists. This method - * optionally uses ICU4J's CharsetICU.forNameICU, - * if it is found on the classpath, else only uses - * JDK's builtin Charset.forName. + * Returns Charset impl, if one exists. This method optionally uses ICU4J's + * CharsetICU.forNameICU, if it is found on the classpath, else only uses JDK's builtin + * Charset.forName. */ public static Charset forName(String name) { if (name == null) { @@ -186,9 +212,10 @@ public static Charset forName(String name) { if (cs != null) { return cs; } - } catch (IllegalArgumentException | IllegalAccessException | - InvocationTargetException e) { - //ignore + } catch (IllegalArgumentException + | IllegalAccessException + | InvocationTargetException e) { + // ignore } } diff --git a/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java b/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java index a4da7772a5..514e389615 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java @@ -19,10 +19,9 @@ public class CompareUtils { /** - * Compare two classes by class names. - * If both classes are Tika's or both are not Tika's class, compare by name String. - * Otherwise one of these two class is Tika's class. - * Then the non-Tika's class comes before Tika's class. + * Compare two classes by class names. If both classes are Tika's or both are not Tika's class, + * compare by name String. Otherwise one of these two class is Tika's class. Then the non-Tika's + * class comes before Tika's class. * * @param o1 the object 1 to be compared * @param o2 the object 2 to be compared diff --git a/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java index 8720e74a9f..a26fef0041 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java @@ -19,7 +19,6 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.FutureTask; - import org.apache.tika.parser.ParseContext; /** @@ -30,8 +29,8 @@ public class ConcurrentUtils { /** - * Execute a runnable using an ExecutorService from the ParseContext if possible. - * Otherwise fallback to individual threads. + * Execute a runnable using an ExecutorService from the ParseContext if possible. Otherwise + * fallback to individual threads. * * @param context * @param runnable diff --git a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java index a6a68fef6e..ae937e5096 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java @@ -28,33 +28,28 @@ import java.util.Locale; import java.util.TimeZone; -/** - * Date related utility methods and constants - */ +/** Date related utility methods and constants */ public class DateUtils { /** - * The UTC time zone. Not sure if {@link TimeZone#getTimeZone(String)} - * understands "UTC" in all environments, but it'll fall back to GMT - * in such cases, which is in practice equivalent to UTC. + * The UTC time zone. Not sure if {@link TimeZone#getTimeZone(String)} understands "UTC" in all + * environments, but it'll fall back to GMT in such cases, which is in practice equivalent to + * UTC. */ public static final TimeZone UTC = TimeZone.getTimeZone("UTC"); /** - * Custom time zone used to interpret date values without a time - * component in a way that most likely falls within the same day - * regardless of in which time zone it is later interpreted. For - * example, the "2012-02-17" date would map to "2012-02-17T12:00:00Z" - * (instead of the default "2012-02-17T00:00:00Z"), which would still - * map to "2012-02-17" if interpreted in say Pacific time (while the - * default mapping would result in "2012-02-16" for UTC-8). + * Custom time zone used to interpret date values without a time component in a way that most + * likely falls within the same day regardless of in which time zone it is later interpreted. + * For example, the "2012-02-17" date would map to "2012-02-17T12:00:00Z" (instead of the + * default "2012-02-17T00:00:00Z"), which would still map to "2012-02-17" if interpreted in say + * Pacific time (while the default mapping would result in "2012-02-16" for UTC-8). */ public static final TimeZone MIDDAY = TimeZone.getTimeZone("GMT-12:00"); + /** - * So we can return Date objects for these, this is the - * list (in preference order) of the various ISO-8601 - * variants that we try when processing a date based - * property. + * So we can return Date objects for these, this is the list (in preference order) of the + * various ISO-8601 variants that we try when processing a date based property. */ private final List iso8601InputFormats = loadDateFormats(); @@ -67,8 +62,8 @@ private static DateFormat createDateFormat(String format, TimeZone timezone) { } /** - * Returns a ISO 8601 representation of the given date in UTC, - * truncated to the seconds unit. This method is thread safe and non-blocking. + * Returns a ISO 8601 representation of the given date in UTC, truncated to the seconds unit. + * This method is thread safe and non-blocking. * * @param date given date * @return ISO 8601 date string in UTC, truncated to the seconds unit @@ -81,8 +76,8 @@ public static String formatDate(Date date) { } /** - * Returns a ISO 8601 representation of the given date in UTC, - * truncated to the seconds unit. This method is thread safe and non-blocking. + * Returns a ISO 8601 representation of the given date in UTC, truncated to the seconds unit. + * This method is thread safe and non-blocking. * * @param date given Calendar * @return ISO 8601 date string in UTC, truncated to the seconds unit @@ -91,15 +86,15 @@ public static String formatDate(Date date) { public static String formatDate(Calendar date) { return doFormatDate(date); } + /** - * Returns a ISO 8601 representation of the given date in UTC, - * truncated to the seconds unit. This method is thread safe and non-blocking. + * Returns a ISO 8601 representation of the given date in UTC, truncated to the seconds unit. + * This method is thread safe and non-blocking. * * @param date given date * @return ISO 8601 date string in UTC, truncated to the seconds unit * @see TIKA-495 */ - public static String formatDateUnknownTimezone(Date date) { // Create the Calendar object in the system timezone Calendar calendar = GregorianCalendar.getInstance(TimeZone.getDefault(), Locale.US); @@ -110,9 +105,9 @@ public static String formatDateUnknownTimezone(Date date) { return formatted.substring(0, formatted.length() - 1); } - /** * Returns ISO-8601 formatted time converted to UTC, truncated to the seconds place + * * @param calendar * @return */ @@ -123,26 +118,25 @@ private static String doFormatDate(Calendar calendar) { private List loadDateFormats() { List dateFormats = new ArrayList<>(); // yyyy-mm-ddThh... - dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", UTC)); // UTC/Zulu - dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", null)); // With timezone - dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss", null)); // Without timezone + dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", UTC)); // UTC/Zulu + dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", null)); // With timezone + dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss", null)); // Without timezone // yyyy-mm-dd hh... - dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss'Z'", UTC)); // UTC/Zulu - dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ssZ", null)); // With timezone - dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss", null)); // Without timezone + dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss'Z'", UTC)); // UTC/Zulu + dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ssZ", null)); // With timezone + dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss", null)); // Without timezone // Date without time, set to Midday UTC - dateFormats.add(createDateFormat("yyyy-MM-dd", MIDDAY)); // Normal date format - dateFormats.add(createDateFormat("yyyy:MM:dd", - MIDDAY)); // Image (IPTC/EXIF) format + dateFormats.add(createDateFormat("yyyy-MM-dd", MIDDAY)); // Normal date format + dateFormats.add(createDateFormat("yyyy:MM:dd", MIDDAY)); // Image (IPTC/EXIF) format return dateFormats; } /** * Tries to parse the date string; returns null if no parse was possible. - *

- * This is not thread safe! Wrap in synchronized or create new {@link DateUtils} - * for each class. + * + *

This is not thread safe! Wrap in synchronized or create new {@link DateUtils} for each + * class. * * @param dateString * @return @@ -151,8 +145,8 @@ public Date tryToParse(String dateString) { // Java doesn't like timezones in the form ss+hh:mm // It only likes the hhmm form, without the colon int n = dateString.length(); - if (dateString.charAt(n - 3) == ':' && - (dateString.charAt(n - 6) == '+' || dateString.charAt(n - 6) == '-')) { + if (dateString.charAt(n - 3) == ':' + && (dateString.charAt(n - 6) == '+' || dateString.charAt(n - 6) == '-')) { dateString = dateString.substring(0, n - 3) + dateString.substring(n - 2); } @@ -160,7 +154,7 @@ public Date tryToParse(String dateString) { try { return df.parse(dateString); } catch (java.text.ParseException e) { - //swallow + // swallow } } return null; diff --git a/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java index 8f071e2569..dd76ed1c9d 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java @@ -16,27 +16,25 @@ */ package org.apache.tika.utils; - import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.io.Writer; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.tika.exception.TikaException; public class ExceptionUtils { - private final static Pattern MSG_PATTERN = Pattern.compile(":[^\r\n]+"); + private static final Pattern MSG_PATTERN = Pattern.compile(":[^\r\n]+"); /** * Simple util to get stack trace. - *

- * This will unwrap a TikaException and return the cause if not null - *

- * NOTE: If your stacktraces are truncated, make sure to start your jvm - * with: -XX:-OmitStackTraceInFastThrow + * + *

This will unwrap a TikaException and return the cause if not null + * + *

NOTE: If your stacktraces are truncated, make sure to start your jvm with: + * -XX:-OmitStackTraceInFastThrow * * @param t throwable * @return @@ -66,17 +64,16 @@ public static String getStackTrace(Throwable t) { writer.close(); result.close(); } catch (IOException e) { - //swallow + // swallow } return result.toString(); } /** - * Utility method to trim the message from a stack trace - * string. - *

- * E.g. java.lang.IllegalStateException: Potential loop detected - * will be trimmed to java.lang.IllegalStateException + * Utility method to trim the message from a stack trace string. + * + *

E.g. java.lang.IllegalStateException: Potential loop detected will be + * trimmed to java.lang.IllegalStateException * * @param trace string view of stack trace * @return trimmed stack trace diff --git a/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java b/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java index f08ca472c8..7b4a528e22 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java +++ b/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java @@ -102,16 +102,27 @@ public void setStdoutTruncated(boolean stdoutTruncated) { @Override public String toString() { - return "FileProcessResult{" + - "stderr='" + stderr + '\'' + - ", stdout='" + stdout + '\'' + - ", exitValue=" + exitValue + - ", processTimeMillis=" + processTimeMillis + - ", isTimeout=" + isTimeout + - ", stdoutLength=" + stdoutLength + - ", stderrLength=" + stderrLength + - ", stderrTruncated=" + stderrTruncated + - ", stdoutTruncated=" + stdoutTruncated + - '}'; + return "FileProcessResult{" + + "stderr='" + + stderr + + '\'' + + ", stdout='" + + stdout + + '\'' + + ", exitValue=" + + exitValue + + ", processTimeMillis=" + + processTimeMillis + + ", isTimeout=" + + isTimeout + + ", stdoutLength=" + + stdoutLength + + ", stderrLength=" + + stderrLength + + ", stderrTruncated=" + + stderrTruncated + + ", stdoutTruncated=" + + stdoutTruncated + + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java index 837f762955..3838763e87 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java @@ -21,7 +21,6 @@ import java.io.IOException; import java.io.InputStream; import java.util.Arrays; - import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -30,18 +29,14 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; -/** - * Helper util methods for Parsers themselves. - */ +/** Helper util methods for Parsers themselves. */ public class ParserUtils { - public final static Property EMBEDDED_PARSER = Property.internalText( - TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser"); - + public static final Property EMBEDDED_PARSER = + Property.internalText( + TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser"); - /** - * Does a deep clone of a Metadata object. - */ + /** Does a deep clone of a Metadata object. */ public static Metadata cloneMetadata(Metadata m) { Metadata clone = new Metadata(); @@ -59,8 +54,8 @@ public static Metadata cloneMetadata(Metadata m) { } /** - * Identifies the real class name of the {@link Parser}, unwrapping - * any {@link ParserDecorator} decorations on top of it. + * Identifies the real class name of the {@link Parser}, unwrapping any {@link ParserDecorator} + * decorations on top of it. */ public static String getParserClassname(Parser parser) { if (parser instanceof ParserDecorator) { @@ -71,9 +66,8 @@ public static String getParserClassname(Parser parser) { } /** - * Records details of the {@link Parser} used to the {@link Metadata}, - * typically wanted where multiple parsers could be picked between - * or used. + * Records details of the {@link Parser} used to the {@link Metadata}, typically wanted where + * multiple parsers could be picked between or used. */ public static void recordParserDetails(Parser parser, Metadata metadata) { String className = getParserClassname(parser); @@ -81,24 +75,22 @@ public static void recordParserDetails(Parser parser, Metadata metadata) { } /** - * Records details of the {@link Parser} used to the {@link Metadata}, - * typically wanted where multiple parsers could be picked between - * or used. + * Records details of the {@link Parser} used to the {@link Metadata}, typically wanted where + * multiple parsers could be picked between or used. */ public static void recordParserDetails(String parserClassName, Metadata metadata) { String[] parsedBys = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY); if (parsedBys == null || parsedBys.length == 0) { metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName); } else if (Arrays.stream(parsedBys).noneMatch(parserClassName::equals)) { - //only add parser once + // only add parser once metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName); } } /** - * Records details of a {@link Parser}'s failure to the - * {@link Metadata}, so you can check what went wrong even if the - * {@link Exception} wasn't immediately thrown (eg when several different + * Records details of a {@link Parser}'s failure to the {@link Metadata}, so you can check what + * went wrong even if the {@link Exception} wasn't immediately thrown (eg when several different * Parsers are used) */ public static void recordParserFailure(Parser parser, Throwable failure, Metadata metadata) { @@ -108,14 +100,12 @@ public static void recordParserFailure(Parser parser, Throwable failure, Metadat } /** - * Ensures that the Stream will be able to be re-read, by buffering to - * a temporary file if required. - * Streams that are automatically OK include {@link TikaInputStream}s - * created from Files or InputStreamFactories, and {@link RereadableInputStream}. + * Ensures that the Stream will be able to be re-read, by buffering to a temporary file if + * required. Streams that are automatically OK include {@link TikaInputStream}s created from + * Files or InputStreamFactories, and {@link RereadableInputStream}. */ - public static InputStream ensureStreamReReadable(InputStream stream, TemporaryResources tmp, - Metadata metadata) - throws IOException { + public static InputStream ensureStreamReReadable( + InputStream stream, TemporaryResources tmp, Metadata metadata) throws IOException { // If it's re-readable, we're done if (stream instanceof RereadableInputStream) { return stream; @@ -140,9 +130,9 @@ public static InputStream ensureStreamReReadable(InputStream stream, TemporaryRe } /** - * Resets the given {@link TikaInputStream} (checked by - * {@link #ensureStreamReReadable(InputStream, TemporaryResources, Metadata)}) - * so that it can be re-read again. + * Resets the given {@link TikaInputStream} (checked by {@link + * #ensureStreamReReadable(InputStream, TemporaryResources, Metadata)}) so that it can be + * re-read again. */ public static InputStream streamResetForReRead(InputStream stream, TemporaryResources tmp) throws IOException { diff --git a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java index 0120cac0ca..f5c287710d 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java @@ -16,7 +16,6 @@ */ package org.apache.tika.utils; - import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -26,13 +25,15 @@ public class ProcessUtils { - private static final ConcurrentHashMap PROCESS_MAP = new ConcurrentHashMap<>(); static { - Runtime.getRuntime().addShutdownHook(new Thread(() -> { - PROCESS_MAP.forEachValue(1, Process::destroyForcibly); - })); + Runtime.getRuntime() + .addShutdownHook( + new Thread( + () -> { + PROCESS_MAP.forEachValue(1, Process::destroyForcibly); + })); } private static String register(Process p) { @@ -46,9 +47,8 @@ private static Process release(String id) { } /** - * This should correctly put double-quotes around an argument if - * ProcessBuilder doesn't seem to work (as it doesn't - * on paths with spaces on Windows) + * This should correctly put double-quotes around an argument if ProcessBuilder doesn't seem to + * work (as it doesn't on paths with spaces on Windows) * * @param arg * @return @@ -57,18 +57,20 @@ public static String escapeCommandLine(String arg) { if (arg == null) { return arg; } - //need to test for " " on windows, can't just add double quotes - //across platforms. - if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS && - (!arg.startsWith("\"") && !arg.endsWith("\""))) { + // need to test for " " on windows, can't just add double quotes + // across platforms. + if (arg.contains(" ") + && SystemUtils.IS_OS_WINDOWS + && (!arg.startsWith("\"") && !arg.endsWith("\""))) { arg = "\"" + arg + "\""; } return arg; } public static String unescapeCommandLine(String arg) { - if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS && - (arg.startsWith("\"") && arg.endsWith("\""))) { + if (arg.contains(" ") + && SystemUtils.IS_OS_WINDOWS + && (arg.startsWith("\"") && arg.endsWith("\""))) { arg = arg.substring(1, arg.length() - 1); } return arg; @@ -84,9 +86,8 @@ public static String unescapeCommandLine(String arg) { * @return * @throws IOException */ - public static FileProcessResult execute(ProcessBuilder pb, - long timeoutMillis, - int maxStdoutBuffer, int maxStdErrBuffer) + public static FileProcessResult execute( + ProcessBuilder pb, long timeoutMillis, int maxStdoutBuffer, int maxStdErrBuffer) throws IOException { Process p = null; String id = null; @@ -121,7 +122,7 @@ public static FileProcessResult execute(ProcessBuilder pb, try { exitValue = p.exitValue(); } catch (IllegalThreadStateException e) { - //not finished! + // not finished! } } } @@ -135,7 +136,7 @@ public static FileProcessResult execute(ProcessBuilder pb, result.processTimeMillis = elapsed; result.stderrLength = errGobbler.getStreamLength(); result.stdoutLength = outGobbler.getStreamLength(); - result.isTimeout = ! complete; + result.isTimeout = !complete; result.exitValue = exitValue; result.stdout = StringUtils.joinWith("\n", outGobbler.getLines()); result.stderr = StringUtils.joinWith("\n", errGobbler.getLines()); @@ -162,9 +163,9 @@ public static FileProcessResult execute(ProcessBuilder pb, * @return * @throws IOException */ - public static FileProcessResult execute(ProcessBuilder pb, - long timeoutMillis, - Path stdoutRedirect, int maxStdErrBuffer) throws IOException { + public static FileProcessResult execute( + ProcessBuilder pb, long timeoutMillis, Path stdoutRedirect, int maxStdErrBuffer) + throws IOException { if (!Files.isDirectory(stdoutRedirect.getParent())) { Files.createDirectories(stdoutRedirect.getParent()); @@ -214,7 +215,5 @@ public static FileProcessResult execute(ProcessBuilder pb, } release(id); } - } - } diff --git a/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java b/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java index 70d04119b3..030b011f0f 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java @@ -22,22 +22,20 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -/** - * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract - * content - */ +/** Inspired from Nutch code class OutlinkExtractor. Apply regex to extract content */ public class RegexUtils { /** * Regex pattern to get URLs within a plain text. * * @see http://www.truerwords.net/articles/ut/urlactivation.html - * + * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html + * */ - private static final String LINKS_REGEX = "([A-Za-z][A-Za-z0-9+.-]{1,120}:" + - "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}" + - "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; + private static final String LINKS_REGEX = + "([A-Za-z][A-Za-z0-9+.-]{1,120}:" + + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}" + + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; private static final Pattern LINKS_PATTERN = Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE); @@ -59,6 +57,5 @@ public static List extractLinks(String content) { extractions.add(matcher.group()); } return extractions; - } } diff --git a/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java b/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java index db38977b3a..4f53c7fb16 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java @@ -25,96 +25,75 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; /** - * Wraps an input stream, reading it only once, but making it available - * for rereading an arbitrary number of times. The stream's bytes are - * stored in memory up to a user specified maximum, and then stored in a - * temporary file which is deleted when this class's close() method is called. + * Wraps an input stream, reading it only once, but making it available for rereading an arbitrary + * number of times. The stream's bytes are stored in memory up to a user specified maximum, and then + * stored in a temporary file which is deleted when this class's close() method is called. */ public class RereadableInputStream extends InputStream { - /** - * Default value for buffer size = 500M - */ + /** Default value for buffer size = 500M */ private static final int DEFAULT_MAX_BYTES_IN_MEMORY = 512 * 1024 * 1024; - - /** - * Input stream originally passed to the constructor. - */ + /** Input stream originally passed to the constructor. */ private final InputStream originalInputStream; /** - * The inputStream currently being used by this object to read contents; - * may be the original stream passed in, or a stream that reads - * the saved copy from a memory buffer or file. + * The inputStream currently being used by this object to read contents; may be the original + * stream passed in, or a stream that reads the saved copy from a memory buffer or file. */ private InputStream inputStream; /** - * Maximum number of bytes that can be stored in memory before - * storage will be moved to a temporary file. + * Maximum number of bytes that can be stored in memory before storage will be moved to a + * temporary file. */ private final int maxBytesInMemory; /** - * Whether or not we are currently reading from the byte buffer in memory - * Bytes are read until we've exhausted the buffered bytes and then we proceed to read from - * the original input stream. If the numbers of bytes read from the original stream - * eventually exceed maxBytesInMemory, then we'll switch to reading from a file. + * Whether or not we are currently reading from the byte buffer in memory Bytes are read until + * we've exhausted the buffered bytes and then we proceed to read from the original input + * stream. If the numbers of bytes read from the original stream eventually exceed + * maxBytesInMemory, then we'll switch to reading from a file. */ private boolean readingFromBuffer; - /** - * The buffer used to store the stream's content; this storage is moved - * to a file when the stored data's size exceeds maxBytesInMemory. - * Set to null once we start writing to a file. + * The buffer used to store the stream's content; this storage is moved to a file when the + * stored data's size exceeds maxBytesInMemory. Set to null once we start writing to a file. */ private byte[] byteBuffer; - /** - * The current pointer when reading from memory - */ + /** The current pointer when reading from memory */ private int bufferPointer; - /** - * Maximum size of the buffer that was written in previous pass(s) - */ + /** Maximum size of the buffer that was written in previous pass(s) */ private int bufferHighWaterMark; /** - * File used to store the stream's contents; is null until the stored - * content's size exceeds maxBytesInMemory. + * File used to store the stream's contents; is null until the stored content's size exceeds + * maxBytesInMemory. */ private File storeFile; - /** - * Specifies whether the stream has been closed - */ + /** Specifies whether the stream has been closed */ private boolean closed; - /** - * OutputStream used to save the content of the input stream in a - * temporary file. - */ + /** OutputStream used to save the content of the input stream in a temporary file. */ private OutputStream storeOutputStream; - /** - * Specifies whether or not to close the original input stream - * when close() is called. Defaults to true. + * Specifies whether or not to close the original input stream when close() is called. Defaults + * to true. */ private final boolean closeOriginalStreamOnClose; - /** - * Creates a rereadable input stream with defaults of 512*1024*1024 bytes (500M) for - * maxBytesInMemory and both readToEndOfStreamOnFirstRewind and closeOriginalStreamOnClose - * set to true + * Creates a rereadable input stream with defaults of 512*1024*1024 bytes (500M) for + * maxBytesInMemory and both readToEndOfStreamOnFirstRewind and closeOriginalStreamOnClose set + * to true * * @param inputStream stream containing the source of data */ @@ -133,16 +112,14 @@ public RereadableInputStream(InputStream inputStream, boolean closeOriginalStrea } /** - * Creates a rereadable input stream with closeOriginalStreamOnClose set to true + * Creates a rereadable input stream with closeOriginalStreamOnClose set to true * - * @param inputStream stream containing the source of data - * @param maxBytesInMemory maximum number of bytes to use to store - * the stream's contents in memory before switching to disk; note that - * the instance will preallocate a byte array whose size is - * maxBytesInMemory. This byte array will be made available for - * garbage collection (i.e. its reference set to null) when the - * content size exceeds the array's size, when close() is called, or - * when there are no more references to the instance. + * @param inputStream stream containing the source of data + * @param maxBytesInMemory maximum number of bytes to use to store the stream's contents in + * memory before switching to disk; note that the instance will preallocate a byte array + * whose size is maxBytesInMemory. This byte array will be made available for garbage + * collection (i.e. its reference set to null) when the content size exceeds the array's + * size, when close() is called, or when there are no more references to the instance. */ public RereadableInputStream(InputStream inputStream, int maxBytesInMemory) { this(inputStream, maxBytesInMemory, true); @@ -151,17 +128,15 @@ public RereadableInputStream(InputStream inputStream, int maxBytesInMemory) { /** * Creates a rereadable input stream. * - * @param inputStream stream containing the source of data - * @param maxBytesInMemory maximum number of bytes to use to store - * the stream's contents in memory before switching to disk; note that - * the instance will preallocate a byte array whose size is - * maxBytesInMemory. This byte array will be made available for - * garbage collection (i.e. its reference set to null) when the - * content size exceeds the array's size, when close() is called, or - * when there are no more references to the instance. + * @param inputStream stream containing the source of data + * @param maxBytesInMemory maximum number of bytes to use to store the stream's contents in + * memory before switching to disk; note that the instance will preallocate a byte array + * whose size is maxBytesInMemory. This byte array will be made available for garbage + * collection (i.e. its reference set to null) when the content size exceeds the array's + * size, when close() is called, or when there are no more references to the instance. */ - public RereadableInputStream(InputStream inputStream, int maxBytesInMemory, - boolean closeOriginalStreamOnClose) { + public RereadableInputStream( + InputStream inputStream, int maxBytesInMemory, boolean closeOriginalStreamOnClose) { this.inputStream = inputStream; this.originalInputStream = inputStream; this.maxBytesInMemory = maxBytesInMemory; @@ -170,9 +145,8 @@ public RereadableInputStream(InputStream inputStream, int maxBytesInMemory, } /** - * Reads a byte from the stream, saving it in the store if it is being - * read from the original stream. Implements the abstract - * InputStream.read(). + * Reads a byte from the stream, saving it in the store if it is being read from the original + * stream. Implements the abstract InputStream.read(). * * @return the read byte, or -1 on end of stream. * @throws IOException @@ -188,9 +162,9 @@ public int read() throws IOException { // the next byte from there instead if (readingFromBuffer) { readingFromBuffer = false; - inputStream.close(); // Close the input byte stream + inputStream.close(); // Close the input byte stream } else { - inputStream.close(); // Close the input file stream + inputStream.close(); // Close the input file stream // start appending to the file storeOutputStream = new BufferedOutputStream(new FileOutputStream(storeFile, true)); } @@ -207,9 +181,7 @@ public int read() throws IOException { return inputByte; } - /** - * Saves the bytes read from the original stream to buffer or file - */ + /** Saves the bytes read from the original stream to buffer or file */ private void saveByte(int inputByte) throws IOException { if (byteBuffer != null) { if (bufferPointer == maxBytesInMemory) { @@ -257,7 +229,8 @@ public void rewind() throws IOException { // If we have a buffer, then we'll read from it if (byteBuffer != null) { readingFromBuffer = true; - inputStream = new UnsynchronizedByteArrayInputStream(byteBuffer, 0, bufferHighWaterMark); + inputStream = + new UnsynchronizedByteArrayInputStream(byteBuffer, 0, bufferHighWaterMark); } else { // No buffer, which means we've switched to a file inputStream = new BufferedInputStream(new FileInputStream(storeFile)); @@ -268,8 +241,8 @@ public void rewind() throws IOException { } /** - * Closes the input stream currently used for reading (may either be - * the original stream or a memory or file stream after the first pass). + * Closes the input stream currently used for reading (may either be the original stream or a + * memory or file stream after the first pass). * * @throws IOException */ @@ -285,8 +258,7 @@ private void closeStream() throws IOException { } /** - * Closes the input stream and removes the temporary file if one was - * created. + * Closes the input stream and removes the temporary file if one was created. * * @throws IOException */ diff --git a/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java index 1e61c97ae0..c255b41105 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java @@ -19,16 +19,13 @@ import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.util.List; - import org.apache.tika.config.ServiceLoader; -/** - * Service Loading and Ordering related utils - */ +/** Service Loading and Ordering related utils */ public class ServiceLoaderUtils { /** - * Sorts a list of loaded classes, so that non-Tika ones come - * before Tika ones, and otherwise in reverse alphabetical order + * Sorts a list of loaded classes, so that non-Tika ones come before Tika ones, and otherwise in + * reverse alphabetical order */ public static void sortLoadedClasses(List loaded) { loaded.sort(CompareUtils::compareClassName); @@ -38,7 +35,7 @@ public static void sortLoadedClasses(List loaded) { * Loads a class and instantiates it * * @param className service class name - * @param service type + * @param service type * @return instance of service */ public static T newInstance(String className) { @@ -49,27 +46,31 @@ public static T newInstance(String className) { * Loads a class and instantiates it * * @param className service class name - * @param loader class loader - * @param service type + * @param loader class loader + * @param service type * @return instance of service */ public static T newInstance(String className, ClassLoader loader) { try { - return ((Class) Class.forName(className, true, loader)).getDeclaredConstructor().newInstance(); - } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | - NoSuchMethodException | InvocationTargetException e) { + return ((Class) Class.forName(className, true, loader)) + .getDeclaredConstructor() + .newInstance(); + } catch (ClassNotFoundException + | InstantiationException + | IllegalAccessException + | NoSuchMethodException + | InvocationTargetException e) { throw new RuntimeException(e); } } /** - * Loads a class and instantiates it. If the class can be initialized - * with a ServiceLoader, the ServiceLoader constructor is used. - * Otherwise, a zero arg newInstance() is called. + * Loads a class and instantiates it. If the class can be initialized with a ServiceLoader, the + * ServiceLoader constructor is used. Otherwise, a zero arg newInstance() is called. * - * @param klass class to build - * @param loader service loader - * @param service type + * @param klass class to build + * @param loader service loader + * @param service type * @return instance of service */ public static T newInstance(Class klass, ServiceLoader loader) { @@ -78,12 +79,14 @@ public static T newInstance(Class klass, ServiceLoader loader) { Constructor constructor = klass.getDeclaredConstructor(ServiceLoader.class); return constructor.newInstance(loader); } catch (NoSuchMethodException e) { - return (T)klass.getDeclaredConstructor().newInstance(); + return (T) klass.getDeclaredConstructor().newInstance(); } catch (InvocationTargetException e) { throw new RuntimeException(e); } - } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | - InvocationTargetException e) { + } catch (InstantiationException + | IllegalAccessException + | NoSuchMethodException + | InvocationTargetException e) { throw new RuntimeException(e); } } diff --git a/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java b/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java index effbeb2c90..266f312e50 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java +++ b/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java @@ -27,7 +27,6 @@ public class StreamGobbler implements Runnable { - private final InputStream is; private final int maxBufferLength; List lines = new ArrayList<>(); @@ -39,12 +38,11 @@ public StreamGobbler(InputStream is, int maxBufferLength) { this.maxBufferLength = maxBufferLength; } - @Override public void run() { - try (BufferedReader r = new BufferedReader( - new InputStreamReader(is, StandardCharsets.UTF_8))) { + try (BufferedReader r = + new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { String line = r.readLine(); while (line != null) { if (maxBufferLength >= 0) { diff --git a/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java b/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java index 462cceb17d..8adb3d2468 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java @@ -45,9 +45,9 @@ public static boolean isBlank(final String s) { } /** - *

Left pad a String with a specified String.

+ * Left pad a String with a specified String. * - *

Pad to a size of {@code size}.

+ *

Pad to a size of {@code size}. * *

      * StringUtils.leftPad(null, *, *)      = null
@@ -61,11 +61,11 @@ public static boolean isBlank(final String s) {
      * StringUtils.leftPad("bat", 5, "")    = "  bat"
      * 
* - * @param str the String to pad out, may be null - * @param size the size to pad to + * @param str the String to pad out, may be null + * @param size the size to pad to * @param padStr the String to pad with, null or empty treated as single space - * @return left padded String or original String if no padding is necessary, - * {@code null} if null String input + * @return left padded String or original String if no padding is necessary, {@code null} if + * null String input */ public static String leftPad(final String str, final int size, String padStr) { if (str == null) { @@ -98,7 +98,6 @@ public static String leftPad(final String str, final int size, String padStr) { } } - public static String leftPad(final String str, final int size, final char padChar) { if (str == null) { return null; @@ -114,8 +113,7 @@ public static String leftPad(final String str, final int size, final char padCha } /** - *

Returns padding using the specified delimiter repeated - * to a given length.

+ * Returns padding using the specified delimiter repeated to a given length. * *
      * StringUtils.repeat('e', 0)  = ""
@@ -123,14 +121,13 @@ public static String leftPad(final String str, final int size, final char padCha
      * StringUtils.repeat('e', -2) = ""
      * 
* - *

Note: this method does not support padding with - * Unicode Supplementary Characters - * as they require a pair of {@code char}s to be represented. - * If you are needing to support full I18N of your applications - * consider using {@link #repeat(String, int)} instead. - *

+ *

Note: this method does not support padding with Unicode Supplementary + * Characters as they require a pair of {@code char}s to be represented. If you are needing + * to support full I18N of your applications consider using {@link #repeat(String, int)} + * instead. * - * @param ch character to repeat + * @param ch character to repeat * @param repeat number of times to repeat char, negative treated as zero * @return String with repeated character * @see #repeat(String, int) @@ -147,11 +144,10 @@ public static String repeat(final char ch, final int repeat) { } // Padding - //----------------------------------------------------------------------- + // ----------------------------------------------------------------------- /** - *

Repeat a String {@code repeat} times to form a - * new String.

+ * Repeat a String {@code repeat} times to form a new String. * *
      * StringUtils.repeat(null, 2) = null
@@ -162,10 +158,10 @@ public static String repeat(final char ch, final int repeat) {
      * StringUtils.repeat("a", -2) = ""
      * 
* - * @param str the String to repeat, may be null + * @param str the String to repeat, may be null * @param repeat number of times to repeat str, negative treated as zero - * @return a new String consisting of the original String repeated, - * {@code null} if null String input + * @return a new String consisting of the original String repeated, {@code null} if null String + * input */ public static String repeat(final String str, final int repeat) { // Performance tuned for 2.0 (JDK1.4) @@ -206,7 +202,6 @@ public static String repeat(final String str, final int repeat) { } } - public static String joinWith(String delimiter, List lines) { if (lines.size() == 0) { return EMPTY; diff --git a/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java b/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java index 027b677fcc..7e75e586d4 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java @@ -16,9 +16,7 @@ */ package org.apache.tika.utils; -/** - * Copied from commons-lang to avoid requiring the dependency - */ +/** Copied from commons-lang to avoid requiring the dependency */ public class SystemUtils { public static final String OS_NAME = getSystemProperty("os.name"); @@ -40,8 +38,14 @@ public class SystemUtils { private static final String OS_VERSION_WSL = "WSL"; static { - IS_OS_UNIX = IS_OS_AIX || IS_OS_HP_UX || IS_OS_IRIX || IS_OS_LINUX || IS_OS_MAC_OSX || - IS_OS_SOLARIS || IS_OS_SUN_OS; + IS_OS_UNIX = + IS_OS_AIX + || IS_OS_HP_UX + || IS_OS_IRIX + || IS_OS_LINUX + || IS_OS_MAC_OSX + || IS_OS_SOLARIS + || IS_OS_SUN_OS; IS_OS_WINDOWS = getOSMatchesName(OS_NAME_WINDOWS_PREFIX); IS_OS_VERSION_WSL = getOSContainsVersion(OS_VERSION_WSL); } @@ -69,5 +73,4 @@ private static boolean getOSContainsVersion(String osVersionSearch) { static boolean doesOSVersionContain(String osVersion, String osVersionSearch) { return osVersion != null && osVersion.contains(osVersionSearch); } - } diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java index 262ebfef97..4b9bfcad4c 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java @@ -42,7 +42,9 @@ import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.TransformerFactoryConfigurationError; - +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.OfflineContentHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; @@ -57,26 +59,17 @@ import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.exception.TikaException; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.OfflineContentHandler; - - -/** - * Utility functions for reading XML. - */ +/** Utility functions for reading XML. */ public class XMLReaderUtils implements Serializable { - /** - * Default size for the pool of SAX Parsers - * and the pool of DOM builders - */ + /** Default size for the pool of SAX Parsers and the pool of DOM builders */ public static final int DEFAULT_POOL_SIZE = 10; + public static final int DEFAULT_MAX_ENTITY_EXPANSIONS = 20; - /** - * Serial version UID - */ + + /** Serial version UID */ private static final long serialVersionUID = 6110455808615143122L; + private static final Logger LOG = LoggerFactory.getLogger(XMLReaderUtils.class); private static final String XERCES_SECURITY_MANAGER = "org.apache.xerces.util.SecurityManager"; private static final String XERCES_SECURITY_MANAGER_PROPERTY = @@ -84,37 +77,30 @@ public class XMLReaderUtils implements Serializable { private static final AtomicBoolean HAS_WARNED_STAX = new AtomicBoolean(false); private static final ContentHandler IGNORING_CONTENT_HANDLER = new DefaultHandler(); - private static final DTDHandler IGNORING_DTD_HANDLER = new DTDHandler() { - @Override - public void notationDecl(String name, String publicId, String systemId) - throws SAXException { - - } - - @Override - public void unparsedEntityDecl(String name, String publicId, String systemId, - String notationName) throws SAXException { - - } - }; - private static final ErrorHandler IGNORING_ERROR_HANDLER = new ErrorHandler() { - @Override - public void warning(SAXParseException exception) throws SAXException { - - } - - @Override - public void error(SAXParseException exception) throws SAXException { - - } - - @Override - public void fatalError(SAXParseException exception) throws SAXException { - - } - }; + private static final DTDHandler IGNORING_DTD_HANDLER = + new DTDHandler() { + @Override + public void notationDecl(String name, String publicId, String systemId) + throws SAXException {} + + @Override + public void unparsedEntityDecl( + String name, String publicId, String systemId, String notationName) + throws SAXException {} + }; + private static final ErrorHandler IGNORING_ERROR_HANDLER = + new ErrorHandler() { + @Override + public void warning(SAXParseException exception) throws SAXException {} + + @Override + public void error(SAXParseException exception) throws SAXException {} + + @Override + public void fatalError(SAXParseException exception) throws SAXException {} + }; private static final String JAXP_ENTITY_EXPANSION_LIMIT_KEY = "jdk.xml.entityExpansionLimit"; - //TODO: figure out if the rw lock is any better than a simple lock + // TODO: figure out if the rw lock is any better than a simple lock private static final ReentrantReadWriteLock SAX_READ_WRITE_LOCK = new ReentrantReadWriteLock(); private static final ReentrantReadWriteLock DOM_READ_WRITE_LOCK = new ReentrantReadWriteLock(); private static final AtomicInteger POOL_GENERATION = new AtomicInteger(); @@ -122,10 +108,10 @@ public void fatalError(SAXParseException exception) throws SAXException { (publicId, systemId) -> new InputSource(new StringReader("")); private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER = (publicID, systemID, baseURI, namespace) -> ""; - /** - * Parser pool size - */ + + /** Parser pool size */ private static int POOL_SIZE = DEFAULT_POOL_SIZE; + private static long LAST_LOG = -1; private static volatile int MAX_ENTITY_EXPANSIONS = determineMaxEntityExpansions(); private static ArrayBlockingQueue SAX_PARSERS = @@ -148,18 +134,18 @@ private static int determineMaxEntityExpansions() { return Integer.parseInt(expansionLimit); } catch (NumberFormatException e) { LOG.warn( - "Couldn't parse an integer for the entity expansion limit: {}; " + - "backing off to default: {}", - expansionLimit, DEFAULT_MAX_ENTITY_EXPANSIONS); + "Couldn't parse an integer for the entity expansion limit: {}; " + + "backing off to default: {}", + expansionLimit, + DEFAULT_MAX_ENTITY_EXPANSIONS); } } return DEFAULT_MAX_ENTITY_EXPANSIONS; } /** - * Returns the XMLReader specified in this parsing context. If a reader - * is not explicitly specified, then one is created using the specified - * or the default SAX parser. + * Returns the XMLReader specified in this parsing context. If a reader is not explicitly + * specified, then one is created using the specified or the default SAX parser. * * @return XMLReader * @throws TikaException @@ -178,13 +164,11 @@ public static XMLReader getXMLReader() throws TikaException { } /** - * Returns the SAX parser specified in this parsing context. If a parser - * is not explicitly specified, then one is created using the specified - * or the default SAX parser factory. - *

- * If you call reset() on the parser, make sure to replace the - * SecurityManager which will be cleared by xerces2 on reset(). - *

+ * Returns the SAX parser specified in this parsing context. If a parser is not explicitly + * specified, then one is created using the specified or the default SAX parser factory. + * + *

If you call reset() on the parser, make sure to replace the SecurityManager which will be + * cleared by xerces2 on reset(). * * @return SAX parser * @throws TikaException if a SAX parser could not be created @@ -204,11 +188,10 @@ public static SAXParser getSAXParser() throws TikaException { } /** - * Returns the SAX parser factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware, not validating, and to use - * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. + * Returns the SAX parser factory specified in this parsing context. If a factory is not + * explicitly specified, then a default factory instance is created and returned. The default + * factory instance is configured to be namespace-aware, not validating, and to use {@link + * XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. * * @return SAX parser factory * @since Apache Tika 0.8 @@ -223,26 +206,25 @@ public static SAXParserFactory getSAXParserFactory() { trySetSAXFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true); trySetSAXFeature(factory, "http://xml.org/sax/features/external-general-entities", false); trySetSAXFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false); - trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", - false); - trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar", - false); + trySetSAXFeature( + factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + trySetSAXFeature( + factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); return factory; } /** - * Returns the DOM builder factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware and to apply reasonable security + * Returns the DOM builder factory specified in this parsing context. If a factory is not + * explicitly specified, then a default factory instance is created and returned. The default + * factory instance is configured to be namespace-aware and to apply reasonable security * features. * * @return DOM parser factory * @since Apache Tika 1.13 */ public static DocumentBuilderFactory getDocumentBuilderFactory() { - //borrowed from Apache POI + // borrowed from Apache POI DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); if (LOG.isDebugEnabled()) { LOG.debug("DocumentBuilderFactory class {}", factory.getClass()); @@ -255,20 +237,20 @@ public static DocumentBuilderFactory getDocumentBuilderFactory() { trySetSAXFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true); trySetSAXFeature(factory, "http://xml.org/sax/features/external-general-entities", false); trySetSAXFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false); - trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", - false); - trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar", - false); + trySetSAXFeature( + factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + trySetSAXFeature( + factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); trySetXercesSecurityManager(factory); return factory; } /** - * Returns the DOM builder specified in this parsing context. - * If a builder is not explicitly specified, then a builder - * instance is created and returned. The builder instance is - * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER}, - * and it sets the ErrorHandler to null. + * Returns the DOM builder specified in this parsing context. If a builder is not explicitly + * specified, then a builder instance is created and returned. The builder instance is + * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER}, and it sets the ErrorHandler to + * null + * . * * @return DOM Builder * @since Apache Tika 1.13 @@ -286,11 +268,10 @@ public static DocumentBuilder getDocumentBuilder() throws TikaException { } /** - * Returns the StAX input factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware and to apply reasonable security - * using the {@link #IGNORING_STAX_ENTITY_RESOLVER}. + * Returns the StAX input factory specified in this parsing context. If a factory is not + * explicitly specified, then a default factory instance is created and returned. The default + * factory instance is configured to be namespace-aware and to apply reasonable security using + * the {@link #IGNORING_STAX_ENTITY_RESOLVER}. * * @return StAX input factory * @since Apache Tika 1.13 @@ -309,8 +290,8 @@ public static XMLInputFactory getXMLInputFactory() { return factory; } - private static void trySetTransformerAttribute(TransformerFactory transformerFactory, - String attribute, String value) { + private static void trySetTransformerAttribute( + TransformerFactory transformerFactory, String attribute, String value) { try { transformerFactory.setAttribute(attribute, value); } catch (SecurityException e) { @@ -320,12 +301,13 @@ private static void trySetTransformerAttribute(TransformerFactory transformerFac } catch (AbstractMethodError ame) { LOG.warn( "Cannot set Transformer attribute because outdated XML parser in classpath: {}", - attribute, ame); + attribute, + ame); } } - private static void trySetSAXFeature(SAXParserFactory saxParserFactory, String feature, - boolean enabled) { + private static void trySetSAXFeature( + SAXParserFactory saxParserFactory, String feature, boolean enabled) { try { saxParserFactory.setFeature(feature, enabled); } catch (SecurityException e) { @@ -333,19 +315,23 @@ private static void trySetSAXFeature(SAXParserFactory saxParserFactory, String f } catch (Exception e) { LOG.warn("SAX Feature unsupported: {}", feature, e); } catch (AbstractMethodError ame) { - LOG.warn("Cannot set SAX feature because outdated XML parser in classpath: {}", feature, + LOG.warn( + "Cannot set SAX feature because outdated XML parser in classpath: {}", + feature, ame); } } - private static void trySetSAXFeature(DocumentBuilderFactory documentBuilderFactory, - String feature, boolean enabled) { + private static void trySetSAXFeature( + DocumentBuilderFactory documentBuilderFactory, String feature, boolean enabled) { try { documentBuilderFactory.setFeature(feature, enabled); } catch (Exception e) { LOG.warn("SAX Feature unsupported: {}", feature, e); } catch (AbstractMethodError ame) { - LOG.warn("Cannot set SAX feature because outdated XML parser in classpath: {}", feature, + LOG.warn( + "Cannot set SAX feature because outdated XML parser in classpath: {}", + feature, ame); } } @@ -360,9 +346,9 @@ private static void tryToSetStaxProperty(XMLInputFactory factory, String key, bo /** * Returns a new transformer - *

- * The transformer instance is configured to to use - * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. + * + *

The transformer instance is configured to to use {@link + * XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. * * @return Transformer * @throws TikaException when the transformer can not be created @@ -373,8 +359,8 @@ public static Transformer getTransformer() throws TikaException { TransformerFactory transformerFactory = TransformerFactory.newInstance(); transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_DTD, ""); - trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, - ""); + trySetTransformerAttribute( + transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); return transformerFactory.newTransformer(); } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) { throw new TikaException("Transformer not available", e); @@ -382,10 +368,10 @@ public static Transformer getTransformer() throws TikaException { } /** - * This checks context for a user specified {@link DocumentBuilder}. - * If one is not found, this reuses a DocumentBuilder from the pool. + * This checks context for a user specified {@link DocumentBuilder}. If one is not found, this + * reuses a DocumentBuilder from the pool. * - * @param is InputStream to parse + * @param is InputStream to parse * @param context context to use * @return a document * @throws TikaException @@ -412,10 +398,10 @@ public static Document buildDOM(InputStream is, ParseContext context) } /** - * This checks context for a user specified {@link DocumentBuilder}. - * If one is not found, this reuses a DocumentBuilder from the pool. + * This checks context for a user specified {@link DocumentBuilder}. If one is not found, this + * reuses a DocumentBuilder from the pool. * - * @param reader reader (character stream) to parse + * @param reader reader (character stream) to parse * @param context context to use * @return a document * @throws TikaException @@ -497,14 +483,13 @@ public static Document buildDOM(InputStream is) } /** - * This checks context for a user specified {@link SAXParser}. - * If one is not found, this reuses a SAXParser from the pool. + * This checks context for a user specified {@link SAXParser}. If one is not found, this reuses + * a SAXParser from the pool. * - * @param is InputStream to parse - * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler} - * to the content handler as an extra layer of defense against - * external entity vulnerabilities - * @param context context to use + * @param is InputStream to parse + * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler} to the + * content handler as an extra layer of defense against external entity vulnerabilities + * @param context context to use * @return * @throws TikaException * @throws IOException @@ -529,14 +514,13 @@ public static void parseSAX(InputStream is, ContentHandler contentHandler, Parse } /** - * This checks context for a user specified {@link SAXParser}. - * If one is not found, this reuses a SAXParser from the pool. + * This checks context for a user specified {@link SAXParser}. If one is not found, this reuses + * a SAXParser from the pool. * - * @param reader reader (character stream) to parse - * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler} - * to the content handler as an extra layer of defense against - * external entity vulnerabilities - * @param context context to use + * @param reader reader (character stream) to parse + * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler} to the + * content handler as an extra layer of defense against external entity vulnerabilities + * @param context context to use * @return * @throws TikaException * @throws IOException @@ -561,8 +545,7 @@ public static void parseSAX(Reader reader, ContentHandler contentHandler, ParseC } /** - * Acquire a SAXParser from the pool. Make sure to - * {@link #releaseDOMBuilder(PoolDOMBuilder)} in + * Acquire a SAXParser from the pool. Make sure to {@link #releaseDOMBuilder(PoolDOMBuilder)} in * a finally block every time you call this. * * @return a DocumentBuilder @@ -585,22 +568,23 @@ private static PoolDOMBuilder acquireDOMBuilder() throws TikaException { return builder; } if (lastWarn < 0 || System.currentTimeMillis() - lastWarn > 1000) { - //avoid spamming logs - LOG.warn("Contention waiting for a DOMParser. " + - "Consider increasing the XMLReaderUtils.POOL_SIZE"); + // avoid spamming logs + LOG.warn( + "Contention waiting for a DOMParser. " + + "Consider increasing the XMLReaderUtils.POOL_SIZE"); lastWarn = System.currentTimeMillis(); } waiting++; if (waiting > 3000) { - //freshen the pool. Something went very wrong... + // freshen the pool. Something went very wrong... setPoolSize(POOL_SIZE); - //better to get an exception than have permahang by a bug in one of our parsers - throw new TikaException("Waited more than 5 minutes for a DocumentBuilder; " + - "This could indicate that a parser has not correctly released its " + - "DocumentBuilder. " + - "Please report this to the Tika team: dev@tika.apache.org"); - + // better to get an exception than have permahang by a bug in one of our parsers + throw new TikaException( + "Waited more than 5 minutes for a DocumentBuilder; " + + "This could indicate that a parser has not correctly released its " + + "DocumentBuilder. " + + "Please report this to the Tika team: dev@tika.apache.org"); } } } @@ -617,18 +601,18 @@ private static void releaseDOMBuilder(PoolDOMBuilder builder) { try { builder.reset(); } catch (UnsupportedOperationException e) { - //ignore + // ignore } DOM_READ_WRITE_LOCK.readLock().lock(); try { - //if there are extra parsers (e.g. after a reset of the pool to a smaller size), + // if there are extra parsers (e.g. after a reset of the pool to a smaller size), // this parser will not be added and will then be gc'd boolean success = DOM_BUILDERS.offer(builder); if (!success) { LOG.warn( - "DocumentBuilder not taken back into pool. If you haven't resized the " + - "pool, this could be a sign that there are more calls to " + - "'acquire' than to 'release'"); + "DocumentBuilder not taken back into pool. If you haven't resized the " + + "pool, this could be a sign that there are more calls to " + + "'acquire' than to 'release'"); } } finally { DOM_READ_WRITE_LOCK.readLock().unlock(); @@ -636,9 +620,8 @@ private static void releaseDOMBuilder(PoolDOMBuilder builder) { } /** - * Acquire a SAXParser from the pool. Make sure to - * {@link #releaseParser(PoolSAXParser)} in - * a finally block every time you call this. + * Acquire a SAXParser from the pool. Make sure to {@link #releaseParser(PoolSAXParser)} in a + * finally block every time you call this. * * @return a SAXParser * @throws TikaException @@ -660,20 +643,21 @@ private static PoolSAXParser acquireSAXParser() throws TikaException { return parser; } if (lastWarn < 0 || System.currentTimeMillis() - lastWarn > 1000) { - //avoid spamming logs - LOG.warn("Contention waiting for a SAXParser. " + - "Consider increasing the XMLReaderUtils.POOL_SIZE"); + // avoid spamming logs + LOG.warn( + "Contention waiting for a SAXParser. " + + "Consider increasing the XMLReaderUtils.POOL_SIZE"); lastWarn = System.currentTimeMillis(); } waiting++; if (waiting > 3000) { - //freshen the pool. Something went very wrong... + // freshen the pool. Something went very wrong... setPoolSize(POOL_SIZE); - //better to get an exception than have permahang by a bug in one of our parsers - throw new TikaException("Waited more than 5 minutes for a SAXParser; " + - "This could indicate that a parser has not correctly released its " + - "SAXParser. Please report this to the Tika team: dev@tika.apache.org"); - + // better to get an exception than have permahang by a bug in one of our parsers + throw new TikaException( + "Waited more than 5 minutes for a SAXParser; " + + "This could indicate that a parser has not correctly released its " + + "SAXParser. Please report this to the Tika team: dev@tika.apache.org"); } } } @@ -687,23 +671,23 @@ private static void releaseParser(PoolSAXParser parser) { try { parser.reset(); } catch (UnsupportedOperationException e) { - //TIKA-3009 -- we really shouldn't have to do this... :( + // TIKA-3009 -- we really shouldn't have to do this... :( } - //if this is a different generation, don't put it back - //in the pool + // if this is a different generation, don't put it back + // in the pool if (parser.getGeneration() != POOL_GENERATION.get()) { return; } SAX_READ_WRITE_LOCK.readLock().lock(); try { - //if there are extra parsers (e.g. after a reset of the pool to a smaller size), + // if there are extra parsers (e.g. after a reset of the pool to a smaller size), // this parser will not be added and will then be gc'd boolean success = SAX_PARSERS.offer(parser); if (!success) { LOG.warn( - "SAXParser not taken back into pool. If you haven't resized the pool " + - "this could be a sign that there are more calls to 'acquire' " + - "than to 'release'"); + "SAXParser not taken back into pool. If you haven't resized the pool " + + "this could be a sign that there are more calls to 'acquire' " + + "than to 'release'"); } } finally { SAX_READ_WRITE_LOCK.readLock().unlock(); @@ -711,28 +695,31 @@ private static void releaseParser(PoolSAXParser parser) { } private static void trySetXercesSecurityManager(DocumentBuilderFactory factory) { - //from POI + // from POI // Try built-in JVM one first, standalone if not - for (String securityManagerClassName : new String[]{ - //"com.sun.org.apache.xerces.internal.util.SecurityManager", - XERCES_SECURITY_MANAGER}) { + for (String securityManagerClassName : + new String[] { + // "com.sun.org.apache.xerces.internal.util.SecurityManager", + XERCES_SECURITY_MANAGER + }) { try { Object mgr = - Class.forName(securityManagerClassName).getDeclaredConstructor().newInstance(); - Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", - Integer.TYPE); + Class.forName(securityManagerClassName) + .getDeclaredConstructor() + .newInstance(); + Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE); setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS); factory.setAttribute(XERCES_SECURITY_MANAGER_PROPERTY, mgr); // Stop once one can be setup without error return; } catch (ClassNotFoundException e) { // continue without log, this is expected in some setups - } catch (Throwable e) { // NOSONAR - also catch things like NoClassDefError here + } catch (Throwable e) { // NOSONAR - also catch things like NoClassDefError here // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { LOG.warn( - "SAX Security Manager could not be setup [log suppressed for 5 " + - "minutes]", + "SAX Security Manager could not be setup [log suppressed for 5 " + + "minutes]", e); LAST_LOG = System.currentTimeMillis(); } @@ -741,13 +728,15 @@ private static void trySetXercesSecurityManager(DocumentBuilderFactory factory) // separate old version of Xerces not found => use the builtin way of setting the property try { - factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", + factory.setAttribute( + "http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS); } catch (IllegalArgumentException e) { // NOSONAR - also catch things like NoClassDefError here // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { - LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]", + LOG.warn( + "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e); LAST_LOG = System.currentTimeMillis(); } @@ -755,14 +744,18 @@ private static void trySetXercesSecurityManager(DocumentBuilderFactory factory) } private static void trySetXercesSecurityManager(SAXParser parser) { - //from POI + // from POI // Try built-in JVM one first, standalone if not - for (String securityManagerClassName : new String[]{ - //"com.sun.org.apache.xerces.internal.util.SecurityManager", - XERCES_SECURITY_MANAGER}) { + for (String securityManagerClassName : + new String[] { + // "com.sun.org.apache.xerces.internal.util.SecurityManager", + XERCES_SECURITY_MANAGER + }) { try { Object mgr = - Class.forName(securityManagerClassName).getDeclaredConstructor().newInstance(); + Class.forName(securityManagerClassName) + .getDeclaredConstructor() + .newInstance(); Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE); setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS); @@ -776,8 +769,8 @@ private static void trySetXercesSecurityManager(SAXParser parser) { // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { LOG.warn( - "SAX Security Manager could not be setup [log suppressed for 5 " + - "minutes]", + "SAX Security Manager could not be setup [log suppressed for 5 " + + "minutes]", e); LAST_LOG = System.currentTimeMillis(); } @@ -786,12 +779,14 @@ private static void trySetXercesSecurityManager(SAXParser parser) { // separate old version of Xerces not found => use the builtin way of setting the property try { - parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", + parser.setProperty( + "http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS); - } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here + } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { - LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]", + LOG.warn( + "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e); LAST_LOG = System.currentTimeMillis(); } @@ -799,19 +794,21 @@ private static void trySetXercesSecurityManager(SAXParser parser) { } private static void trySetStaxSecurityManager(XMLInputFactory inputFactory) { - //try default java entity expansion, then fallback to woodstox, then warn...once. + // try default java entity expansion, then fallback to woodstox, then warn...once. try { - inputFactory.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", + inputFactory.setProperty( + "http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS); } catch (IllegalArgumentException e) { try { inputFactory.setProperty("com.ctc.wstx.maxEntityCount", MAX_ENTITY_EXPANSIONS); } catch (IllegalArgumentException e2) { if (HAS_WARNED_STAX.getAndSet(true) == false) { - LOG.warn("Could not set limit on maximum entity expansions for: " + inputFactory.getClass()); + LOG.warn( + "Could not set limit on maximum entity expansions for: " + + inputFactory.getClass()); } } - } } @@ -820,23 +817,23 @@ public static int getPoolSize() { } /** - * Set the pool size for cached XML parsers. This has a side - * effect of locking the pool, and rebuilding the pool from - * scratch with the most recent settings, such as {@link #MAX_ENTITY_EXPANSIONS} + * Set the pool size for cached XML parsers. This has a side effect of locking the pool, and + * rebuilding the pool from scratch with the most recent settings, such as {@link + * #MAX_ENTITY_EXPANSIONS} * * @param poolSize * @since Apache Tika 1.19 */ public static void setPoolSize(int poolSize) throws TikaException { - //stop the world with a write lock. - //parsers that are currently in use will be offered later (once the lock is released), - //but not accepted and will be gc'd. We have to do this locking and - //the read locking in case one thread resizes the pool when the - //parsers have already started. We could have an NPE on SAX_PARSERS - //if we didn't lock. + // stop the world with a write lock. + // parsers that are currently in use will be offered later (once the lock is released), + // but not accepted and will be gc'd. We have to do this locking and + // the read locking in case one thread resizes the pool when the + // parsers have already started. We could have an NPE on SAX_PARSERS + // if we didn't lock. SAX_READ_WRITE_LOCK.writeLock().lock(); try { - //free up any resources before emptying SAX_PARSERS + // free up any resources before emptying SAX_PARSERS for (PoolSAXParser parser : SAX_PARSERS) { parser.reset(); } @@ -845,8 +842,8 @@ public static void setPoolSize(int poolSize) throws TikaException { int generation = POOL_GENERATION.incrementAndGet(); for (int i = 0; i < poolSize; i++) { try { - SAX_PARSERS.offer(buildPoolParser(generation, - getSAXParserFactory().newSAXParser())); + SAX_PARSERS.offer( + buildPoolParser(generation, getSAXParserFactory().newSAXParser())); } catch (SAXException | ParserConfigurationException e) { throw new TikaException("problem creating sax parser", e); } @@ -873,15 +870,13 @@ public static int getMaxEntityExpansions() { } /** - * Set the maximum number of entity expansions allowable in SAX/DOM/StAX parsing. - * NOTE:A value less than or equal to zero indicates no limit. - * This will override the system property {@link #JAXP_ENTITY_EXPANSION_LIMIT_KEY} - * and the {@link #DEFAULT_MAX_ENTITY_EXPANSIONS} value for allowable entity expansions - *

- * NOTE: To trigger a rebuild of the pool of parsers with this setting, - * the client must call {@link #setPoolSize(int)} to rebuild the SAX and DOM parsers - * with this setting. - *

+ * Set the maximum number of entity expansions allowable in SAX/DOM/StAX parsing. NOTE:A + * value less than or equal to zero indicates no limit. This will override the system property + * {@link #JAXP_ENTITY_EXPANSION_LIMIT_KEY} and the {@link #DEFAULT_MAX_ENTITY_EXPANSIONS} value + * for allowable entity expansions + * + *

NOTE: To trigger a rebuild of the pool of parsers with this setting, the client + * must call {@link #setPoolSize(int)} to rebuild the SAX and DOM parsers with this setting. * * @param maxEntityExpansions -- maximum number of allowable entity expansions * @since Apache Tika 1.19 @@ -922,7 +917,7 @@ private static PoolSAXParser buildPoolParser(int generation, SAXParser parser) { parser.setProperty(XERCES_SECURITY_MANAGER_PROPERTY, mgr); hasSecurityManager = true; } catch (SecurityException e) { - //don't swallow security exceptions + // don't swallow security exceptions throw e; } catch (ClassNotFoundException e) { // continue without log, this is expected in some setups @@ -930,7 +925,8 @@ private static PoolSAXParser buildPoolParser(int generation, SAXParser parser) { // NOSONAR - also catch things like NoClassDefError here // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { - LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]", + LOG.warn( + "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e); LAST_LOG = System.currentTimeMillis(); } @@ -940,15 +936,16 @@ private static PoolSAXParser buildPoolParser(int generation, SAXParser parser) { if (!hasSecurityManager) { // use the builtin way of setting the property try { - parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", + parser.setProperty( + "http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS); canSetJaxPEntity = true; - } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here + } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { LOG.warn( - "SAX Security Manager could not be setup [log suppressed for 5 " + - "minutes]", + "SAX Security Manager could not be setup [log suppressed for 5 " + + "minutes]", e); LAST_LOG = System.currentTimeMillis(); } @@ -964,7 +961,6 @@ private static PoolSAXParser buildPoolParser(int generation, SAXParser parser) { } else { return new UnrecognizedPoolSAXParser(generation, parser); } - } private static void clearReader(XMLReader reader) { @@ -1028,12 +1024,12 @@ public XercesPoolSAXParser(int generation, SAXParser parser) { @Override public void reset() { - //don't do anything + // don't do anything try { XMLReader reader = saxParser.getXMLReader(); clearReader(reader); } catch (SAXException e) { - //swallow + // swallow } } } @@ -1079,8 +1075,8 @@ void reset() { } private static class UnrecognizedPoolSAXParser extends PoolSAXParser { - //if unrecognized, try to set all protections - //and try to reset every time + // if unrecognized, try to set all protections + // and try to reset every time public UnrecognizedPoolSAXParser(int generation, SAXParser parser) { super(generation, parser); } diff --git a/tika-core/src/main/java/org/apache/tika/utils/package-info.java b/tika-core/src/main/java/org/apache/tika/utils/package-info.java index 04ea52e5cf..833c117717 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/utils/package-info.java @@ -15,8 +15,6 @@ * limitations under the License. */ -/** - * Utilities. - */ +/** Utilities. */ @aQute.bnd.annotation.Version("1.0.0") package org.apache.tika.utils; diff --git a/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java b/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java index 7237d11766..2e5efd5324 100644 --- a/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java +++ b/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.tika.detect.Detector; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java index fd3f381d48..4bcbab6416 100644 --- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java @@ -42,7 +42,6 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; - import org.apache.tika.detect.Detector; import org.apache.tika.detect.XmlRootExtractor; import org.apache.tika.exception.TikaException; @@ -57,32 +56,34 @@ import org.apache.tika.utils.XMLReaderUtils; public class MultiThreadedTikaTest extends TikaTest { - //TODO: figure out how to make failures reproducible a la Lucene/Solr with a seed - //TODO: Consider randomizing the Locale and timezone, like Lucene/Solr... + // TODO: figure out how to make failures reproducible a la Lucene/Solr with a seed + // TODO: Consider randomizing the Locale and timezone, like Lucene/Solr... XmlRootExtractor ex = new XmlRootExtractor(); public static Path[] getTestFiles(final FileFilter fileFilter) throws URISyntaxException, IOException { Path root = Paths.get(MultiThreadedTikaTest.class.getResource("/test-documents").toURI()); final List files = new ArrayList<>(); - Files.walkFileTree(root, new SimpleFileVisitor() { - @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) - throws IOException { - if (fileFilter != null && !fileFilter.accept(file.toFile())) { - return FileVisitResult.CONTINUE; - } - if (!attrs.isDirectory()) { - files.add(file); - } - return FileVisitResult.CONTINUE; - } - }); + Files.walkFileTree( + root, + new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) + throws IOException { + if (fileFilter != null && !fileFilter.accept(file.toFile())) { + return FileVisitResult.CONTINUE; + } + if (!attrs.isDirectory()) { + files.add(file); + } + return FileVisitResult.CONTINUE; + } + }); return files.toArray(new Path[0]); } - private static ConcurrentHashMap getBaselineDetection(Detector detector, - Path[] files) { + private static ConcurrentHashMap getBaselineDetection( + Detector detector, Path[] files) { ConcurrentHashMap baseline = new ConcurrentHashMap<>(); XmlRootExtractor extractor = new XmlRootExtractor(); @@ -98,8 +99,8 @@ private static ConcurrentHashMap getBaselineDetection(Detector return baseline; } - private static ConcurrentHashMap getBaseline(Parser parser, Path[] files, - ParseContext parseContext) { + private static ConcurrentHashMap getBaseline( + Parser parser, Path[] files, ParseContext parseContext) { ConcurrentHashMap baseline = new ConcurrentHashMap<>(); for (Path f : files) { @@ -110,69 +111,89 @@ private static ConcurrentHashMap getBaseline(Parser parser, Path[ } catch (Exception e) { e.printStackTrace(); - //swallow + // swallow } } return baseline; } - private static List getRecursiveMetadata(InputStream is, Parser parser, - ParseContext parseContext) throws Exception { - //different from parent TikaTest in that this extracts text. - //can't extract xhtml because "tmp" file names wind up in - //content's metadata and they'll differ by file. + private static List getRecursiveMetadata( + InputStream is, Parser parser, ParseContext parseContext) throws Exception { + // different from parent TikaTest in that this extracts text. + // can't extract xhtml because "tmp" file names wind up in + // content's metadata and they'll differ by file. parseContext = new ParseContext(); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - -1); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), + -1); parser.parse(is, handler, new Metadata(), parseContext); return handler.getMetadataList(); } private static void assertExtractEquals(Extract extractA, Extract extractB) { - //this currently only checks the basics - //might want to add more checks + // this currently only checks the basics + // might want to add more checks - assertEquals(extractA.metadataList.size(), extractB.metadataList.size(), + assertEquals( + extractA.metadataList.size(), + extractB.metadataList.size(), "number of embedded files"); for (int i = 0; i < extractA.metadataList.size(); i++) { - assertEquals(extractA.metadataList.get(i).size(), extractB.metadataList.get(i).size(), + assertEquals( + extractA.metadataList.get(i).size(), + extractB.metadataList.get(i).size(), "number of metadata elements in attachment: " + i); - assertEquals(extractA.metadataList.get(i).get(TikaCoreProperties.TIKA_CONTENT), + assertEquals( + extractA.metadataList.get(i).get(TikaCoreProperties.TIKA_CONTENT), extractB.metadataList.get(i).get(TikaCoreProperties.TIKA_CONTENT), "content in attachment: " + i); } } /** - * This calls {@link #testEach(Parser parser, Path[], ParseContext[], int, int)} and - * then {@link #testAll(Parser parser, Path[], ParseContext[], int, int)} + * This calls {@link #testEach(Parser parser, Path[], ParseContext[], int, int)} and then {@link + * #testAll(Parser parser, Path[], ParseContext[], int, int)} * - * @param numThreads number of threads to use + * @param numThreads number of threads to use * @param numIterations number of iterations per thread - * @param filter file filter to select files from "/test-documents"; if - * null, - * all files will be used + * @param filter file filter to select files from "/test-documents"; if null, all + * files will be used * @throws Exception */ - protected void testMultiThreaded(Parser parser, ParseContext[] parseContext, int numThreads, - int numIterations, FileFilter filter) throws Exception { + protected void testMultiThreaded( + Parser parser, + ParseContext[] parseContext, + int numThreads, + int numIterations, + FileFilter filter) + throws Exception { Path[] allFiles = getTestFiles(filter); testEach(parser, allFiles, parseContext, numThreads, numIterations); testAll(parser, allFiles, parseContext, numThreads, numIterations); } - public void testDetector(Detector detector, int numThreads, int numIterations, - FileFilter filter, int randomlyResizeSAXPool) throws Exception { + public void testDetector( + Detector detector, + int numThreads, + int numIterations, + FileFilter filter, + int randomlyResizeSAXPool) + throws Exception { Path[] files = getTestFiles(filter); testDetectorEach(detector, files, numThreads, numIterations, randomlyResizeSAXPool); testDetectorOnAll(detector, files, numThreads, numIterations, randomlyResizeSAXPool); } - void testDetectorEach(Detector detector, Path[] files, int numThreads, int numIterations, - int randomlyResizeSAXPool) { + void testDetectorEach( + Detector detector, + Path[] files, + int numThreads, + int numIterations, + int randomlyResizeSAXPool) { for (Path p : files) { Path[] toTest = new Path[1]; toTest[0] = p; @@ -180,14 +201,18 @@ void testDetectorEach(Detector detector, Path[] files, int numThreads, int numIt } } - private void testDetectorOnAll(Detector detector, Path[] toTest, int numThreads, - int numIterations, int randomlyResizeSAXPool) { + private void testDetectorOnAll( + Detector detector, + Path[] toTest, + int numThreads, + int numIterations, + int randomlyResizeSAXPool) { Map truth = getBaselineDetection(detector, toTest); - //if all files caused an exception + // if all files caused an exception if (truth.size() == 0) { return; } - //only those that parsed without exception + // only those that parsed without exception Path[] testFiles = new Path[truth.size()]; int j = 0; for (Path testFile : truth.keySet()) { @@ -196,7 +221,13 @@ private void testDetectorOnAll(Detector detector, Path[] toTest, int numThreads, int actualThreadCount = numThreads + Math.max(randomlyResizeSAXPool, 0); ExecutorService ex = Executors.newFixedThreadPool(actualThreadCount); try { - _testDetectorOnAll(detector, testFiles, numThreads, numIterations, truth, ex, + _testDetectorOnAll( + detector, + testFiles, + numThreads, + numIterations, + truth, + ex, randomlyResizeSAXPool); } finally { ex.shutdown(); @@ -204,27 +235,32 @@ private void testDetectorOnAll(Detector detector, Path[] toTest, int numThreads, } } - private void _testDetectorOnAll(Detector detector, Path[] testFiles, int numThreads, - int numIterations, Map truth, - ExecutorService ex, int randomlyResizeSAXPool) { + private void _testDetectorOnAll( + Detector detector, + Path[] testFiles, + int numThreads, + int numIterations, + Map truth, + ExecutorService ex, + int randomlyResizeSAXPool) { ExecutorCompletionService executorCompletionService = new ExecutorCompletionService<>(ex); executorCompletionService.submit(new SAXPoolResizer(randomlyResizeSAXPool)); for (int i = 0; i < numThreads; i++) { - executorCompletionService - .submit(new TikaDetectorRunner(detector, numIterations, testFiles, truth)); + executorCompletionService.submit( + new TikaDetectorRunner(detector, numIterations, testFiles, truth)); } int completed = 0; while (completed < numThreads) { - //TODO: add a maximum timeout threshold + // TODO: add a maximum timeout threshold Future future = null; try { future = executorCompletionService.poll(1000, TimeUnit.MILLISECONDS); if (future != null) { - future.get();//trigger exceptions from thread + future.get(); // trigger exceptions from thread completed++; } } catch (InterruptedException | ExecutionException e) { @@ -236,21 +272,23 @@ private void _testDetectorOnAll(Detector detector, Path[] testFiles, int numThre } /** - * Test each file, one at a time in multiple threads. - * This was required to test TIKA-2519 in a reasonable - * amount of time. This forced the parser to use the - * same underlying memory structures because it was the same file. - * This is stricter than I think our agreement with clients is - * because this run tests on literally the same file and - * not a copy of the file per thread. Let's leave this as is - * unless there's a good reason to create a separate copy per thread. + * Test each file, one at a time in multiple threads. This was required to test TIKA-2519 in a + * reasonable amount of time. This forced the parser to use the same underlying memory + * structures because it was the same file. This is stricter than I think our agreement with + * clients is because this run tests on literally the same file and not a copy of the file per + * thread. Let's leave this as is unless there's a good reason to create a separate copy per + * thread. * - * @param files files to test, one at a time - * @param numThreads number of threads to use + * @param files files to test, one at a time + * @param numThreads number of threads to use * @param numIterations number of iterations per thread */ - protected void testEach(Parser parser, Path[] files, ParseContext[] parseContext, - int numThreads, int numIterations) { + protected void testEach( + Parser parser, + Path[] files, + ParseContext[] parseContext, + int numThreads, + int numIterations) { for (Path p : files) { Path[] toTest = new Path[1]; toTest[0] = p; @@ -259,27 +297,29 @@ protected void testEach(Parser parser, Path[] files, ParseContext[] parseContext } /** - * This tests all files together. Each parser randomly selects - * a file from the array. Two parsers could wind up parsing the - * same file at the same time. Good. - *

- * In the current implementation, this gets ground truth only - * from files that do not throw exceptions. This will ignore - * files that cause exceptions. + * This tests all files together. Each parser randomly selects a file from the array. Two + * parsers could wind up parsing the same file at the same time. Good. * - * @param files files to parse - * @param numThreads number of parser threads + *

In the current implementation, this gets ground truth only from files that do not throw + * exceptions. This will ignore files that cause exceptions. + * + * @param files files to parse + * @param numThreads number of parser threads * @param numIterations number of iterations per parser */ - protected void testAll(Parser parser, Path[] files, ParseContext[] parseContext, int numThreads, - int numIterations) { + protected void testAll( + Parser parser, + Path[] files, + ParseContext[] parseContext, + int numThreads, + int numIterations) { Map truth = getBaseline(parser, files, parseContext[0]); - //if all files caused an exception + // if all files caused an exception if (truth.size() == 0) { - //return; + // return; } - //only those that parsed without exception + // only those that parsed without exception Path[] testFiles = new Path[truth.size()]; int j = 0; for (Path testFile : truth.keySet()) { @@ -295,29 +335,33 @@ protected void testAll(Parser parser, Path[] files, ParseContext[] parseContext, } } - private void _testAll(Parser parser, Path[] testFiles, ParseContext[] parseContext, - int numThreads, int numIterations, Map truth, - ExecutorService ex) { + private void _testAll( + Parser parser, + Path[] testFiles, + ParseContext[] parseContext, + int numThreads, + int numIterations, + Map truth, + ExecutorService ex) { ExecutorCompletionService executorCompletionService = new ExecutorCompletionService<>(ex); - //use the same parser in all threads + // use the same parser in all threads for (int i = 0; i < numThreads; i++) { - executorCompletionService - .submit(new TikaRunner(parser, parseContext[i], numIterations, testFiles, - truth)); + executorCompletionService.submit( + new TikaRunner(parser, parseContext[i], numIterations, testFiles, truth)); } int completed = 0; while (completed < numThreads) { - //TODO: add a maximum timeout threshold + // TODO: add a maximum timeout threshold Future future = null; try { future = executorCompletionService.poll(1000, TimeUnit.MILLISECONDS); if (future != null) { - future.get();//trigger exceptions from thread + future.get(); // trigger exceptions from thread completed++; } } catch (InterruptedException | ExecutionException e) { @@ -326,7 +370,7 @@ private void _testAll(Parser parser, Path[] testFiles, ParseContext[] parseConte } } - //TODO: make this return something useful besides an integer + // TODO: make this return something useful besides an integer private static class TikaRunner implements Callable { private static final AtomicInteger threadCount = new AtomicInteger(0); private final Parser parser; @@ -337,8 +381,12 @@ private static class TikaRunner implements Callable { private final Random random = new Random(); private final int threadNumber; - private TikaRunner(Parser parser, ParseContext parseContext, int iterations, Path[] files, - Map truth) { + private TikaRunner( + Parser parser, + ParseContext parseContext, + int iterations, + Path[] files, + Map truth) { this.parser = parser; this.iterations = iterations; this.files = files; @@ -358,8 +406,8 @@ public Integer call() throws Exception { metadataList = getRecursiveMetadata(is, parser, new ParseContext()); success = true; } catch (Exception e) { - //swallow - //throw new RuntimeException(testFile + " triggered this exception", e); + // swallow + // throw new RuntimeException(testFile + " triggered this exception", e); } if (success) { assertExtractEquals(truth.get(testFile), new Extract(metadataList)); @@ -367,7 +415,6 @@ public Integer call() throws Exception { } return 1; } - } private static class Extract { @@ -411,8 +458,8 @@ private static class TikaDetectorRunner implements Callable { private final Map truth; private final Random random = new Random(); - private TikaDetectorRunner(Detector detector, int iterations, Path[] files, - Map truth) { + private TikaDetectorRunner( + Detector detector, int iterations, Path[] files, Map truth) { this.detector = detector; this.iterations = iterations; this.files = files; @@ -427,12 +474,11 @@ public Integer call() throws Exception { Metadata metadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(testFile, metadata)) { MediaType mediaType = detector.detect(tis, metadata); - assertEquals(truth.get(testFile), mediaType, - "failed on: " + testFile.getFileName()); + assertEquals( + truth.get(testFile), mediaType, "failed on: " + testFile.getFileName()); } } return 1; } - } } diff --git a/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java b/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java index 1a6d454d32..e7d88f11cb 100644 --- a/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java +++ b/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java @@ -26,10 +26,8 @@ import java.util.Map; /** - * A wrapper around a {@link ClassLoader} that logs all - * the Resources loaded through it. - * Used to check that a specific ClassLoader was used - * when unit testing + * A wrapper around a {@link ClassLoader} that logs all the Resources loaded through it. Used to + * check that a specific ClassLoader was used when unit testing */ public class ResourceLoggingClassLoader extends ClassLoader { private final Map> loadedResources = new HashMap<>(); diff --git a/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java b/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java index 05fdb53f4d..c31c5cd835 100644 --- a/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java +++ b/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java @@ -25,12 +25,10 @@ import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; - +import org.apache.tika.utils.RereadableInputStream; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import org.apache.tika.utils.RereadableInputStream; - public class TestRereadableInputStream { private final int DEFAULT_TEST_SIZE = 3; @@ -48,54 +46,60 @@ public class TestRereadableInputStream { // This size of data exactly equals memory threshold private final int TEST_SIZE_MAX = MEMORY_THRESHOLD; - @TempDir - private Path tempDir; + @TempDir private Path tempDir; @Test public void testInMemory() throws IOException { readEntireStream((TEST_SIZE_MEMORY)); } -// @Test -// public void testInFile() throws IOException { -// readData(TEST_SIZE_FILE); -// } -// -// @Test -// public void testMemoryThreshold() throws IOException { -// readData(TEST_SIZE_MAX); -// } -// -// @Test -// public void testInMemory2() throws IOException { -// readData2((TEST_SIZE_MEMORY)); -// } -// -// @Test -// public void testInFile2() throws IOException { -// readData2(TEST_SIZE_FILE); -// } + // @Test + // public void testInFile() throws IOException { + // readData(TEST_SIZE_FILE); + // } + // + // @Test + // public void testMemoryThreshold() throws IOException { + // readData(TEST_SIZE_MAX); + // } + // + // @Test + // public void testInMemory2() throws IOException { + // readData2((TEST_SIZE_MEMORY)); + // } + // + // @Test + // public void testInFile2() throws IOException { + // readData2(TEST_SIZE_FILE); + // } @Test public void testMemoryThreshold2() throws IOException { readPartialStream(TEST_SIZE_MAX); } - /** - * Read entire stream of various sizes - */ + /** Read entire stream of various sizes */ private void readEntireStream(int testSize) throws IOException { InputStream is = createTestInputStream(testSize); try (RereadableInputStream ris = new RereadableInputStream(is, MEMORY_THRESHOLD, true)) { for (int pass = 0; pass < NUM_PASSES; pass++) { for (int byteNum = 0; byteNum < testSize; byteNum++) { int byteRead = ris.read(); - assertEquals(byteNum, byteRead, - "Pass = " + pass + ", byte num should be " + byteNum + " but is " + - byteRead + "."); + assertEquals( + byteNum, + byteRead, + "Pass = " + + pass + + ", byte num should be " + + byteNum + + " but is " + + byteRead + + "."); } int eof = ris.read(); - assertEquals(-1, eof, + assertEquals( + -1, + eof, "Pass = " + pass + ", byte num should be " + -1 + " but is " + eof + "."); ris.rewind(); } @@ -103,8 +107,8 @@ private void readEntireStream(int testSize) throws IOException { } /** - * Read increasingly more of the stream, but not all, with each pass before rewinding to - * make sure we pick up at the correct point + * Read increasingly more of the stream, but not all, with each pass before rewinding to make + * sure we pick up at the correct point */ private void readPartialStream(int testSize) throws IOException { InputStream is = createTestInputStream(20); @@ -114,8 +118,16 @@ private void readPartialStream(int testSize) throws IOException { for (int pass = 0; pass < NUM_PASSES; pass++) { for (int byteNum = 0; byteNum < iterations; byteNum++) { int byteRead = ris.read(); - assertEquals(byteNum, byteRead, - "Pass = " + pass + ", byte num should be " + byteNum + " but is " + byteRead + "."); + assertEquals( + byteNum, + byteRead, + "Pass = " + + pass + + ", byte num should be " + + byteNum + + " but is " + + byteRead + + "."); } ris.rewind(); iterations++; @@ -123,20 +135,21 @@ private void readPartialStream(int testSize) throws IOException { } } - @Test public void testRewind() throws IOException { InputStream is = createTestInputStream(DEFAULT_TEST_SIZE); try (RereadableInputStream ris = new RereadableInputStream(is, MEMORY_THRESHOLD, true)) { - ris.rewind(); // rewind before we've done anything + ris.rewind(); // rewind before we've done anything for (int byteNum = 0; byteNum < 1; byteNum++) { int byteRead = ris.read(); - assertEquals(byteNum, byteRead, "Byte num should be " + byteNum + " but is " + byteRead + "."); + assertEquals( + byteNum, + byteRead, + "Byte num should be " + byteNum + " but is " + byteRead + "."); } } } - private TestInputStream createTestInputStream(int testSize) throws IOException { return new TestInputStream( new BufferedInputStream(Files.newInputStream(createTestFile(testSize)))); @@ -175,15 +188,14 @@ public void doReadAfterCloseTest() throws IOException { TestInputStream tis = createTestInputStream(DEFAULT_TEST_SIZE); RereadableInputStream ris = new RereadableInputStream(tis, DEFAULT_TEST_SIZE); ris.close(); - assertThrows(IOException.class, () -> { - ris.read(); - }); + assertThrows( + IOException.class, + () -> { + ris.read(); + }); } - - /** - * Adds isClosed() to a BufferedInputStream. - */ + /** Adds isClosed() to a BufferedInputStream. */ static class TestInputStream extends BufferedInputStream { private boolean closed; diff --git a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java index f52482c8d7..6924029ae3 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java @@ -16,7 +16,6 @@ */ package org.apache.tika; - import static org.junit.jupiter.api.Assertions.assertEquals; import org.junit.jupiter.api.Test; @@ -26,9 +25,10 @@ public class TikaDetectionTest { private final Tika tika = new Tika(); /** - * This test checks that Tika correctly detects all the file extensions - * defined in the mime.types file (revision 819245) of the Apache HTTP - * Server project. The tests were created with: + * This test checks that Tika correctly detects all the file extensions defined in the + * mime.types file (revision 819245) of the Apache HTTP Server project. The tests were created + * with: + * *

      * cat docs/conf/mime.types | grep -v '#' | perl -lne '/\S\s+\S/ and do {
      *     my ($type, @ext) = split /\s+/;
@@ -92,11 +92,11 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/oebps-package+xml", tika.detect("x.opf"));
         assertEquals("application/ogg", tika.detect("x.ogx"));
         // Differ from httpd - We have subtypes they lack
-        //assertEquals("application/onenote", tika.detect("x.one"));
-        //assertEquals("application/onenote", tika.detect("x.onetoc"));
-        //assertEquals("application/onenote", tika.detect("x.onetoc2"));
-        //assertEquals("application/onenote", tika.detect("x.onetmp"));
-        //assertEquals("application/onenote", tika.detect("x.onepkg"));
+        // assertEquals("application/onenote", tika.detect("x.one"));
+        // assertEquals("application/onenote", tika.detect("x.onetoc"));
+        // assertEquals("application/onenote", tika.detect("x.onetoc2"));
+        // assertEquals("application/onenote", tika.detect("x.onetmp"));
+        // assertEquals("application/onenote", tika.detect("x.onepkg"));
         assertEquals("application/patch-ops-error+xml", tika.detect("x.xer"));
         assertEquals("application/pdf", tika.detect("x.pdf"));
         assertEquals("application/pgp-encrypted", tika.detect("x.pgp"));
@@ -154,7 +154,8 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.acucobol", tika.detect("x.acu"));
         assertEquals("application/vnd.acucorp", tika.detect("x.atc"));
         assertEquals("application/vnd.acucorp", tika.detect("x.acutc"));
-        assertEquals("application/vnd.adobe.air-application-installer-package+zip",
+        assertEquals(
+                "application/vnd.adobe.air-application-installer-package+zip",
                 tika.detect("x.air"));
         assertEquals("application/vnd.adobe.xdp+xml", tika.detect("x.xdp"));
         assertEquals("application/vnd.adobe.xfdf", tika.detect("x.xfdf"));
@@ -164,14 +165,14 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.americandynamics.acc", tika.detect("x.acc"));
         assertEquals("application/vnd.amiga.ami", tika.detect("x.ami"));
         assertEquals("application/vnd.android.package-archive", tika.detect("x.apk"));
-        assertEquals("application/vnd.anser-web-certificate-issue-initiation",
-                tika.detect("x.cii"));
+        assertEquals(
+                "application/vnd.anser-web-certificate-issue-initiation", tika.detect("x.cii"));
         assertEquals("application/vnd.anser-web-funds-transfer-initiation", tika.detect("x.fti"));
         assertEquals("application/vnd.antix.game-component", tika.detect("x.atx"));
         assertEquals("application/vnd.apple.installer+xml", tika.detect("x.mpkg"));
         assertEquals("application/vnd.arastra.swi", tika.detect("x.swi"));
         // Differ from httpd - Adobe After Effects is a much more common user of .AEP these days
-        //assertEquals("application/vnd.audiograph", tika.detect("x.aep"));
+        // assertEquals("application/vnd.audiograph", tika.detect("x.aep"));
         assertEquals("application/vnd.blueice.multipass", tika.detect("x.mpm"));
         assertEquals("application/vnd.bmi", tika.detect("x.bmi"));
         assertEquals("application/vnd.businessobjects", tika.detect("x.rep"));
@@ -309,8 +310,8 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.koan", tika.detect("x.skm"));
         assertEquals("application/vnd.kodak-descriptor", tika.detect("x.sse"));
         assertEquals("application/vnd.llamagraphics.life-balance.desktop", tika.detect("x.lbd"));
-        assertEquals("application/vnd.llamagraphics.life-balance.exchange+xml",
-                tika.detect("x.lbe"));
+        assertEquals(
+                "application/vnd.llamagraphics.life-balance.exchange+xml", tika.detect("x.lbe"));
         assertEquals("application/vnd.lotus-1-2-3", tika.detect("x.123"));
         assertEquals("application/vnd.lotus-approach", tika.detect("x.apr"));
         assertEquals("application/vnd.lotus-freelance", tika.detect("x.pre"));
@@ -346,8 +347,8 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.ms-excel", tika.detect("x.xlt"));
         assertEquals("application/vnd.ms-excel", tika.detect("x.xlw"));
         assertEquals("application/vnd.ms-excel.addin.macroenabled.12", tika.detect("x.xlam"));
-        assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12",
-                tika.detect("x.xlsb"));
+        assertEquals(
+                "application/vnd.ms-excel.sheet.binary.macroenabled.12", tika.detect("x.xlsb"));
         assertEquals("application/vnd.ms-excel.sheet.macroenabled.12", tika.detect("x.xlsm"));
         assertEquals("application/vnd.ms-excel.template.macroenabled.12", tika.detect("x.xltm"));
         assertEquals("application/vnd.ms-fontobject", tika.detect("x.eot"));
@@ -360,13 +361,14 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.ms-powerpoint", tika.detect("x.pps"));
         assertEquals("application/vnd.ms-powerpoint", tika.detect("x.pot"));
         assertEquals("application/vnd.ms-powerpoint.addin.macroenabled.12", tika.detect("x.ppam"));
-        assertEquals("application/vnd.ms-powerpoint.presentation.macroenabled.12",
+        assertEquals(
+                "application/vnd.ms-powerpoint.presentation.macroenabled.12",
                 tika.detect("x.pptm"));
         assertEquals("application/vnd.ms-powerpoint.slide.macroenabled.12", tika.detect("x.sldm"));
-        assertEquals("application/vnd.ms-powerpoint.slideshow.macroenabled.12",
-                tika.detect("x.ppsm"));
-        assertEquals("application/vnd.ms-powerpoint.template.macroenabled.12",
-                tika.detect("x.potm"));
+        assertEquals(
+                "application/vnd.ms-powerpoint.slideshow.macroenabled.12", tika.detect("x.ppsm"));
+        assertEquals(
+                "application/vnd.ms-powerpoint.template.macroenabled.12", tika.detect("x.potm"));
         assertEquals("application/vnd.ms-project", tika.detect("x.mpp"));
         assertEquals("application/vnd.ms-project", tika.detect("x.mpt"));
         assertEquals("application/vnd.ms-word.document.macroenabled.12", tika.detect("x.docm"));
@@ -394,7 +396,7 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.oasis.opendocument.chart", tika.detect("x.odc"));
         assertEquals("application/vnd.oasis.opendocument.chart-template", tika.detect("x.otc"));
         // Differ from httpd - Mimetype embedded in file is .base not .database
-        //assertEquals("application/vnd.oasis.opendocument.database", tika.detect("x.odb"));
+        // assertEquals("application/vnd.oasis.opendocument.database", tika.detect("x.odb"));
         assertEquals("application/vnd.oasis.opendocument.formula", tika.detect("x.odf"));
         assertEquals("application/vnd.oasis.opendocument.formula-template", tika.detect("x.odft"));
         assertEquals("application/vnd.oasis.opendocument.graphics", tika.detect("x.odg"));
@@ -402,11 +404,11 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.oasis.opendocument.image", tika.detect("x.odi"));
         assertEquals("application/vnd.oasis.opendocument.image-template", tika.detect("x.oti"));
         assertEquals("application/vnd.oasis.opendocument.presentation", tika.detect("x.odp"));
-        assertEquals("application/vnd.oasis.opendocument.presentation-template",
-                tika.detect("x.otp"));
+        assertEquals(
+                "application/vnd.oasis.opendocument.presentation-template", tika.detect("x.otp"));
         assertEquals("application/vnd.oasis.opendocument.spreadsheet", tika.detect("x.ods"));
-        assertEquals("application/vnd.oasis.opendocument.spreadsheet-template",
-                tika.detect("x.ots"));
+        assertEquals(
+                "application/vnd.oasis.opendocument.spreadsheet-template", tika.detect("x.ots"));
         assertEquals("application/vnd.oasis.opendocument.text", tika.detect("x.odt"));
         assertEquals("application/vnd.oasis.opendocument.text-master", tika.detect("x.otm"));
         assertEquals("application/vnd.oasis.opendocument.text-template", tika.detect("x.ott"));
@@ -414,21 +416,29 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.olpc-sugar", tika.detect("x.xo"));
         assertEquals("application/vnd.oma.dd2+xml", tika.detect("x.dd2"));
         assertEquals("application/vnd.openofficeorg.extension", tika.detect("x.oxt"));
-        assertEquals("application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
                 tika.detect("x.pptx"));
-        assertEquals("application/vnd.openxmlformats-officedocument.presentationml.slide",
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.presentationml.slide",
                 tika.detect("x.sldx"));
-        assertEquals("application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
                 tika.detect("x.ppsx"));
-        assertEquals("application/vnd.openxmlformats-officedocument.presentationml.template",
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.presentationml.template",
                 tika.detect("x.potx"));
-        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                 tika.detect("x.xlsx"));
-        assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.template",
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
                 tika.detect("x.xltx"));
-        assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                 tika.detect("x.docx"));
-        assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+        assertEquals(
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
                 tika.detect("x.dotx"));
         assertEquals("application/vnd.osgi.dp", tika.detect("x.dp"));
         assertEquals("chemical/x-pdb", tika.detect("x.pdb"));
@@ -582,8 +592,8 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/x-font-type1", tika.detect("x.pfa"));
         assertEquals("application/x-font-type1", tika.detect("x.pfb"));
         // TODO Get these fixed upstream too
-        //assertEquals("application/x-font-type1", tika.detect("x.pfm"));
-        //assertEquals("application/x-font-type1", tika.detect("x.afm"));
+        // assertEquals("application/x-font-type1", tika.detect("x.pfm"));
+        // assertEquals("application/x-font-type1", tika.detect("x.afm"));
         assertEquals("application/x-font-printer-metric", tika.detect("x.pfm"));
         assertEquals("application/x-font-adobe-metric", tika.detect("x.afm"));
         assertEquals("application/x-futuresplash", tika.detect("x.spl"));
@@ -606,14 +616,14 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/x-msdownload", tika.detect("x.dll"));
         assertEquals("application/x-msdownload", tika.detect("x.com"));
         // Differ from httpd - BAT is different from normal windows executables
-        //assertEquals("application/x-msdownload", tika.detect("x.bat"));
+        // assertEquals("application/x-msdownload", tika.detect("x.bat"));
         // Differ from httpd - MSI is different from normal windows executables
-        //assertEquals("application/x-msdownload", tika.detect("x.msi"));
+        // assertEquals("application/x-msdownload", tika.detect("x.msi"));
         assertEquals("application/x-msmediaview", tika.detect("x.mvb"));
         assertEquals("application/x-msmediaview", tika.detect("x.m13"));
         assertEquals("application/x-msmediaview", tika.detect("x.m14"));
         // Differ from httpd - wmf was properly registered in RFC 7903
-        //assertEquals("application/x-msmetafile", tika.detect("x.wmf"));
+        // assertEquals("application/x-msmetafile", tika.detect("x.wmf"));
         assertEquals("application/x-msmoney", tika.detect("x.mny"));
         assertEquals("application/x-mspublisher", tika.detect("x.pub"));
         assertEquals("application/x-msschedule", tika.detect("x.scd"));
@@ -644,8 +654,8 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/x-ustar", tika.detect("x.ustar"));
         assertEquals("application/x-wais-source", tika.detect("x.src"));
         // Differ from httpd - use a common parent for CA and User certs
-        //assertEquals("application/x-x509-ca-cert", tika.detect("x.der"));
-        //assertEquals("application/x-x509-ca-cert", tika.detect("x.crt"));
+        // assertEquals("application/x-x509-ca-cert", tika.detect("x.der"));
+        // assertEquals("application/x-x509-ca-cert", tika.detect("x.crt"));
         assertEquals("application/x-xfig", tika.detect("x.fig"));
         assertEquals("application/x-xpinstall", tika.detect("x.xpi"));
         assertEquals("application/xenc+xml", tika.detect("x.xenc"));
@@ -678,9 +688,9 @@ public void testHttpServerFileExtensions() {
         assertEquals("audio/mpeg", tika.detect("x.m3a"));
         assertEquals("audio/ogg", tika.detect("x.oga"));
         // Differ from httpd - Use a dedicated mimetype of Vorbis
-        //assertEquals("audio/ogg", tika.detect("x.ogg"));
+        // assertEquals("audio/ogg", tika.detect("x.ogg"));
         // Differ from httpd - Speex more commonly uses its own mimetype
-        //assertEquals("audio/ogg", tika.detect("x.spx"));
+        // assertEquals("audio/ogg", tika.detect("x.spx"));
         assertEquals("audio/vnd.digital-winds", tika.detect("x.eol"));
         assertEquals("audio/vnd.dts", tika.detect("x.dts"));
         assertEquals("audio/vnd.dts.hd", tika.detect("x.dtshd"));
@@ -700,7 +710,7 @@ public void testHttpServerFileExtensions() {
         assertEquals("audio/x-pn-realaudio", tika.detect("x.ra"));
         assertEquals("audio/x-pn-realaudio-plugin", tika.detect("x.rmp"));
         // Differ from httpd - wav was properly registered in RFC 2361
-        //assertEquals("audio/x-wav", tika.detect("x.wav"));
+        // assertEquals("audio/x-wav", tika.detect("x.wav"));
         assertEquals("chemical/x-cdx", tika.detect("x.cdx"));
         assertEquals("chemical/x-cif", tika.detect("x.cif"));
         assertEquals("chemical/x-cmdf", tika.detect("x.cmdf"));
@@ -708,7 +718,7 @@ public void testHttpServerFileExtensions() {
         assertEquals("chemical/x-csml", tika.detect("x.csml"));
         assertEquals("chemical/x-xyz", tika.detect("x.xyz"));
         // Differ from httpd - bmp was properly registered in RFC 7903
-        //assertEquals("image/x-ms-bmp", tika.detect("x.bmp"));
+        // assertEquals("image/x-ms-bmp", tika.detect("x.bmp"));
         assertEquals("image/cgm", tika.detect("x.cgm"));
         assertEquals("image/g3fax", tika.detect("x.g3"));
         assertEquals("image/gif", tika.detect("x.gif"));
@@ -747,10 +757,10 @@ public void testHttpServerFileExtensions() {
         assertEquals("image/x-freehand", tika.detect("x.fh7"));
         // Differ from httpd - An official mimetype has subsequently been issued
         //  favicon.ico +friends should now be image/vnd.microsoft.icon
-        //assertEquals("image/x-icon", tika.detect("x.ico"));
+        // assertEquals("image/x-icon", tika.detect("x.ico"));
         // Differ from httpd - An official mimetype has subsequently been issued
         //  pcx PiCture eXchange files should now be image/vnd.zbrush.pcx
-        //assertEquals("image/x-pcx", tika.detect("x.pcx"));
+        // assertEquals("image/x-pcx", tika.detect("x.pcx"));
         assertEquals("image/x-pict", tika.detect("x.pic"));
         assertEquals("image/x-pict", tika.detect("x.pct"));
         assertEquals("image/x-portable-anymap", tika.detect("x.pnm"));
@@ -784,7 +794,7 @@ public void testHttpServerFileExtensions() {
         assertEquals("text/plain", tika.detect("x.txt"));
         assertEquals("text/plain", tika.detect("x.text"));
         // Differ from httpd - Use a dedicated mimetype for Config files
-        //assertEquals("text/plain", tika.detect("x.conf"));
+        // assertEquals("text/plain", tika.detect("x.conf"));
         assertEquals("text/plain", tika.detect("x.def"));
         assertEquals("text/plain", tika.detect("x.list"));
         assertEquals("text/x-log", tika.detect("x.log"));
@@ -877,5 +887,4 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/x-grib", tika.detect("x.grb2"));
         assertEquals("application/dif+xml", tika.detect("x.dif"));
     }
-
 }
diff --git a/tika-core/src/test/java/org/apache/tika/TikaIT.java b/tika-core/src/test/java/org/apache/tika/TikaIT.java
index 1604818512..db6f99cab6 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaIT.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaIT.java
@@ -30,5 +30,4 @@ public void testToString() {
         assertTrue(
                 version.matches("Apache Tika \\d+\\.\\d+\\.\\d+(-(?:ALPHA|BETA))?(?:-SNAPSHOT)?"));
     }
-
 }
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index a0a6377b89..275ed8df22 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -39,10 +39,7 @@
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
-
 import org.apache.commons.io.IOUtils;
-import org.xml.sax.ContentHandler;
-
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.extractor.EmbeddedResourceHandler;
 import org.apache.tika.io.FilenameUtils;
@@ -59,10 +56,9 @@
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.RecursiveParserWrapperHandler;
 import org.apache.tika.sax.ToXMLContentHandler;
+import org.xml.sax.ContentHandler;
 
-/**
- * Parent class of Tika tests
- */
+/** Parent class of Tika tests */
 public abstract class TikaTest {
 
     protected static TikaConfig DEFAULT_TIKA_CONFIG;
@@ -75,6 +71,7 @@ public abstract class TikaTest {
             throw new RuntimeException(e);
         }
     }
+
     public static void assertContainsCount(String needle, String haystack, int targetCount) {
         int i = haystack.indexOf(needle);
         int count = 0;
@@ -82,8 +79,8 @@ public static void assertContainsCount(String needle, String haystack, int targe
             count++;
             i = haystack.indexOf(needle, i + 1);
         }
-        assertEquals(targetCount, count,
-                "found " + count + " but should have found: " + targetCount);
+        assertEquals(
+                targetCount, count, "found " + count + " but should have found: " + targetCount);
     }
 
     public static void assertContains(String needle, String haystack) {
@@ -102,9 +99,10 @@ public static  void assertNotContained(T needle, Collection hays
         assertFalse(haystack.contains(needle), needle + " unexpectedly found in:\n" + haystack);
     }
 
-    public static void assertMetadataListEquals(List metadataListA,
-                                          List metadataListB,
-                                    Set fieldsToIgnore) {
+    public static void assertMetadataListEquals(
+            List metadataListA,
+            List metadataListB,
+            Set fieldsToIgnore) {
         assertEquals(metadataListA.size(), metadataListB.size(), "different sizes");
         for (int i = 0; i < metadataListA.size(); i++) {
             Metadata mA = metadataListA.get(i);
@@ -115,8 +113,10 @@ public static void assertMetadataListEquals(List metadataListA,
                     continue;
                 }
                 mAFields.add(n);
-                assertArrayEquals(mA.getValues(n), mB.getValues(n), "problem with " + n +
-                        " in metadata index=" + i);
+                assertArrayEquals(
+                        mA.getValues(n),
+                        mB.getValues(n),
+                        "problem with " + n + " in metadata index=" + i);
             }
             Set mBFields = new HashSet<>();
             for (String n : mB.names()) {
@@ -130,14 +130,13 @@ public static void assertMetadataListEquals(List metadataListA,
     }
 
     /**
-     * Test that in at least one item in metadataList, all keys and values
-     * in minExpected are contained.
-     * 

- * The values in minExpected are tested for whether they are contained - * within a value in the target. If minExpected=&dquot;text/vbasic&dquot; and - * what was actually found in the target within metadatalist is - * &dquot;text/vbasic; charset=windows-1252&dquot;, - * that is counted as a hit. + * Test that in at least one item in metadataList, all keys and values in minExpected are + * contained. + * + *

The values in minExpected are tested for whether they are contained within a value in the + * target. If minExpected=&dquot;text/vbasic&dquot; and what was actually found in the target + * within metadatalist is &dquot;text/vbasic; charset=windows-1252&dquot;, that is counted as a + * hit. * * @param minExpected * @param metadataList @@ -160,11 +159,11 @@ public static void assertContainsAtLeast(Metadata minExpected, List me } } if (foundPropertyCount == minExpected.names().length) { - //found everything! + // found everything! return; } } - //TODO: figure out how to have more informative error message + // TODO: figure out how to have more informative error message fail("Couldn't find everything within a single metadata item"); } @@ -221,8 +220,8 @@ public URL getResourceAsUrl(String name) { * * @param name name of the desired resource * @return A {@link java.net.URI} object or null - * @throws URISyntaxException if this URL is not formatted strictly according to - * RFC2396 and cannot be converted to a URI. + * @throws URISyntaxException if this URL is not formatted strictly according to RFC2396 and + * cannot be converted to a URI. */ public URI getResourceAsUri(String name) throws URISyntaxException { URL url = getResourceAsUrl(name); @@ -233,13 +232,12 @@ public URI getResourceAsUri(String name) throws URISyntaxException { } /** - * This method will give you back the filename incl. the absolute path name - * to the resource. If the resource does not exist it will give you back the - * resource name incl. the path. + * This method will give you back the filename incl. the absolute path name to the resource. If + * the resource does not exist it will give you back the resource name incl. the path. * * @param name The named resource to search for. - * @return an absolute path incl. the name which is in the same directory as - * the the class you've called it from. + * @return an absolute path incl. the name which is in the same directory as the the class + * you've called it from. */ public File getResourceAsFile(String name) throws URISyntaxException { URI uri = getResourceAsUri(name); @@ -267,7 +265,10 @@ public InputStream getResourceAsStream(String name) { protected XMLResult getXML(String filePath, Parser parser, ParseContext context) throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), parser, new Metadata(), + return getXML( + getResourceAsStream("/test-documents/" + filePath), + parser, + new Metadata(), context); } @@ -279,22 +280,28 @@ protected XMLResult getXML(String filePath, ParseContext parseContext) throws Ex return getXML(filePath, AUTO_DETECT_PARSER, parseContext); } - protected XMLResult getXML(String filePath, Parser parser, Metadata metadata, - ParseContext parseContext) + protected XMLResult getXML( + String filePath, Parser parser, Metadata metadata, ParseContext parseContext) throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), parser, - metadata, parseContext); + return getXML( + getResourceAsStream("/test-documents/" + filePath), parser, metadata, parseContext); } protected XMLResult getXML(String filePath, Metadata metadata, ParseContext parseContext) throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), AUTO_DETECT_PARSER, - metadata, parseContext); + return getXML( + getResourceAsStream("/test-documents/" + filePath), + AUTO_DETECT_PARSER, + metadata, + parseContext); } protected XMLResult getXML(String filePath, Metadata metadata) throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), AUTO_DETECT_PARSER, - metadata, null); + return getXML( + getResourceAsStream("/test-documents/" + filePath), + AUTO_DETECT_PARSER, + metadata, + null); } protected XMLResult getXML(String filePath, Parser parser) throws Exception { @@ -304,8 +311,11 @@ protected XMLResult getXML(String filePath, Parser parser) throws Exception { } protected XMLResult getXML(String filePath) throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), AUTO_DETECT_PARSER, - new Metadata(), null); + return getXML( + getResourceAsStream("/test-documents/" + filePath), + AUTO_DETECT_PARSER, + new Metadata(), + null); } protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) @@ -313,8 +323,9 @@ protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) return getXML(input, parser, metadata, null); } - protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, - ParseContext context) throws Exception { + protected XMLResult getXML( + InputStream input, Parser parser, Metadata metadata, ParseContext context) + throws Exception { if (context == null) { context = new ParseContext(); } @@ -334,12 +345,13 @@ protected List getRecursiveMetadataFromFullPath(String path) throws Ex protected List getRecursiveMetadata(String filePath, boolean suppressException) throws Exception { - return getRecursiveMetadata(filePath, new Metadata(), new ParseContext(), - suppressException); + return getRecursiveMetadata( + filePath, new Metadata(), new ParseContext(), suppressException); } - protected List getRecursiveMetadata(String filePath, ParseContext parseContext, - boolean suppressException) throws Exception { + protected List getRecursiveMetadata( + String filePath, ParseContext parseContext, boolean suppressException) + throws Exception { return getRecursiveMetadata(filePath, new Metadata(), parseContext, suppressException); } @@ -347,11 +359,14 @@ protected List getRecursiveMetadata(String filePath) throws Exception return getRecursiveMetadata(filePath, new ParseContext()); } - protected List getRecursiveMetadata(String filePath, - BasicContentHandlerFactory.HANDLER_TYPE handlerType) - throws Exception { - return getRecursiveMetadata(filePath, TikaTest.AUTO_DETECT_PARSER, new Metadata(), - new ParseContext(), true, + protected List getRecursiveMetadata( + String filePath, BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { + return getRecursiveMetadata( + filePath, + TikaTest.AUTO_DETECT_PARSER, + new Metadata(), + new ParseContext(), + true, handlerType); } @@ -360,51 +375,59 @@ protected List getRecursiveMetadata(String filePath, Metadata metadata return getRecursiveMetadata(filePath, metadata, new ParseContext()); } - protected List getRecursiveMetadata(String filePath, Metadata metadata, - ParseContext context) throws Exception { + protected List getRecursiveMetadata( + String filePath, Metadata metadata, ParseContext context) throws Exception { return getRecursiveMetadata(filePath, metadata, context, false); } - protected List getRecursiveMetadata(String filePath, Metadata metadata, - ParseContext context, boolean suppressException) + protected List getRecursiveMetadata( + String filePath, Metadata metadata, ParseContext context, boolean suppressException) throws Exception { - return getRecursiveMetadata(filePath, AUTO_DETECT_PARSER, metadata, context, - suppressException); + return getRecursiveMetadata( + filePath, AUTO_DETECT_PARSER, metadata, context, suppressException); } - protected List getRecursiveMetadata(String filePath, Parser wrapped, - Metadata metadata, ParseContext context, - boolean suppressException) throws Exception { + protected List getRecursiveMetadata( + String filePath, + Parser wrapped, + Metadata metadata, + ParseContext context, + boolean suppressException) + throws Exception { try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { return getRecursiveMetadata(is, wrapped, metadata, context, suppressException); } } - protected List getRecursiveMetadata(String filePath, Parser wrapped, - Metadata metadata, ParseContext context, - boolean suppressException, - BasicContentHandlerFactory.HANDLER_TYPE handlerType) + protected List getRecursiveMetadata( + String filePath, + Parser wrapped, + Metadata metadata, + ParseContext context, + boolean suppressException, + BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { - return getRecursiveMetadata(is, wrapped, metadata, context, suppressException, handlerType); + return getRecursiveMetadata( + is, wrapped, metadata, context, suppressException, handlerType); } } - protected List getRecursiveMetadata(Path path, ParseContext context, - boolean suppressException) throws Exception { + protected List getRecursiveMetadata( + Path path, ParseContext context, boolean suppressException) throws Exception { Metadata metadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(path, metadata)) { - return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, metadata, context, - suppressException); + return getRecursiveMetadata( + tis, AUTO_DETECT_PARSER, metadata, context, suppressException); } } - protected List getRecursiveMetadata(Path path, Parser parser, - boolean suppressException) throws Exception { + protected List getRecursiveMetadata( + Path path, Parser parser, boolean suppressException) throws Exception { Metadata metadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(path, metadata)) { - return getRecursiveMetadata(tis, parser, metadata, new ParseContext(), - suppressException); + return getRecursiveMetadata( + tis, parser, metadata, new ParseContext(), suppressException); } } @@ -428,24 +451,30 @@ protected List getRecursiveMetadata(InputStream is, boolean suppressEx return getRecursiveMetadata(is, new Metadata(), new ParseContext(), suppressException); } - protected List getRecursiveMetadata(InputStream is, Parser parser, - boolean suppressException) throws Exception { - return getRecursiveMetadata(is, parser, new Metadata(), new ParseContext(), - suppressException); + protected List getRecursiveMetadata( + InputStream is, Parser parser, boolean suppressException) throws Exception { + return getRecursiveMetadata( + is, parser, new Metadata(), new ParseContext(), suppressException); } - protected List getRecursiveMetadata(InputStream is, Metadata metadata, - ParseContext context, boolean suppressException) + protected List getRecursiveMetadata( + InputStream is, Metadata metadata, ParseContext context, boolean suppressException) throws Exception { return getRecursiveMetadata(is, AUTO_DETECT_PARSER, metadata, context, suppressException); } - protected List getRecursiveMetadata(InputStream is, Parser p, Metadata metadata, - ParseContext context, boolean suppressException) + protected List getRecursiveMetadata( + InputStream is, + Parser p, + Metadata metadata, + ParseContext context, + boolean suppressException) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try { wrapper.parse(is, handler, metadata, context); } catch (Exception e) { @@ -456,13 +485,17 @@ protected List getRecursiveMetadata(InputStream is, Parser p, Metadata return handler.getMetadataList(); } - protected List getRecursiveMetadata(InputStream is, Parser p, Metadata metadata, - ParseContext context, boolean suppressException, - BasicContentHandlerFactory.HANDLER_TYPE handlerType) + protected List getRecursiveMetadata( + InputStream is, + Parser p, + Metadata metadata, + ParseContext context, + boolean suppressException, + BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(handlerType, -1)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(handlerType, -1)); try { wrapper.parse(is, handler, metadata, context); } catch (Exception e) { @@ -477,8 +510,10 @@ protected List getRecursiveMetadata(String filePath, ParseContext cont throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), context); } @@ -487,21 +522,24 @@ protected List getRecursiveMetadata(String filePath, ParseContext cont protected List getRecursiveMetadata(String filePath, Parser parserToWrap) throws Exception { - return getRecursiveMetadata(filePath, parserToWrap, - BasicContentHandlerFactory.HANDLER_TYPE.XML); + return getRecursiveMetadata( + filePath, parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE.XML); } - protected List getRecursiveMetadata(String filePath, Parser parserToWrap, - BasicContentHandlerFactory.HANDLER_TYPE - handlerType) + protected List getRecursiveMetadata( + String filePath, + Parser parserToWrap, + BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { return getRecursiveMetadata(filePath, parserToWrap, handlerType, new ParseContext()); } - protected List getRecursiveMetadata(String filePath, Parser parserToWrap, - BasicContentHandlerFactory.HANDLER_TYPE - handlerType, - ParseContext context) throws Exception { + protected List getRecursiveMetadata( + String filePath, + Parser parserToWrap, + BasicContentHandlerFactory.HANDLER_TYPE handlerType, + ParseContext context) + throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(handlerType, -1)); @@ -513,11 +551,13 @@ protected List getRecursiveMetadata(String filePath, Parser parserToWr return handler.getMetadataList(); } - protected List getRecursiveMetadata(String filePath, Parser parserToWrap, - ParseContext parseContext) throws Exception { + protected List getRecursiveMetadata( + String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), parseContext); @@ -546,16 +586,17 @@ protected String getText(String filePath, Metadata metadata, ParseContext parseC return getText(filePath, AUTO_DETECT_PARSER, metadata, parseContext); } - protected String getText(String filePath, Parser parser, Metadata metadata, - ParseContext parseContext) throws Exception { - return getText(getResourceAsStream("/test-documents/" + filePath), parser, parseContext, - metadata); + protected String getText( + String filePath, Parser parser, Metadata metadata, ParseContext parseContext) + throws Exception { + return getText( + getResourceAsStream("/test-documents/" + filePath), parser, parseContext, metadata); } /** * Basic text extraction. - *

- * Tries to close input stream after processing. + * + *

Tries to close input stream after processing. */ public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception { @@ -595,8 +636,8 @@ public InputStream truncate(String testFileName, int truncatedLength) throws IOE } public List getAllTestFiles() { - //for now, just get main files - //TODO: fix this to be recursive + // for now, just get main files + // TODO: fix this to be recursive try { File[] pathArray = Paths.get(getResourceAsUri("/test-documents")).toFile().listFiles(); List paths = new ArrayList<>(); @@ -619,9 +660,7 @@ public XMLResult(String xml, Metadata metadata) { } } - /** - * Keeps track of media types and file names recursively. - */ + /** Keeps track of media types and file names recursively. */ public static class TrackingHandler implements EmbeddedResourceHandler { private final Set skipTypes; public List filenames = new ArrayList<>(); @@ -645,9 +684,7 @@ public void handle(String filename, MediaType mediaType, InputStream stream) { } } - /** - * Copies byte[] of embedded documents into a List. - */ + /** Copies byte[] of embedded documents into a List. */ public static class ByteCopyingHandler implements EmbeddedResourceHandler { public List bytes = new ArrayList<>(); @@ -664,7 +701,7 @@ public void handle(String filename, MediaType mediaType, InputStream stream) { bytes.add(os.toByteArray()); stream.reset(); } catch (IOException e) { - //swallow + // swallow } } } diff --git a/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java b/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java index 9ff104c708..f44b7c8c56 100644 --- a/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java +++ b/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java @@ -21,7 +21,6 @@ import java.io.FileInputStream; import java.io.InputStream; import java.util.Locale; - import org.apache.commons.io.IOUtils; public class TypeDetectionBenchmark { @@ -51,8 +50,12 @@ private static void benchmark(File file) throws Exception { for (int i = 0; i < 1000; i++) { tika.detect(new ByteArrayInputStream(content)); } - System.out.printf(Locale.ROOT, "%6dns per Tika.detect(%s) = %s%n", - System.currentTimeMillis() - start, file, type); + System.out.printf( + Locale.ROOT, + "%6dns per Tika.detect(%s) = %s%n", + System.currentTimeMillis() - start, + file, + type); } } else if (file.isDirectory()) { for (File child : file.listFiles()) { @@ -60,5 +63,4 @@ private static void benchmark(File file) throws Exception { } } } - } diff --git a/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java index 763cb43fdd..459a1cd8ae 100644 --- a/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java @@ -21,17 +21,14 @@ import java.net.URL; import java.nio.file.Path; import java.nio.file.Paths; - -import org.junit.jupiter.api.AfterEach; - import org.apache.tika.TikaTest; import org.apache.tika.parser.ParseContext; +import org.junit.jupiter.api.AfterEach; /** - * Parent of Junit test classes for {@link TikaConfig}, including - * Tika Core based ones, and ones in Tika Parsers that do things - * that {@link TikaConfigTest} can't, do due to a need for the - * full set of "real" classes of parsers / detectors + * Parent of Junit test classes for {@link TikaConfig}, including Tika Core based ones, and ones in + * Tika Parsers that do things that {@link TikaConfigTest} can't, do due to a need for the full set + * of "real" classes of parsers / detectors */ public abstract class AbstractTikaConfigTest extends TikaTest { protected static ParseContext context = new ParseContext(); @@ -42,7 +39,6 @@ protected static Path getConfigFilePath(String config) throws Exception { return Paths.get(url.toURI()); } - protected static String getConfigPath(String config) throws Exception { URL url = TikaConfig.class.getResource(config); assertNotNull(url, "Test Tika Config not found: " + config); diff --git a/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java b/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java index 185387cffe..72e4aeef02 100644 --- a/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java +++ b/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java @@ -19,7 +19,6 @@ import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; - import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor; public class DummyExecutor extends ThreadPoolExecutor implements ConfigurableThreadPoolExecutor { diff --git a/tika-core/src/test/java/org/apache/tika/config/DummyParser.java b/tika-core/src/test/java/org/apache/tika/config/DummyParser.java index cea6c2f498..3557de46c0 100644 --- a/tika-core/src/test/java/org/apache/tika/config/DummyParser.java +++ b/tika-core/src/test/java/org/apache/tika/config/DummyParser.java @@ -17,7 +17,6 @@ package org.apache.tika.config; import java.util.Collection; - import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; @@ -27,8 +26,10 @@ public class DummyParser extends CompositeParser implements Parser { private final ServiceLoader loader; - public DummyParser(MediaTypeRegistry registry, ServiceLoader loader, - Collection> excludeParsers) { + public DummyParser( + MediaTypeRegistry registry, + ServiceLoader loader, + Collection> excludeParsers) { this.loader = loader; } diff --git a/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java index 9bbe8ede99..56ae304f5f 100644 --- a/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java @@ -1,17 +1,15 @@ /** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

- * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at + * + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.config; @@ -22,10 +20,8 @@ import java.io.InputStream; import java.util.List; import java.util.Map; - -import org.junit.jupiter.api.Test; - import org.apache.tika.exception.TikaException; +import org.junit.jupiter.api.Test; public class MockConfigTest { @@ -44,7 +40,6 @@ public void testBasic() throws Exception { assertEquals("two", config.getMyStrings().get(1)); } - public static class MockConfig extends ConfigBase { private Map mappings; diff --git a/tika-core/src/test/java/org/apache/tika/config/ParamTest.java b/tika-core/src/test/java/org/apache/tika/config/ParamTest.java index 5f5321d5d4..0393540b26 100644 --- a/tika-core/src/test/java/org/apache/tika/config/ParamTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/ParamTest.java @@ -1,17 +1,15 @@ /** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

- * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at + * + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.config; @@ -26,7 +24,6 @@ import java.net.URL; import java.util.ArrayList; import java.util.List; - import org.junit.jupiter.api.Test; public class ParamTest { @@ -39,11 +36,20 @@ public void testSaveAndLoad() throws Exception { list.add("brown"); list.add("fox"); Object[] objects = - new Object[]{list, Integer.MAX_VALUE, 2.5f, 4000.57576, true, false, Long.MAX_VALUE, - "Hello this is a boring string", new URL("http://apache.org"), - new URI("tika://org.apache.tika.ner.parser?impl=xyz"), - new BigInteger(Long.MAX_VALUE + "").add( - new BigInteger(Long.MAX_VALUE + "")), new File("."),}; + new Object[] { + list, + Integer.MAX_VALUE, + 2.5f, + 4000.57576, + true, + false, + Long.MAX_VALUE, + "Hello this is a boring string", + new URL("http://apache.org"), + new URI("tika://org.apache.tika.ner.parser?impl=xyz"), + new BigInteger(Long.MAX_VALUE + "").add(new BigInteger(Long.MAX_VALUE + "")), + new File("."), + }; for (Object object : objects) { String name = "name" + System.currentTimeMillis(); @@ -64,5 +70,4 @@ public void testSaveAndLoad() throws Exception { assertEquals(loaded.getType(), object.getClass()); } } - } diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java index dafdd641f0..81db088b9d 100644 --- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java @@ -17,7 +17,6 @@ package org.apache.tika.config; - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -25,49 +24,51 @@ import java.io.InputStream; import java.io.StringWriter; import java.nio.charset.StandardCharsets; - -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.mock.MockParser; import org.apache.tika.parser.multiple.FallbackParser; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; public class TikaConfigSerializerTest extends TikaConfigTest { /** - * TIKA-1445 It should be possible to exclude DefaultParser from - * certain types, so another parser explicitly listed will take them + * TIKA-1445 It should be possible to exclude DefaultParser from certain types, so another + * parser explicitly listed will take them */ @Test public void defaultParserWithExcludes() throws Exception { String xml = loadAndSerialize("TIKA-1445-default-except.xml", TikaConfigSerializer.Mode.STATIC); assertContains( - "" + " fail/world" + - " " + - "", xml); + "" + + " fail/world" + + " " + + "", + xml); } @Test public void testEncodingDetectors() throws Exception { String xml = loadAndSerialize("TIKA-1762-executors.xml", TikaConfigSerializer.Mode.STATIC); - assertContains(" " + - " " + - "", xml); + assertContains( + " " + + " " + + "", + xml); } @Test public void testMultipleWithFallback() throws Exception { TikaConfig config = getConfig("TIKA-1509-multiple-fallback.xml"); StringWriter writer = new StringWriter(); - TikaConfigSerializer.serialize(config, - TikaConfigSerializer.Mode.STATIC_FULL, writer, StandardCharsets.UTF_8); + TikaConfigSerializer.serialize( + config, TikaConfigSerializer.Mode.STATIC_FULL, writer, StandardCharsets.UTF_8); try (InputStream is = - new ByteArrayInputStream(writer.toString().getBytes(StandardCharsets.UTF_8))) { + new ByteArrayInputStream(writer.toString().getBytes(StandardCharsets.UTF_8))) { config = new TikaConfig(is); } @@ -90,9 +91,12 @@ public void testMultipleWithFallback() throws Exception { @Disabled("TODO: executor-service info needs to be stored in TikaConfig for serialization") public void testExecutors() throws Exception { String xml = loadAndSerialize("TIKA-1762-executors.xml", TikaConfigSerializer.Mode.STATIC); - assertContains("" + - " 3" + " 10" + - "", xml); + assertContains( + "" + + " 3" + + " 10" + + "", + xml); } String loadAndSerialize(String configFile, TikaConfigSerializer.Mode mode) throws Exception { diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java index 7fa0217294..9178dd702d 100644 --- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java @@ -31,9 +31,6 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ThreadPoolExecutor; - -import org.junit.jupiter.api.Test; - import org.apache.tika.ResourceLoggingClassLoader; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; @@ -51,18 +48,17 @@ import org.apache.tika.parser.mock.MockParser; import org.apache.tika.parser.multiple.FallbackParser; import org.apache.tika.utils.XMLReaderUtils; +import org.junit.jupiter.api.Test; /** - * Tests for the Tika Config, which don't require real parsers / - * detectors / etc. - * There's also {@link TikaParserConfigTest} and {@link TikaDetectorConfigTest} - * over in the Tika Parsers project, which do further Tika Config - * testing using real parsers and detectors. + * Tests for the Tika Config, which don't require real parsers / detectors / etc. There's also + * {@link TikaParserConfigTest} and {@link TikaDetectorConfigTest} over in the Tika Parsers project, + * which do further Tika Config testing using real parsers and detectors. */ public class TikaConfigTest extends AbstractTikaConfigTest { /** - * Make sure that a configuration file can't reference the - * {@link AutoDetectParser} class a <parser> configuration element. + * Make sure that a configuration file can't reference the {@link AutoDetectParser} class a + * <parser> configuration element. * * @see TIKA-866 */ @@ -76,9 +72,8 @@ public void withInvalidParser() throws Exception { } /** - * Make sure that with a service loader given, we can - * get different configurable behaviour on parser classes - * which can't be found. + * Make sure that with a service loader given, we can get different configurable behaviour on + * parser classes which can't be found. */ @Test public void testUnknownParser() throws Exception { @@ -108,9 +103,8 @@ public void testUnknownParser() throws Exception { } /** - * Make sure that a configuration file can reference also a composite - * parser class like {@link DefaultParser} in a <parser> - * configuration element. + * Make sure that a configuration file can reference also a composite parser class like {@link + * DefaultParser} in a <parser> configuration element. * * @see TIKA-866 */ @@ -124,8 +118,8 @@ public void asCompositeParser() throws Exception { } /** - * Make sure that a valid configuration file without mimetypes or - * detector entries can be loaded without problems. + * Make sure that a valid configuration file without mimetypes or detector entries can be loaded + * without problems. * * @see TIKA-866 */ @@ -139,9 +133,8 @@ public void onlyValidParser() throws Exception { } /** - * TIKA-1145 If the TikaConfig has a ClassLoader set on it, - * that should be used when loading the mimetypes and when - * discovering services + * TIKA-1145 If the TikaConfig has a ClassLoader set on it, that should be used when loading the + * mimetypes and when discovering services */ @Test public void ensureClassLoaderUsedEverywhere() throws Exception { @@ -162,14 +155,15 @@ public void ensureClassLoaderUsedEverywhere() throws Exception { Map> resources = customLoader.getLoadedResources(); int resourcesCount = resources.size(); - assertTrue(resourcesCount > 3, + assertTrue( + resourcesCount > 3, "Not enough things used the classloader, found only " + resourcesCount); // Ensure everything that should do, did use it // - Parsers assertNotNull(resources.get("META-INF/services/org.apache.tika.parser.Parser")); // - Detectors - //assertNotNull(resources.get("META-INF/services/org.apache.tika.detect.Detector")); + // assertNotNull(resources.get("META-INF/services/org.apache.tika.detect.Detector")); // - Built-In Mimetypes assertNotNull(resources.get("org/apache/tika/mime/tika-mimetypes.xml")); // - Custom Mimetypes @@ -177,8 +171,8 @@ public void ensureClassLoaderUsedEverywhere() throws Exception { } /** - * TIKA-1445 It should be possible to exclude DefaultParser from - * certain types, so another parser explicitly listed will take them + * TIKA-1445 It should be possible to exclude DefaultParser from certain types, so another + * parser explicitly listed will take them */ @Test public void defaultParserWithExcludes() throws Exception { @@ -215,8 +209,8 @@ public void defaultParserWithExcludes() throws Exception { } /** - * TIKA-1653 If one parser has child parsers, those child parsers shouldn't - * show up at the top level as well + * TIKA-1653 If one parser has child parsers, those child parsers shouldn't show up at the top + * level as well */ @Test public void parserWithChildParsers() throws Exception { @@ -268,41 +262,46 @@ public void testTikaExecutorServiceFromConfig() throws Exception { assertTrue((executorService instanceof DummyExecutor), "Should use Dummy Executor"); assertEquals(3, executorService.getCorePoolSize(), "Should have configured Core Threads"); - assertEquals(10, executorService.getMaximumPoolSize(), - "Should have configured Max Threads"); + assertEquals( + 10, executorService.getMaximumPoolSize(), "Should have configured Max Threads"); } @Test public void testInitializerBadValue() throws Exception { - assertThrows(TikaConfigException.class, () -> { - TikaConfig config = getConfig("TIKA-2389-illegal.xml"); - }); + assertThrows( + TikaConfigException.class, + () -> { + TikaConfig config = getConfig("TIKA-2389-illegal.xml"); + }); } - @Test public void testInitializerPerParserThrow() throws Exception { - assertThrows(TikaConfigException.class, () -> { - TikaConfig config = getConfig("TIKA-2389-throw-per-parser.xml"); - }); + assertThrows( + TikaConfigException.class, + () -> { + TikaConfig config = getConfig("TIKA-2389-throw-per-parser.xml"); + }); } @Test public void testInitializerServiceLoaderThrow() throws Exception { - assertThrows(TikaConfigException.class, () -> { - TikaConfig config = getConfig("TIKA-2389-throw-default.xml"); - }); + assertThrows( + TikaConfigException.class, + () -> { + TikaConfig config = getConfig("TIKA-2389-throw-default.xml"); + }); } @Test public void testInitializerServiceLoaderThrowButOverridden() throws Exception { - //TODO: test that this was logged at INFO level + // TODO: test that this was logged at INFO level TikaConfig config = getConfig("TIKA-2389-throw-default-overridden.xml"); } @Test public void testInitializerPerParserWarn() throws Exception { - //TODO: test that this was logged at WARN level + // TODO: test that this was logged at WARN level TikaConfig config = getConfig("TIKA-2389-warn-per-parser.xml"); } @@ -327,20 +326,22 @@ public void testMultipleWithFallback() throws Exception { @Test public void testXMLReaderUtils() throws Exception { - //pool size may have been reset already by an - //earlier test. Can't test for default here. - assertEquals(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS, + // pool size may have been reset already by an + // earlier test. Can't test for default here. + assertEquals( + XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS, XMLReaderUtils.getMaxEntityExpansions()); - //make sure that detection on this file actually works with - //default expansions - assertEquals("application/rdf+xml", + // make sure that detection on this file actually works with + // default expansions + assertEquals( + "application/rdf+xml", detect("test-difficult-rdf1.xml", TikaConfig.getDefaultConfig()).toString()); TikaConfig tikaConfig = getConfig("TIKA-2732-xmlreaderutils.xml"); try { assertEquals(33, XMLReaderUtils.getPoolSize()); assertEquals(5, XMLReaderUtils.getMaxEntityExpansions()); - //make sure that there's actually a change in behavior + // make sure that there's actually a change in behavior assertEquals("text/plain", detect("test-difficult-rdf1.xml", tikaConfig).toString()); } finally { XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS); @@ -356,27 +357,33 @@ private MediaType detect(String testFileName, TikaConfig tikaConfig) throws Exce @Test public void testXMLReaderUtilsException() throws Exception { - assertThrows(NumberFormatException.class, () -> { - getConfig("TIKA-2732-xmlreaderutils-exc.xml"); - }); + assertThrows( + NumberFormatException.class, + () -> { + getConfig("TIKA-2732-xmlreaderutils-exc.xml"); + }); } @Test public void testXMLReaderUtilsUnspecifiedAttribute() throws Exception { TikaConfig tikaConfig = getConfig("TIKA-3551-xmlreaderutils.xml"); - assertEquals(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS, XMLReaderUtils.getMaxEntityExpansions()); + assertEquals( + XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS, + XMLReaderUtils.getMaxEntityExpansions()); } @Test public void testBadExclude() throws Exception { - assertThrows(TikaConfigException.class, () -> { - getConfig("TIKA-3268-bad-parser-exclude.xml"); - }); + assertThrows( + TikaConfigException.class, + () -> { + getConfig("TIKA-3268-bad-parser-exclude.xml"); + }); } @Test public void testTimesInitiated() throws Exception { - //this prevents multi-threading tests, but we aren't doing that now... + // this prevents multi-threading tests, but we aren't doing that now... MockParser.resetTimesInitiated(); TikaConfig tikaConfig = new TikaConfig(TikaConfigTest.class.getResourceAsStream("mock-exclude.xml")); diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaPipesConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaPipesConfigTest.java index 3ea1e538ce..2cae67ac8c 100644 --- a/tika-core/src/test/java/org/apache/tika/config/TikaPipesConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/TikaPipesConfigTest.java @@ -1,17 +1,15 @@ /** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

- * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at + * + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.config; @@ -24,9 +22,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.List; - -import org.junit.jupiter.api.Test; - import org.apache.tika.exception.TikaConfigException; import org.apache.tika.pipes.CompositePipesReporter; import org.apache.tika.pipes.PipesReporter; @@ -38,9 +33,10 @@ import org.apache.tika.pipes.fetcher.FetcherManager; import org.apache.tika.pipes.fetcher.fs.FileSystemFetcher; import org.apache.tika.pipes.pipesiterator.PipesIterator; +import org.junit.jupiter.api.Test; public class TikaPipesConfigTest extends AbstractTikaConfigTest { - //this handles tests for the newer pipes type configs. + // this handles tests for the newer pipes type configs. @Test public void testFetchers() throws Exception { @@ -54,27 +50,31 @@ public void testFetchers() throws Exception { @Test public void testDuplicateFetchers() throws Exception { - //can't have two fetchers with the same name - assertThrows(TikaConfigException.class, () -> { - FetcherManager.load(getConfigFilePath("fetchers-duplicate-config.xml")); - }); + // can't have two fetchers with the same name + assertThrows( + TikaConfigException.class, + () -> { + FetcherManager.load(getConfigFilePath("fetchers-duplicate-config.xml")); + }); } @Test public void testNoNameFetchers() throws Exception { - //can't have two fetchers with an empty name - assertThrows(TikaConfigException.class, () -> { - FetcherManager.load(getConfigFilePath("fetchers-noname-config.xml")); - }); + // can't have two fetchers with an empty name + assertThrows( + TikaConfigException.class, + () -> { + FetcherManager.load(getConfigFilePath("fetchers-noname-config.xml")); + }); } @Test public void testNoBasePathFetchers() throws Exception { - //no basepath is allowed as of > 2.3.0 - //test that this does not throw an exception. + // no basepath is allowed as of > 2.3.0 + // test that this does not throw an exception. - FetcherManager fetcherManager = FetcherManager.load( - getConfigFilePath("fetchers-nobasepath-config.xml")); + FetcherManager fetcherManager = + FetcherManager.load(getConfigFilePath("fetchers-nobasepath-config.xml")); } @Test @@ -89,36 +89,41 @@ public void testEmitters() throws Exception { @Test public void testDuplicateEmitters() throws Exception { - assertThrows(TikaConfigException.class, () -> { - EmitterManager.load(getConfigFilePath("emitters-duplicate-config.xml")); - }); + assertThrows( + TikaConfigException.class, + () -> { + EmitterManager.load(getConfigFilePath("emitters-duplicate-config.xml")); + }); } @Test public void testPipesIterator() throws Exception { - PipesIterator it = - PipesIterator.build(getConfigFilePath("pipes-iterator-config.xml")); + PipesIterator it = PipesIterator.build(getConfigFilePath("pipes-iterator-config.xml")); assertEquals("fs1", it.getFetcherName()); } @Test public void testMultiplePipesIterators() throws Exception { - assertThrows(TikaConfigException.class, () -> { - PipesIterator it = - PipesIterator.build(getConfigFilePath("pipes-iterator-multiple-config.xml")); - assertEquals("fs1", it.getFetcherName()); - }); + assertThrows( + TikaConfigException.class, + () -> { + PipesIterator it = + PipesIterator.build( + getConfigFilePath("pipes-iterator-multiple-config.xml")); + assertEquals("fs1", it.getFetcherName()); + }); } + @Test public void testParams() throws Exception { - //This test makes sure that pre 2.7.x configs that still contain element - //in ConfigBase derived objects still work. + // This test makes sure that pre 2.7.x configs that still contain element + // in ConfigBase derived objects still work. Path configPath = getConfigFilePath("TIKA-3865-params.xml"); AsyncConfig asyncConfig = AsyncConfig.load(configPath); PipesReporter reporter = asyncConfig.getPipesReporter(); assertTrue(reporter instanceof CompositePipesReporter); - List reporters = ((CompositePipesReporter)reporter).getPipesReporters(); - assertEquals("somethingOrOther1", ((MockReporter)reporters.get(0)).getEndpoint()); - assertEquals("somethingOrOther2", ((MockReporter)reporters.get(1)).getEndpoint()); + List reporters = ((CompositePipesReporter) reporter).getPipesReporters(); + assertEquals("somethingOrOther1", ((MockReporter) reporters.get(0)).getEndpoint()); + assertEquals("somethingOrOther2", ((MockReporter) reporters.get(1)).getEndpoint()); } } diff --git a/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java index dbb8220e74..8b8912ca20 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java @@ -20,14 +20,12 @@ import static org.junit.jupiter.api.Assumptions.assumeTrue; import java.io.InputStream; - -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; public class FileCommandDetectorTest { @@ -44,28 +42,30 @@ public static void setUp() throws Exception { public void testBasic() throws Exception { assumeTrue(FileCommandDetector.checkHasFile()); - try (InputStream is = getClass() - .getResourceAsStream("/test-documents/basic_embedded.xml")) { - //run more than once to ensure that the input stream is reset + try (InputStream is = + getClass().getResourceAsStream("/test-documents/basic_embedded.xml")) { + // run more than once to ensure that the input stream is reset for (int i = 0; i < 2; i++) { Metadata metadata = new Metadata(); MediaType answer = DETECTOR.detect(is, metadata); String fileMime = metadata.get(FileCommandDetector.FILE_MIME); - assertTrue(MediaType.text("xml").equals(answer) || - MediaType.application("xml").equals(answer)); - assertTrue("application/xml".equals(fileMime) || - "text/xml".equals(fileMime)); + assertTrue( + MediaType.text("xml").equals(answer) + || MediaType.application("xml").equals(answer)); + assertTrue("application/xml".equals(fileMime) || "text/xml".equals(fileMime)); } } - //now try with TikaInputStream - try (InputStream is = TikaInputStream - .get(getClass().getResourceAsStream("/test-documents/basic_embedded.xml"))) { - //run more than once to ensure that the input stream is reset + // now try with TikaInputStream + try (InputStream is = + TikaInputStream.get( + getClass().getResourceAsStream("/test-documents/basic_embedded.xml"))) { + // run more than once to ensure that the input stream is reset for (int i = 0; i < 2; i++) { MediaType answer = DETECTOR.detect(is, new Metadata()); - assertTrue(MediaType.text("xml").equals(answer) || - MediaType.application("xml").equals(answer)); + assertTrue( + MediaType.text("xml").equals(answer) + || MediaType.application("xml").equals(answer)); } } } diff --git a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java index 3a86a53b36..dc9cfed5ee 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java @@ -26,17 +26,13 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.Test; -/** - * Test cases for the {@link MagicDetector} class. - */ +/** Test cases for the {@link MagicDetector} class. */ public class MagicDetectorTest { @Test @@ -73,9 +69,13 @@ public void testDetectOffsetRange() throws Exception { assertDetect(detector, MediaType.OCTET_STREAM, " html"); assertDetect(detector, MediaType.OCTET_STREAM, ""); @@ -96,9 +101,13 @@ public void testDetectMask() throws Exception { assertDetect(detector, MediaType.OCTET_STREAM, "" + - "XHTML test document"); + assertDetect( + detector, + xhtml, + "" + + "XHTML test document"); } @Test public void testDetectRegExOptions() throws Exception { - String pattern = "(?s)\\A.{0,1024}\\x3c\\!(?:DOCTYPE|doctype) (?:HTML|html) " + - "(?:PUBLIC|public) \"-//.{1,16}//(?:DTD|dtd) .{0,64}" + "(?:HTML|html) 4\\.01"; - - String data = "" + - "HTML document" + "

Hello world!"; - - String data1 = "" + - "HTML document" + "

Hello world!"; - - String data2 = "" + - "HTML document" + "

Hello world!"; + String pattern = + "(?s)\\A.{0,1024}\\x3c\\!(?:DOCTYPE|doctype) (?:HTML|html) " + + "(?:PUBLIC|public) \"-//.{1,16}//(?:DTD|dtd) .{0,64}" + + "(?:HTML|html) 4\\.01"; + + String data = + "" + + "HTML document" + + "

Hello world!"; + + String data1 = + "" + + "HTML document" + + "

Hello world!"; + + String data2 = + "" + + "HTML document" + + "

Hello world!"; MediaType html = new MediaType("text", "html"); Detector detector = new MagicDetector(html, pattern.getBytes(US_ASCII), null, true, 0, 0); @@ -171,8 +199,9 @@ public void testDetectStreamReadProblems() throws Exception { @Test public void testDetectApplicationEnviHdr() throws Exception { - InputStream iStream = MagicDetectorTest.class - .getResourceAsStream("/test-documents/ang20150420t182050_corr_v1e_img.hdr"); + InputStream iStream = + MagicDetectorTest.class.getResourceAsStream( + "/test-documents/ang20150420t182050_corr_v1e_img.hdr"); byte[] data = IOUtils.toByteArray(iStream); MediaType testMT = new MediaType("application", "envi.hdr"); Detector detector = new MagicDetector(testMT, data, null, false, 0, 0); @@ -225,19 +254,13 @@ private void assertDetect(Detector detector, MediaType type, byte[] bytes) { } } - /** - * InputStream class that does not read in all available bytes in - * one go. - */ + /** InputStream class that does not read in all available bytes in one go. */ private static class RestrictiveInputStream extends ByteArrayInputStream { public RestrictiveInputStream(byte[] buf) { super(buf); } - /** - * Prevent reading the entire len of bytes if requesting more - * than 10 bytes. - */ + /** Prevent reading the entire len of bytes if requesting more than 10 bytes. */ public int read(byte[] b, int off, int len) { if (len > 10) { return super.read(b, off, len - 10); @@ -250,18 +273,23 @@ public int read(byte[] b, int off, int len) { @Test public void testBZ2Detection() throws Exception { Detector detector = new TikaConfig().getDetector(); - for (String bz2 : new String[]{"bzip2-8-file.txt.bz2", - "empty-file.txt.bz2", "lbzip2-8-file.txt.bz2", - "small-file.txt.bz2", "test-file-1.csv.bz2", - "test-file-2.csv.bz2"}) { + for (String bz2 : + new String[] { + "bzip2-8-file.txt.bz2", + "empty-file.txt.bz2", + "lbzip2-8-file.txt.bz2", + "small-file.txt.bz2", + "test-file-1.csv.bz2", + "test-file-2.csv.bz2" + }) { assertEquals("application/x-bzip2", detect(detector, bz2)); } } - private String detect(Detector detector, String bz2Name) throws IOException { - try (InputStream is = new BufferedInputStream( - this.getClass().getResourceAsStream( - "/test-documents/bz2/" + bz2Name))) { + private String detect(Detector detector, String bz2Name) throws IOException { + try (InputStream is = + new BufferedInputStream( + this.getClass().getResourceAsStream("/test-documents/bz2/" + bz2Name))) { return detector.detect(is, new Metadata()).toString(); } } diff --git a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java index 293f423d2c..30d23193f7 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java @@ -22,13 +22,11 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeDetectionTest; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class MimeDetectionWithNNTest { @@ -43,10 +41,10 @@ public void setUp() { } /** - * The test case only works on the detector that only has grb model as - * currently the grb model is used as an example; if more models are added - * in the TrainedModelDetector, the following tests will need to modified to reflect - * the corresponding type instead of test-equal with the "OCTET_STREAM"; + * The test case only works on the detector that only has grb model as currently the grb model + * is used as an example; if more models are added in the TrainedModelDetector, the following + * tests will need to modified to reflect the corresponding type instead of test-equal with the + * "OCTET_STREAM"; * * @throws Exception */ @@ -69,7 +67,9 @@ public void testDetection() throws Exception { testFile(octetStream_str, "test-utf16be.xml"); testFile(octetStream_str, "test-long-comment.xml"); testFile(octetStream_str, "stylesheet.xsl"); - testUrl(octetStream_str, "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", + testUrl( + octetStream_str, + "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", "test-difficult-rdf1.xml"); testUrl(octetStream_str, "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml"); // add evil test from TIKA-327 @@ -108,27 +108,25 @@ private void testStream(String expected, String urlOrFileName, InputStream in) try { Metadata metadata = new Metadata(); String mime = this.detector.detect(in, metadata).toString(); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected: detected."); + assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected."); // Add resource name and test again // metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName); mime = this.detector.detect(in, metadata).toString(); - assertEquals(expected, mime, + assertEquals( + expected, + mime, urlOrFileName + " is not properly detected after adding resource name."); } finally { in.close(); } } - /** - * Test for type detection of empty documents. - */ + /** Test for type detection of empty documents. */ @Test public void testEmptyDocument() throws IOException { - assertEquals(MediaType.OCTET_STREAM, + assertEquals( + MediaType.OCTET_STREAM, detector.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); - } - } diff --git a/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java index dc15299afc..f232025ce0 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java @@ -23,17 +23,13 @@ import java.util.HashMap; import java.util.Map; import java.util.regex.Pattern; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; -/** - * Test cases for the {@link NameDetector} class. - */ +/** Test cases for the {@link NameDetector} class. */ public class NameDetectorTest { private Detector detector; @@ -50,27 +46,27 @@ public void setUp() { @Test public void testDetect() { assertDetect(MediaType.TEXT_PLAIN, "text.txt"); - assertDetect(MediaType.TEXT_PLAIN, "text.txt "); // trailing space - assertDetect(MediaType.TEXT_PLAIN, "text.txt\n"); // trailing newline + assertDetect(MediaType.TEXT_PLAIN, "text.txt "); // trailing space + assertDetect(MediaType.TEXT_PLAIN, "text.txt\n"); // trailing newline assertDetect(MediaType.TEXT_PLAIN, "text.txt?a=b"); // URL query assertDetect(MediaType.TEXT_PLAIN, "text.txt#abc"); // URL fragment - assertDetect(MediaType.TEXT_PLAIN, "text%2Etxt"); // URL encoded - assertDetect(MediaType.TEXT_PLAIN, "text.TXT"); // case insensitive + assertDetect(MediaType.TEXT_PLAIN, "text%2Etxt"); // URL encoded + assertDetect(MediaType.TEXT_PLAIN, "text.TXT"); // case insensitive assertDetect(MediaType.OCTET_STREAM, "text.txt.gz"); assertDetect(MediaType.TEXT_PLAIN, "README"); - assertDetect(MediaType.TEXT_PLAIN, " README "); // space around - assertDetect(MediaType.TEXT_PLAIN, "\tREADME\n"); // other whitespace - assertDetect(MediaType.TEXT_PLAIN, "/a/README"); // leading path - assertDetect(MediaType.TEXT_PLAIN, "\\b\\README"); // windows path - assertDetect(MediaType.OCTET_STREAM, "ReadMe"); // case sensitive + assertDetect(MediaType.TEXT_PLAIN, " README "); // space around + assertDetect(MediaType.TEXT_PLAIN, "\tREADME\n"); // other whitespace + assertDetect(MediaType.TEXT_PLAIN, "/a/README"); // leading path + assertDetect(MediaType.TEXT_PLAIN, "\\b\\README"); // windows path + assertDetect(MediaType.OCTET_STREAM, "ReadMe"); // case sensitive assertDetect(MediaType.OCTET_STREAM, "README.NOW"); // TIKA-1928 # in the filename assertDetect(MediaType.TEXT_PLAIN, "text.txt"); - assertDetect(MediaType.TEXT_PLAIN, "text#.txt"); // # before extension - assertDetect(MediaType.TEXT_PLAIN, "text#123.txt");// # before extension - assertDetect(MediaType.TEXT_PLAIN, "text.txt#pdf");// # after extension + assertDetect(MediaType.TEXT_PLAIN, "text#.txt"); // # before extension + assertDetect(MediaType.TEXT_PLAIN, "text#123.txt"); // # before extension + assertDetect(MediaType.TEXT_PLAIN, "text.txt#pdf"); // # after extension // TIKA-3783 # before the final . assertDetect(MediaType.TEXT_PLAIN, "ABC#192.168.0.1#2.txt"); @@ -82,7 +78,7 @@ public void testDetect() { // tough one assertDetect(MediaType.TEXT_PLAIN, " See http://www.example.com:1234/README.txt?a=b#c \n"); assertDetect(MediaType.TEXT_PLAIN, "See README.txt"); // even this! - assertDetect(MediaType.OCTET_STREAM, "See README"); // but not this + assertDetect(MediaType.OCTET_STREAM, "See README"); // but not this assertDetect(MediaType.application("envi.hdr"), "ang20150420t182050_corr_v1e_img.hdr"); diff --git a/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java index 1870033d9e..377adc970f 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java @@ -24,15 +24,11 @@ import java.io.IOException; import java.io.InputStream; import java.util.Arrays; - -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.Test; -/** - * Test cases for the {@link TextDetector} class. - */ +/** Test cases for the {@link TextDetector} class. */ public class TextDetectorTest { private final Detector detector = new TextDetector(); @@ -56,9 +52,9 @@ public void testDetectEmpty() throws Exception { public void testDetectText() throws Exception { assertText("Hello, World!".getBytes(UTF_8)); assertText(" \t\r\n".getBytes(UTF_8)); - assertNotText(new byte[]{-1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B}); - assertNotText(new byte[]{0}); - assertNotText(new byte[]{'H', 'e', 'l', 'l', 'o', 0}); + assertNotText(new byte[] {-1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B}); + assertNotText(new byte[] {0}); + assertNotText(new byte[] {'H', 'e', 'l', 'l', 'o', 0}); byte[] data = new byte[512]; Arrays.fill(data, (byte) '.'); @@ -99,11 +95,11 @@ private void assertText(byte[] data) { private void assertNotText(byte[] data) { try { - assertEquals(MediaType.OCTET_STREAM, + assertEquals( + MediaType.OCTET_STREAM, detector.detect(new ByteArrayInputStream(data), new Metadata())); } catch (IOException e) { fail("Unexpected exception from TextDetector"); } } - } diff --git a/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java index d79e9b7e50..5ad556fd3c 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java @@ -22,15 +22,11 @@ import java.io.IOException; import java.util.Map; import java.util.TreeMap; - -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.Test; -/** - * Test cases for the {@link TypeDetector} class. - */ +/** Test cases for the {@link TypeDetector} class. */ public class TypeDetectorTest { private static Map params = new TreeMap<>(); @@ -41,7 +37,6 @@ public class TypeDetectorTest { private static MediaType TEXT_PLAIN_A_EQ_B = new MediaType("text", "plain", params); - private final Detector detector = new TypeDetector(); @Test @@ -73,5 +68,4 @@ private void assertDetect(MediaType type, String name) { fail("TypeDetector should never throw an IOException"); } } - } diff --git a/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java index 852711671d..68bbb29b21 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java @@ -23,12 +23,10 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class ZeroSizeFileDetectorTest { @@ -59,5 +57,4 @@ private void detect(byte[] data, MediaType type) { fail("Unexpected exception from ZeroSizeFileDetector"); } } - } diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java index b233b44e16..ab24f151c5 100644 --- a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java @@ -45,14 +45,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.TikaTest; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -67,16 +60,20 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; public class ForkParserTest extends TikaTest { - @TempDir - Path tempDir; + @TempDir Path tempDir; @Test public void testHelloWorld() throws Exception { - try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser())) { + try (ForkParser parser = + new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) { Metadata metadata = new Metadata(); ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); @@ -89,8 +86,8 @@ public void testHelloWorld() throws Exception { @Test public void testSerialParsing() throws Exception { - try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser())) { + try (ForkParser parser = + new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) { ParseContext context = new ParseContext(); for (int i = 0; i < 10; i++) { ContentHandler output = new BodyContentHandler(); @@ -103,8 +100,8 @@ public void testSerialParsing() throws Exception { @Test public void testParallelParsing() throws Exception { - try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser())) { + try (ForkParser parser = + new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) { final ParseContext context = new ParseContext(); Thread[] threads = new Thread[10]; @@ -112,14 +109,16 @@ public void testParallelParsing() throws Exception { for (int i = 0; i < threads.length; i++) { final ContentHandler o = new BodyContentHandler(); output[i] = o; - threads[i] = new Thread(() -> { - try { - InputStream stream = new ByteArrayInputStream(new byte[0]); - parser.parse(stream, o, new Metadata(), context); - } catch (Exception e) { - e.printStackTrace(); - } - }); + threads[i] = + new Thread( + () -> { + try { + InputStream stream = new ByteArrayInputStream(new byte[0]); + parser.parse(stream, o, new Metadata(), context); + } catch (Exception e) { + e.printStackTrace(); + } + }); threads[i].start(); } @@ -132,30 +131,33 @@ public void testParallelParsing() throws Exception { @Test public void testPoolSizeReached() throws Exception { - try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser())) { + try (ForkParser parser = + new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) { final Semaphore barrier = new Semaphore(0); Thread[] threads = new Thread[parser.getPoolSize()]; PipedOutputStream[] pipes = new PipedOutputStream[threads.length]; final ParseContext context = new ParseContext(); for (int i = 0; i < threads.length; i++) { - final PipedInputStream input = new PipedInputStream() { - @Override - public synchronized int read() throws IOException { - barrier.release(); - return super.read(); - } - }; + final PipedInputStream input = + new PipedInputStream() { + @Override + public synchronized int read() throws IOException { + barrier.release(); + return super.read(); + } + }; pipes[i] = new PipedOutputStream(input); - threads[i] = new Thread(() -> { - try { - ContentHandler o = new DefaultHandler(); - parser.parse(input, o, new Metadata(), context); - } catch (Exception e) { - e.printStackTrace(); - } - }); + threads[i] = + new Thread( + () -> { + try { + ContentHandler o = new DefaultHandler(); + parser.parse(input, o, new Metadata(), context); + } catch (Exception e) { + e.printStackTrace(); + } + }); threads[i].start(); } @@ -163,15 +165,17 @@ public synchronized int read() throws IOException { barrier.acquire(parser.getPoolSize()); final ContentHandler o = new BodyContentHandler(); - Thread blocked = new Thread(() -> { - try { - barrier.release(); - InputStream stream = new ByteArrayInputStream(new byte[0]); - parser.parse(stream, o, new Metadata(), context); - } catch (Exception e) { - e.printStackTrace(); - } - }); + Thread blocked = + new Thread( + () -> { + try { + barrier.release(); + InputStream stream = new ByteArrayInputStream(new byte[0]); + parser.parse(stream, o, new Metadata(), context); + } catch (Exception e) { + e.printStackTrace(); + } + }); blocked.start(); // Wait until the last thread is started, and then some to @@ -200,40 +204,48 @@ public void testPulseAndTimeouts() throws Exception { forkParser.setServerPulseMillis(500); forkParser.setServerParseTimeoutMillis(5000); forkParser.setServerWaitTimeoutMillis(60000); - String sleepCommand = "\n" + " Hello, World!\n" + - " \n" + - ""; + String sleepCommand = + "\n" + + " Hello, World!\n" + + " \n" + + ""; ContentHandler o = new BodyContentHandler(-1); Metadata m = new Metadata(); ParseContext c = new ParseContext(); try { - forkParser - .parse(new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)), - o, m, c); + forkParser.parse( + new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)), + o, + m, + c); fail("should have thrown IOException"); } catch (TikaException e) { - //failed to communicate with forked parser process" + // failed to communicate with forked parser process" } finally { forkParser.close(); } - //test setting very short pulse (10 ms) and a parser that takes at least 1000 ms + // test setting very short pulse (10 ms) and a parser that takes at least 1000 ms forkParser = new ForkParser(ForkParserTest.class.getClassLoader(), new MockParser()); forkParser.setServerPulseMillis(10); forkParser.setServerParseTimeoutMillis(100); - sleepCommand = "\n" + " Hello, World!\n" + - " \n" + - ""; + sleepCommand = + "\n" + + " Hello, World!\n" + + " \n" + + ""; o = new BodyContentHandler(-1); m = new Metadata(); c = new ParseContext(); try { - forkParser - .parse(new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)), - o, m, c); + forkParser.parse( + new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)), + o, + m, + c); fail("Should have thrown exception"); } catch (IOException | TikaException e) { - //"should have thrown IOException lost connection" + // "should have thrown IOException lost connection" } finally { forkParser.close(); } @@ -241,8 +253,10 @@ public void testPulseAndTimeouts() throws Exception { @Test public void testPackageCanBeAccessed() throws Exception { - try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser.ForkTestParserAccessingPackage())) { + try (ForkParser parser = + new ForkParser( + ForkParserTest.class.getClassLoader(), + new ForkTestParser.ForkTestParserAccessingPackage())) { Metadata metadata = new Metadata(); ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); @@ -257,9 +271,10 @@ public void testPackageCanBeAccessed() throws Exception { public void testRecursiveParserWrapper() throws Exception { Parser parser = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, - 20000)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000)); try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); InputStream is = getResourceAsStream("/test-documents/basic_embedded.xml")) { Metadata metadata = new Metadata(); @@ -282,9 +297,10 @@ public void testRecursiveParserWrapper() throws Exception { public void testRPWWithEmbeddedNPE() throws Exception { Parser parser = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, - 20000)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000)); try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); InputStream is = getResourceAsStream("/test-documents/embedded_with_npe.xml")) { Metadata metadata = new Metadata(); @@ -301,17 +317,18 @@ public void testRPWWithEmbeddedNPE() throws Exception { assertEquals("embeddedAuthor", m1.get(TikaCoreProperties.CREATOR)); assertContains("some_embedded_content", m1.get(TikaCoreProperties.TIKA_CONTENT)); assertEquals("/embed1.xml", m1.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); - assertContains("another null pointer exception", - m1.get(TikaCoreProperties.EMBEDDED_EXCEPTION)); + assertContains( + "another null pointer exception", m1.get(TikaCoreProperties.EMBEDDED_EXCEPTION)); } @Test public void testRPWWithMainDocNPE() throws Exception { Parser parser = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, - 20000)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000)); try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); InputStream is = getResourceAsStream("/test-documents/embedded_then_npe.xml")) { Metadata metadata = new Metadata(); @@ -336,15 +353,15 @@ public void testRPWWithMainDocNPE() throws Exception { @Test public void testToFileHandler() throws Exception { - //test that a server-side write-to-file works without proxying back the - //AbstractContentHandlerFactory + // test that a server-side write-to-file works without proxying back the + // AbstractContentHandlerFactory Path target = Files.createTempFile(tempDir, "fork-to-file-handler-", ".txt"); try (InputStream is = getResourceAsStream("/test-documents/basic_embedded.xml")) { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(new AutoDetectParser()); ToFileHandler toFileHandler = new ToFileHandler(new SBContentHandlerFactory(), target.toFile()); - try (ForkParser forkParser = new ForkParser(ForkParserTest.class.getClassLoader(), - wrapper)) { + try (ForkParser forkParser = + new ForkParser(ForkParserTest.class.getClassLoader(), wrapper)) { Metadata m = new Metadata(); ParseContext context = new ParseContext(); forkParser.parse(is, toFileHandler, m, context); @@ -355,16 +372,21 @@ public void testToFileHandler() throws Exception { try (Reader reader = Files.newBufferedReader(target, StandardCharsets.UTF_8)) { contents = IOUtils.toString(reader); } - assertContainsCount(TikaCoreProperties.TIKA_PARSED_BY.getName() + - " : org.apache.tika.parser.DefaultParser", contents, 2); - assertContainsCount(TikaCoreProperties.TIKA_PARSED_BY.getName() + - " : org.apache.tika.parser.mock.MockParser", contents, 2); + assertContainsCount( + TikaCoreProperties.TIKA_PARSED_BY.getName() + + " : org.apache.tika.parser.DefaultParser", + contents, + 2); + assertContainsCount( + TikaCoreProperties.TIKA_PARSED_BY.getName() + + " : org.apache.tika.parser.mock.MockParser", + contents, + 2); assertContains("Nikolai Lobachevsky", contents); assertContains("embeddedAuthor", contents); assertContains("main_content", contents); assertContains("some_embedded_content", contents); assertContains("X-TIKA:embedded_resource_path : /embed1.xml", contents); - } @Test @@ -394,7 +416,6 @@ public void testRecursiveParserWrapperWithProxyingContentHandlersAndMetadata() assertEquals("/embed1.xml", m1.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); } - @Test public void testRPWWithNonSerializableContentHandler() throws Exception { Parser parser = new AutoDetectParser(); @@ -439,20 +460,23 @@ public void testNoUTFDataFormatException() throws Exception { public void testForkParserDoesntPreventShutdown() throws Exception { ExecutorService service = Executors.newFixedThreadPool(1); CountDownLatch cdl = new CountDownLatch(1); - service.submit(() -> { - try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser.ForkTestParserWaiting())) { - Metadata metadata = new Metadata(); - ContentHandler output = new BodyContentHandler(); - InputStream stream = new ByteArrayInputStream(new byte[0]); - ParseContext context = new ParseContext(); - cdl.countDown(); - parser.parse(stream, output, metadata, context); - // Don't care about output not planning to get this far - } catch (IOException | SAXException | TikaException e) { - throw new RuntimeException(e); - } - }); + service.submit( + () -> { + try (ForkParser parser = + new ForkParser( + ForkParserTest.class.getClassLoader(), + new ForkTestParser.ForkTestParserWaiting())) { + Metadata metadata = new Metadata(); + ContentHandler output = new BodyContentHandler(); + InputStream stream = new ByteArrayInputStream(new byte[0]); + ParseContext context = new ParseContext(); + cdl.countDown(); + parser.parse(stream, output, metadata, context); + // Don't care about output not planning to get this far + } catch (IOException | SAXException | TikaException e) { + throw new RuntimeException(e); + } + }); // Wait to make sure submitted runnable is actually running boolean await = cdl.await(1, TimeUnit.SECONDS); if (!await) { @@ -464,14 +488,15 @@ public void testForkParserDoesntPreventShutdown() throws Exception { service.shutdownNow(); service.awaitTermination(15, TimeUnit.SECONDS); long secondsSinceShutdown = ChronoUnit.SECONDS.between(requestShutdown, Instant.now()); - assertTrue(secondsSinceShutdown < 5, "Should have shutdown the service in less than 5 seconds"); + assertTrue( + secondsSinceShutdown < 5, + "Should have shutdown the service in less than 5 seconds"); } - - //use this to test that the wrapper handler is acted upon by the server but not proxied back + // use this to test that the wrapper handler is acted upon by the server but not proxied back private static class ToFileHandler extends AbstractRecursiveParserWrapperHandler { - //this needs to be a file because a File is serializable + // this needs to be a file because a File is serializable private final File file; private OutputStream os; @@ -579,8 +604,8 @@ public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { private static class LyingNonSerializableContentHandler extends DefaultHandler implements Serializable { - //StringWriter makes this class not actually Serializable - //as is. + // StringWriter makes this class not actually Serializable + // as is. StringWriter writer = new StringWriter(); @Override @@ -594,9 +619,9 @@ public String toString() { } } - //use this to test that a handler that extends RecursiveParserWrapperHandler - //does have both contenthandlers and metadata objects proxied back from the - //server. + // use this to test that a handler that extends RecursiveParserWrapperHandler + // does have both contenthandlers and metadata objects proxied back from the + // server. private static class BufferingHandler extends RecursiveParserWrapperHandler { List contentHandlers = new ArrayList<>(); @@ -604,7 +629,6 @@ public BufferingHandler(ContentHandlerFactory contentHandlerFactory) { super(contentHandlerFactory); } - @Override public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { @@ -627,6 +651,5 @@ public List getContentHandlers() { public List getMetadataList() { return metadataList; } - } } diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java index 4756f00abe..7e768572ab 100644 --- a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java +++ b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java @@ -18,6 +18,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; +import com.google.common.reflect.ClassPath; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -32,15 +33,7 @@ import java.util.function.Predicate; import java.util.jar.JarEntry; import java.util.jar.JarOutputStream; - -import com.google.common.reflect.ClassPath; import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.TikaTest; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -48,13 +41,17 @@ import org.apache.tika.parser.AutoDetectParserFactory; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.ToXMLContentHandler; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class ForkParserTikaBinTest extends TikaTest { private static final String JAR_FILE_NAME = "mock-tika-app.jar"; private static final Map EMPTY_MAP = Collections.emptyMap(); - @TempDir - private static Path JAR_DIR; + @TempDir private static Path JAR_DIR; private static Path JAR_FILE; @BeforeAll @@ -65,29 +62,38 @@ public static void bootstrapJar() throws Exception { ClassLoader loader = ForkServer.class.getClassLoader(); ClassPath classPath = ClassPath.from(loader); addClasses(jarOs, classPath, ci -> ci.getPackageName().startsWith("org.slf4j")); - addClasses(jarOs, classPath, ci -> ci.getPackageName().startsWith("org.apache.logging")); - addClasses(jarOs, classPath, + addClasses( + jarOs, classPath, ci -> ci.getPackageName().startsWith("org.apache.logging")); + addClasses( + jarOs, + classPath, ci -> ci.getPackageName().startsWith("org.apache.commons.io")); - //exclude TypeDetectionBenchmark because it is not serializable - //exclude UpperCasingContentHandler because we want to test that - //we can serialize it from the parent process into the forked process - addClasses(jarOs, classPath, ci -> ci.getPackageName().startsWith("org.apache.tika") && - (!ci.getName().contains("TypeDetectionBenchmark")) && - (!ci.getName().contains("UpperCasingContentHandler"))); - - try (InputStream input = ForkParserTikaBinTest.class - .getResourceAsStream("/org/apache/tika/config/TIKA-2653-vowel-parser-ae.xml")) { + // exclude TypeDetectionBenchmark because it is not serializable + // exclude UpperCasingContentHandler because we want to test that + // we can serialize it from the parent process into the forked process + addClasses( + jarOs, + classPath, + ci -> + ci.getPackageName().startsWith("org.apache.tika") + && (!ci.getName().contains("TypeDetectionBenchmark")) + && (!ci.getName().contains("UpperCasingContentHandler"))); + + try (InputStream input = + ForkParserTikaBinTest.class.getResourceAsStream( + "/org/apache/tika/config/TIKA-2653-vowel-parser-ae.xml")) { jarOs.putNextEntry( new JarEntry("org/apache/tika/parser/TIKA-2653-vowel-parser-ae.xml")); IOUtils.copy(input, jarOs); } - try (InputStream input = ForkParserTikaBinTest.class - .getResourceAsStream("/org/apache/tika/mime/tika-mimetypes.xml")) { + try (InputStream input = + ForkParserTikaBinTest.class.getResourceAsStream( + "/org/apache/tika/mime/tika-mimetypes.xml")) { jarOs.putNextEntry(new JarEntry("org/apache/tika/mime/tika-mimetypes.xml")); IOUtils.copy(input, jarOs); } - try (InputStream input = ForkParserTikaBinTest.class - .getResourceAsStream("/custom-mimetypes.xml")) { + try (InputStream input = + ForkParserTikaBinTest.class.getResourceAsStream("/custom-mimetypes.xml")) { jarOs.putNextEntry(new JarEntry("custom-mimetypes.xml")); IOUtils.copy(input, jarOs); } @@ -98,15 +104,17 @@ public static void bootstrapJar() throws Exception { } Path tikaConfigVowelParser = JAR_DIR.resolve("TIKA_2653-iou.xml"); - try (InputStream is = ForkServer.class - .getResourceAsStream("/org/apache/tika/config/TIKA-2653-vowel-parser-iou.xml"); + try (InputStream is = + ForkServer.class.getResourceAsStream( + "/org/apache/tika/config/TIKA-2653-vowel-parser-iou.xml"); OutputStream os = Files.newOutputStream(tikaConfigVowelParser)) { IOUtils.copy(is, os); } } - private static void addClasses(JarOutputStream jarOs, ClassPath classPath, - Predicate predicate) throws IOException { + private static void addClasses( + JarOutputStream jarOs, ClassPath classPath, Predicate predicate) + throws IOException { for (ClassPath.ClassInfo classInfo : classPath.getAllClasses()) { if (predicate.test(classInfo)) { jarOs.putNextEntry(new JarEntry(classInfo.getResourceName())); @@ -118,8 +126,9 @@ private static void addClasses(JarOutputStream jarOs, ClassPath classPath, @Test public void testExplicitParserFactory() throws Exception { XMLResult xmlResult = - getXML(new ParserFactoryFactory("org.apache.tika.parser.mock.MockParserFactory", - EMPTY_MAP)); + getXML( + new ParserFactoryFactory( + "org.apache.tika.parser.mock.MockParserFactory", EMPTY_MAP)); assertContains("hello world!", xmlResult.xml); assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR)); } @@ -127,8 +136,8 @@ public void testExplicitParserFactory() throws Exception { @Test public void testVowelParserAsDefault() throws Exception { ParserFactoryFactory pff = - new ParserFactoryFactory("org.apache.tika.parser.AutoDetectParserFactory", - EMPTY_MAP); + new ParserFactoryFactory( + "org.apache.tika.parser.AutoDetectParserFactory", EMPTY_MAP); XMLResult xmlResult = getXML(pff); assertContains("eooeuiooueoeeao", xmlResult.xml); assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR)); @@ -138,19 +147,18 @@ public void testVowelParserAsDefault() throws Exception { public void testVowelParserInClassPath() throws Exception { Map args = new HashMap<>(); args.put(AutoDetectParserFactory.TIKA_CONFIG_PATH, "TIKA-2653-vowel-parser-ae.xml"); - ParserFactoryFactory pff = new ParserFactoryFactory( - "org.apache.tika.parser.AutoDetectParserFactory", - args); + ParserFactoryFactory pff = + new ParserFactoryFactory("org.apache.tika.parser.AutoDetectParserFactory", args); XMLResult xmlResult = getXML(pff); assertContains("eeeeea", xmlResult.xml); - assertEquals("Nikolai Lobachevsky", - xmlResult.metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR)); } @Test public void testVowelParserFromDirectory() throws Exception { Map args = new HashMap<>(); - args.put(AutoDetectParserFactory.TIKA_CONFIG_PATH, + args.put( + AutoDetectParserFactory.TIKA_CONFIG_PATH, JAR_DIR.resolve("TIKA_2653-iou.xml").toAbsolutePath().toString()); ParserFactoryFactory pff = new ParserFactoryFactory("org.apache.tika.parser.AutoDetectParserFactory", args); @@ -161,17 +169,16 @@ public void testVowelParserFromDirectory() throws Exception { @Test public void testPFFWithClassLoaderFromParentProcess() throws Exception { - //The UpperCasingContentHandler is not sent to the bootstrap test jar file in @BeforeClass. - //this tests that the content handler was loaded from the parent process. + // The UpperCasingContentHandler is not sent to the bootstrap test jar file in @BeforeClass. + // this tests that the content handler was loaded from the parent process. ParserFactoryFactory pff = - new ParserFactoryFactory("org.apache.tika.parser.AutoDetectParserFactory", - EMPTY_MAP); + new ParserFactoryFactory( + "org.apache.tika.parser.AutoDetectParserFactory", EMPTY_MAP); XMLResult xmlResult = getXML(pff, this.getClass().getClassLoader(), new UpperCasingContentHandler()); assertContains("EOOEUIOOUEOEEAO", xmlResult.xml); assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR)); - } private XMLResult getXML(ParserFactoryFactory pff) @@ -179,8 +186,8 @@ private XMLResult getXML(ParserFactoryFactory pff) return getXML(pff, null, null); } - private XMLResult getXML(ParserFactoryFactory pff, ClassLoader classloader, - ContentHandler contentHandler) + private XMLResult getXML( + ParserFactoryFactory pff, ClassLoader classloader, ContentHandler contentHandler) throws TikaException, SAXException, IOException { List java = new ArrayList<>(); diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java b/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java index e9c6949fa2..6ae7ff96e3 100644 --- a/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java +++ b/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java @@ -22,10 +22,6 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.fork.unusedpackage.ClassInUnusedPackage; import org.apache.tika.metadata.Metadata; @@ -33,20 +29,21 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; class ForkTestParser implements Parser { - /** - * Serial version UID - */ + /** Serial version UID */ private static final long serialVersionUID = -5492269783593452319L; public Set getSupportedTypes(ParseContext context) { return Collections.singleton(MediaType.TEXT_PLAIN); } - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { stream.read(); metadata.set(Metadata.CONTENT_TYPE, "text/plain"); @@ -60,8 +57,9 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, static class ForkTestParserAccessingPackage extends ForkTestParser { @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { assertNotNull(ClassInUnusedPackage.class.getPackage()); super.parse(stream, handler, metadata, context); } @@ -69,8 +67,9 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, static class ForkTestParserWaiting extends ForkTestParser { @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { try { Thread.sleep(10_000); } catch (InterruptedException e) { diff --git a/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java b/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java index 3ca513f1a4..e02a9e9943 100644 --- a/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java +++ b/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java @@ -17,7 +17,6 @@ package org.apache.tika.fork; import java.util.Locale; - import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; @@ -34,5 +33,4 @@ public void characters(char[] ch, int start, int length) throws SAXException { public String toString() { return sb.toString(); } - } diff --git a/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java b/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java index 1de4c45496..996bfa2b67 100644 --- a/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java +++ b/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java @@ -16,5 +16,4 @@ */ package org.apache.tika.fork.unusedpackage; -public class ClassInUnusedPackage { -} +public class ClassInUnusedPackage {} diff --git a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java index 906870e730..e607c08b1f 100644 --- a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java @@ -21,7 +21,6 @@ import static org.junit.jupiter.api.Assertions.fail; import java.io.ByteArrayInputStream; - import org.junit.jupiter.api.Test; public class EndianUtilsTest { @@ -29,65 +28,65 @@ public class EndianUtilsTest { public void testReadUE7() throws Exception { byte[] data; - data = new byte[]{0x08}; + data = new byte[] {0x08}; assertEquals(8, EndianUtils.readUE7(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0x84, 0x1e}; + data = new byte[] {(byte) 0x84, 0x1e}; assertEquals(542, EndianUtils.readUE7(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xac, (byte) 0xbe, 0x17}; + data = new byte[] {(byte) 0xac, (byte) 0xbe, 0x17}; assertEquals(728855, EndianUtils.readUE7(new ByteArrayInputStream(data))); } @Test public void testReadUIntLE() throws Exception { - byte[] data = new byte[]{(byte) 0x08, (byte) 0x00, (byte) 0x00, (byte) 0x00}; + byte[] data = new byte[] {(byte) 0x08, (byte) 0x00, (byte) 0x00, (byte) 0x00}; assertEquals(8, EndianUtils.readUIntLE(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xF0, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; + data = new byte[] {(byte) 0xF0, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; assertEquals(4294967280L, EndianUtils.readUIntLE(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; + data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; try { EndianUtils.readUIntLE(new ByteArrayInputStream(data)); fail("Should have thrown exception"); } catch (EndianUtils.BufferUnderrunException e) { - //swallow + // swallow } } @Test public void testReadUIntBE() throws Exception { - byte[] data = new byte[]{(byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x08}; + byte[] data = new byte[] {(byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x08}; assertEquals(8, EndianUtils.readUIntBE(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xF0}; + data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xF0}; assertEquals(4294967280L, EndianUtils.readUIntBE(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; + data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; try { EndianUtils.readUIntLE(new ByteArrayInputStream(data)); fail("Should have thrown exception"); } catch (EndianUtils.BufferUnderrunException e) { - //swallow + // swallow } } @Test public void testReadIntME() throws Exception { - // Example from https://yamm.finance/wiki/Endianness.html#mwAiw - byte[] data = new byte[]{(byte) 0x0b, (byte) 0x0a, (byte) 0x0d, (byte) 0x0c}; + // Example from https://yamm.finance/wiki/Endianness.html#mwAiw + byte[] data = new byte[] {(byte) 0x0b, (byte) 0x0a, (byte) 0x0d, (byte) 0x0c}; assertEquals(0x0a0b0c0d, EndianUtils.readIntME(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xFE, (byte) 0xFF, (byte) 0xFC, (byte) 0xFD}; + data = new byte[] {(byte) 0xFE, (byte) 0xFF, (byte) 0xFC, (byte) 0xFD}; assertEquals(0xfffefdfc, EndianUtils.readIntME(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; + data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; try { EndianUtils.readIntME(new ByteArrayInputStream(data)); fail("Should have thrown exception"); } catch (EndianUtils.BufferUnderrunException e) { - //swallow + // swallow } } } diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java index 0cc869aa1f..64a65bb408 100644 --- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java @@ -57,11 +57,12 @@ public void normalizeWithNull() throws Exception { @Test public void normalizeWithReservedChar() throws Exception { final String[] TEST_NAMES = {"test?.txt", "?test.txt", "test.txt?", "?test?txt?"}; - final String[] EXPECTED_NAMES = - {"test%3F.txt", "%3Ftest.txt", "test.txt%3F", "%3Ftest%3Ftxt%3F"}; + final String[] EXPECTED_NAMES = { + "test%3F.txt", "%3Ftest.txt", "test.txt%3F", "%3Ftest%3Ftxt%3F" + }; for (int i = 0; i < TEST_NAMES.length; ++i) { - //System.out.println("checking " + TEST_NAMES[i]); + // System.out.println("checking " + TEST_NAMES[i]); assertEquals(EXPECTED_NAMES[i], FilenameUtils.normalize(TEST_NAMES[i])); } } @@ -76,11 +77,16 @@ public void normalizeWithReservedChars() throws Exception { @Test public void normalizeWithNotPrintableChars() throws Exception { - final String TEST_NAME = new String( - new char[]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, '.', 16, 17, 18, - 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); - final String EXPECTED_NAME = "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" + "." + - "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F"; + final String TEST_NAME = + new String( + new char[] { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, '.', 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + }); + final String EXPECTED_NAME = + "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" + + "." + + "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F"; assertEquals(EXPECTED_NAME, FilenameUtils.normalize(TEST_NAME)); } @@ -104,5 +110,4 @@ public void testGetName() throws Exception { private void testFilenameEquality(String expected, String path) { assertEquals(expected, FilenameUtils.getName(path)); } - } diff --git a/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java b/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java index b6237b3600..9dfd2674ee 100644 --- a/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java @@ -21,12 +21,9 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; - import org.junit.jupiter.api.Test; -/** - * Test cases for the {@link LookaheadInputStream} class. - */ +/** Test cases for the {@link LookaheadInputStream} class. */ public class LookaheadInputStreamTest { @Test @@ -46,7 +43,7 @@ public void testEmptyStream() throws IOException { @Test public void testBasicLookahead() throws IOException { - InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'}); + InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'}); InputStream lookahead = new LookaheadInputStream(stream, 2); assertEquals('a', lookahead.read()); assertEquals('b', lookahead.read()); @@ -60,7 +57,7 @@ public void testBasicLookahead() throws IOException { @Test public void testZeroLookahead() throws IOException { - InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'}); + InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'}); InputStream lookahead = new LookaheadInputStream(stream, 0); assertEquals(-1, lookahead.read()); lookahead.close(); @@ -72,7 +69,7 @@ public void testZeroLookahead() throws IOException { @Test public void testMarkLookahead() throws IOException { - InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'}); + InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'}); InputStream lookahead = new LookaheadInputStream(stream, 2); lookahead.mark(1); assertEquals('a', lookahead.read()); @@ -93,7 +90,7 @@ public void testMarkLookahead() throws IOException { @Test public void testSkipLookahead() throws IOException { - InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'}); + InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'}); InputStream lookahead = new LookaheadInputStream(stream, 2); assertEquals(1, lookahead.skip(1)); assertEquals('b', lookahead.read()); @@ -105,5 +102,4 @@ public void testSkipLookahead() throws IOException { assertEquals('c', stream.read()); assertEquals(-1, stream.read()); } - } diff --git a/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java b/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java index cfe0c15f05..62a5e0283f 100644 --- a/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java @@ -24,26 +24,23 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; - import org.junit.jupiter.api.Test; -/** - * Test class for {@code TailStream}. - */ +/** Test class for {@code TailStream}. */ public class TailStreamTest { - /** - * Constant for generating test text. - */ - private static final String TEXT = "Lorem ipsum dolor sit amet, consetetur " + - "sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut " + - "labore et dolore magna aliquyam erat, sed diam voluptua. At vero" + - " eos et accusam et justo duo dolores et ea rebum. Stet clita " + - "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor " + "sit amet."; + /** Constant for generating test text. */ + private static final String TEXT = + "Lorem ipsum dolor sit amet, consetetur " + + "sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut " + + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero" + + " eos et accusam et justo duo dolores et ea rebum. Stet clita " + + "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor " + + "sit amet."; /** * Generates a test text using the specified parameters. * - * @param from the start index of the text + * @param from the start index of the text * @param length the length of the text * @return the generated test text */ @@ -59,7 +56,7 @@ private static String generateText(int from, int length) { /** * Generates a stream which contains a test text. * - * @param from the start index of the text + * @param from the start index of the text * @param length the length of the generated stream * @return the stream with the test text */ @@ -83,9 +80,7 @@ private static byte[] readStream(InputStream in) throws IOException { return bos.toByteArray(); } - /** - * Tests whether the tail buffer can be obtained before data was read. - */ + /** Tests whether the tail buffer can be obtained before data was read. */ @Test public void testTailBeforeRead() throws IOException { TailStream stream = new TailStream(generateStream(0, 100), 50); @@ -93,9 +88,7 @@ public void testTailBeforeRead() throws IOException { stream.close(); } - /** - * Tests the content of the tail buffer if it is only partly filled. - */ + /** Tests the content of the tail buffer if it is only partly filled. */ @Test public void testTailBufferPartlyRead() throws IOException { final int count = 64; @@ -105,21 +98,17 @@ public void testTailBufferPartlyRead() throws IOException { stream.close(); } - /** - * Tests the content of the tail buffer if only single bytes were read. - */ + /** Tests the content of the tail buffer if only single bytes were read. */ @Test public void testTailSingleByteReads() throws IOException { final int count = 128; TailStream stream = new TailStream(generateStream(0, 2 * count), count); readStream(stream); - assertEquals(generateText(count, count), new String(stream.getTail(), UTF_8), - "Wrong buffer"); + assertEquals( + generateText(count, count), new String(stream.getTail(), UTF_8), "Wrong buffer"); } - /** - * Tests the content of the tail buffer if larger chunks are read. - */ + /** Tests the content of the tail buffer if larger chunks are read. */ @Test public void testTailChunkReads() throws IOException { final int count = 16384; @@ -132,14 +121,14 @@ public void testTailChunkReads() throws IOException { while (read != -1) { read = stream.read(buf); } - assertEquals(generateText(count - tailSize, tailSize), - new String(stream.getTail(), UTF_8), "Wrong buffer"); + assertEquals( + generateText(count - tailSize, tailSize), + new String(stream.getTail(), UTF_8), + "Wrong buffer"); stream.close(); } - /** - * Tests whether mark() and reset() work as expected. - */ + /** Tests whether mark() and reset() work as expected. */ @Test public void testReadWithMarkAndReset() throws IOException { final int tailSize = 64; @@ -150,13 +139,13 @@ public void testReadWithMarkAndReset() throws IOException { stream.read(buf); stream.reset(); readStream(stream); - assertEquals(generateText(tailSize, tailSize), - new String(stream.getTail(), UTF_8), "Wrong buffer"); + assertEquals( + generateText(tailSize, tailSize), + new String(stream.getTail(), UTF_8), + "Wrong buffer"); } - /** - * Tests whether a reset() operation without a mark is simply ignored. - */ + /** Tests whether a reset() operation without a mark is simply ignored. */ @Test public void testResetWithoutMark() throws IOException { final int tailSize = 75; @@ -165,14 +154,14 @@ public void testResetWithoutMark() throws IOException { stream.reset(); byte[] buf = new byte[count]; stream.read(buf); - assertEquals(generateText(count - tailSize, tailSize), - new String(stream.getTail(), UTF_8), "Wrong buffer"); + assertEquals( + generateText(count - tailSize, tailSize), + new String(stream.getTail(), UTF_8), + "Wrong buffer"); stream.close(); } - /** - * Tests whether skip() also fills the tail buffer. - */ + /** Tests whether skip() also fills the tail buffer. */ @Test public void testSkip() throws IOException { final int tailSize = 128; @@ -180,27 +169,24 @@ public void testSkip() throws IOException { final int skipCount = 512; TailStream stream = new TailStream(generateStream(0, count), tailSize); assertEquals(skipCount, stream.skip(skipCount), "Wrong skip result"); - assertEquals(generateText(skipCount - tailSize, tailSize), - new String(stream.getTail(), UTF_8), "Wrong buffer"); + assertEquals( + generateText(skipCount - tailSize, tailSize), + new String(stream.getTail(), UTF_8), + "Wrong buffer"); stream.close(); } - /** - * Tests a skip operation at the end of the stream. - */ + /** Tests a skip operation at the end of the stream. */ @Test public void testSkipEOS() throws IOException { final int count = 128; TailStream stream = new TailStream(generateStream(0, count), 2 * count); assertEquals(count, stream.skip(2 * count), "Wrong skip result"); - assertEquals(generateText(0, count), new String(stream.getTail(), UTF_8), - "Wrong buffer"); + assertEquals(generateText(0, count), new String(stream.getTail(), UTF_8), "Wrong buffer"); stream.close(); } - /** - * Tests skip() if read reaches the end of the stream and returns -1. - */ + /** Tests skip() if read reaches the end of the stream and returns -1. */ @Test public void testSkipReadEnd() throws IOException { final int count = 128; diff --git a/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java b/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java index fffb3f3778..06784f553d 100644 --- a/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java @@ -21,7 +21,6 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; - import org.junit.jupiter.api.Test; public class TemporaryResourcesTest { @@ -31,10 +30,11 @@ public void testFileDeletion() throws IOException { Path tempFile; try (TemporaryResources tempResources = new TemporaryResources()) { tempFile = tempResources.createTempFile(); - assertTrue(Files.exists(tempFile), "Temp file should exist while TempResources is used"); + assertTrue( + Files.exists(tempFile), "Temp file should exist while TempResources is used"); } - assertTrue(Files.notExists(tempFile), + assertTrue( + Files.notExists(tempFile), "Temp file should not exist after TempResources is closed"); } - } diff --git a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java index 1f8943e688..51e93a4454 100644 --- a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java @@ -29,18 +29,15 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; public class TikaInputStreamTest { - @TempDir - Path tempDir; + @TempDir Path tempDir; @Test public void testFileBased() throws IOException { @@ -50,19 +47,23 @@ public void testFileBased() throws IOException { assertNull(stream.getOpenContainer()); assertNull(stream.getInputStreamFactory()); - assertEquals(path, TikaInputStream.get(stream).getPath(), - "The file returned by the getFile() method should" + - " be the file used to instantiate a TikaInputStream"); + assertEquals( + path, + TikaInputStream.get(stream).getPath(), + "The file returned by the getFile() method should" + + " be the file used to instantiate a TikaInputStream"); - assertEquals("Hello, World!", readStream(stream), - "The contents of the TikaInputStream should equal the" + - " contents of the underlying file"); + assertEquals( + "Hello, World!", + readStream(stream), + "The contents of the TikaInputStream should equal the" + + " contents of the underlying file"); stream.close(); - assertTrue(Files.exists(path), - "The close() method must not remove the file used to" + - " instantiate a TikaInputStream"); - + assertTrue( + Files.exists(path), + "The close() method must not remove the file used to" + + " instantiate a TikaInputStream"); } @Test @@ -79,29 +80,37 @@ public void testStreamBased() throws IOException { assertNull(stream.getOpenContainer()); assertNull(stream.getInputStreamFactory()); - assertEquals("Hello, World!", readFile(file), - "The contents of the file returned by the getFile method" + - " should equal the contents of the TikaInputStream"); + assertEquals( + "Hello, World!", + readFile(file), + "The contents of the file returned by the getFile method" + + " should equal the contents of the TikaInputStream"); - assertEquals("Hello, World!", readStream(stream), - "The contents of the TikaInputStream should not get modified" + - " by reading the file first"); + assertEquals( + "Hello, World!", + readStream(stream), + "The contents of the TikaInputStream should not get modified" + + " by reading the file first"); stream.close(); - assertFalse(Files.exists(file), + assertFalse( + Files.exists(file), "The close() method must remove the temporary file created by a TikaInputStream"); } @Test public void testInputStreamFactoryBased() throws IOException { - TikaInputStream stream = TikaInputStream.get(() -> IOUtils.toInputStream("Hello, World!", UTF_8)); + TikaInputStream stream = + TikaInputStream.get(() -> IOUtils.toInputStream("Hello, World!", UTF_8)); assertFalse(stream.hasFile()); assertNull(stream.getOpenContainer()); assertNotNull(stream.getInputStreamFactory()); - assertEquals("Hello, World!", readStream(stream), - "The contents of the TikaInputStream should not get modified" + - " by reading the file first"); + assertEquals( + "Hello, World!", + readStream(stream), + "The contents of the TikaInputStream should not get modified" + + " by reading the file first"); stream.close(); } @@ -125,8 +134,8 @@ public void testGetMetadata() throws Exception { Metadata metadata = new Metadata(); TikaInputStream.get(url, metadata).close(); assertEquals("test.txt", metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY)); - assertEquals(Long.toString(Files.size(Paths.get(url.toURI()))), + assertEquals( + Long.toString(Files.size(Paths.get(url.toURI()))), metadata.get(Metadata.CONTENT_LENGTH)); } - } diff --git a/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java b/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java index a035574281..c7a9ccf633 100644 --- a/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java +++ b/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java @@ -34,5 +34,4 @@ public void test() { // TODO verify that "en-GB" == "en"??? } - } diff --git a/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java b/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java index 2ce1b8b6c6..3ecf3382c0 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java @@ -35,24 +35,18 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.utils.DateUtils; +import org.junit.jupiter.api.Test; -//Junit imports +// Junit imports -/** - * JUnit based tests of class {@link org.apache.tika.metadata.Metadata}. - */ +/** JUnit based tests of class {@link org.apache.tika.metadata.Metadata}. */ public class TestMetadata extends TikaTest { private static final String CONTENTTYPE = "contenttype"; - /** - * Test for the add(String, String) method. - */ + /** Test for the add(String, String) method. */ @Test public void testAdd() { String[] values = null; @@ -87,13 +81,11 @@ public void testAdd() { meta.add(nonMultiValued, "value2"); fail("add should fail on the second call of a non-multi valued item"); } catch (PropertyTypeException e) { - //swallow + // swallow } } - /** - * Test for the set(String, String) method. - */ + /** Test for the set(String, String) method. */ @Test public void testSet() { String[] values = null; @@ -120,9 +112,7 @@ public void testSet() { assertEquals("new value 2", values[1]); } - /** - * Test for setAll(Properties) method. - */ + /** Test for setAll(Properties) method. */ @Test public void testSetProperties() { String[] values = null; @@ -150,9 +140,7 @@ public void testSetProperties() { assertEquals("value2.1", values[0]); } - /** - * Test for get(String) method. - */ + /** Test for get(String) method. */ @Test public void testGet() { Metadata meta = new Metadata(); @@ -163,9 +151,7 @@ public void testGet() { assertEquals("value-1", meta.get("a-name")); } - /** - * Test for isMultiValued() method. - */ + /** Test for isMultiValued() method. */ @Test public void testIsMultiValued() { Metadata meta = new Metadata(); @@ -176,9 +162,7 @@ public void testIsMultiValued() { assertTrue(meta.isMultiValued("key")); } - /** - * Test for names method. - */ + /** Test for names method. */ @Test public void testNames() { String[] names = null; @@ -195,9 +179,7 @@ public void testNames() { assertEquals(2, names.length); } - /** - * Test for remove(String) method. - */ + /** Test for remove(String) method. */ @Test public void testRemove() { Metadata meta = new Metadata(); @@ -219,9 +201,7 @@ public void testRemove() { assertNull(meta.get("name-two")); } - /** - * Test for equals(Object) method. - */ + /** Test for equals(Object) method. */ @Test public void testObject() { Metadata meta1 = new Metadata(); @@ -247,10 +227,7 @@ public void testObject() { assertFalse(meta1.equals(meta2)); } - /** - * Tests for getting and setting integer - * based properties - */ + /** Tests for getting and setting integer based properties */ @Test public void testGetSetInt() { Metadata meta = new Metadata(); @@ -264,13 +241,13 @@ public void testGetSetInt() { meta.set(Metadata.BITS_PER_SAMPLE, 1); fail("Shouldn't be able to set a multi valued property as an int"); } catch (PropertyTypeException e) { - //swallow + // swallow } try { meta.set(TikaCoreProperties.CREATED, 1); fail("Shouldn't be able to set a date property as an int"); } catch (PropertyTypeException e) { - //swallow + // swallow } // Can set it and retrieve it @@ -290,10 +267,7 @@ public void testGetSetInt() { assertEquals(null, meta.getInt(TikaCoreProperties.CREATED)); } - /** - * Tests for getting and setting date - * based properties - */ + /** Tests for getting and setting date based properties */ @Test public void testGetSetDate() { Metadata meta = new Metadata(); @@ -308,13 +282,13 @@ public void testGetSetDate() { meta.set(Metadata.BITS_PER_SAMPLE, new Date(1000)); fail("Shouldn't be able to set a multi valued property as a date"); } catch (PropertyTypeException e) { - //swallow + // swallow } try { meta.set(Metadata.IMAGE_WIDTH, new Date(1000)); fail("Shouldn't be able to set an int property as an date"); } catch (PropertyTypeException e) { - //swallow + // swallow } // Can set it and retrieve it @@ -334,7 +308,7 @@ public void testGetSetDate() { assertEquals(null, meta.getInt(TikaCoreProperties.CREATED)); // Our format doesn't include milliseconds - // This means things get rounded + // This means things get rounded meta.set(TikaCoreProperties.CREATED, new Date(1050)); assertEquals("1970-01-01T00:00:01Z", meta.get(TikaCoreProperties.CREATED)); assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime()); @@ -367,8 +341,8 @@ public void testGetSetDate() { } /** - * Some documents, like jpegs, might have date in unspecified time zone - * which should be handled like strings but verified to have parseable ISO 8601 format + * Some documents, like jpegs, might have date in unspecified time zone which should be handled + * like strings but verified to have parseable ISO 8601 format */ @Test public void testGetSetDateUnspecifiedTimezone() { @@ -376,26 +350,34 @@ public void testGetSetDateUnspecifiedTimezone() { // Set explictly without a timezone meta.set(TikaCoreProperties.CREATED, "1970-01-01T00:00:01"); - assertEquals("1970-01-01T00:00:01", meta.get(TikaCoreProperties.CREATED), + assertEquals( + "1970-01-01T00:00:01", + meta.get(TikaCoreProperties.CREATED), "should return string without time zone specifier because zone is not known"); // Now ask DateUtils to format for us without one meta.set(TikaCoreProperties.CREATED, DateUtils.formatDateUnknownTimezone(new Date(1000))); - assertEquals("1970-01-01T00:00:01", meta.get(TikaCoreProperties.CREATED), + assertEquals( + "1970-01-01T00:00:01", + meta.get(TikaCoreProperties.CREATED), "should return string without time zone specifier because zone is not known"); } /** - * Defines a composite property, then checks that when set as the - * composite the value can be retrieved with the property or the aliases + * Defines a composite property, then checks that when set as the composite the value can be + * retrieved with the property or the aliases */ @SuppressWarnings("deprecation") @Test public void testCompositeProperty() { Metadata meta = new Metadata(); - Property compositeProperty = Property.composite(DublinCore.DESCRIPTION, - new Property[]{TikaCoreProperties.DESCRIPTION, - Property.internalText("testDescriptionAlt")}); + Property compositeProperty = + Property.composite( + DublinCore.DESCRIPTION, + new Property[] { + TikaCoreProperties.DESCRIPTION, + Property.internalText("testDescriptionAlt") + }); String message = "composite description"; meta.set(compositeProperty, message); @@ -424,7 +406,6 @@ public void testMultithreadedDates() throws Exception { finished++; } } - } @Test @@ -506,9 +487,8 @@ public Integer call() throws Exception { df.setTimeZone(TimeZone.getTimeZone("UTC")); m.set(TikaCoreProperties.CREATED, df.format(now)); assertTrue( - Math.abs(now.getTime() - m.getDate(TikaCoreProperties.CREATED).getTime()) < - 2000); - + Math.abs(now.getTime() - m.getDate(TikaCoreProperties.CREATED).getTime()) + < 2000); } return 1; } diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java index ac64734c20..d55bd722a8 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java @@ -17,13 +17,10 @@ package org.apache.tika.metadata.filter; import java.util.Locale; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; -/** - * Mock Filter for testing uppercasing of all values - */ +/** Mock Filter for testing uppercasing of all values */ public class MockUpperCaseFilter extends MetadataFilter { @Override diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java index 0b071d0be6..9aec84c541 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java @@ -24,14 +24,12 @@ import java.util.Arrays; import java.util.HashSet; import java.util.Set; - -import org.junit.jupiter.api.Test; - import org.apache.tika.config.AbstractTikaConfigTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.Test; public class TestMetadataFilter extends AbstractTikaConfigTest { @@ -111,7 +109,7 @@ public void testConfigExcludeFilter() throws Exception { @Test public void testConfigIncludeAndUCFilter() throws Exception { TikaConfig config = getConfig("TIKA-3137-include-uc.xml"); - String[] expectedTitles = new String[]{"TITLE1", "TITLE2", "TITLE3"}; + String[] expectedTitles = new String[] {"TITLE1", "TITLE2", "TITLE3"}; Metadata metadata = new Metadata(); metadata.add("title", "title1"); metadata.add("title", "title2"); @@ -141,7 +139,6 @@ public void testMimeClearingFilter() throws Exception { filter.filter(metadata); assertEquals(2, metadata.size()); assertEquals("author", metadata.get("author")); - } @Test @@ -182,8 +179,8 @@ public void testFieldNameMapping() throws Exception { @Test public void testDateNormalizingFilter() throws Exception { - //test that a Date lacking a timezone, if interpreted as Los Angeles, for example, - //yields a UTC string that is properly +7 hours. + // test that a Date lacking a timezone, if interpreted as Los Angeles, for example, + // yields a UTC string that is properly +7 hours. Metadata m = new Metadata(); m.set(TikaCoreProperties.CREATED, "2021-07-23T01:02:24"); DateNormalizingMetadataFilter filter = new DateNormalizingMetadataFilter(); @@ -243,5 +240,4 @@ public void testCaptureGroupOverwrite() throws Exception { assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length); assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE)); } - } diff --git a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java index 7b7e8710db..f5f64ab9d1 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java @@ -23,9 +23,6 @@ import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Set; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.config.TikaConfigTest; @@ -38,10 +35,10 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.AutoDetectParserConfig; import org.apache.tika.parser.ParseContext; +import org.junit.jupiter.api.Test; public class StandardWriteFilterTest extends TikaTest { - @Test public void testMetadataFactoryConfig() throws Exception { TikaConfig tikaConfig = @@ -50,8 +47,7 @@ public void testMetadataFactoryConfig() throws Exception { MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory(); assertEquals(350, ((StandardWriteFilterFactory) factory).getMaxTotalEstimatedBytes()); AutoDetectParser parser = new AutoDetectParser(tikaConfig); - String mock = "" + - ""; + String mock = "" + ""; for (int i = 0; i < 20; i++) { mock += "01234567890123456789"; } @@ -59,8 +55,12 @@ public void testMetadataFactoryConfig() throws Exception { mock += ""; Metadata metadata = new Metadata(); List metadataList = - getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)), - parser, metadata, new ParseContext(), true); + getRecursiveMetadata( + new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)), + parser, + metadata, + new ParseContext(), + true); assertEquals(1, metadataList.size()); metadata = metadataList.get(0); @@ -81,8 +81,7 @@ public void testMetadataFactoryFieldsConfig() throws Exception { assertEquals(999, ((StandardWriteFilterFactory) factory).getMaxKeySize()); assertEquals(10001, ((StandardWriteFilterFactory) factory).getMaxFieldSize()); AutoDetectParser parser = new AutoDetectParser(tikaConfig); - String mock = "" + - ""; + String mock = "" + ""; mock += "this is not a title"; mock += "this is a title"; for (int i = 0; i < 20; i++) { @@ -94,19 +93,23 @@ public void testMetadataFactoryFieldsConfig() throws Exception { metadata.add("dc:creator", "abcdefghijabcdefghij"); metadata.add("not-allowed", "not-allowed"); List metadataList = - getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)), - parser, metadata, new ParseContext(), true); + getRecursiveMetadata( + new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)), + parser, + metadata, + new ParseContext(), + true); assertEquals(1, metadataList.size()); metadata = metadataList.get(0); - //test that this was removed during the filter existing stage + // test that this was removed during the filter existing stage assertNull(metadata.get("not-allowed")); - //test that this was not allowed because it isn't in the "include" list + // test that this was not allowed because it isn't in the "include" list assertNull(metadata.get("dc:subject")); String[] creators = metadata.getValues("dc:creator"); assertEquals("abcdefghijabcdefghij", creators[0]); - //this gets more than the other test because this is filtering out some fields + // this gets more than the other test because this is filtering out some fields assertEquals(3, creators.length); assertEquals("012345678901234", creators[2]); assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30); @@ -115,16 +118,15 @@ public void testMetadataFactoryFieldsConfig() throws Exception { @Test public void testKeySizeFilter() throws Exception { - Metadata metadata = filter(10, 1000, 10000, 100, - null, true); - //test that must add keys are not truncated + Metadata metadata = filter(10, 1000, 10000, 100, null, true); + // test that must add keys are not truncated metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser1"); metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser2"); metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser3"); assertEquals(3, metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY).length); metadata.add(OfficeOpenXMLExtended.DOC_SECURITY_STRING, "some doc-security-string"); - //truncated to 10 bytes in UTF-16 = 5 characters + // truncated to 10 bytes in UTF-16 = 5 characters assertEquals("some doc-security-string", metadata.getValues("exten")[0]); assertTruncated(metadata); @@ -135,16 +137,14 @@ public void testKeySizeFilter() throws Exception { @Test public void testAfterMaxHit() throws Exception { - String k = "dc:creator";//20 bytes - //key is > maxTotalBytes, so the value isn't even added - Metadata metadata = filter(100, 10000, 10, - 100, null, false); + String k = "dc:creator"; // 20 bytes + // key is > maxTotalBytes, so the value isn't even added + Metadata metadata = filter(100, 10000, 10, 100, null, false); metadata.set(k, "ab"); assertEquals(1, metadata.names().length); assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA)); - metadata = filter(100, 10000, 50, 100, - null, false); + metadata = filter(100, 10000, 50, 100, null, false); for (int i = 0; i < 10; i++) { metadata.set(k, "abcde"); } @@ -153,10 +153,10 @@ public void testAfterMaxHit() throws Exception { assertEquals("abcde", metadata.getValues(k)[0]); assertNull(metadata.get(TikaCoreProperties.TRUNCATED_METADATA)); - metadata.add(k, "abcde");//40 - metadata.add(k, "abc");//46 - metadata.add(k, "abcde");//only the first character is taken from this - metadata.add(k, "abcde");//this shouldn't even be countenanced + metadata.add(k, "abcde"); // 40 + metadata.add(k, "abc"); // 46 + metadata.add(k, "abcde"); // only the first character is taken from this + metadata.add(k, "abcde"); // this shouldn't even be countenanced assertEquals(2, metadata.names().length); assertEquals(4, metadata.getValues(k).length); @@ -166,8 +166,8 @@ public void testAfterMaxHit() throws Exception { assertEquals("a", metadata.getValues(k)[3]); assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA)); - //this will force a reset of the total max bytes because - //this is a set, not an add. This should get truncated at 15 chars = 30 bytes + // this will force a reset of the total max bytes because + // this is a set, not an add. This should get truncated at 15 chars = 30 bytes metadata.set(k, "abcdefghijklmnopqrstuvwx"); assertEquals(2, metadata.names().length); assertEquals(1, metadata.getValues(k).length); @@ -177,14 +177,14 @@ public void testAfterMaxHit() throws Exception { @Test public void testMinSizeForAlwaysInclude() throws Exception { - //test that mimes don't get truncated + // test that mimes don't get truncated Metadata metadata = filter(100, 10, 10000, 100, null, true); String mime = getLongestMime().toString(); metadata.set(Metadata.CONTENT_TYPE, mime); assertEquals(mime, metadata.get(Metadata.CONTENT_TYPE)); - //test that other fields are truncated + // test that other fields are truncated metadata.set("dc:title", "abcdefghij"); assertEquals("abcde", metadata.get("dc:title")); assertTruncated(metadata); @@ -202,11 +202,22 @@ public void testMaxFieldValues() throws Exception { private void assertTruncated(Metadata metadata) { assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA)); } - private Metadata filter(int maxKeySize, int maxFieldSize, int maxTotalBytes, - int maxValuesPerField, - Set includeFields, boolean includeEmpty) { - MetadataWriteFilter filter = new StandardWriteFilter(maxKeySize, maxFieldSize, - maxTotalBytes, maxValuesPerField, includeFields, includeEmpty); + + private Metadata filter( + int maxKeySize, + int maxFieldSize, + int maxTotalBytes, + int maxValuesPerField, + Set includeFields, + boolean includeEmpty) { + MetadataWriteFilter filter = + new StandardWriteFilter( + maxKeySize, + maxFieldSize, + maxTotalBytes, + maxValuesPerField, + includeFields, + includeEmpty); Metadata metadata = new Metadata(); metadata.setMetadataWriteFilter(filter); return metadata; @@ -226,5 +237,4 @@ public MediaType getLongestMime() throws Exception { } return longest; } - } diff --git a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java index 6c57740873..8858031f22 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java @@ -24,12 +24,10 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - import org.junit.jupiter.api.Test; import org.xml.sax.Attributes; import org.xml.sax.SAXException; - public class CustomReaderTest { @Test @@ -54,8 +52,8 @@ public void testCustomReader() throws Exception { MimeType another = mimeTypes.forName(key); assertEquals("kittens", reader.values.get(key)); assertEquals(1, reader.ignorePatterns.size()); - assertEquals(another.toString() + ">>*" + hello.getExtension(), - reader.ignorePatterns.get(0)); + assertEquals( + another.toString() + ">>*" + hello.getExtension(), reader.ignorePatterns.get(0)); assertTrue(another.isInterpreted(), "Server-side script type not detected"); } @@ -67,7 +65,6 @@ static class CustomMimeTypesReader extends MimeTypesReader { super(types); } - @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { @@ -89,8 +86,13 @@ public void endElement(String uri, String localName, String qName) { } @Override - protected void handleGlobError(MimeType type, String pattern, MimeTypeException ex, - String qName, Attributes attributes) throws SAXException { + protected void handleGlobError( + MimeType type, + String pattern, + MimeTypeException ex, + String qName, + Attributes attributes) + throws SAXException { ignorePatterns.add(type.toString() + ">>" + pattern); } } diff --git a/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java b/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java index 64a2bebf82..5ac9b47c19 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java @@ -23,14 +23,14 @@ import java.util.HashMap; import java.util.Map; - import org.junit.jupiter.api.Test; public class MediaTypeTest { @Test public void testBasics() { - assertEquals("application/octet-stream", + assertEquals( + "application/octet-stream", new MediaType("application", "octet-stream").toString()); assertEquals("text/plain", new MediaType("text", "plain").toString()); @@ -39,11 +39,12 @@ public void testBasics() { assertEquals("text/plain", new MediaType("text", "plain", parameters).toString()); parameters.put("charset", "UTF-8"); - assertEquals("text/plain; charset=UTF-8", - new MediaType("text", "plain", parameters).toString()); + assertEquals( + "text/plain; charset=UTF-8", new MediaType("text", "plain", parameters).toString()); parameters.put("x-eol-style", "crlf"); - assertEquals("text/plain; charset=UTF-8; x-eol-style=crlf", + assertEquals( + "text/plain; charset=UTF-8; x-eol-style=crlf", new MediaType("text", "plain", parameters).toString()); } @@ -56,11 +57,12 @@ public void testLowerCase() { assertEquals("text/plain", new MediaType("text", "PLAIN", parameters).toString()); parameters.put("CHARSET", "UTF-8"); - assertEquals("text/plain; charset=UTF-8", - new MediaType("TEXT", "plain", parameters).toString()); + assertEquals( + "text/plain; charset=UTF-8", new MediaType("TEXT", "plain", parameters).toString()); parameters.put("X-Eol-Style", "crlf"); - assertEquals("text/plain; charset=UTF-8; x-eol-style=crlf", + assertEquals( + "text/plain; charset=UTF-8; x-eol-style=crlf", new MediaType("TeXt", "PlAiN", parameters).toString()); } @@ -73,11 +75,13 @@ public void testTrim() { assertEquals("text/plain", new MediaType("text\r\n", " \tplain", parameters).toString()); parameters.put(" charset", "UTF-8"); - assertEquals("text/plain; charset=UTF-8", + assertEquals( + "text/plain; charset=UTF-8", new MediaType("\n\ntext", "plain \r", parameters).toString()); parameters.put("\r\n\tx-eol-style \t", "crlf"); - assertEquals("text/plain; charset=UTF-8; x-eol-style=crlf", + assertEquals( + "text/plain; charset=UTF-8; x-eol-style=crlf", new MediaType(" text", "\tplain ", parameters).toString()); } @@ -87,8 +91,9 @@ public void testQuote() { parameters.put("a", " value with spaces "); parameters.put("b", "text/plain"); parameters.put("c", "()<>@,;:\\\"/[]?="); - assertEquals("text/plain; a=\" value with spaces \"; b=\"text\\/plain\"" + - "; c=\"\\(\\)\\<\\>\\@\\,\\;\\:\\\\\\\"\\/\\[\\]\\?\\=\"", + assertEquals( + "text/plain; a=\" value with spaces \"; b=\"text\\/plain\"" + + "; c=\"\\(\\)\\<\\>\\@\\,\\;\\:\\\\\\\"\\/\\[\\]\\?\\=\"", new MediaType("text", "plain", parameters).toString()); } @@ -121,9 +126,7 @@ public void testParseWithParams() { assertTrue(gotCharset && gotFoo && gotFoo2); } - /** - * Per http://tools.ietf.org/html/rfc2045#section-5.1, charset can be in quotes - */ + /** Per http://tools.ietf.org/html/rfc2045#section-5.1, charset can be in quotes */ @Test public void testParseWithParamsAndQuotedCharset() { // Typical case, with a quoted charset @@ -171,19 +174,20 @@ public void testParseNoParamsWithSemi() { assertEquals(0, type.getParameters().keySet().size()); } - /** - * TIKA-349 - */ + /** TIKA-349 */ @Test public void testOddParameters() { - assertEquals("text/html; charset=UTF-8", + assertEquals( + "text/html; charset=UTF-8", MediaType.parse("text/html;; charset=UTF-8").toString()); - assertEquals("text/html; charset=UTF-8", + assertEquals( + "text/html; charset=UTF-8", MediaType.parse("text/html;; charset=UTF-8").toString()); - assertEquals("text/html; charset=UTF-8", + assertEquals( + "text/html; charset=UTF-8", MediaType.parse("text/html;; charset=\"UTF-8\"").toString()); - assertEquals("text/html; charset=UTF-8", + assertEquals( + "text/html; charset=UTF-8", MediaType.parse("text/html;; charset=\"UTF-8").toString()); } - } diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java index 84820ac943..84d3064916 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java @@ -27,15 +27,13 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; - import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; public class MimeDetectionTest { @@ -65,7 +63,9 @@ public void testDetection() throws Exception { testFile("application/xml", "test-utf16be.xml"); testFile("application/xml", "test-long-comment.xml"); testFile("application/xslt+xml", "stylesheet.xsl"); - testUrl("application/rdf+xml", "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", + testUrl( + "application/rdf+xml", + "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", "test-difficult-rdf1.xml"); testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml"); // add evil test from TIKA-327 @@ -79,7 +79,7 @@ public void testDetection() throws Exception { // test HTML detection of malformed file, previously identified as image/cgm (TIKA-1170) testFile("text/html", "test-malformed-header.html.bin"); - //test GCMD Directory Interchange Format (.dif) TIKA-1561 + // test GCMD Directory Interchange Format (.dif) TIKA-1561 testFile("application/dif+xml", "brwNIMS_2014.dif"); // truncated xml should still be detected as xml, See TIKA-3596 @@ -103,59 +103,80 @@ public void testDetectionWithoutContent() throws IOException { @Test public void testByteOrderMark() throws Exception { - assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata())); - assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata())); - assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata())); + assertEquals( + MediaType.TEXT_PLAIN, + MIME_TYPES.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata())); + assertEquals( + MediaType.TEXT_PLAIN, + MIME_TYPES.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata())); + assertEquals( + MediaType.TEXT_PLAIN, + MIME_TYPES.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata())); } @Test public void testRFC822WithBOM() throws Exception { - String header = "From: blah \r\n" + "Received: Friday, January 24, 2020 3:24 PM\r\n" + - "To: someone@somewhere.com\r\n" + "Cc: someone-else@other.com\r\n" + - "Subject: Received\r\n"; + String header = + "From: blah \r\n" + + "Received: Friday, January 24, 2020 3:24 PM\r\n" + + "To: someone@somewhere.com\r\n" + + "Cc: someone-else@other.com\r\n" + + "Subject: Received\r\n"; MediaType rfc822 = MediaType.parse("message/rfc822"); - assertEquals(rfc822, MIME_TYPES.detect(UnsynchronizedByteArrayInputStream - .builder() - .setByteArray(header.getBytes(UTF_8)) - .get(), new Metadata())); + assertEquals( + rfc822, + MIME_TYPES.detect( + UnsynchronizedByteArrayInputStream.builder() + .setByteArray(header.getBytes(UTF_8)) + .get(), + new Metadata())); int utfLength = ByteOrderMark.UTF_8.length(); byte[] bytes = new byte[header.getBytes(UTF_8).length + utfLength]; System.arraycopy(ByteOrderMark.UTF_8.getBytes(), 0, bytes, 0, utfLength); System.arraycopy(header.getBytes(UTF_8), 0, bytes, 3, header.getBytes(UTF_8).length); - assertEquals(rfc822, MIME_TYPES.detect(UnsynchronizedByteArrayInputStream - .builder() - .setByteArray(bytes) - .get(), new Metadata())); + assertEquals( + rfc822, + MIME_TYPES.detect( + UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get(), + new Metadata())); } @Test public void testSuperTypes() { - assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.parse("text/something"))); + assertTrue( + REGISTRY.isSpecializationOf( + MediaType.parse("text/something; charset=UTF-8"), + MediaType.parse("text/something"))); - assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.TEXT_PLAIN)); + assertTrue( + REGISTRY.isSpecializationOf( + MediaType.parse("text/something; charset=UTF-8"), MediaType.TEXT_PLAIN)); - assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.OCTET_STREAM)); + assertTrue( + REGISTRY.isSpecializationOf( + MediaType.parse("text/something; charset=UTF-8"), MediaType.OCTET_STREAM)); - assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something"), - MediaType.TEXT_PLAIN)); + assertTrue( + REGISTRY.isSpecializationOf( + MediaType.parse("text/something"), MediaType.TEXT_PLAIN)); - assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("application/something+xml"), - MediaType.APPLICATION_XML)); + assertTrue( + REGISTRY.isSpecializationOf( + MediaType.parse("application/something+xml"), MediaType.APPLICATION_XML)); - assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("application/something+zip"), - MediaType.APPLICATION_ZIP)); + assertTrue( + REGISTRY.isSpecializationOf( + MediaType.parse("application/something+zip"), MediaType.APPLICATION_ZIP)); assertTrue(REGISTRY.isSpecializationOf(MediaType.APPLICATION_XML, MediaType.TEXT_PLAIN)); - assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("application/vnd.apple.iwork"), - MediaType.APPLICATION_ZIP)); + assertTrue( + REGISTRY.isSpecializationOf( + MediaType.parse("application/vnd.apple.iwork"), MediaType.APPLICATION_ZIP)); } @SuppressWarnings("unused") @@ -168,8 +189,7 @@ private void testUrlWithoutContent(String expected, String url) throws IOExcepti Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, url); String mime = this.MIME_TYPES.detect(null, metadata).toString(); - assertEquals(expected, mime, - url + " is not properly detected using only resource name"); + assertEquals(expected, mime, url + " is not properly detected using only resource name"); } private void testUrl(String expected, String url, String file) throws IOException { @@ -193,13 +213,14 @@ private void testStream(String expected, String urlOrFileName, InputStream in) try { Metadata metadata = new Metadata(); String mime = this.MIME_TYPES.detect(in, metadata).toString(); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected: detected."); + assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected."); - //Add resource name and test again + // Add resource name and test again metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName); mime = this.MIME_TYPES.detect(in, metadata).toString(); - assertEquals(expected, mime, + assertEquals( + expected, + mime, urlOrFileName + " is not properly detected after adding resource name."); } finally { in.close(); @@ -213,37 +234,40 @@ private void testStream(String expected, String urlOrFileName, InputStream in) */ @Test public void testEmptyDocument() throws IOException { - assertEquals(MediaType.OCTET_STREAM, + assertEquals( + MediaType.OCTET_STREAM, MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); Metadata namehint = new Metadata(); namehint.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt"); - assertEquals(MediaType.TEXT_PLAIN, + assertEquals( + MediaType.TEXT_PLAIN, MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), namehint)); Metadata typehint = new Metadata(); typehint.set(Metadata.CONTENT_TYPE, "text/plain"); - assertEquals(MediaType.TEXT_PLAIN, + assertEquals( + MediaType.TEXT_PLAIN, MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), typehint)); - } /** - * Test for things like javascript files whose content is enclosed in XML - * comment delimiters, but that aren't actually XML. + * Test for things like javascript files whose content is enclosed in XML comment delimiters, + * but that aren't actually XML. * * @see TIKA-426 */ @Test public void testNotXML() throws IOException { - assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES - .detect(new ByteArrayInputStream("".getBytes(UTF_8)), new Metadata())); + assertEquals( + MediaType.TEXT_PLAIN, + MIME_TYPES.detect( + new ByteArrayInputStream("".getBytes(UTF_8)), new Metadata())); } /** - * Tests that when we repeatedly test the detection of a document - * that can be detected with Mime Magic, that we consistently - * detect it correctly. See TIKA-391 for more details. + * Tests that when we repeatedly test the detection of a document that can be detected with Mime + * Magic, that we consistently detect it correctly. See TIKA-391 for more details. */ @Test public void testMimeMagicStability() throws IOException { @@ -253,10 +277,9 @@ public void testMimeMagicStability() throws IOException { } /** - * Tests that when two magic matches both apply, and both - * have the same priority, we use the name to pick the - * right one based on the glob, or the first one we - * come across if not. See TIKA-1292 for more details. + * Tests that when two magic matches both apply, and both have the same priority, we use the + * name to pick the right one based on the glob, or the first one we come across if not. See + * TIKA-1292 for more details. */ @Test public void testMimeMagicClashSamePriority() throws IOException { @@ -280,9 +303,7 @@ public void testMimeMagicClashSamePriority() throws IOException { assertEquals(helloXType, MIME_TYPES.detect(new ByteArrayInputStream(helloWorld), metadata)); } - /** - * Test for TIKA-3771. - */ + /** Test for TIKA-3771. */ @Test public void testPNGWithSomeEmlHeaders() throws IOException { testFile("image/png", "test-pngNotEml.bin"); diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java index 0d904f6dfb..92f542a36c 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java @@ -32,26 +32,20 @@ import java.util.Set; import java.util.concurrent.Executors; import java.util.stream.Collectors; - -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; /** - * These tests try to ensure that the MimeTypesReader - * has correctly processed the mime-types.xml file. - * To do this, it tests that various aspects of the - * mime-types.xml file have ended up correctly as - * globs, matches, magics etc. - *

- * If you make updates to mime-types.xml, then the - * checks in this test may no longer hold true. - * As such, if tests here start failing after your - * changes, please review the test details, and + * These tests try to ensure that the MimeTypesReader has correctly processed the mime-types.xml + * file. To do this, it tests that various aspects of the mime-types.xml file have ended up + * correctly as globs, matches, magics etc. + * + *

If you make updates to mime-types.xml, then the checks in this test may no longer hold true. + * As such, if tests here start failing after your changes, please review the test details, and * update it to match the new state of the file! */ public class MimeTypesReaderTest { @@ -68,19 +62,21 @@ private static String getTypeAsString(MimeTypes mimeTypes, String text, Metadata return mimeTypes .detect(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), metadata) .toString(); - } @SuppressWarnings("unchecked") @BeforeEach - public void setUp() throws NoSuchFieldException, SecurityException, IllegalArgumentException, - IllegalAccessException { + public void setUp() + throws NoSuchFieldException, + SecurityException, + IllegalArgumentException, + IllegalAccessException { this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository(); Field magicsField = mimeTypes.getClass().getDeclaredField("magics"); magicsField.setAccessible(true); magics = (List) magicsField.get(mimeTypes); - //ensure reset of custom mimes path + // ensure reset of custom mimes path customMimeTypes = System.getProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP); } @@ -100,9 +96,12 @@ public void testHtmlMatches() throws Exception { // Check on the type MimeType html = mimeTypes.forName("text/html"); assertTrue(html.hasMagic()); - assertTrue(html.getMagics().size() >= minMatches, - "There should be at least " + minMatches + " HTML matches, found " + - html.getMagics().size()); + assertTrue( + html.getMagics().size() >= minMatches, + "There should be at least " + + minMatches + + " HTML matches, found " + + html.getMagics().size()); // Check on the overall magics List htmlMagics = new ArrayList<>(); @@ -112,8 +111,12 @@ public void testHtmlMatches() throws Exception { } } - assertTrue(htmlMagics.size() >= minMatches, - "There should be at least " + minMatches + " HTML matches, found " + htmlMagics.size()); + assertTrue( + htmlMagics.size() >= minMatches, + "There should be at least " + + minMatches + + " HTML matches, found " + + htmlMagics.size()); } @Test @@ -123,9 +126,12 @@ public void testExcelMatches() throws Exception { // Check on the type MimeType excel = mimeTypes.forName("application/vnd.ms-excel"); assertTrue(excel.hasMagic()); - assertTrue(excel.getMagics().size() >= minMatches, - "There should be at least " + minMatches + " Excel matches, found " + - excel.getMagics().size()); + assertTrue( + excel.getMagics().size() >= minMatches, + "There should be at least " + + minMatches + + " Excel matches, found " + + excel.getMagics().size()); // Check on the overall magics List excelMagics = new ArrayList<>(); @@ -135,9 +141,12 @@ public void testExcelMatches() throws Exception { } } - assertTrue(excel.getMagics().size() >= minMatches, - "There should be at least " + minMatches + " Excel matches, found " + - excelMagics.size()); + assertTrue( + excel.getMagics().size() >= minMatches, + "There should be at least " + + minMatches + + " Excel matches, found " + + excelMagics.size()); } /** @@ -160,8 +169,8 @@ public void testReadExtendedMetadata() throws Exception { MimeType mime = this.mimeTypes.forName("image/bmp"); assertEquals("BMP", mime.getAcronym()); assertEquals("com.microsoft.bmp", mime.getUniformTypeIdentifier()); - assertEquals("http://en.wikipedia.org/wiki/BMP_file_format", - mime.getLinks().get(0).toString()); + assertEquals( + "http://en.wikipedia.org/wiki/BMP_file_format", mime.getLinks().get(0).toString()); mime = this.mimeTypes.forName("application/xml"); assertEquals("XML", mime.getAcronym()); @@ -200,10 +209,7 @@ public void testReadParameterHierarchy() throws Exception { assertEquals("application/x-berkeley-db", mtAltP.toString()); } - /** - * TIKA-746 Ensures that the custom mimetype maps were also - * loaded and used - */ + /** TIKA-746 Ensures that the custom mimetype maps were also loaded and used */ @Test public void testCustomMimeTypes() { // Check that it knows about our three special ones @@ -259,12 +265,11 @@ public void testCustomMimeTypes() { } } - /** - * TIKA-2460 Test loading of custom-mimetypes.xml from sys prop. - */ + /** TIKA-2460 Test loading of custom-mimetypes.xml from sys prop. */ @Test public void testExternalMimeTypes() throws Exception { - System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, + System.setProperty( + MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, "src/test/resources/org/apache/tika/mime/external-mimetypes.xml"); MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader()); Metadata m = new Metadata(); @@ -290,17 +295,16 @@ public void testGetExtensionForJavaScript() throws Exception { @Test public void testGetAliasForJavaScript() throws Exception { MimeType mt = this.mimeTypes.forName("text/javascript"); - Set aliases = mimeTypes.getMediaTypeRegistry() - .getAliases(mt.getType()) - .stream() - .map(MediaType::toString) - .collect(Collectors.toSet()); + Set aliases = + mimeTypes.getMediaTypeRegistry().getAliases(mt.getType()).stream() + .map(MediaType::toString) + .collect(Collectors.toSet()); assertEquals(Set.of("application/javascript", "application/x-javascript"), aliases); } @Test public void testGetRegisteredMimesWithParameters() throws Exception { - //TIKA-1692 + // TIKA-1692 // Media Type always keeps details / parameters String name = "application/xml; charset=UTF-8"; @@ -324,15 +328,17 @@ public void testGetRegisteredMimesWithParameters() throws Exception { @Test public void testMultiThreaded() throws Exception { MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(); - Executors.newFixedThreadPool(1).execute(() -> { - try { - for (int i = 0; i < 500 && !stop; i++) { - mimeTypes.forName("abc" + i + "/abc"); - } - } catch (MimeTypeException e) { - e.printStackTrace(); - } - }); + Executors.newFixedThreadPool(1) + .execute( + () -> { + try { + for (int i = 0; i < 500 && !stop; i++) { + mimeTypes.forName("abc" + i + "/abc"); + } + } catch (MimeTypeException e) { + e.printStackTrace(); + } + }); for (int i = 0; i < 500 & !stop; i++) { mimeTypes.getMediaTypeRegistry().getAliases(MediaType.APPLICATION_ZIP); @@ -341,63 +347,75 @@ public void testMultiThreaded() throws Exception { @Test public void testMinShouldMatch() throws Exception { - System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, + System.setProperty( + MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, "src/test/resources/org/apache/tika/mime/custom-mimetypes-minShouldMatch.xml"); MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader()); - //matches one - assertEquals("hello/world-min-file", - getTypeAsString(mimeTypes, "Hello World!", new Metadata())); + // matches one + assertEquals( + "hello/world-min-file", getTypeAsString(mimeTypes, "Hello World!", new Metadata())); - //matches two - assertEquals("hello/world-min-file", - getTypeAsString(mimeTypes, "Hello Welt!", new Metadata())); + // matches two + assertEquals( + "hello/world-min-file", getTypeAsString(mimeTypes, "Hello Welt!", new Metadata())); - //matches two - assertEquals("hello/world-min-file", - getTypeAsString(mimeTypes, "Hallo Welt!", new Metadata())); + // matches two + assertEquals( + "hello/world-min-file", getTypeAsString(mimeTypes, "Hallo Welt!", new Metadata())); - //missing ! + // missing ! assertEquals("text/plain", getTypeAsString(mimeTypes, "Hello World", new Metadata())); - //Hello requires world, welt or hallo; monde requires bonjour le + // Hello requires world, welt or hallo; monde requires bonjour le assertEquals("text/plain", getTypeAsString(mimeTypes, "Hello Monde", new Metadata())); - //this matcher is treated as "or" with minshouldmatch clause - assertEquals("hello/world-min-file", + // this matcher is treated as "or" with minshouldmatch clause + assertEquals( + "hello/world-min-file", getTypeAsString(mimeTypes, "Bonjour le Monde!", new Metadata())); - } @Test public void testBadMinShouldMatch1() { - System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, + System.setProperty( + MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch1.xml"); - assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); + assertThrows( + IllegalArgumentException.class, + () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); } @Test public void testBadMinShouldMatch2() { - System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, + System.setProperty( + MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch2.xml"); - assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); + assertThrows( + IllegalArgumentException.class, + () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); } @Test public void testBadMinShouldMatch3() { - System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, + System.setProperty( + MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch3.xml"); - assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); + assertThrows( + IllegalArgumentException.class, + () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); } @Test public void testBadMinShouldMatch4() { - System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, + System.setProperty( + MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch4.xml"); - assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); + assertThrows( + IllegalArgumentException.class, + () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); } - private static class CustomClassLoader extends ClassLoader { - } + private static class CustomClassLoader extends ClassLoader {} } diff --git a/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java b/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java index 25721b15fd..6ddc6db33d 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java @@ -21,7 +21,6 @@ import static org.junit.jupiter.api.Assertions.fail; import java.util.List; - import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -39,9 +38,7 @@ public void setUp() throws MimeTypeException { text = types.forName("text/plain"); } - /** - * Test add() - */ + /** Test add() */ @Test public void testAdd() throws MimeTypeException { try { @@ -64,9 +61,7 @@ public void testAdd() throws MimeTypeException { } } - /** - * Test matches() - */ + /** Test matches() */ @Test public void testMatches() { try { @@ -96,5 +91,4 @@ public void testExtensions() throws Exception { assertTrue(extensions.contains(".jpg")); assertTrue(extensions.contains(".jpeg")); } - } diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java index c9d0073c21..5220f81c25 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java @@ -28,12 +28,10 @@ import java.io.InputStream; import java.net.URL; import java.nio.charset.StandardCharsets; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class ProbabilisticMimeDetectionTest { @@ -63,7 +61,9 @@ public void testDetection() throws Exception { testFile("application/xml", "test-utf16be.xml"); testFile("application/xml", "test-long-comment.xml"); testFile("application/xslt+xml", "stylesheet.xsl"); - testUrl("application/rdf+xml", "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", + testUrl( + "application/rdf+xml", + "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", "test-difficult-rdf1.xml"); testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml"); // add evil test from TIKA-327 @@ -81,39 +81,53 @@ public void testDetection() throws Exception { @Test public void testByteOrderMark() throws Exception { - assertEquals(MediaType.TEXT_PLAIN, proDetector - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata())); - assertEquals(MediaType.TEXT_PLAIN, proDetector - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata())); - - assertEquals(MediaType.TEXT_PLAIN, proDetector - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata())); + assertEquals( + MediaType.TEXT_PLAIN, + proDetector.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata())); + assertEquals( + MediaType.TEXT_PLAIN, + proDetector.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata())); + + assertEquals( + MediaType.TEXT_PLAIN, + proDetector.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata())); } @Test public void testSuperTypes() { - assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.parse("text/something"))); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("text/something; charset=UTF-8"), + MediaType.parse("text/something"))); - assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.TEXT_PLAIN)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("text/something; charset=UTF-8"), MediaType.TEXT_PLAIN)); - assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.OCTET_STREAM)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("text/something; charset=UTF-8"), MediaType.OCTET_STREAM)); - assertTrue(registry.isSpecializationOf(MediaType.parse("text/something"), - MediaType.TEXT_PLAIN)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("text/something"), MediaType.TEXT_PLAIN)); - assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+xml"), - MediaType.APPLICATION_XML)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("application/something+xml"), MediaType.APPLICATION_XML)); - assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+zip"), - MediaType.APPLICATION_ZIP)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("application/something+zip"), MediaType.APPLICATION_ZIP)); assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML, MediaType.TEXT_PLAIN)); - assertTrue(registry.isSpecializationOf(MediaType.parse("application/vnd.apple.iwork"), - MediaType.APPLICATION_ZIP)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("application/vnd.apple.iwork"), MediaType.APPLICATION_ZIP)); } @SuppressWarnings("unused") @@ -143,13 +157,14 @@ private void testStream(String expected, String urlOrFileName, InputStream in) try { Metadata metadata = new Metadata(); String mime = this.proDetector.detect(in, metadata).toString(); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected: detected."); + assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected."); // Add resource name and test again metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName); mime = this.proDetector.detect(in, metadata).toString(); - assertEquals(expected, mime, + assertEquals( + expected, + mime, urlOrFileName + " is not properly detected after adding resource name."); } finally { in.close(); @@ -159,43 +174,44 @@ private void testStream(String expected, String urlOrFileName, InputStream in) /** * Test for type detection of empty documents. * - * @see TIKA-483 + * @see TIKA-483 */ @Test public void testEmptyDocument() throws IOException { - assertEquals(MediaType.OCTET_STREAM, + assertEquals( + MediaType.OCTET_STREAM, proDetector.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); Metadata namehint = new Metadata(); namehint.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt"); - assertEquals(MediaType.TEXT_PLAIN, + assertEquals( + MediaType.TEXT_PLAIN, proDetector.detect(new ByteArrayInputStream(new byte[0]), namehint)); Metadata typehint = new Metadata(); typehint.set(Metadata.CONTENT_TYPE, "text/plain"); - assertEquals(MediaType.TEXT_PLAIN, + assertEquals( + MediaType.TEXT_PLAIN, proDetector.detect(new ByteArrayInputStream(new byte[0]), typehint)); - } /** - * Test for things like javascript files whose content is enclosed in XML - * comment delimiters, but that aren't actually XML. + * Test for things like javascript files whose content is enclosed in XML comment delimiters, + * but that aren't actually XML. * - * @see TIKA-426 + * @see TIKA-426 */ @Test public void testNotXML() throws IOException { - assertEquals(MediaType.TEXT_PLAIN, proDetector - .detect(new ByteArrayInputStream("".getBytes(UTF_8)), new Metadata())); + assertEquals( + MediaType.TEXT_PLAIN, + proDetector.detect( + new ByteArrayInputStream("".getBytes(UTF_8)), new Metadata())); } /** - * Tests that when we repeatedly test the detection of a document that can - * be detected with Mime Magic, that we consistently detect it correctly. - * See TIKA-391 for more details. + * Tests that when we repeatedly test the detection of a document that can be detected with Mime + * Magic, that we consistently detect it correctly. See TIKA-391 for more details. */ @Test public void testMimeMagicStability() throws IOException { @@ -205,9 +221,9 @@ public void testMimeMagicStability() throws IOException { } /** - * Tests that when two magic matches both apply, and both have the same - * priority, we use the name to pick the right one based on the glob, or the - * first one we come across if not. See TIKA-1292 for more details. + * Tests that when two magic matches both apply, and both have the same priority, we use the + * name to pick the right one based on the glob, or the first one we come across if not. See + * TIKA-1292 for more details. */ @Test public void testMimeMagicClashSamePriority() throws IOException { @@ -223,24 +239,29 @@ public void testMimeMagicClashSamePriority() throws IOException { metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.x-hello-world"); - assertEquals(helloXType, - proDetector.detect(new ByteArrayInputStream(helloWorld), metadata)); + assertEquals( + helloXType, proDetector.detect(new ByteArrayInputStream(helloWorld), metadata)); // Without, goes for the one that sorts last metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testingTESTINGtesting"); - assertEquals(helloXType, - proDetector.detect(new ByteArrayInputStream(helloWorld), metadata)); + assertEquals( + helloXType, proDetector.detect(new ByteArrayInputStream(helloWorld), metadata)); } @Test public void testTIKA2237() throws IOException { Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, MediaType.text("javascript").toString()); - InputStream input = new ByteArrayInputStream( - ("function() {};\n" + "try {\n" + " window.location = 'index.html';\n" + - "} catch (e) {\n" + " console.log(e);\n" + "}") - .getBytes(StandardCharsets.UTF_8)); + InputStream input = + new ByteArrayInputStream( + ("function() {};\n" + + "try {\n" + + " window.location = 'index.html';\n" + + "} catch (e) {\n" + + " console.log(e);\n" + + "}") + .getBytes(StandardCharsets.UTF_8)); MediaType detect = new ProbabilisticMimeDetectionSelector().detect(input, metadata); assertEquals(MediaType.text("javascript"), detect); } diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java index 69ef03ad51..4f28fd0206 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java +++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java @@ -27,16 +27,14 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.Tika; import org.apache.tika.config.ServiceLoader; import org.apache.tika.detect.DefaultProbDetector; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.ProbabilisticMimeDetectionSelector.Builder; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class ProbabilisticMimeDetectionTestWithTika { @@ -58,9 +56,12 @@ public void setUp() { * instantiate the object. */ Builder builder = new ProbabilisticMimeDetectionSelector.Builder(); - proSelector = new ProbabilisticMimeDetectionSelector(types, - builder.priorMagicFileType(0.5f).priorExtensionFileType(0.5f) - .priorMetaFileType(0.5f)); + proSelector = + new ProbabilisticMimeDetectionSelector( + types, + builder.priorMagicFileType(0.5f) + .priorExtensionFileType(0.5f) + .priorMetaFileType(0.5f)); DefaultProbDetector detector = new DefaultProbDetector(proSelector, loader); // Use a default Tika, except for our different detector @@ -80,7 +81,9 @@ public void testDetection() throws Exception { testFile("application/xml", "test-utf16be.xml"); testFile("application/xml", "test-long-comment.xml"); testFile("application/xslt+xml", "stylesheet.xsl"); - testUrl("application/rdf+xml", "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", + testUrl( + "application/rdf+xml", + "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", "test-difficult-rdf1.xml"); testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml"); // add evil test from TIKA-327 @@ -98,43 +101,54 @@ public void testDetection() throws Exception { @Test public void testByteOrderMark() throws Exception { - assertEquals(MediaType.TEXT_PLAIN.toString(), - tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), - new Metadata())); - assertEquals(MediaType.TEXT_PLAIN.toString(), - tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), - new Metadata())); - - assertEquals(MediaType.TEXT_PLAIN.toString(), - tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), - new Metadata())); + assertEquals( + MediaType.TEXT_PLAIN.toString(), + tika.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata())); + assertEquals( + MediaType.TEXT_PLAIN.toString(), + tika.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata())); + + assertEquals( + MediaType.TEXT_PLAIN.toString(), + tika.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata())); } @Test public void testSuperTypes() { - assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.parse("text/something"))); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("text/something; charset=UTF-8"), + MediaType.parse("text/something"))); - assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.TEXT_PLAIN)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("text/something; charset=UTF-8"), MediaType.TEXT_PLAIN)); - assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.OCTET_STREAM)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("text/something; charset=UTF-8"), MediaType.OCTET_STREAM)); - assertTrue(registry.isSpecializationOf(MediaType.parse("text/something"), - MediaType.TEXT_PLAIN)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("text/something"), MediaType.TEXT_PLAIN)); - assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+xml"), - MediaType.APPLICATION_XML)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("application/something+xml"), MediaType.APPLICATION_XML)); - assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+zip"), - MediaType.APPLICATION_ZIP)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("application/something+zip"), MediaType.APPLICATION_ZIP)); assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML, MediaType.TEXT_PLAIN)); - assertTrue(registry.isSpecializationOf(MediaType.parse("application/vnd.apple.iwork"), - MediaType.APPLICATION_ZIP)); + assertTrue( + registry.isSpecializationOf( + MediaType.parse("application/vnd.apple.iwork"), MediaType.APPLICATION_ZIP)); } @SuppressWarnings("unused") @@ -165,14 +179,15 @@ private void testStream(String expected, String urlOrFileName, InputStream in) Metadata metadata = new Metadata(); // String mime = this.proDetector.detect(in, metadata).toString(); String mime = tika.detect(in, metadata); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected: detected."); + assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected."); // Add resource name and test again metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName); // mime = this.proDetector.detect(in, metadata).toString(); mime = tika.detect(in, metadata); - assertEquals(expected, mime, + assertEquals( + expected, + mime, urlOrFileName + " is not properly detected after adding resource name."); } finally { in.close(); @@ -182,44 +197,44 @@ private void testStream(String expected, String urlOrFileName, InputStream in) /** * Test for type detection of empty documents. * - * @see TIKA-483 + * @see TIKA-483 */ @Test public void testEmptyDocument() throws IOException { - assertEquals(MediaType.OCTET_STREAM.toString(), + assertEquals( + MediaType.OCTET_STREAM.toString(), tika.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); Metadata namehint = new Metadata(); namehint.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt"); - assertEquals(MediaType.TEXT_PLAIN.toString(), + assertEquals( + MediaType.TEXT_PLAIN.toString(), tika.detect(new ByteArrayInputStream(new byte[0]), namehint)); Metadata typehint = new Metadata(); typehint.set(Metadata.CONTENT_TYPE, "text/plain"); - assertEquals(MediaType.TEXT_PLAIN.toString(), + assertEquals( + MediaType.TEXT_PLAIN.toString(), tika.detect(new ByteArrayInputStream(new byte[0]), typehint)); - } /** - * Test for things like javascript files whose content is enclosed in XML - * comment delimiters, but that aren't actually XML. + * Test for things like javascript files whose content is enclosed in XML comment delimiters, + * but that aren't actually XML. * - * @see TIKA-426 + * @see TIKA-426 */ @Test public void testNotXML() throws IOException { - assertEquals(MediaType.TEXT_PLAIN.toString(), - tika.detect(new ByteArrayInputStream("".getBytes(UTF_8)), - new Metadata())); + assertEquals( + MediaType.TEXT_PLAIN.toString(), + tika.detect( + new ByteArrayInputStream("".getBytes(UTF_8)), new Metadata())); } /** - * Tests that when we repeatedly test the detection of a document that can - * be detected with Mime Magic, that we consistently detect it correctly. - * See TIKA-391 for more details. + * Tests that when we repeatedly test the detection of a document that can be detected with Mime + * Magic, that we consistently detect it correctly. See TIKA-391 for more details. */ @Test public void testMimeMagicStability() throws IOException { @@ -229,9 +244,9 @@ public void testMimeMagicStability() throws IOException { } /** - * Tests that when two magic matches both apply, and both have the same - * priority, we use the name to pick the right one based on the glob, or the - * first one we come across if not. See TIKA-1292 for more details. + * Tests that when two magic matches both apply, and both have the same priority, we use the + * name to pick the right one based on the glob, or the first one we come across if not. See + * TIKA-1292 for more details. */ @Test public void testMimeMagicClashSamePriority() throws IOException { @@ -243,18 +258,18 @@ public void testMimeMagicClashSamePriority() throws IOException { // With a filename, picks the right one metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.hello.world"); - assertEquals(helloType.toString(), - tika.detect(new ByteArrayInputStream(helloWorld), metadata)); + assertEquals( + helloType.toString(), tika.detect(new ByteArrayInputStream(helloWorld), metadata)); metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.x-hello-world"); - assertEquals(helloXType.toString(), - tika.detect(new ByteArrayInputStream(helloWorld), metadata)); + assertEquals( + helloXType.toString(), tika.detect(new ByteArrayInputStream(helloWorld), metadata)); // Without, goes for the one that sorts last metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testingTESTINGtesting"); - assertEquals(helloXType.toString(), - tika.detect(new ByteArrayInputStream(helloWorld), metadata)); + assertEquals( + helloXType.toString(), tika.detect(new ByteArrayInputStream(helloWorld), metadata)); } } diff --git a/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java index 7340e06a28..ac395c2e6a 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java @@ -20,12 +20,10 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.Test; public class RFC822DetectionTest { @@ -33,47 +31,59 @@ public class RFC822DetectionTest { @Test public void testBasic() throws Exception { - for (String txt : new String[]{ - "Date: blah\nSent: someone\r\nthis is a test", - "date: blah\nSent: someone\r\nthis is a test", - "date: blah\nDelivered-To: someone\r\nthis is a test" - }) { + for (String txt : + new String[] { + "Date: blah\nSent: someone\r\nthis is a test", + "date: blah\nSent: someone\r\nthis is a test", + "date: blah\nDelivered-To: someone\r\nthis is a test" + }) { assertMime("message/rfc822", txt); } - for (String txt : new String[]{ - //test missing colon - "Date blah\nSent: someone\r\nthis is a test", - //test precursor junk - "some precursor junk Date: blah\nSent: someone\r\nthis is a test", - "some precursor junk\nDate: blah\nSent: someone\r\nthis is a test", - "some precursor junk:\nDate: blah\nSent: someone\r\nthis is a test", - //confirm that date is case-insensitive, but delivered-to is case-sensitive - "date: blah\ndelivered-To: someone\r\nthis is a test", - //test that a file that starts only with "Subject:" and no other header is - //detected as text/plain - "Subject: this is a subject\nand there's some other text", - "To: someone\nand there's some other text", - "To: someone or other" - }) { + for (String txt : + new String[] { + // test missing colon + "Date blah\nSent: someone\r\nthis is a test", + // test precursor junk + "some precursor junk Date: blah\nSent: someone\r\nthis is a test", + "some precursor junk\nDate: blah\nSent: someone\r\nthis is a test", + "some precursor junk:\nDate: blah\nSent: someone\r\nthis is a test", + // confirm that date is case-insensitive, but delivered-to is case-sensitive + "date: blah\ndelivered-To: someone\r\nthis is a test", + // test that a file that starts only with "Subject:" and no other header is + // detected as text/plain + "Subject: this is a subject\nand there's some other text", + "To: someone\nand there's some other text", + "To: someone or other" + }) { assertMime("text/plain", txt); } - //TIKA-4153, specifically - String txt = "Some text here 1.\n" + "Some text here 2.\n" + "Some text here 3.\n" + - "Original Message-----\n" + "From: some_mail@abc.com\n" + - "Sent: Thursday, October 31, 2019 9:52 AM\n" + - "To: Some person, (The XYZ group)\n" + - "Subject: RE: Mr. Random person phone call: MESSAGE\n" + "Hi,\n" + - "I am available now to receive the call.\n" + "Some text here 4.\n" + - "Some text here 5.\n" + "Some text here 6."; + // TIKA-4153, specifically + String txt = + "Some text here 1.\n" + + "Some text here 2.\n" + + "Some text here 3.\n" + + "Original Message-----\n" + + "From: some_mail@abc.com\n" + + "Sent: Thursday, October 31, 2019 9:52 AM\n" + + "To: Some person, (The XYZ group)\n" + + "Subject: RE: Mr. Random person phone call: MESSAGE\n" + + "Hi,\n" + + "I am available now to receive the call.\n" + + "Some text here 4.\n" + + "Some text here 5.\n" + + "Some text here 6."; assertMime("text/plain", txt); } private void assertMime(String expected, String txt) throws IOException { MediaType mediaType = - MIME_TYPES.detect(UnsynchronizedByteArrayInputStream.builder() - .setByteArray(txt.getBytes(StandardCharsets.UTF_8)).get(), new Metadata()); + MIME_TYPES.detect( + UnsynchronizedByteArrayInputStream.builder() + .setByteArray(txt.getBytes(StandardCharsets.UTF_8)) + .get(), + new Metadata()); assertEquals(expected, mediaType.toString(), txt); } } diff --git a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java index 62b061d98e..026f6fd3ee 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java @@ -20,9 +20,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.InputStream; - -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.extractor.EmbeddedBytesSelector; import org.apache.tika.extractor.RUnpackExtractor; @@ -30,14 +27,15 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.StringUtils; +import org.junit.jupiter.api.Test; public class AutoDetectParserConfigTest { @Test public void testEmbeddedBytesSelector() throws Exception { TikaConfig config; - try (InputStream is = TikaConfig.class.getResourceAsStream( - "TIKA-4207-embedded-bytes-config.xml")) { + try (InputStream is = + TikaConfig.class.getResourceAsStream("TIKA-4207-embedded-bytes-config.xml")) { config = new TikaConfig(is); } AutoDetectParserConfig c = config.getAutoDetectParserConfig(); @@ -56,7 +54,6 @@ public void testEmbeddedBytesSelector() throws Exception { assertFalse(selector.select(getMetadata("application/pdf", "MACRO"))); assertFalse(selector.select(getMetadata("application/docx", ""))); - } private Metadata getMetadata(String mime, String embeddedResourceType) { diff --git a/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java index 5519dce675..e99756cabf 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java @@ -26,36 +26,37 @@ import java.util.List; import java.util.Map; import java.util.Set; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; public class CompositeParserTest { @Test @SuppressWarnings("serial") public void testFindDuplicateParsers() { - Parser a = new EmptyParser() { - public Set getSupportedTypes(ParseContext context) { - return Collections.singleton(MediaType.TEXT_PLAIN); - } - }; - Parser b = new EmptyParser() { - public Set getSupportedTypes(ParseContext context) { - return Collections.singleton(MediaType.TEXT_PLAIN); - } - }; - Parser c = new EmptyParser() { - public Set getSupportedTypes(ParseContext context) { - return Collections.singleton(MediaType.OCTET_STREAM); - } - }; + Parser a = + new EmptyParser() { + public Set getSupportedTypes(ParseContext context) { + return Collections.singleton(MediaType.TEXT_PLAIN); + } + }; + Parser b = + new EmptyParser() { + public Set getSupportedTypes(ParseContext context) { + return Collections.singleton(MediaType.TEXT_PLAIN); + } + }; + Parser c = + new EmptyParser() { + public Set getSupportedTypes(ParseContext context) { + return Collections.singleton(MediaType.OCTET_STREAM); + } + }; CompositeParser composite = new CompositeParser(MediaTypeRegistry.getDefaultRegistry(), a, b, c); @@ -86,24 +87,26 @@ public void testMimeTypeAliases() throws Exception { bmpCanonicalMetadata.put("BMP", "True"); bmpCanonicalMetadata.put("Canonical", "True"); Parser bmpCanonicalParser = - new DummyParser(new HashSet<>(Collections.singletonList(bmpCanonical)), - bmpCanonicalMetadata, null); + new DummyParser( + new HashSet<>(Collections.singletonList(bmpCanonical)), + bmpCanonicalMetadata, + null); MediaType bmpAlias = MediaType.image("x-ms-bmp"); Map bmpAliasMetadata = new HashMap<>(); bmpAliasMetadata.put("BMP", "True"); bmpAliasMetadata.put("Alias", "True"); Parser bmpAliasParser = - new DummyParser(new HashSet<>(Collections.singletonList(bmpAlias)), bmpAliasMetadata, - null); + new DummyParser( + new HashSet<>(Collections.singletonList(bmpAlias)), bmpAliasMetadata, null); TikaConfig config = TikaConfig.getDefaultConfig(); CompositeParser canonical = new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser); CompositeParser alias = new CompositeParser(config.getMediaTypeRegistry(), bmpAliasParser); CompositeParser both = - new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser, - bmpAliasParser); + new CompositeParser( + config.getMediaTypeRegistry(), bmpCanonicalParser, bmpAliasParser); ContentHandler handler = new BodyContentHandler(); Metadata metadata; @@ -111,12 +114,11 @@ public void testMimeTypeAliases() throws Exception { // Canonical and Canonical metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString()); - canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, - new ParseContext()); + canonical.parse( + new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); assertEquals("True", metadata.get("BMP")); assertEquals("True", metadata.get("Canonical")); - // Alias and Alias metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString()); @@ -124,16 +126,14 @@ public void testMimeTypeAliases() throws Exception { assertEquals("True", metadata.get("BMP")); assertEquals("True", metadata.get("Alias")); - // Alias type and Canonical parser metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString()); - canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, - new ParseContext()); + canonical.parse( + new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); assertEquals("True", metadata.get("BMP")); assertEquals("True", metadata.get("Canonical")); - // Canonical type and Alias parser metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString()); @@ -141,7 +141,6 @@ public void testMimeTypeAliases() throws Exception { assertEquals("True", metadata.get("BMP")); assertEquals("True", metadata.get("Alias")); - // And when both are there, will go for the last one // to be registered (which is the alias one) metadata = new Metadata(); diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java index 7b329faeef..384c060cc1 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java @@ -22,10 +22,6 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -34,11 +30,10 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; -/** - * This tests that initialize() is called after adding the parameters - * configured via TikaConfig - */ +/** This tests that initialize() is called after adding the parameters configured via TikaConfig */ public class DummyInitializableParser implements Parser, Initializable { public static String SUM_FIELD = "SUM"; @@ -48,10 +43,8 @@ public class DummyInitializableParser implements Parser, Initializable { MIMES.add(MediaType.TEXT_PLAIN); } - @Field - private short shortA = -2; - @Field - private short shortB = -3; + @Field private short shortA = -2; + @Field private short shortB = -3; private int sum = 0; @Override @@ -60,8 +53,9 @@ public Set getSupportedTypes(ParseContext context) { } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { metadata.set(SUM_FIELD, Integer.toString(sum)); } @@ -75,10 +69,10 @@ public void initialize(Map params) throws TikaConfigException { @Override public void checkInitialization(InitializableProblemHandler handler) throws TikaConfigException { - //completely arbitrary + // completely arbitrary if (sum > 1000) { - handler.handleInitializableProblem("DummyInitializableParser", - "sum cannot be > 1000: " + sum); + handler.handleInitializableProblem( + "DummyInitializableParser", "sum cannot be > 1000: " + sum); } } } diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java index 5483474923..da759326e0 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java @@ -26,14 +26,12 @@ import java.net.URL; import java.util.HashSet; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * A test Parsers to test {@link Field} @@ -54,36 +52,26 @@ public class DummyParameterizedParser implements Parser { @Field(name = "testparam") private String testParam = "init_string"; - @Field - private short xshort = -2; + @Field private short xshort = -2; - @Field - private int xint = -3; + @Field private int xint = -3; - @Field - private long xlong = -4; + @Field private long xlong = -4; @Field(name = "xbigint") private BigInteger xbigInt; - @Field - private float xfloat = -5.0f; - - @Field - private double xdouble = -6.0d; + @Field private float xfloat = -5.0f; - @Field - private boolean xbool = true; + @Field private double xdouble = -6.0d; - @Field - private URL xurl; + @Field private boolean xbool = true; - @Field - private URI xuri; + @Field private URL xurl; - @Field - private String missing = "default"; + @Field private URI xuri; + @Field private String missing = "default"; private final String inner = "inner"; private File xfile; @@ -113,8 +101,9 @@ public Set getSupportedTypes(ParseContext context) { } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { metadata.add("testparam", testParam); metadata.add("xshort", xshort + ""); diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java index 9b1ffcc4e1..240265f241 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java @@ -21,19 +21,17 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * A Dummy Parser for use with unit tests. - *

- * See also {@link org.apache.tika.parser.mock.MockParser}. + * + *

See also {@link org.apache.tika.parser.mock.MockParser}. */ public class DummyParser implements Parser { private final Set types; @@ -50,8 +48,9 @@ public Set getSupportedTypes(ParseContext context) { return types; } - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { for (Entry m : this.metadata.entrySet()) { metadata.add(m.getKey(), m.getValue()); } @@ -63,5 +62,4 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } xhtml.endDocument(); } - } diff --git a/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java index 9571ab2527..c42bbee2bc 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java @@ -20,13 +20,11 @@ import java.net.URL; import java.nio.charset.StandardCharsets; - -import org.junit.jupiter.api.Test; - import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.Test; public class InitializableParserTest { diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java index 7ad198f521..9550fd34fe 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java @@ -24,44 +24,42 @@ import java.net.URL; import java.util.HashMap; import java.util.Map; - -import org.junit.jupiter.api.Test; -import org.xml.sax.SAXException; - import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.Test; +import org.xml.sax.SAXException; public class ParameterizedParserTest { - private static final Map expcted = new HashMap() { - { - put("testparam", "testparamval"); - put("xshort", "1000"); - put("xint", "999999999"); - put("xlong", "9999999999999"); - put("xbigint", "99999999999999999999999999999999999999999999999"); - put("xfloat", "10.2"); - put("xbool", "true"); - put("xdouble", "4.6"); - put("xurl", "http://apache.org"); - put("xfile", "somefile"); - put("xuri", "tika://customuri?param=value"); - - put("inner", "inner"); - put("missing", "default"); - } - }; - + private static final Map expcted = + new HashMap() { + { + put("testparam", "testparamval"); + put("xshort", "1000"); + put("xint", "999999999"); + put("xlong", "9999999999999"); + put("xbigint", "99999999999999999999999999999999999999999999999"); + put("xfloat", "10.2"); + put("xbool", "true"); + put("xdouble", "4.6"); + put("xurl", "http://apache.org"); + put("xfile", "somefile"); + put("xuri", "tika://customuri?param=value"); + + put("inner", "inner"); + put("missing", "default"); + } + }; @Test public void testConfigurableParserTypes() throws Exception { Metadata md = getMetadata("TIKA-1986-parameterized.xml"); for (Map.Entry entry : expcted.entrySet()) { - assertEquals(entry.getValue(), - md.get(entry.getKey()), "mismatch for " + entry.getKey()); + assertEquals( + entry.getValue(), md.get(entry.getKey()), "mismatch for " + entry.getKey()); } } @@ -69,16 +67,15 @@ public void testConfigurableParserTypes() throws Exception { public void testConfigurableParserTypesDecorated() throws Exception { Metadata md = getMetadata("TIKA-1986-parameterized-decorated.xml"); for (Map.Entry entry : expcted.entrySet()) { - assertEquals(entry.getValue(), - md.get(entry.getKey()), "mismatch for " + entry.getKey()); + assertEquals( + entry.getValue(), md.get(entry.getKey()), "mismatch for " + entry.getKey()); } } - @Test public void testSomeParams() throws Exception { - //test that a parameterized parser can read a config file - //with only some changes to the initial values + // test that a parameterized parser can read a config file + // with only some changes to the initial values Metadata md = getMetadata("TIKA-1986-some-parameters.xml"); assertEquals("-6.0", md.get("xdouble")); assertEquals("testparamval", md.get("testparam")); @@ -87,20 +84,24 @@ public void testSomeParams() throws Exception { @Test public void testBadValue() throws Exception { - assertThrows(TikaConfigException.class, () -> { - getMetadata("TIKA-1986-bad-values.xml"); - }); + assertThrows( + TikaConfigException.class, + () -> { + getMetadata("TIKA-1986-bad-values.xml"); + }); } @Test public void testBadType() throws Exception { - assertThrows(TikaConfigException.class, () -> { - getMetadata("TIKA-1986-bad-types.xml"); - }); + assertThrows( + TikaConfigException.class, + () -> { + getMetadata("TIKA-1986-bad-types.xml"); + }); } - //TODO later -- add a test for a parser that isn't configurable - //but that has params in the config file + // TODO later -- add a test for a parser that isn't configurable + // but that has params in the config file private Metadata getMetadata(String name) throws TikaException, IOException, SAXException { URL url = this.getClass().getResource("/org/apache/tika/config/" + name); diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java index 141c058af8..30ed57b98f 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java @@ -25,12 +25,10 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Set; - -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; public class ParserDecoratorTest { @@ -46,7 +44,6 @@ public void withAndWithoutTypes() { Set types; ParseContext context = new ParseContext(); - // With a parser of no types, get the decorated type p = ParserDecorator.withTypes(EmptyParser.INSTANCE, onlyTxt); types = p.getSupportedTypes(context); @@ -54,34 +51,28 @@ public void withAndWithoutTypes() { assertTrue(types.contains(MediaType.TEXT_PLAIN), types.toString()); // With a parser with other types, still just the decorated type - p = ParserDecorator - .withTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt); + p = ParserDecorator.withTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt); types = p.getSupportedTypes(context); assertEquals(1, types.size()); assertTrue(types.contains(MediaType.TEXT_PLAIN), types.toString()); - // Exclude will remove if there p = ParserDecorator.withoutTypes(EmptyParser.INSTANCE, onlyTxt); types = p.getSupportedTypes(context); assertEquals(0, types.size()); - p = ParserDecorator - .withoutTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt); + p = ParserDecorator.withoutTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt); types = p.getSupportedTypes(context); assertEquals(1, types.size()); assertTrue(types.contains(MediaType.OCTET_STREAM), types.toString()); - p = ParserDecorator - .withoutTypes(new DummyParser(both, new HashMap<>(), ""), onlyTxt); + p = ParserDecorator.withoutTypes(new DummyParser(both, new HashMap<>(), ""), onlyTxt); types = p.getSupportedTypes(context); assertEquals(1, types.size()); assertTrue(types.contains(MediaType.OCTET_STREAM), types.toString()); } - /** - * Testing one proposed implementation for TIKA-1509 - */ + /** Testing one proposed implementation for TIKA-1509 */ @Test public void withFallback() throws Exception { Set onlyOct = Collections.singleton(MediaType.OCTET_STREAM); @@ -97,8 +88,8 @@ public void withFallback() throws Exception { EmptyParser pNothing = new EmptyParser(); // Create a combination which will fail first - @SuppressWarnings("deprecation") Parser p = - ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText); + @SuppressWarnings("deprecation") + Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText); // Will claim to support the types given, not those on the child parsers Set types = p.getSupportedTypes(context); @@ -109,15 +100,14 @@ public void withFallback() throws Exception { // Parsing will make it to the second one metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); - // With a parser that will work with no output, will get nothing p = ParserDecorator.withFallbacks(Arrays.asList(pNothing, pWork), octAndText); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("", handler.toString()); } } diff --git a/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java index 13c5eada73..c65a202fa9 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java @@ -22,31 +22,29 @@ import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; - +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; import org.junit.jupiter.api.Test; import org.xml.sax.ContentHandler; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; - public class RegexCaptureParserTest { @Test public void testBasic() throws Exception { Metadata m = new Metadata(); ContentHandler contentHandler = new DefaultHandler(); - String output = "Something\n" + - "Title: the quick brown fox\n" + - "Author: jumped over\n" + - "Created: 10/20/2024"; + String output = + "Something\n" + + "Title: the quick brown fox\n" + + "Author: jumped over\n" + + "Created: 10/20/2024"; RegexCaptureParser parser = new RegexCaptureParser(); Map regexes = new HashMap<>(); regexes.put("title", "^Title: ([^\r\n]+)"); parser.setCaptureMap(regexes); - try (InputStream stream = - TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) { + try (InputStream stream = TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) { parser.parse(stream, contentHandler, m, new ParseContext()); } assertEquals("the quick brown fox", m.get("title")); diff --git a/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java index d4c3899550..eb74cd34d6 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java @@ -22,11 +22,6 @@ import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.List; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.io.TikaInputStream; @@ -36,14 +31,17 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RegexCaptureParser; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; public class ExternalParserTest extends TikaTest { @Test public void testConfigRegexCaptureParser() throws Exception { - assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{ - "file", "--version" - })); + assumeTrue( + org.apache.tika.parser.external.ExternalParser.check( + new String[] {"file", "--version"})); try (InputStream is = TikaConfig.class.getResourceAsStream("TIKA-3557.xml")) { TikaConfig config = new TikaConfig(is); @@ -56,12 +54,13 @@ public void testConfigRegexCaptureParser() throws Exception { Metadata m = new Metadata(); ContentHandler contentHandler = new DefaultHandler(); - String output = "Something\n" + - "Title: the quick brown fox\n" + - "Author: jumped over\n" + - "Created: 10/20/2024"; + String output = + "Something\n" + + "Title: the quick brown fox\n" + + "Author: jumped over\n" + + "Created: 10/20/2024"; try (InputStream stream = - TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) { + TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) { outputParser.parse(stream, contentHandler, m, new ParseContext()); } assertEquals("the quick brown fox", m.get("title")); @@ -70,8 +69,11 @@ public void testConfigRegexCaptureParser() throws Exception { @Test public void testConfigBasic() throws Exception { - assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{"file", "--version"})); - try (InputStream is = TikaConfig.class.getResourceAsStream("TIKA-3557-no-output-parser.xml")) { + assumeTrue( + org.apache.tika.parser.external.ExternalParser.check( + new String[] {"file", "--version"})); + try (InputStream is = + TikaConfig.class.getResourceAsStream("TIKA-3557-no-output-parser.xml")) { TikaConfig config = new TikaConfig(is); CompositeParser p = (CompositeParser) config.getParser(); assertEquals(1, p.getAllComponentParsers().size()); @@ -84,13 +86,14 @@ public void testConfigBasic() throws Exception { @Test public void testExifTool() throws Exception { - assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{"exiftool", - "-ver"})); + assumeTrue( + org.apache.tika.parser.external.ExternalParser.check( + new String[] {"exiftool", "-ver"})); try (InputStream is = - TikaConfig.class.getResourceAsStream("TIKA-3557-exiftool-example.xml")) { + TikaConfig.class.getResourceAsStream("TIKA-3557-exiftool-example.xml")) { TikaConfig config = new TikaConfig(is); Parser p = new AutoDetectParser(config); - //this was the smallest pdf we had + // this was the smallest pdf we had List metadataList = getRecursiveMetadata("testOverlappingText.pdf", p); assertEquals(1, metadataList.size()); Metadata m = metadataList.get(0); diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java index de464bca51..22011c054f 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java @@ -16,9 +16,13 @@ */ package org.apache.tika.parser.mock; - import static java.nio.charset.StandardCharsets.UTF_8; +import com.martensigwart.fakeload.FakeLoad; +import com.martensigwart.fakeload.FakeLoadBuilder; +import com.martensigwart.fakeload.FakeLoadExecutor; +import com.martensigwart.fakeload.FakeLoadExecutors; +import com.martensigwart.fakeload.MemoryUnit; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; @@ -39,20 +43,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import javax.xml.parsers.DocumentBuilder; - -import com.martensigwart.fakeload.FakeLoad; -import com.martensigwart.fakeload.FakeLoadBuilder; -import com.martensigwart.fakeload.FakeLoadExecutor; -import com.martensigwart.fakeload.FakeLoadExecutors; -import com.martensigwart.fakeload.MemoryUnit; import org.apache.commons.io.input.CloseShieldInputStream; -import org.w3c.dom.Document; -import org.w3c.dom.NamedNodeMap; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; @@ -64,22 +55,25 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * This class enables mocking of parser behavior for use in testing - * wrappers and drivers of parsers. - *

- * See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation - * of all the options for this MockParser. - *

- * Tests for this class are in tika-parsers. - *

- * See also {@link org.apache.tika.parser.DummyParser} for another option. + * This class enables mocking of parser behavior for use in testing wrappers and drivers of parsers. + * + *

See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation of + * all the options for this MockParser. + * + *

Tests for this class are in tika-parsers. + * + *

See also {@link org.apache.tika.parser.DummyParser} for another option. */ - public class MockParser implements Parser { - private static final long serialVersionUID = 1L; private static final PrintStream ORIG_STDERR; private static final PrintStream ORIG_STDOUT; @@ -113,8 +107,9 @@ public Set getSupportedTypes(ParseContext context) { } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { if (Thread.currentThread().isInterrupted()) { throw new TikaException("interrupted", new InterruptedException()); } @@ -123,7 +118,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, DocumentBuilder docBuilder = context.getDocumentBuilder(); doc = docBuilder.parse(new CloseShieldInputStream(stream)); } catch (SAXException e) { - //to distinguish between SAX on read vs SAX while writing + // to distinguish between SAX on read vs SAX while writing throw new IOException(e); } Node root = doc.getDocumentElement(); @@ -136,8 +131,8 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, xhtml.endDocument(); } - private void executeAction(Node action, Metadata metadata, ParseContext context, - XHTMLContentHandler xhtml) + private void executeAction( + Node action, Metadata metadata, ParseContext context, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { if (action.getNodeType() != 1) { @@ -182,21 +177,23 @@ private void parentMetadata(Node action, ParseContext context) { } private void fakeload(Node action) { - //https://github.com/msigwart/fakeload - //with this version of fakeload, you should only need one thread to hit - //the cpu targets; on Linux with Java 8 at least, two or more threads did - //not increase the overall CPU over a single thread + // https://github.com/msigwart/fakeload + // with this version of fakeload, you should only need one thread to hit + // the cpu targets; on Linux with Java 8 at least, two or more threads did + // not increase the overall CPU over a single thread int numThreads = 1; NamedNodeMap attrs = action.getAttributes(); if (attrs == null) { - throw new IllegalArgumentException("Must specify details...no attributes for " + - "fakeload?!"); + throw new IllegalArgumentException( + "Must specify details...no attributes for " + "fakeload?!"); } - if (attrs.getNamedItem("millis") == null || attrs.getNamedItem("cpu") == null || - attrs.getNamedItem("mb") == null) { - throw new IllegalArgumentException("must specify 'millis' (time to process), " + - "'cpu' (% cpu as an integer, e.g. 50% would be '50'), " + - "and 'mb' (megabytes as an integer)"); + if (attrs.getNamedItem("millis") == null + || attrs.getNamedItem("cpu") == null + || attrs.getNamedItem("mb") == null) { + throw new IllegalArgumentException( + "must specify 'millis' (time to process), " + + "'cpu' (% cpu as an integer, e.g. 50% would be '50'), " + + "and 'mb' (megabytes as an integer)"); } Node n = attrs.getNamedItem("numThreads"); if (n != null) { @@ -211,13 +208,18 @@ private void fakeload(Node action) { new ExecutorCompletionService<>(executorService); for (int i = 0; i < numThreads; i++) { - executorCompletionService.submit(() -> { - FakeLoad fakeload = - new FakeLoadBuilder().lasting(millis, TimeUnit.MILLISECONDS) - .withCpu(cpu).withMemory(mb, MemoryUnit.MB).build(); - FakeLoadExecutor executor = FakeLoadExecutors.newDefaultExecutor(); - executor.execute(fakeload); - }, 1); + executorCompletionService.submit( + () -> { + FakeLoad fakeload = + new FakeLoadBuilder() + .lasting(millis, TimeUnit.MILLISECONDS) + .withCpu(cpu) + .withMemory(mb, MemoryUnit.MB) + .build(); + FakeLoadExecutor executor = FakeLoadExecutors.newDefaultExecutor(); + executor.execute(fakeload); + }, + 1); int finished = 0; try { @@ -233,9 +235,7 @@ private void fakeload(Node action) { } finally { executorService.shutdownNow(); } - } - } private void throwIllegalChars() throws IOException { @@ -259,7 +259,8 @@ private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseConte } String embeddedText = action.getTextContent(); - EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + EmbeddedDocumentExtractor extractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); Metadata m = new Metadata(); m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName); @@ -363,7 +364,7 @@ private void throwIt(Node action) throws IOException, SAXException, TikaExceptio private void metadata(Node action, Metadata metadata) { NamedNodeMap attrs = action.getAttributes(); - //throws npe unless there is a name + // throws npe unless there is a name String name = attrs.getNamedItem("name").getNodeValue(); String value = action.getTextContent(); Node actionType = attrs.getNamedItem("action"); @@ -398,7 +399,6 @@ protected void write(Node action, XHTMLContentHandler xhtml) throws SAXException } } - private void throwIt(String className, String msg) throws IOException, SAXException, TikaException { Throwable t = null; @@ -428,7 +428,7 @@ private void throwIt(String className, String msg) } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else { - //wrap the throwable in a RuntimeException + // wrap the throwable in a RuntimeException throw new RuntimeException(t); } } @@ -443,11 +443,11 @@ private void kabOOM() { } private void hangHeavy(long maxMillis, long pulseCheckMillis, boolean interruptible) { - //do some heavy computation and occasionally check for - //whether time has exceeded maxMillis (see TIKA-1132 for inspiration) - //or whether the thread was interrupted. - //By creating a new Date in the inner loop, we're also intentionally - //triggering the gc most likely. + // do some heavy computation and occasionally check for + // whether time has exceeded maxMillis (see TIKA-1132 for inspiration) + // or whether the thread was interrupted. + // By creating a new Date in the inner loop, we're also intentionally + // triggering the gc most likely. long start = new Date().getTime(); long lastChecked = start; while (true) { @@ -489,5 +489,4 @@ private void sleep(long maxMillis, boolean isInterruptible) { } } } - } diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java index c7716946f5..28ea72d2a6 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java +++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java @@ -18,12 +18,10 @@ import java.io.IOException; import java.util.Map; - import org.apache.tika.exception.TikaException; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserFactory; - public class MockParserFactory extends ParserFactory { public MockParserFactory(Map args) { diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java index 1902b08d8c..661b7da00c 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java @@ -17,25 +17,22 @@ package org.apache.tika.parser.mock; import java.util.List; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.Test; public class MockParserTest extends TikaTest { @Test public void testFakeload() throws Exception { - //just make sure there aren't any exceptions + // just make sure there aren't any exceptions getRecursiveMetadata("mock_fakeload.xml"); } @Test public void testTimes() throws Exception { List metadataList = getRecursiveMetadata("mock_times.xml"); - assertContainsCount("hello", - metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT), 30); + assertContainsCount("hello", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT), 30); } } diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java b/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java index 61a0473608..ca25b5367c 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java @@ -17,26 +17,20 @@ package org.apache.tika.parser.mock; - import java.util.regex.Matcher; import java.util.regex.Pattern; - +import org.apache.tika.config.Field; +import org.apache.tika.sax.XHTMLContentHandler; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.xml.sax.SAXException; -import org.apache.tika.config.Field; -import org.apache.tika.sax.XHTMLContentHandler; - -/** - * only parses vowels as specified in "vowel" field. - */ +/** only parses vowels as specified in "vowel" field. */ public class VowelParser extends MockParser { private static final long serialVersionUID = 1L; - @Field - private String vowel = "aeiou"; + @Field private String vowel = "aeiou"; protected void write(Node action, XHTMLContentHandler xhtml) throws SAXException { NamedNodeMap attrs = action.getAttributes(); @@ -55,5 +49,4 @@ protected void write(Node action, XHTMLContentHandler xhtml) throws SAXException xhtml.characters(sb.toString()); xhtml.endElement(elementType); } - } diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java index 9462d0cb2e..62ea4926c8 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java @@ -27,9 +27,6 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; - -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -41,11 +38,12 @@ import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.utils.ParserUtils; +import org.junit.jupiter.api.Test; public class MultipleParserTest { /** - * Tests how {@link AbstractMultipleParser} works out which - * mime types to offer, based on the types of the parsers + * Tests how {@link AbstractMultipleParser} works out which mime types to offer, based on the + * types of the parsers */ @Test public void testMimeTypeSupported() { @@ -57,9 +55,7 @@ public void testMimeTypeSupported() { // TODO One with a subtype } - /** - * Test {@link FallbackParser} - */ + /** Test {@link FallbackParser} */ @Test public void testFallback() throws Exception { ParseContext context = new ParseContext(); @@ -73,30 +69,27 @@ public void testFallback() throws Exception { // Some parsers ErrorParser pFail = new ErrorParser(); - DummyParser pContent = - new DummyParser(onlyOct, new HashMap<>(), "Fell back!"); + DummyParser pContent = new DummyParser(onlyOct, new HashMap<>(), "Fell back!"); EmptyParser pNothing = new EmptyParser(); - // With only one parser defined, works as normal p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY); assertEquals(1, usedParsers.length); assertEquals(DummyParser.class.getName(), usedParsers[0]); - // With a failing parser, will go to the working one p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY); @@ -109,13 +102,12 @@ public void testFallback() throws Exception { assertNotNull(metadata.get(ParserUtils.EMBEDDED_PARSER)); assertEquals(ErrorParser.class.getName(), metadata.get(ParserUtils.EMBEDDED_PARSER)); - // Won't go past a working parser to a second one, stops after one works p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent, pNothing); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY); @@ -123,13 +115,10 @@ public void testFallback() throws Exception { assertEquals(ErrorParser.class.getName(), usedParsers[0]); assertEquals(DummyParser.class.getName(), usedParsers[1]); - // TODO Check merge policies - First vs Discard } - /** - * Test for {@link SupplementingParser} - */ + /** Test for {@link SupplementingParser} */ @Test public void testSupplemental() throws Exception { ParseContext context = new ParseContext(); @@ -155,22 +144,20 @@ public void testSupplemental() throws Exception { DummyParser pContent2 = new DummyParser(onlyOct, m2, "Fell back 2!"); EmptyParser pNothing = new EmptyParser(); - // Supplemental doesn't support DISCARD try { new SupplementingParser(null, MetadataPolicy.DISCARD_ALL); fail("Discard shouldn't be supported"); } catch (IllegalArgumentException e) { - //swallow + // swallow } - // With only one parser defined, works as normal p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pContent1); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back 1!", handler.toString()); assertEquals("Test1", metadata.get("T1")); @@ -180,15 +167,15 @@ public void testSupplemental() throws Exception { assertEquals(1, usedParsers.length); assertEquals(DummyParser.class.getName(), usedParsers[0]); - // Check the First, Last and All policies: // First Wins - p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2, - pNothing); + p = + new SupplementingParser( + null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2, pNothing); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back 1!Fell back 2!", handler.toString()); assertEquals("Test1", metadata.get("T1")); @@ -201,14 +188,14 @@ public void testSupplemental() throws Exception { assertEquals(DummyParser.class.getName(), usedParsers[1]); assertEquals(EmptyParser.class.getName(), usedParsers[2]); - // Last Wins - p = new SupplementingParser(null, MetadataPolicy.LAST_WINS, pFail, pContent1, pContent2, - pNothing); + p = + new SupplementingParser( + null, MetadataPolicy.LAST_WINS, pFail, pContent1, pContent2, pNothing); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back 1!Fell back 2!", handler.toString()); assertEquals("Test1", metadata.get("T1")); @@ -221,14 +208,14 @@ public void testSupplemental() throws Exception { assertEquals(DummyParser.class.getName(), usedParsers[1]); assertEquals(EmptyParser.class.getName(), usedParsers[2]); - // Merge - p = new SupplementingParser(null, MetadataPolicy.KEEP_ALL, pFail, pContent1, pContent2, - pNothing); + p = + new SupplementingParser( + null, MetadataPolicy.KEEP_ALL, pFail, pContent1, pContent2, pNothing); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back 1!Fell back 2!", handler.toString()); assertEquals("Test1", metadata.get("T1")); @@ -243,11 +230,9 @@ public void testSupplemental() throws Exception { assertEquals(DummyParser.class.getName(), usedParsers[1]); assertEquals(EmptyParser.class.getName(), usedParsers[2]); - // Check the error details always come through, no matter the policy // TODO - // Check that each parser gets its own ContentHandler if a factory was given // TODO } diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java index 4aca5207e5..efa1ac4fc0 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java @@ -22,14 +22,10 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; - import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - import org.apache.tika.TikaTest; import org.apache.tika.extractor.BasicEmbeddedDocumentBytesHandler; import org.apache.tika.metadata.Metadata; @@ -38,12 +34,14 @@ import org.apache.tika.pipes.fetcher.FetchKey; import org.apache.tika.pipes.fetcher.Fetcher; import org.apache.tika.pipes.fetcher.FetcherManager; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; public class PipesServerTest extends TikaTest { /** - * This test is useful for stepping through the debugger on PipesServer - * without having to attach the debugger to the forked process. + * This test is useful for stepping through the debugger on PipesServer without having to attach + * the debugger to the forked process. * * @param tmp * @throws Exception @@ -51,29 +49,38 @@ public class PipesServerTest extends TikaTest { @Test public void testBasic(@TempDir Path tmp) throws Exception { Path tikaConfig = tmp.resolve("tika-config.xml"); - String xml = IOUtils.toString( - PipesServerTest.class.getResourceAsStream("TIKA-3941.xml"), StandardCharsets.UTF_8); + String xml = + IOUtils.toString( + PipesServerTest.class.getResourceAsStream("TIKA-3941.xml"), + StandardCharsets.UTF_8); xml = xml.replace("BASE_PATH", tmp.toAbsolutePath().toString()); Files.write(tikaConfig, xml.getBytes(StandardCharsets.UTF_8)); - Files.copy(PipesServerTest.class.getResourceAsStream("/test-documents/mock_times.xml"), + Files.copy( + PipesServerTest.class.getResourceAsStream("/test-documents/mock_times.xml"), tmp.resolve("mock.xml")); - PipesServer pipesServer = new PipesServer(tikaConfig, - new UnsynchronizedByteArrayInputStream(new byte[0]), - new PrintStream(UnsynchronizedByteArrayOutputStream.builder().get(), true, - StandardCharsets.UTF_8.name()), - -1, 30000, 30000); + PipesServer pipesServer = + new PipesServer( + tikaConfig, + new UnsynchronizedByteArrayInputStream(new byte[0]), + new PrintStream( + UnsynchronizedByteArrayOutputStream.builder().get(), + true, + StandardCharsets.UTF_8.name()), + -1, + 30000, + 30000); pipesServer.initializeResources(); - FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id", - new FetchKey("fs", "mock.xml"), - new EmitKey("", "")); + FetchEmitTuple fetchEmitTuple = + new FetchEmitTuple("id", new FetchKey("fs", "mock.xml"), new EmitKey("", "")); Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher(); - PipesServer.MetadataListAndEmbeddedBytes - parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher); - assertEquals("5f3b924303e960ce35d7f705e91d3018dd110a9c3cef0546a91fe013d6dad6fd", + PipesServer.MetadataListAndEmbeddedBytes parseData = + pipesServer.parseFromTuple(fetchEmitTuple, fetcher); + assertEquals( + "5f3b924303e960ce35d7f705e91d3018dd110a9c3cef0546a91fe013d6dad6fd", parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256")); } @@ -85,51 +92,65 @@ public void testEmbeddedStreamEmitter(@TempDir Path tmp) throws Exception { Files.createDirectories(tmp); Path tikaConfig = tmp.resolve("tika-config.xml"); - String xml = IOUtils.toString( - PipesServerTest.class.getResourceAsStream("TIKA-4207.xml"), - StandardCharsets.UTF_8); + String xml = + IOUtils.toString( + PipesServerTest.class.getResourceAsStream("TIKA-4207.xml"), + StandardCharsets.UTF_8); xml = xml.replace("BASE_PATH", tmp.toAbsolutePath().toString()); Files.write(tikaConfig, xml.getBytes(StandardCharsets.UTF_8)); - Files.copy(PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"), + Files.copy( + PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"), tmp.resolve("mock.xml")); - PipesServer pipesServer = new PipesServer(tikaConfig, - new UnsynchronizedByteArrayInputStream(new byte[0]), - new PrintStream(UnsynchronizedByteArrayOutputStream.builder().get(), true, - StandardCharsets.UTF_8.name()), - -1, 30000, 30000); + PipesServer pipesServer = + new PipesServer( + tikaConfig, + new UnsynchronizedByteArrayInputStream(new byte[0]), + new PrintStream( + UnsynchronizedByteArrayOutputStream.builder().get(), + true, + StandardCharsets.UTF_8.name()), + -1, + 30000, + 30000); pipesServer.initializeResources(); EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = new EmbeddedDocumentBytesConfig(true); embeddedDocumentBytesConfig.setIncludeOriginal(true); - FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id", - new FetchKey("fs", "mock.xml"), - new EmitKey("", ""), new Metadata(), - HandlerConfig.DEFAULT_HANDLER_CONFIG, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT, - embeddedDocumentBytesConfig); + FetchEmitTuple fetchEmitTuple = + new FetchEmitTuple( + "id", + new FetchKey("fs", "mock.xml"), + new EmitKey("", ""), + new Metadata(), + HandlerConfig.DEFAULT_HANDLER_CONFIG, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT, + embeddedDocumentBytesConfig); Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher(); - PipesServer.MetadataListAndEmbeddedBytes - parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher); + PipesServer.MetadataListAndEmbeddedBytes parseData = + pipesServer.parseFromTuple(fetchEmitTuple, fetcher); assertEquals(2, parseData.metadataList.size()); byte[] bytes0 = IOUtils.toByteArray( - ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler()) - .getDocument(0)); + ((BasicEmbeddedDocumentBytesHandler) + parseData.getEmbeddedDocumentBytesHandler()) + .getDocument(0)); byte[] bytes1 = IOUtils.toByteArray( - ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler()) + ((BasicEmbeddedDocumentBytesHandler) + parseData.getEmbeddedDocumentBytesHandler()) .getDocument(1)); - assertContains("is to trigger mock on the embedded", - new String(bytes0, StandardCharsets.UTF_8)); + assertContains( + "is to trigger mock on the embedded", new String(bytes0, StandardCharsets.UTF_8)); - assertContains("embeddedAuthor", - new String(bytes1, StandardCharsets.UTF_8)); - assertEquals("fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a", + assertContains("embeddedAuthor", new String(bytes1, StandardCharsets.UTF_8)); + assertEquals( + "fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a", parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256")); } @@ -141,50 +162,65 @@ public void testEmbeddedStreamEmitterLimitBytes(@TempDir Path tmp) throws Except Files.createDirectories(tmp); Path tikaConfig = tmp.resolve("tika-config.xml"); - String xml = IOUtils.toString( - PipesServerTest.class.getResourceAsStream("TIKA-4207-limit-bytes.xml"), - StandardCharsets.UTF_8); + String xml = + IOUtils.toString( + PipesServerTest.class.getResourceAsStream("TIKA-4207-limit-bytes.xml"), + StandardCharsets.UTF_8); xml = xml.replace("BASE_PATH", tmp.toAbsolutePath().toString()); Files.write(tikaConfig, xml.getBytes(StandardCharsets.UTF_8)); - Files.copy(PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"), + Files.copy( + PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"), tmp.resolve("mock.xml")); - PipesServer pipesServer = new PipesServer(tikaConfig, - new UnsynchronizedByteArrayInputStream(new byte[0]), - new PrintStream(UnsynchronizedByteArrayOutputStream.builder().get(), true, - StandardCharsets.UTF_8.name()), - -1, 30000, 30000); + PipesServer pipesServer = + new PipesServer( + tikaConfig, + new UnsynchronizedByteArrayInputStream(new byte[0]), + new PrintStream( + UnsynchronizedByteArrayOutputStream.builder().get(), + true, + StandardCharsets.UTF_8.name()), + -1, + 30000, + 30000); pipesServer.initializeResources(); EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = new EmbeddedDocumentBytesConfig(true); embeddedDocumentBytesConfig.setIncludeOriginal(true); - FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id", - new FetchKey("fs", "mock.xml"), - new EmitKey("", ""), new Metadata(), - HandlerConfig.DEFAULT_HANDLER_CONFIG, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT, - embeddedDocumentBytesConfig); + FetchEmitTuple fetchEmitTuple = + new FetchEmitTuple( + "id", + new FetchKey("fs", "mock.xml"), + new EmitKey("", ""), + new Metadata(), + HandlerConfig.DEFAULT_HANDLER_CONFIG, + FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT, + embeddedDocumentBytesConfig); Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher(); - PipesServer.MetadataListAndEmbeddedBytes - parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher); + PipesServer.MetadataListAndEmbeddedBytes parseData = + pipesServer.parseFromTuple(fetchEmitTuple, fetcher); assertEquals(2, parseData.metadataList.size()); byte[] bytes0 = IOUtils.toByteArray( - ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler()) + ((BasicEmbeddedDocumentBytesHandler) + parseData.getEmbeddedDocumentBytesHandler()) .getDocument(0)); byte[] bytes1 = IOUtils.toByteArray( - ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler()) + ((BasicEmbeddedDocumentBytesHandler) + parseData.getEmbeddedDocumentBytesHandler()) .getDocument(1)); - assertContains("is to trigger mock on the embedded", - new String(bytes0, StandardCharsets.UTF_8)); + assertContains( + "is to trigger mock on the embedded", new String(bytes0, StandardCharsets.UTF_8)); assertEquals(10, bytes1.length); - assertEquals("fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a", + assertEquals( + "fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a", parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256")); } } diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java b/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java index 4522a2ea12..af67e78a66 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java @@ -26,10 +26,6 @@ import java.util.HashSet; import java.util.Random; import java.util.Set; - -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.pipes.FetchEmitTuple; @@ -39,40 +35,49 @@ import org.apache.tika.pipes.fetcher.FetchKey; import org.apache.tika.pipes.pipesiterator.PipesIterator; import org.apache.tika.utils.ProcessUtils; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; public class AsyncChaosMonkeyTest { - private final String OOM = "" + "" + - "oom message\n"; - private final String OK = "" + "" + - "Nikolai Lobachevsky" + - "main_content" + - ""; - - private final String TIMEOUT = "" + "" + - "Nikolai Lobachevsky" + - "main_content" + - "" + ""; - - private final String SYSTEM_EXIT = "" + "" + - "Nikolai Lobachevsky" + - "main_content" + - "" + ""; + private final String OOM = + "" + + "" + + "oom message\n"; + private final String OK = + "" + + "" + + "Nikolai Lobachevsky" + + "main_content" + + ""; + + private final String TIMEOUT = + "" + + "" + + "Nikolai Lobachevsky" + + "main_content" + + "" + + ""; + + private final String SYSTEM_EXIT = + "" + + "" + + "Nikolai Lobachevsky" + + "main_content" + + "" + + ""; private final int totalFiles = 100; - @TempDir - private Path inputDir; + @TempDir private Path inputDir; - @TempDir - private Path configDir; + @TempDir private Path configDir; private int ok = 0; private int oom = 0; private int timeouts = 0; private int crash = 0; - public Path setUp(boolean emitIntermediateResults) throws SQLException, IOException { ok = 0; oom = 0; @@ -80,28 +85,36 @@ public Path setUp(boolean emitIntermediateResults) throws SQLException, IOExcept crash = 0; Path tikaConfigPath = Files.createTempFile(configDir, "tika-config-", ".xml"); String xml = - "" + "" + " " + - " \n" + - " mock\n" + " " + - " " + " " + - " " + - " mock\n" + " " + - ProcessUtils.escapeCommandLine(inputDir.toAbsolutePath().toString()) + - "\n" + " " + " " + - " \n" + - " \n" + - "" + - "" + - "" + emitIntermediateResults + - "" + - "" + - ProcessUtils.escapeCommandLine(tikaConfigPath.toAbsolutePath().toString()) + - "-Xmx512m1000000" + - "5000" + - "4" + - ""; + "" + + "" + + " " + + " \n" + + " mock\n" + + " " + + " " + + " " + + " " + + " mock\n" + + " " + + ProcessUtils.escapeCommandLine(inputDir.toAbsolutePath().toString()) + + "\n" + + " " + + " " + + " \n" + + " \n" + + "" + + "" + + "" + + emitIntermediateResults + + "" + + "" + + ProcessUtils.escapeCommandLine(tikaConfigPath.toAbsolutePath().toString()) + + "-Xmx512m1000000" + + "5000" + + "4" + + ""; Files.write(tikaConfigPath, xml.getBytes(StandardCharsets.UTF_8)); Random r = new Random(); for (int i = 0; i < totalFiles; i++) { @@ -110,7 +123,8 @@ public Path setUp(boolean emitIntermediateResults) throws SQLException, IOExcept Files.write(inputDir.resolve(i + ".xml"), OOM.getBytes(StandardCharsets.UTF_8)); oom++; } else if (f < 0.10) { - Files.write(inputDir.resolve(i + ".xml"), SYSTEM_EXIT.getBytes(StandardCharsets.UTF_8)); + Files.write( + inputDir.resolve(i + ".xml"), SYSTEM_EXIT.getBytes(StandardCharsets.UTF_8)); crash++; } else if (f < 0.13) { Files.write(inputDir.resolve(i + ".xml"), TIMEOUT.getBytes(StandardCharsets.UTF_8)); @@ -125,32 +139,35 @@ public Path setUp(boolean emitIntermediateResults) throws SQLException, IOExcept return tikaConfigPath; } -/* - private void writeLarge(Path resolve) throws IOException { - try (BufferedWriter writer = Files.newBufferedWriter(resolve, StandardCharsets.UTF_8)) { - writer.write(""); - writer.write(""); - for (int i = 0; i < 10000000; i++) { - writer.write("hello hello hello hello hello"); + /* + private void writeLarge(Path resolve) throws IOException { + try (BufferedWriter writer = Files.newBufferedWriter(resolve, StandardCharsets.UTF_8)) { + writer.write(""); + writer.write(""); + for (int i = 0; i < 10000000; i++) { + writer.write("hello hello hello hello hello"); + } + writer.write(""); } - writer.write(""); } - } -*/ + */ @Test public void testBasic() throws Exception { AsyncProcessor processor = new AsyncProcessor(setUp(false)); for (int i = 0; i < totalFiles; i++) { - FetchEmitTuple t = new FetchEmitTuple("myId-" + i, - new FetchKey("mock", i + ".xml"), - new EmitKey("mock", "emit-" + i), new Metadata()); + FetchEmitTuple t = + new FetchEmitTuple( + "myId-" + i, + new FetchKey("mock", i + ".xml"), + new EmitKey("mock", "emit-" + i), + new Metadata()); processor.offer(t, 1000); } for (int i = 0; i < 10; i++) { processor.offer(PipesIterator.COMPLETED_SEMAPHORE, 1000); } - //TODO clean this up + // TODO clean this up while (processor.checkActive()) { Thread.sleep(100); } @@ -162,7 +179,8 @@ public void testBasic() throws Exception { assertEquals(ok, emitKeys.size()); assertEquals(100, MockReporter.RESULTS.size()); for (PipesResult r : MockReporter.RESULTS) { - assertEquals("application/mock+xml", + assertEquals( + "application/mock+xml", r.getEmitData().getMetadataList().get(0).get(Metadata.CONTENT_TYPE)); } } @@ -171,14 +189,18 @@ public void testBasic() throws Exception { public void testEmitIntermediate() throws Exception { AsyncProcessor processor = new AsyncProcessor(setUp(true)); for (int i = 0; i < totalFiles; i++) { - FetchEmitTuple t = new FetchEmitTuple("myId-" + i, new FetchKey("mock", i + ".xml"), - new EmitKey("mock", "emit-" + i), new Metadata()); + FetchEmitTuple t = + new FetchEmitTuple( + "myId-" + i, + new FetchKey("mock", i + ".xml"), + new EmitKey("mock", "emit-" + i), + new Metadata()); processor.offer(t, 1000); } for (int i = 0; i < 10; i++) { processor.offer(PipesIterator.COMPLETED_SEMAPHORE, 1000); } - //TODO clean this up + // TODO clean this up while (processor.checkActive()) { Thread.sleep(100); } @@ -187,10 +209,10 @@ public void testEmitIntermediate() throws Exception { int observedOOM = 0; for (EmitData d : MockEmitter.EMIT_DATA) { emitKeys.add(d.getEmitKey().getEmitKey()); - assertEquals(64, - d.getMetadataList().get(0).get("X-TIKA:digest:SHA-256").trim().length()); - assertEquals("application/mock+xml", - d.getMetadataList().get(0).get(Metadata.CONTENT_TYPE)); + assertEquals( + 64, d.getMetadataList().get(0).get("X-TIKA:digest:SHA-256").trim().length()); + assertEquals( + "application/mock+xml", d.getMetadataList().get(0).get(Metadata.CONTENT_TYPE)); String val = d.getMetadataList().get(0).get(TikaCoreProperties.PIPES_RESULT); if ("OOM".equals(val)) { observedOOM++; diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java b/tika-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java index 2374c14743..33f6a98bec 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java @@ -21,7 +21,6 @@ import java.util.Collections; import java.util.List; import java.util.concurrent.ArrayBlockingQueue; - import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.emitter.AbstractEmitter; import org.apache.tika.pipes.emitter.EmitData; @@ -32,8 +31,7 @@ public class MockEmitter extends AbstractEmitter { static ArrayBlockingQueue EMIT_DATA = new ArrayBlockingQueue<>(10000); - public MockEmitter() { - } + public MockEmitter() {} public static List getData() { return new ArrayList<>(EMIT_DATA); @@ -43,8 +41,8 @@ public static List getData() { public void emit(String emitKey, List metadataList) throws IOException, TikaEmitterException { emit( - Collections.singletonList(new EmitData(new EmitKey(getName(), emitKey), - metadataList))); + Collections.singletonList( + new EmitData(new EmitKey(getName(), emitKey), metadataList))); } @Override @@ -54,5 +52,4 @@ public void emit(List emitData) throws IOException, TikaEmit EMIT_DATA.offer(d); } } - } diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java b/tika-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java index 10af275e36..d234ac5225 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java @@ -20,16 +20,19 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.fetcher.Fetcher; public class MockFetcher implements Fetcher { - private static final byte[] BYTES = ("" + "" + - "Nikolai Lobachevsky" + - "main_content" + "").getBytes(StandardCharsets.UTF_8); + private static final byte[] BYTES = + ("" + + "" + + "Nikolai Lobachevsky" + + "main_content" + + "") + .getBytes(StandardCharsets.UTF_8); @Override public String getName() { diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java b/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java index 6e8308c895..0c05648040 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java @@ -17,7 +17,6 @@ package org.apache.tika.pipes.async; import java.util.concurrent.ArrayBlockingQueue; - import org.apache.tika.config.Field; import org.apache.tika.pipes.FetchEmitTuple; import org.apache.tika.pipes.PipesReporter; @@ -35,14 +34,10 @@ public void report(FetchEmitTuple t, PipesResult result, long elapsed) { } @Override - public void error(Throwable t) { - - } + public void error(Throwable t) {} @Override - public void error(String msg) { - - } + public void error(String msg) {} @Field public void setEndpoint(String endpoint) { diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java b/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java index 9bfcd55918..cbd4c0a2c1 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java @@ -22,11 +22,9 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.List; - -import org.junit.jupiter.api.Test; - import org.apache.tika.pipes.CompositePipesReporter; import org.apache.tika.pipes.PipesReporter; +import org.junit.jupiter.api.Test; public class MockReporterTest { @@ -36,7 +34,7 @@ public void testBasic() throws Exception { AsyncConfig asyncConfig = AsyncConfig.load(configPath); PipesReporter reporter = asyncConfig.getPipesReporter(); assertTrue(reporter instanceof MockReporter); - assertEquals("somethingOrOther", ((MockReporter)reporter).getEndpoint()); + assertEquals("somethingOrOther", ((MockReporter) reporter).getEndpoint()); } @Test @@ -45,8 +43,8 @@ public void testCompositePipesReporter() throws Exception { AsyncConfig asyncConfig = AsyncConfig.load(configPath); PipesReporter reporter = asyncConfig.getPipesReporter(); assertTrue(reporter instanceof CompositePipesReporter); - List reporters = ((CompositePipesReporter)reporter).getPipesReporters(); - assertEquals("somethingOrOther1", ((MockReporter)reporters.get(0)).getEndpoint()); - assertEquals("somethingOrOther2", ((MockReporter)reporters.get(1)).getEndpoint()); + List reporters = ((CompositePipesReporter) reporter).getPipesReporters(); + assertEquals("somethingOrOther1", ((MockReporter) reporters.get(0)).getEndpoint()); + assertEquals("somethingOrOther2", ((MockReporter) reporters.get(1)).getEndpoint()); } } diff --git a/tika-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java b/tika-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java index 036a95965b..f37f0eefdf 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java @@ -19,7 +19,6 @@ import java.io.IOException; import java.util.List; import java.util.Map; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -29,13 +28,10 @@ public class MockEmitter extends AbstractEmitter implements Initializable { - @Field - private boolean throwOnCheck = false; + @Field private boolean throwOnCheck = false; @Override - public void initialize(Map params) throws TikaConfigException { - - } + public void initialize(Map params) throws TikaConfigException {} public void setThrowOnCheck(boolean throwOnCheck) { this.throwOnCheck = throwOnCheck; @@ -48,12 +44,9 @@ public void checkInitialization(InitializableProblemHandler problemHandler) if (throwOnCheck) { throw new TikaConfigException("throw on check"); } - } @Override public void emit(String emitKey, List metadataList) - throws IOException, TikaEmitterException { - - } + throws IOException, TikaEmitterException {} } diff --git a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java b/tika-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java index 060432724e..78621dd6d5 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java @@ -21,7 +21,6 @@ import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.Map; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -34,12 +33,9 @@ public class MockFetcher extends AbstractFetcher implements Initializable { private Map params; - @Field - private String byteString = null; - - @Field - private boolean throwOnCheck = false; + @Field private String byteString = null; + @Field private boolean throwOnCheck = false; public void setThrowOnCheck(boolean throwOnCheck) { this.throwOnCheck = throwOnCheck; @@ -62,10 +58,10 @@ public void checkInitialization(InitializableProblemHandler problemHandler) } } - @Override public InputStream fetch(String fetchKey, Metadata metadata) throws TikaException, IOException { - return byteString == null ? new ByteArrayInputStream(new byte[0]) : - new ByteArrayInputStream(byteString.getBytes(StandardCharsets.UTF_8)); + return byteString == null + ? new ByteArrayInputStream(new byte[0]) + : new ByteArrayInputStream(byteString.getBytes(StandardCharsets.UTF_8)); } } diff --git a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java b/tika-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java index 7e29ac20ad..f74fc549f1 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java @@ -23,11 +23,8 @@ import java.nio.file.InvalidPathException; import java.nio.file.Path; import java.nio.file.Paths; - -import org.junit.jupiter.api.Test; - import org.apache.tika.config.InitializableProblemHandler; - +import org.junit.jupiter.api.Test; public class FileSystemFetcherTest { @@ -48,10 +45,12 @@ public void testDescendant() throws Exception { @Test public void testNullByte() throws Exception { FileSystemFetcher f = new FileSystemFetcher(); - assertThrows(InvalidPathException.class, () -> { - f.setBasePath("bad\u0000path"); - f.setName("fs"); - f.checkInitialization(InitializableProblemHandler.IGNORE); - }); + assertThrows( + InvalidPathException.class, + () -> { + f.setBasePath("bad\u0000path"); + f.setName("fs"); + f.checkInitialization(InitializableProblemHandler.IGNORE); + }); } } diff --git a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java index 7b37ec5517..303f508f3f 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java @@ -28,13 +28,10 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; - -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.Timeout; - import org.apache.tika.pipes.FetchEmitTuple; import org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator; - +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; public class FileSystemPipesIteratorTest { @@ -45,14 +42,12 @@ public static List listFiles(Path path) throws IOException { result = walk.filter(Files::isRegularFile).collect(Collectors.toList()); } return result; - } @Test @Timeout(30000) public void testBasic() throws Exception { - URL url = - FileSystemPipesIteratorTest.class.getResource("/test-documents"); + URL url = FileSystemPipesIteratorTest.class.getResource("/test-documents"); Path root = Paths.get(url.toURI()); List files = listFiles(root); Set truthSet = new HashSet<>(); diff --git a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java index fec827d0e2..3c24b9dc3c 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java @@ -23,11 +23,9 @@ import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; - -import org.junit.jupiter.api.Test; - import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.pipes.FetchEmitTuple; +import org.junit.jupiter.api.Test; public class FileListPipesIteratorTest { diff --git a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java index 111d2ea3cf..86ffcc5a10 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java @@ -26,30 +26,26 @@ import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.Set; - -import org.junit.jupiter.api.Test; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.junit.jupiter.api.Test; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; +import org.xml.sax.helpers.DefaultHandler; -/** - * Test cases for the {@link org.apache.tika.sax.BodyContentHandler} class. - */ +/** Test cases for the {@link org.apache.tika.sax.BodyContentHandler} class. */ public class BasicContentHandlerFactoryTest { - //default max char len (at least in WriteOutContentHandler is 100k) + // default max char len (at least in WriteOutContentHandler is 100k) private static final int OVER_DEFAULT = 120000; - //copied from TikaTest in tika-parsers package + // copied from TikaTest in tika-parsers package public static void assertNotContains(String needle, String haystack) { assertFalse(haystack.contains(needle), needle + " found in:\n" + haystack); } @@ -76,13 +72,14 @@ public void testIgnore() throws Exception { .getNewContentHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); - //unfortunatley, the DefaultHandler does not return "", + // unfortunatley, the DefaultHandler does not return "", assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); - //tests that no write limit exception is thrown + // tests that no write limit exception is thrown p = new MockParser(100); - handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5) - .getNewContentHandler(); + handler = + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5) + .getNewContentHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); @@ -102,7 +99,7 @@ public void testText() throws Exception { assertNotContains(" 110000); - //now test write limit + // now test write limit p = new MockParser(10); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof WriteOutContentHandler); @@ -111,7 +108,7 @@ public void testText() throws Exception { assertContains("This ", extracted); assertNotContains("aaaa", extracted); - //now test outputstream call + // now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); @@ -128,8 +125,8 @@ public void testText() throws Exception { handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); - //When writing to an OutputStream and a write limit is reached, - //currently, nothing is written. + // When writing to an OutputStream and a write limit is reached, + // currently, nothing is written. assertEquals(0, os.toByteArray().length); } @@ -146,7 +143,7 @@ public void testHTML() throws Exception { assertContains("aaaaaaaaaa", extracted); assertTrue(extracted.length() > 110000); - //now test write limit + // now test write limit p = new MockParser(10); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof WriteOutContentHandler); @@ -155,7 +152,7 @@ public void testHTML() throws Exception { assertContains("This ", extracted); assertNotContains("aaaa", extracted); - //now test outputstream call + // now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); @@ -167,7 +164,6 @@ public void testHTML() throws Exception { assertContains(" 110000); - p = new MockParser(10); os = new ByteArrayOutputStream(); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); @@ -189,7 +185,7 @@ public void testXML() throws Exception { assertContains("aaaaaaaaaa", extracted); assertTrue(handler.toString().length() > 110000); - //now test write limit + // now test write limit p = new MockParser(10); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof WriteOutContentHandler); @@ -198,7 +194,7 @@ public void testXML() throws Exception { assertContains("This ", extracted); assertNotContains("aaaa", extracted); - //now test outputstream call + // now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); @@ -211,7 +207,6 @@ public void testXML() throws Exception { assertContains(" 110000); - p = new MockParser(10); os = new ByteArrayOutputStream(); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); @@ -234,7 +229,7 @@ public void testBody() throws Exception { assertContains("aaaaaaaaaa", extracted); assertTrue(extracted.length() > 110000); - //now test write limit + // now test write limit p = new MockParser(10); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof BodyContentHandler); @@ -243,7 +238,7 @@ public void testBody() throws Exception { assertNotContains("This ", extracted); assertContains("aaaa", extracted); - //now test outputstream call + // now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); @@ -277,13 +272,13 @@ private void assertWriteLimitReached(Parser p, WriteOutContentHandler handler) assertTrue(wlr, "WriteLimitReached"); } - //TODO: is there a better way than to repeat this with diff signature? + // TODO: is there a better way than to repeat this with diff signature? private void assertWriteLimitReached(Parser p, BodyContentHandler handler) throws Exception { boolean wlr = false; try { p.parse(null, handler, null, null); } catch (SAXException e) { - if (! WriteLimitReachedException.isWriteLimitReached(e)) { + if (!WriteLimitReachedException.isWriteLimitReached(e)) { throw e; } @@ -292,8 +287,8 @@ private void assertWriteLimitReached(Parser p, BodyContentHandler handler) throw assertTrue(wlr, "WriteLimitReached"); } - //Simple mockparser that writes a title - //and charsToWrite number of 'a' + // Simple mockparser that writes a title + // and charsToWrite number of 'a' private static class MockParser implements Parser { private final String XHTML = "http://www.w3.org/1999/xhtml"; private final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); @@ -311,8 +306,9 @@ public Set getSupportedTypes(ParseContext context) { } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { handler.startDocument(); handler.startPrefixMapping("", XHTML); handler.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES); diff --git a/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java index 19bf853783..1937ea1023 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java @@ -23,24 +23,20 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.mock.MockParser; +import org.junit.jupiter.api.Test; -/** - * Test cases for the {@link BodyContentHandler} class. - */ +/** Test cases for the {@link BodyContentHandler} class. */ public class BodyContentHandlerTest extends TikaTest { /** - * Test that the conversion to an {@link OutputStream} doesn't leave - * characters unflushed in an internal buffer. + * Test that the conversion to an {@link OutputStream} doesn't leave characters unflushed in an + * internal buffer. * * @see TIKA-179 */ @@ -49,8 +45,8 @@ public void testOutputStream() throws Exception { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); XHTMLContentHandler xhtml = - new XHTMLContentHandler(new BodyContentHandler( - new OutputStreamWriter(buffer, UTF_8)), + new XHTMLContentHandler( + new BodyContentHandler(new OutputStreamWriter(buffer, UTF_8)), new Metadata()); xhtml.startDocument(); xhtml.element("p", "Test text"); @@ -61,7 +57,7 @@ public void testOutputStream() throws Exception { @Test public void testLimit() throws Exception { - //TIKA-2668 - java 11-ea + // TIKA-2668 - java 11-ea Parser p = new MockParser(); WriteOutContentHandler handler = new WriteOutContentHandler(15); Metadata metadata = new Metadata(); diff --git a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java index 88643147f5..5c252a6751 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java @@ -23,19 +23,17 @@ import java.io.OutputStream; import java.nio.charset.StandardCharsets; import javax.xml.parsers.ParserConfigurationException; - import org.apache.commons.io.output.ByteArrayOutputStream; +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.utils.XMLReaderUtils; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.xml.sax.SAXException; -import org.apache.tika.TikaTest; -import org.apache.tika.exception.TikaException; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.utils.XMLReaderUtils; - /** * Test that validates a custom {@link ContentHandlerDecorator} can handle errors during XML parsing * @@ -46,12 +44,13 @@ public class CustomErrorHandlerTest extends TikaTest { private static String DEFAULT_SAX_PARSER_FACTORY; private static String SAX_PARSER_FACTORY_KEY = "javax.xml.parsers.SAXParserFactory"; + @BeforeAll public static void setUp() throws TikaException { DEFAULT_SAX_PARSER_FACTORY = System.getProperty(SAX_PARSER_FACTORY_KEY); - System.setProperty(SAX_PARSER_FACTORY_KEY, - "org.apache.tika.sax.ErrorResistantSAXParserFactory"); - //forces re-initialization + System.setProperty( + SAX_PARSER_FACTORY_KEY, "org.apache.tika.sax.ErrorResistantSAXParserFactory"); + // forces re-initialization XMLReaderUtils.setPoolSize(10); } @@ -62,9 +61,10 @@ public static void tearDown() throws TikaException { } else { System.setProperty(SAX_PARSER_FACTORY_KEY, DEFAULT_SAX_PARSER_FACTORY); } - //forces re-initialization + // forces re-initialization XMLReaderUtils.setPoolSize(10); } + private void extractXml(InputStream blobStream, OutputStream textStream) throws IOException, SAXException, TikaException, ParserConfigurationException { @@ -92,7 +92,8 @@ void testUndeclaredEntityXML() throws Exception { try { String content = extractTestData("undeclared_entity.xml"); assertContains("START", content); - //This assertion passes only if custom error handler is called to handle fatal exception + // This assertion passes only if custom error handler is called to handle fatal + // exception assertContains("END", content); } catch (SAXException e) { fail("Exception returned from parser and not handled in error handler " + e); diff --git a/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java index 3ad297811b..a8388f935e 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java @@ -21,9 +21,7 @@ import org.junit.jupiter.api.Test; import org.xml.sax.helpers.AttributesImpl; -/** - * Test cases for the {@link LinkContentHandler} class. - */ +/** Test cases for the {@link LinkContentHandler} class. */ public class LinkContentHandlerTest { /** @@ -34,9 +32,9 @@ public void testWhitespaceCollapsing() throws Exception { LinkContentHandler linkContentHandler = new LinkContentHandler(true); linkContentHandler.startElement(XHTMLContentHandler.XHTML, "a", "", new AttributesImpl()); - char[] anchorText = - {'\n', 'N', 'o', ' ', 'w', 'h', 'i', 't', 'e', '\n', '\t', '\t', 's', 'p', 'a', 'c', - 'e'}; + char[] anchorText = { + '\n', 'N', 'o', ' ', 'w', 'h', 'i', 't', 'e', '\n', '\t', '\t', 's', 'p', 'a', 'c', 'e' + }; linkContentHandler.characters(anchorText, 1, anchorText.length - 1); linkContentHandler.endElement(XHTMLContentHandler.XHTML, "a", ""); @@ -72,7 +70,8 @@ public void testLinkTag() throws Exception { linkContentHandler.startElement(XHTMLContentHandler.XHTML, "link", "", atts); linkContentHandler.endElement(XHTMLContentHandler.XHTML, "link", ""); - assertEquals("http://tika.apache.org/stylesheet.css", + assertEquals( + "http://tika.apache.org/stylesheet.css", linkContentHandler.getLinks().get(0).getUri()); assertEquals("stylesheet", linkContentHandler.getLinks().get(0).getRel()); } @@ -90,7 +89,8 @@ public void testIframeTag() throws Exception { linkContentHandler.startElement(XHTMLContentHandler.XHTML, "iframe", "", atts); linkContentHandler.endElement(XHTMLContentHandler.XHTML, "iframe", ""); - assertEquals("http://tika.apache.org/iframe.html", + assertEquals( + "http://tika.apache.org/iframe.html", linkContentHandler.getLinks().get(0).getUri()); } @@ -107,8 +107,8 @@ public void testScriptTag() throws Exception { linkContentHandler.startElement(XHTMLContentHandler.XHTML, "script", "", atts); linkContentHandler.endElement(XHTMLContentHandler.XHTML, "script", ""); - assertEquals("http://tika.apache.org/script.js", - linkContentHandler.getLinks().get(0).getUri()); + assertEquals( + "http://tika.apache.org/script.js", linkContentHandler.getLinks().get(0).getUri()); } /** diff --git a/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java b/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java index d903a4d632..acd3399eb5 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java +++ b/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java @@ -18,13 +18,11 @@ import java.io.IOException; import java.io.InputStream; - import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; - public class NonValidatingContentHandler extends ContentHandlerDecorator { class ClosedInputStream extends InputStream { @@ -37,7 +35,6 @@ class ClosedInputStream extends InputStream { public int read() { return -1; } - } public NonValidatingContentHandler(ContentHandler handler) { @@ -46,17 +43,17 @@ public NonValidatingContentHandler(ContentHandler handler) { @Override public void warning(SAXParseException e) throws SAXException { - //NO-OP + // NO-OP } @Override public void error(SAXParseException e) throws SAXException { - //NO-OP + // NO-OP } @Override public void fatalError(SAXParseException e) throws SAXException { - //NO-OP + // NO-OP } @Override @@ -64,5 +61,4 @@ public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { return new InputSource(new ClosedInputStream()); } - } diff --git a/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java index 6c7e94513e..dca1220ac3 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java @@ -22,15 +22,12 @@ import java.net.ConnectException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; - import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.xml.sax.InputSource; import org.xml.sax.helpers.DefaultHandler; -/** - * Unit tests for the {@link OfflineContentHandler} class. - */ +/** Unit tests for the {@link OfflineContentHandler} class. */ public class OfflineContentHandlerTest { private SAXParser parser; @@ -56,14 +53,13 @@ public void testExternalDTD() throws Exception { @Test public void testExternalEntity() throws Exception { String xml = - "" + - " ]>&bar;"; + "" + + " ]>&bar;"; try { parser.parse(new InputSource(new StringReader(xml)), offline); } catch (ConnectException e) { fail("Parser tried to access the external DTD:" + e); } } - - } diff --git a/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java index 47918a9b56..d85aab9a70 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java @@ -21,26 +21,22 @@ import java.io.ByteArrayOutputStream; import java.io.OutputStreamWriter; - +import org.apache.tika.metadata.Metadata; import org.junit.jupiter.api.Test; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.metadata.Metadata; - -/** - * Test cases for the {@link RichTextContentHandler} class. - */ +/** Test cases for the {@link RichTextContentHandler} class. */ public class RichTextContentHandlerTest { - /** - * Test to check img tags are detected and rich text version used. - */ + /** Test to check img tags are detected and rich text version used. */ @Test public void aTagTest() throws Exception { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); - XHTMLContentHandler xhtml = new XHTMLContentHandler(new RichTextContentHandler( - new OutputStreamWriter(buffer, UTF_8)), new Metadata()); + XHTMLContentHandler xhtml = + new XHTMLContentHandler( + new RichTextContentHandler(new OutputStreamWriter(buffer, UTF_8)), + new Metadata()); xhtml.startDocument(); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "", "name", "", "value"); @@ -50,15 +46,15 @@ public void aTagTest() throws Exception { assertEquals("\n\n\n\n[bookmark: value]", buffer.toString(UTF_8.name())); } - /** - * Test to check a tags are detected and rich text version used. - */ + /** Test to check a tags are detected and rich text version used. */ @Test public void imgTagTest() throws Exception { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); - XHTMLContentHandler xhtml = new XHTMLContentHandler(new RichTextContentHandler( - new OutputStreamWriter(buffer, UTF_8)), new Metadata()); + XHTMLContentHandler xhtml = + new XHTMLContentHandler( + new RichTextContentHandler(new OutputStreamWriter(buffer, UTF_8)), + new Metadata()); xhtml.startDocument(); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "", "alt", "", "value"); @@ -67,5 +63,4 @@ public void imgTagTest() throws Exception { assertEquals("\n\n\n\n[image: value]", buffer.toString(UTF_8.name())); } - } diff --git a/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java index 80d1bfd9a1..a5bd65a34b 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java @@ -23,9 +23,7 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -/** - * Unit tests for the {@link SafeContentHandler} class. - */ +/** Unit tests for the {@link SafeContentHandler} class. */ public class SafeContentHandlerTest { private ContentHandler output; @@ -78,5 +76,4 @@ public void testInvalidSurrogates() throws SAXException { safe.ignorableWhitespace("\udb00\ubfff".toCharArray(), 0, 2); assertEquals("\ufffd\ubfff", output.toString()); } - } diff --git a/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java index 421e6c2ecb..f6ad542a85 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java @@ -19,20 +19,16 @@ import static org.junit.jupiter.api.Assertions.fail; import java.io.IOException; - import org.apache.commons.io.input.NullInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TikaInputStream; - -/** - * Tests for the {@link SecureContentHandler} class. - */ +/** Tests for the {@link SecureContentHandler} class. */ public class SecureContentHandlerTest { private static final int MANY_BYTES = 2000000; @@ -50,7 +46,7 @@ public void setUp() { @Test public void testZeroCharactersPerByte() throws IOException { try { - char[] ch = new char[]{'x'}; + char[] ch = new char[] {'x'}; for (int i = 0; i < MANY_BYTES; i++) { stream.read(); } @@ -160,5 +156,4 @@ public void testNestedEntries() throws SAXException { } } } - } diff --git a/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java b/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java index 361b88dc76..7ae52bbd3d 100755 --- a/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java @@ -40,16 +40,16 @@ public void testToTextContentHandler() throws Exception { @Test public void testToXMLContentHandler() throws Exception { assertStartDocument("", new ToXMLContentHandler()); - assertStartDocument("\n", - new ToXMLContentHandler("UTF-8")); + assertStartDocument( + "\n", new ToXMLContentHandler("UTF-8")); assertCharacters("content", new ToXMLContentHandler()); assertCharacterEscaping("<&\">", new ToXMLContentHandler()); assertIgnorableWhitespace(" \t\r\n", new ToXMLContentHandler()); assertEmptyElement("
", new ToXMLContentHandler()); - assertEmptyElementWithAttributes("", - new ToXMLContentHandler()); - assertEmptyElementWithAttributeEscaping("

", - new ToXMLContentHandler()); + assertEmptyElementWithAttributes( + "", new ToXMLContentHandler()); + assertEmptyElementWithAttributeEscaping( + "

", new ToXMLContentHandler()); assertElement("

content

", new ToXMLContentHandler()); assertElementWithAttributes("

content

", new ToXMLContentHandler()); } @@ -61,10 +61,10 @@ public void testToHTMLContentHandler() throws Exception { assertCharacterEscaping("<&\">", new ToHTMLContentHandler()); assertIgnorableWhitespace(" \t\r\n", new ToHTMLContentHandler()); assertEmptyElement("
", new ToHTMLContentHandler()); - assertEmptyElementWithAttributes("", - new ToHTMLContentHandler()); - assertEmptyElementWithAttributeEscaping("

", - new ToHTMLContentHandler()); + assertEmptyElementWithAttributes( + "", new ToHTMLContentHandler()); + assertEmptyElementWithAttributeEscaping( + "

", new ToHTMLContentHandler()); assertElement("

content

", new ToHTMLContentHandler()); assertElementWithAttributes("

content

", new ToHTMLContentHandler()); } @@ -133,5 +133,4 @@ private void assertElementWithAttributes(String expected, ContentHandler handler handler.endElement("", "p", "p"); assertEquals(expected, handler.toString()); } - } diff --git a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java index 136c62b0c0..f0e993bb33 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java @@ -21,18 +21,14 @@ import java.util.ArrayList; import java.util.List; - +import org.apache.tika.metadata.Metadata; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.metadata.Metadata; - -/** - * Unit tests for the {@link XHTMLContentHandler} class. - */ +/** Unit tests for the {@link XHTMLContentHandler} class. */ public class XHTMLContentHandlerTest { private ContentHandler output; @@ -40,8 +36,8 @@ public class XHTMLContentHandlerTest { private XHTMLContentHandler xhtml; /** - * Return array of non-zerolength words. Splitting on whitespace will get us - * empty words for emptylines. + * Return array of non-zerolength words. Splitting on whitespace will get us empty words for + * emptylines. * * @param string some mix of newlines and real words * @return array of real words. @@ -65,8 +61,7 @@ public void setUp() { } /** - * Test that content in block elements are properly separated in text - * output. + * Test that content in block elements are properly separated in text output. * * @see TIKA-188 */ @@ -104,8 +99,7 @@ public void testExtraWhitespace() throws SAXException { } /** - * Test that content in option elements are properly separated in text - * output. + * Test that content in option elements are properly separated in text output. * * @see TIKA-394 */ @@ -149,8 +143,8 @@ public void testAttributesOnBody() throws Exception { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", ""); - attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", - "http://schema.org/Event"); + attributes.addAttribute( + XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", "http://schema.org/Event"); xhtmlContentHandler.startDocument(); xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "body", "body", attributes); @@ -168,8 +162,8 @@ public void testAttributesOnHtml() throws Exception { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", ""); - attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", - "http://schema.org/Event"); + attributes.addAttribute( + XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", "http://schema.org/Event"); xhtmlContentHandler.startDocument(); xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "html", "html", attributes); @@ -217,5 +211,4 @@ public void testInvalidControlCharacter0x93() throws Exception { assertEquals(1, words.length); assertEquals("a\ufffdz", words[0]); } - } diff --git a/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java b/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java index 2a3f1d4be0..324e90e377 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java @@ -130,5 +130,4 @@ public void testPrefixedElement() { assertFalse(matcher.matchesAttribute(NS, "name")); assertFalse(matcher.matchesAttribute(NS, "eman")); } - } diff --git a/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java index 1f05631e5d..702e7f9f36 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java @@ -20,17 +20,14 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; +import aQute.bnd.annotation.metatype.Configurable; import java.util.Date; import java.util.HashMap; import java.util.Map; - -import aQute.bnd.annotation.metatype.Configurable; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.Field; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; - +import org.junit.jupiter.api.Test; /** * @since 6/1/16 @@ -63,7 +60,7 @@ class MyParser extends Configurable { AnnotationUtils.assignFieldParams(new MyParser(), params); fail("Exception expected"); } catch (TikaConfigException e) { - //expected + // expected } } @@ -73,6 +70,7 @@ public void testPrimitiveAndBoxedTypes() { class MyParser extends Configurable { @Field(required = true) int config; + @Field(required = true, name = "config") Integer config2; } @@ -89,7 +87,6 @@ class MyParser extends Configurable { e.printStackTrace(); fail("Exception Not expected"); } - } @Test @@ -117,25 +114,25 @@ class MyParser extends Configurable { AnnotationUtils.assignFieldParams(new MyParser(), params); fail("Exception expected"); } catch (TikaConfigException e) { - //expected + // expected } } - @Test public void testParserInheritance() { class Parent { @Field(required = true) int overridden; + @Field(required = true) int parentField; - } class Child extends Parent { @Field(required = true) int overridden; + @Field(required = true) int childField; } @@ -162,11 +159,10 @@ class Child extends Parent { AnnotationUtils.assignFieldParams(new Child(), params); fail("Exception expected, parent class field not set"); } catch (TikaConfigException e) { - //expected + // expected } } - @Test public void testParamValueInheritance() { @@ -193,10 +189,8 @@ class Bean { AnnotationUtils.assignFieldParams(parser, params); fail("Exception expected, Date is not assignable to CharSequence."); } catch (TikaConfigException e) { - //expected + // expected } - } - } diff --git a/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java index 8a6574ae0c..55d3d14093 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java @@ -16,7 +16,6 @@ */ package org.apache.tika.utils; - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -68,5 +67,4 @@ public void testFunkyNames() { assertEquals("KOI8-R", CharsetUtils.clean("koi8r")); } - } diff --git a/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java index 6ac9d72c7a..57bdddea03 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java @@ -20,25 +20,26 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; - -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.parser.ParseContext; +import org.junit.jupiter.api.Test; public class ConcurrentUtilsTest { @Test public void testExecuteThread() throws Exception { ParseContext context = new ParseContext(); - Future result = ConcurrentUtils.execute(context, new Runnable() { + Future result = + ConcurrentUtils.execute( + context, + new Runnable() { - @Override - public void run() { - //Do nothing + @Override + public void run() { + // Do nothing - } - }); + } + }); assertNull(result.get()); } @@ -48,16 +49,18 @@ public void testExecuteExecutor() throws Exception { TikaConfig config = TikaConfig.getDefaultConfig(); ParseContext context = new ParseContext(); context.set(ExecutorService.class, config.getExecutorService()); - Future result = ConcurrentUtils.execute(context, new Runnable() { + Future result = + ConcurrentUtils.execute( + context, + new Runnable() { - @Override - public void run() { - //Do nothing + @Override + public void run() { + // Do nothing - } - }); + } + }); assertNull(result.get()); } - } diff --git a/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java index 030836ed5e..0ff44f210d 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java @@ -21,7 +21,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.List; - import org.junit.jupiter.api.Test; /** @@ -31,10 +30,7 @@ */ public class RegexUtilsTest { - /** - * Test {@link RegexUtils#extractLinks(String)} with no links. - */ - + /** Test {@link RegexUtils#extractLinks(String)} with no links. */ @Test public void testExtractLinksNone() { List links = null; @@ -52,31 +48,28 @@ public void testExtractLinksNone() { assertEquals(0, links.size()); } - - /** - * Test {@link RegexUtils#extractLinks(String)} for http. - */ + /** Test {@link RegexUtils#extractLinks(String)} for http. */ @Test public void testExtractLinksHttp() { - List links = RegexUtils.extractLinks( - "Test with http://www.nutch.org/index.html is it found? " + - "What about www.google.com at http://www.google.de " + - "A longer URL could be http://www.sybit.com/solutions/portals.html"); + List links = + RegexUtils.extractLinks( + "Test with http://www.nutch.org/index.html is it found? " + + "What about www.google.com at http://www.google.de " + + "A longer URL could be http://www.sybit.com/solutions/portals.html"); assertTrue(links.size() == 3, "Url not found!"); assertEquals("http://www.nutch.org/index.html", links.get(0), "Wrong URL"); assertEquals("http://www.google.de", links.get(1), "Wrong URL"); - assertEquals("http://www.sybit.com/solutions/portals.html", links.get(2), - "Wrong URL"); + assertEquals("http://www.sybit.com/solutions/portals.html", links.get(2), "Wrong URL"); } - /** - * Test {@link RegexUtils#extractLinks(String)} for ftp. - */ + /** Test {@link RegexUtils#extractLinks(String)} for ftp. */ @Test public void testExtractLinksFtp() { - List links = RegexUtils.extractLinks("Test with ftp://www.nutch.org is it found? " + - "What about www.google.com at ftp://www.google.de"); + List links = + RegexUtils.extractLinks( + "Test with ftp://www.nutch.org is it found? " + + "What about www.google.com at ftp://www.google.de"); assertTrue(links.size() == 2, "Url not found!"); assertEquals("ftp://www.nutch.org", links.get(0), "Wrong URL"); diff --git a/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java index 199c0031fa..712cffc638 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java @@ -23,26 +23,30 @@ import java.util.Collections; import java.util.List; import java.util.Random; - import org.apache.custom.detect.MyCustomDetector; -import org.junit.jupiter.api.Test; - import org.apache.tika.detect.Detector; import org.apache.tika.detect.EmptyDetector; import org.apache.tika.detect.FileCommandDetector; import org.apache.tika.detect.OverrideDetector; import org.apache.tika.detect.ZeroSizeFileDetector; +import org.junit.jupiter.api.Test; public class ServiceLoaderUtilsTest { @Test public void testSort() throws Exception { - //OverrideDetector is moved to index 0 - //by the private service loading in DefaultDetector. - //This tests that a custom detector always comes first - //and then reverse alphabetical order - Detector[] detectors = new Detector[]{new MyCustomDetector(), new EmptyDetector(), - new FileCommandDetector(), new OverrideDetector(), new ZeroSizeFileDetector()}; + // OverrideDetector is moved to index 0 + // by the private service loading in DefaultDetector. + // This tests that a custom detector always comes first + // and then reverse alphabetical order + Detector[] detectors = + new Detector[] { + new MyCustomDetector(), + new EmptyDetector(), + new FileCommandDetector(), + new OverrideDetector(), + new ZeroSizeFileDetector() + }; List expected = Arrays.asList(detectors); List shuffled = new ArrayList<>(expected); Random random = new Random(42); @@ -52,6 +56,4 @@ public void testSort() throws Exception { assertEquals(expected, shuffled, "failed on iteration " + i); } } - - } diff --git a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java index 18e2535873..828ca250d7 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java @@ -21,20 +21,20 @@ import java.io.ByteArrayInputStream; import java.net.ConnectException; import java.nio.charset.StandardCharsets; - -import org.junit.jupiter.api.Test; - import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.ToTextContentHandler; +import org.junit.jupiter.api.Test; public class XMLReaderUtilsTest { - //make sure that parseSAX actually defends against external entities + // make sure that parseSAX actually defends against external entities @Test public void testExternalDTD() throws Exception { String xml = ""; try { - XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), - new ToTextContentHandler(), new ParseContext()); + XMLReaderUtils.parseSAX( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ToTextContentHandler(), + new ParseContext()); } catch (ConnectException e) { fail("Parser tried to access the external DTD:" + e); } @@ -43,11 +43,14 @@ public void testExternalDTD() throws Exception { @Test public void testExternalEntity() throws Exception { String xml = - "" + - " ]>&bar;"; + "" + + " ]>&bar;"; try { - XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), - new ToTextContentHandler(), new ParseContext()); + XMLReaderUtils.parseSAX( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ToTextContentHandler(), + new ParseContext()); } catch (ConnectException e) { fail("Parser tried to access the external DTD:" + e); } diff --git a/tika-parent/checkstyle.xml b/tika-parent/checkstyle.xml index 55dc19fe74..7c15591471 100644 --- a/tika-parent/checkstyle.xml +++ b/tika-parent/checkstyle.xml @@ -36,109 +36,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 3c331ee380..e286f97478 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -1273,8 +1273,8 @@ - - +