From 3fd128b3fc0eb679a5acd23efaf4f0b2836983fa Mon Sep 17 00:00:00 2001 From: tallison Date: Wed, 19 Nov 2025 16:02:50 -0500 Subject: [PATCH 1/2] spotless -- proof of concept --- .../java/org/apache/tika/cli/AsyncHelper.java | 24 +- .../java/org/apache/tika/cli/TikaCLI.java | 293 +-- .../tika/gui/ParsingTransferHandler.java | 35 +- .../java/org/apache/tika/gui/TikaGUI.java | 172 +- .../org/apache/tika/cli/AsyncHelperTest.java | 27 +- .../org/apache/tika/cli/TikaCLIAsyncTest.java | 75 +- .../java/org/apache/tika/cli/TikaCLITest.java | 288 +-- .../java/org/apache/tika/bundle/BundleIT.java | 107 +- .../src/main/java/org/apache/tika/Tika.java | 323 ++-- .../ConfigurableThreadPoolExecutor.java | 26 +- .../concurrent/SimpleThreadPoolExecutor.java | 28 +- .../org/apache/tika/config/ConfigBase.java | 151 +- .../java/org/apache/tika/config/Field.java | 27 +- .../org/apache/tika/config/Initializable.java | 40 +- .../config/InitializableProblemHandler.java | 42 +- .../apache/tika/config/LoadErrorHandler.java | 48 +- .../java/org/apache/tika/config/Param.java | 83 +- .../org/apache/tika/config/ParamField.java | 68 +- .../org/apache/tika/config/ServiceLoader.java | 126 +- .../org/apache/tika/config/TikaActivator.java | 43 +- .../org/apache/tika/config/TikaConfig.java | 406 ++-- .../tika/config/TikaConfigSerializer.java | 160 +- .../apache/tika/config/TikaTaskTimeout.java | 23 +- .../org/apache/tika/config/package-info.java | 22 +- .../apache/tika/detect/AutoDetectReader.java | 62 +- .../apache/tika/detect/CompositeDetector.java | 34 +- .../detect/CompositeEncodingDetector.java | 46 +- .../apache/tika/detect/DefaultDetector.java | 55 +- .../tika/detect/DefaultEncodingDetector.java | 39 +- .../tika/detect/DefaultProbDetector.java | 33 +- .../java/org/apache/tika/detect/Detector.java | 50 +- .../org/apache/tika/detect/EmptyDetector.java | 23 +- .../apache/tika/detect/EncodingDetector.java | 51 +- .../tika/detect/FileCommandDetector.java | 71 +- .../org/apache/tika/detect/MagicDetector.java | 128 +- .../tika/detect/NNExampleModelDetector.java | 43 +- .../apache/tika/detect/NNTrainedModel.java | 27 +- .../tika/detect/NNTrainedModelBuilder.java | 22 +- .../org/apache/tika/detect/NameDetector.java | 79 +- .../detect/NonDetectingEncodingDetector.java | 27 +- .../apache/tika/detect/OverrideDetector.java | 27 +- .../org/apache/tika/detect/TextDetector.java | 71 +- .../apache/tika/detect/TextStatistics.java | 51 +- .../org/apache/tika/detect/TrainedModel.java | 22 +- .../tika/detect/TrainedModelDetector.java | 26 +- .../org/apache/tika/detect/TypeDetector.java | 40 +- .../apache/tika/detect/XmlRootExtractor.java | 41 +- .../tika/detect/ZeroSizeFileDetector.java | 23 +- .../org/apache/tika/detect/package-info.java | 22 +- .../org/apache/tika/embedder/Embedder.java | 88 +- .../tika/embedder/ExternalEmbedder.java | 175 +- .../exception/AccessPermissionException.java | 27 +- .../exception/CorruptedFileException.java | 26 +- .../exception/EncryptedDocumentException.java | 22 +- .../tika/exception/FileTooLongException.java | 31 +- .../tika/exception/RuntimeSAXException.java | 22 +- .../tika/exception/TikaConfigException.java | 27 +- .../apache/tika/exception/TikaException.java | 22 +- .../exception/TikaMemoryLimitException.java | 34 +- .../tika/exception/TikaTimeoutException.java | 22 +- .../exception/UnsupportedFormatException.java | 41 +- .../exception/WriteLimitReachedException.java | 40 +- .../tika/exception/ZeroByteFileException.java | 34 +- .../apache/tika/exception/package-info.java | 22 +- .../extractor/BasicEmbeddedBytesSelector.java | 40 +- .../tika/extractor/ContainerExtractor.java | 56 +- .../DefaultEmbeddedStreamTranslator.java | 43 +- .../tika/extractor/DocumentSelector.java | 36 +- .../tika/extractor/EmbeddedBytesSelector.java | 23 +- ...ddedDocumentByteStoreExtractorFactory.java | 34 +- .../EmbeddedDocumentBytesHandler.java | 25 +- .../extractor/EmbeddedDocumentExtractor.java | 38 +- .../EmbeddedDocumentExtractorFactory.java | 23 +- .../tika/extractor/EmbeddedDocumentUtil.java | 92 +- .../extractor/EmbeddedResourceHandler.java | 37 +- .../extractor/EmbeddedStreamTranslator.java | 31 +- .../tika/extractor/ParentContentHandler.java | 26 +- .../extractor/ParserContainerExtractor.java | 61 +- .../ParsingEmbeddedDocumentExtractor.java | 48 +- ...rsingEmbeddedDocumentExtractorFactory.java | 28 +- .../tika/extractor/RUnpackExtractor.java | 96 +- .../extractor/RUnpackExtractorFactory.java | 50 +- .../apache/tika/extractor/package-info.java | 22 +- .../apache/tika/fork/ClassLoaderProxy.java | 26 +- .../apache/tika/fork/ClassLoaderResource.java | 45 +- .../apache/tika/fork/ContentHandlerProxy.java | 29 +- .../tika/fork/ContentHandlerResource.java | 25 +- .../java/org/apache/tika/fork/ForkClient.java | 94 +- .../tika/fork/ForkObjectInputStream.java | 60 +- .../java/org/apache/tika/fork/ForkParser.java | 185 +- .../java/org/apache/tika/fork/ForkProxy.java | 22 +- .../org/apache/tika/fork/ForkResource.java | 22 +- .../java/org/apache/tika/fork/ForkServer.java | 88 +- .../apache/tika/fork/InputStreamProxy.java | 22 +- .../apache/tika/fork/InputStreamResource.java | 22 +- .../apache/tika/fork/MemoryURLConnection.java | 25 +- .../tika/fork/MemoryURLStreamHandler.java | 25 +- .../fork/MemoryURLStreamHandlerFactory.java | 22 +- .../tika/fork/MemoryURLStreamRecord.java | 22 +- .../tika/fork/MetadataContentHandler.java | 27 +- .../tika/fork/ParserFactoryFactory.java | 27 +- .../RecursiveMetadataContentHandlerProxy.java | 61 +- ...cursiveMetadataContentHandlerResource.java | 42 +- .../org/apache/tika/fork/TimeoutLimits.java | 22 +- .../org/apache/tika/fork/package-info.java | 22 +- .../apache/tika/io/BoundedInputStream.java | 43 +- .../java/org/apache/tika/io/EndianUtils.java | 90 +- .../org/apache/tika/io/FilenameUtils.java | 141 +- .../main/java/org/apache/tika/io/IOUtils.java | 38 +- .../apache/tika/io/InputStreamFactory.java | 41 +- .../apache/tika/io/LookaheadInputStream.java | 58 +- .../java/org/apache/tika/io/TailStream.java | 99 +- .../apache/tika/io/TemporaryResources.java | 92 +- .../org/apache/tika/io/TikaInputStream.java | 421 ++--- .../java/org/apache/tika/io/package-info.java | 22 +- .../language/detect/LanguageConfidence.java | 24 +- .../language/detect/LanguageDetector.java | 90 +- .../tika/language/detect/LanguageHandler.java | 31 +- .../tika/language/detect/LanguageNames.java | 36 +- .../tika/language/detect/LanguageResult.java | 35 +- .../tika/language/detect/LanguageWriter.java | 33 +- .../language/translate/DefaultTranslator.java | 34 +- .../language/translate/EmptyTranslator.java | 27 +- .../tika/language/translate/Translator.java | 37 +- .../tika/metadata/AccessPermissions.java | 36 +- .../apache/tika/metadata/ClimateForcast.java | 26 +- .../apache/tika/metadata/CreativeCommons.java | 22 +- .../org/apache/tika/metadata/Database.java | 22 +- .../org/apache/tika/metadata/DublinCore.java | 198 +- .../java/org/apache/tika/metadata/Epub.java | 28 +- .../apache/tika/metadata/ExternalProcess.java | 67 +- .../org/apache/tika/metadata/FileSystem.java | 22 +- .../java/org/apache/tika/metadata/Font.java | 25 +- .../org/apache/tika/metadata/Geographic.java | 30 +- .../java/org/apache/tika/metadata/HTML.java | 30 +- .../org/apache/tika/metadata/HttpHeaders.java | 26 +- .../java/org/apache/tika/metadata/IPTC.java | 1209 ++++++------ .../java/org/apache/tika/metadata/MAPI.java | 58 +- .../apache/tika/metadata/MachineMetadata.java | 49 +- .../org/apache/tika/metadata/Message.java | 70 +- .../org/apache/tika/metadata/Metadata.java | 160 +- .../java/org/apache/tika/metadata/Office.java | 113 +- .../tika/metadata/OfficeOpenXMLCore.java | 58 +- .../tika/metadata/OfficeOpenXMLExtended.java | 78 +- .../java/org/apache/tika/metadata/PDF.java | 119 +- .../java/org/apache/tika/metadata/PST.java | 22 +- .../org/apache/tika/metadata/PagedText.java | 37 +- .../org/apache/tika/metadata/Photoshop.java | 86 +- .../org/apache/tika/metadata/Property.java | 66 +- .../tika/metadata/PropertyTypeException.java | 36 +- .../org/apache/tika/metadata/QuattroPro.java | 41 +- .../org/apache/tika/metadata/RTFMetadata.java | 47 +- .../org/apache/tika/metadata/Rendering.java | 22 +- .../java/org/apache/tika/metadata/TIFF.java | 58 +- .../tika/metadata/TikaCoreProperties.java | 277 ++- .../apache/tika/metadata/TikaMimeKeys.java | 22 +- .../apache/tika/metadata/TikaPagedText.java | 27 +- .../java/org/apache/tika/metadata/WARC.java | 24 +- .../org/apache/tika/metadata/WordPerfect.java | 58 +- .../java/org/apache/tika/metadata/XMP.java | 51 +- .../java/org/apache/tika/metadata/XMPDC.java | 190 +- .../java/org/apache/tika/metadata/XMPDM.java | 257 ++- .../java/org/apache/tika/metadata/XMPIdq.java | 26 +- .../java/org/apache/tika/metadata/XMPMM.java | 65 +- .../java/org/apache/tika/metadata/XMPPDF.java | 24 +- .../org/apache/tika/metadata/XMPRights.java | 38 +- .../filter/CaptureGroupMetadataFilter.java | 46 +- .../ClearByAttachmentTypeMetadataFilter.java | 38 +- .../filter/ClearByMimeMetadataFilter.java | 28 +- .../filter/CompositeMetadataFilter.java | 26 +- .../filter/DateNormalizingMetadataFilter.java | 48 +- .../filter/DefaultMetadataFilter.java | 26 +- .../filter/ExcludeFieldMetadataFilter.java | 23 +- .../filter/FieldNameMappingFilter.java | 33 +- .../filter/GeoPointMetadataFilter.java | 32 +- .../filter/IncludeFieldMetadataFilter.java | 23 +- .../tika/metadata/filter/MetadataFilter.java | 33 +- .../tika/metadata/filter/NoOpFilter.java | 27 +- .../CompositeMetadataListFilter.java | 26 +- .../listfilter/MetadataListFilter.java | 37 +- .../metadata/listfilter/NoOpListFilter.java | 23 +- .../apache/tika/metadata/package-info.java | 22 +- .../writefilter/MetadataWriteFilter.java | 34 +- .../MetadataWriteFilterFactory.java | 22 +- .../writefilter/StandardWriteFilter.java | 199 +- .../StandardWriteFilterFactory.java | 40 +- .../java/org/apache/tika/mime/AndClause.java | 22 +- .../java/org/apache/tika/mime/Clause.java | 26 +- .../java/org/apache/tika/mime/HexCoDec.java | 38 +- .../main/java/org/apache/tika/mime/Magic.java | 25 +- .../java/org/apache/tika/mime/MagicMatch.java | 29 +- .../java/org/apache/tika/mime/MediaType.java | 102 +- .../apache/tika/mime/MediaTypeRegistry.java | 75 +- .../java/org/apache/tika/mime/MimeType.java | 75 +- .../apache/tika/mime/MimeTypeException.java | 27 +- .../java/org/apache/tika/mime/MimeTypes.java | 195 +- .../apache/tika/mime/MimeTypesFactory.java | 99 +- .../org/apache/tika/mime/MimeTypesReader.java | 81 +- .../tika/mime/MimeTypesReaderMetKeys.java | 22 +- .../tika/mime/MinShouldMatchClause.java | 28 +- .../java/org/apache/tika/mime/OrClause.java | 22 +- .../java/org/apache/tika/mime/Patterns.java | 61 +- .../ProbabilisticMimeDetectionSelector.java | 85 +- .../org/apache/tika/mime/package-info.java | 22 +- .../java/org/apache/tika/package-info.java | 22 +- .../AbstractEncodingDetectorParser.java | 30 +- .../parser/AbstractExternalProcessParser.java | 33 +- .../apache/tika/parser/AbstractParser.java | 49 +- .../apache/tika/parser/AutoDetectParser.java | 124 +- .../tika/parser/AutoDetectParserConfig.java | 98 +- .../tika/parser/AutoDetectParserFactory.java | 29 +- .../apache/tika/parser/CompositeParser.java | 85 +- .../org/apache/tika/parser/CryptoParser.java | 36 +- .../org/apache/tika/parser/DefaultParser.java | 61 +- .../apache/tika/parser/DelegatingParser.java | 58 +- .../apache/tika/parser/DigestingParser.java | 67 +- .../org/apache/tika/parser/EmptyParser.java | 35 +- .../org/apache/tika/parser/ErrorParser.java | 33 +- .../org/apache/tika/parser/NetworkParser.java | 54 +- .../org/apache/tika/parser/ParseContext.java | 45 +- .../org/apache/tika/parser/ParseRecord.java | 32 +- .../java/org/apache/tika/parser/Parser.java | 57 +- .../apache/tika/parser/ParserDecorator.java | 74 +- .../org/apache/tika/parser/ParserFactory.java | 26 +- .../tika/parser/ParserPostProcessor.java | 40 +- .../org/apache/tika/parser/ParsingReader.java | 117 +- .../apache/tika/parser/PasswordProvider.java | 42 +- .../tika/parser/RecursiveParserWrapper.java | 185 +- .../tika/parser/RegexCaptureParser.java | 38 +- .../apache/tika/parser/RenderingParser.java | 25 +- .../apache/tika/parser/StatefulParser.java | 31 +- .../tika/parser/digest/CompositeDigester.java | 23 +- .../parser/digest/InputStreamDigester.java | 89 +- .../external/CompositeExternalParser.java | 30 +- .../tika/parser/external/ExternalParser.java | 144 +- .../external/ExternalParsersConfigReader.java | 50 +- .../ExternalParsersConfigReaderMetKeys.java | 22 +- .../external/ExternalParsersFactory.java | 35 +- .../tika/parser/external/package-info.java | 22 +- .../tika/parser/external2/ExternalParser.java | 116 +- .../multiple/AbstractMultipleParser.java | 125 +- .../tika/parser/multiple/FallbackParser.java | 40 +- .../parser/multiple/SupplementingParser.java | 52 +- .../org/apache/tika/parser/package-info.java | 22 +- .../tika/renderer/CompositeRenderer.java | 35 +- .../tika/renderer/PageBasedRenderResults.java | 24 +- .../tika/renderer/PageRangeRequest.java | 24 +- .../apache/tika/renderer/RenderRequest.java | 31 +- .../apache/tika/renderer/RenderResult.java | 36 +- .../apache/tika/renderer/RenderResults.java | 25 +- .../org/apache/tika/renderer/Renderer.java | 46 +- .../apache/tika/renderer/RenderingState.java | 27 +- .../tika/renderer/RenderingTracker.java | 31 +- ...AbstractRecursiveParserWrapperHandler.java | 78 +- .../tika/sax/BasicContentHandlerFactory.java | 73 +- .../apache/tika/sax/BodyContentHandler.java | 64 +- .../org/apache/tika/sax/CleanPhoneText.java | 261 +-- .../tika/sax/ContentHandlerDecorator.java | 69 +- .../sax/ContentHandlerDecoratorFactory.java | 28 +- .../tika/sax/ContentHandlerFactory.java | 23 +- .../apache/tika/sax/DIFContentHandler.java | 40 +- .../sax/ElementMappingContentHandler.java | 52 +- .../tika/sax/EmbeddedContentHandler.java | 43 +- .../EndDocumentShieldingContentHandler.java | 29 +- .../tika/sax/ExpandedTitleContentHandler.java | 43 +- .../main/java/org/apache/tika/sax/Link.java | 22 +- .../java/org/apache/tika/sax/LinkBuilder.java | 22 +- .../apache/tika/sax/LinkContentHandler.java | 38 +- .../tika/sax/OfflineContentHandler.java | 29 +- .../sax/PhoneExtractingContentHandler.java | 77 +- .../sax/RecursiveParserWrapperHandler.java | 80 +- .../tika/sax/RichTextContentHandler.java | 33 +- .../apache/tika/sax/SafeContentHandler.java | 152 +- .../apache/tika/sax/SecureContentHandler.java | 92 +- .../tika/sax/StandardOrganizations.java | 102 +- .../apache/tika/sax/StandardReference.java | 28 +- .../StandardsExtractingContentHandler.java | 73 +- .../org/apache/tika/sax/StandardsText.java | 97 +- .../tika/sax/StoppingEarlyException.java | 27 +- .../apache/tika/sax/TaggedContentHandler.java | 51 +- .../apache/tika/sax/TaggedSAXException.java | 30 +- .../apache/tika/sax/TeeContentHandler.java | 28 +- .../sax/TextAndAttributeContentHandler.java | 26 +- .../apache/tika/sax/TextContentHandler.java | 34 +- .../apache/tika/sax/ToHTMLContentHandler.java | 39 +- .../apache/tika/sax/ToTextContentHandler.java | 65 +- .../apache/tika/sax/ToXMLContentHandler.java | 63 +- .../org/apache/tika/sax/WriteLimiter.java | 23 +- .../tika/sax/WriteOutContentHandler.java | 80 +- .../apache/tika/sax/XHTMLContentHandler.java | 87 +- .../apache/tika/sax/XMPContentHandler.java | 38 +- .../org/apache/tika/sax/package-info.java | 22 +- .../tika/sax/xpath/AttributeMatcher.java | 26 +- .../apache/tika/sax/xpath/ChildMatcher.java | 26 +- .../tika/sax/xpath/CompositeMatcher.java | 26 +- .../apache/tika/sax/xpath/ElementMatcher.java | 26 +- .../org/apache/tika/sax/xpath/Matcher.java | 51 +- .../sax/xpath/MatchingContentHandler.java | 34 +- .../tika/sax/xpath/NamedAttributeMatcher.java | 26 +- .../tika/sax/xpath/NamedElementMatcher.java | 27 +- .../apache/tika/sax/xpath/NodeMatcher.java | 26 +- .../apache/tika/sax/xpath/SubtreeMatcher.java | 26 +- .../apache/tika/sax/xpath/TextMatcher.java | 26 +- .../apache/tika/sax/xpath/XPathParser.java | 65 +- .../apache/tika/sax/xpath/package-info.java | 22 +- .../apache/tika/utils/AnnotationUtils.java | 59 +- .../org/apache/tika/utils/CharsetUtils.java | 57 +- .../org/apache/tika/utils/CompareUtils.java | 29 +- .../apache/tika/utils/ConcurrentUtils.java | 27 +- .../java/org/apache/tika/utils/DateUtils.java | 86 +- .../tika/utils/DurationFormatUtils.java | 30 +- .../org/apache/tika/utils/ExceptionUtils.java | 36 +- .../apache/tika/utils/FileProcessResult.java | 38 +- .../org/apache/tika/utils/ParserUtils.java | 62 +- .../org/apache/tika/utils/ProcessUtils.java | 54 +- .../org/apache/tika/utils/RegexUtils.java | 39 +- .../tika/utils/RereadableInputStream.java | 121 +- .../apache/tika/utils/ServiceLoaderUtils.java | 57 +- .../org/apache/tika/utils/StreamGobbler.java | 26 +- .../org/apache/tika/utils/StringUtils.java | 69 +- .../org/apache/tika/utils/SystemUtils.java | 28 +- .../org/apache/tika/utils/XMLReaderUtils.java | 452 +++-- .../org/apache/tika/utils/package-info.java | 22 +- .../custom/detect/MyCustomDetector.java | 23 +- .../apache/tika/MultiThreadedTikaTest.java | 173 +- .../tika/ResourceLoggingClassLoader.java | 28 +- .../tika/TestRereadableInputStream.java | 88 +- .../org/apache/tika/TikaDetectionTest.java | 109 +- .../src/test/java/org/apache/tika/TikaIT.java | 26 +- .../test/java/org/apache/tika/TikaTest.java | 213 +-- .../apache/tika/TypeDetectionBenchmark.java | 25 +- .../tika/config/AbstractTikaConfigTest.java | 33 +- .../org/apache/tika/config/DummyExecutor.java | 23 +- .../org/apache/tika/config/DummyParser.java | 25 +- .../apache/tika/config/MockConfigTest.java | 24 +- .../org/apache/tika/config/ParamTest.java | 33 +- .../tika/config/TikaConfigSerializerTest.java | 62 +- .../apache/tika/config/TikaConfigTest.java | 148 +- .../tika/detect/FileCommandDetectorTest.java | 53 +- .../apache/tika/detect/MagicDetectorTest.java | 116 +- .../tika/detect/MimeDetectionWithNNTest.java | 49 +- .../apache/tika/detect/NameDetectorTest.java | 50 +- .../apache/tika/detect/TextDetectorTest.java | 34 +- .../apache/tika/detect/TypeDetectorTest.java | 26 +- .../tika/detect/ZeroSizeFileDetectorTest.java | 28 +- .../org/apache/tika/fork/ForkParserTest.java | 167 +- .../tika/fork/ForkParserTikaBinTest.java | 123 +- .../org/apache/tika/fork/ForkTestParser.java | 34 +- .../tika/fork/UpperCasingContentHandler.java | 23 +- .../unusedpackage/ClassInUnusedPackage.java | 22 +- .../org/apache/tika/io/EndianUtilsTest.java | 55 +- .../org/apache/tika/io/FilenameUtilsTest.java | 89 +- .../tika/io/LookaheadInputStreamTest.java | 31 +- .../org/apache/tika/io/TailStreamTest.java | 57 +- .../tika/io/TemporaryResourcesTest.java | 28 +- .../apache/tika/io/TikaInputStreamTest.java | 60 +- .../language/detect/LanguageNamesTest.java | 22 +- .../apache/tika/metadata/TestMetadata.java | 69 +- .../metadata/filter/MockUpperCaseFilter.java | 23 +- .../metadata/filter/TestMetadataFilter.java | 39 +- .../AttachmentCountingListFilter.java | 26 +- .../listfilter/MetadataListFilterTest.java | 36 +- .../writefilter/StandardWriteFilterTest.java | 142 +- .../apache/tika/mime/CustomReaderTest.java | 29 +- .../org/apache/tika/mime/MediaTypeTest.java | 51 +- .../apache/tika/mime/MimeDetectionTest.java | 125 +- .../apache/tika/mime/MimeTypesReaderTest.java | 152 +- .../org/apache/tika/mime/PatternsTest.java | 23 +- .../mime/ProbabilisticMimeDetectionTest.java | 106 +- ...robabilisticMimeDetectionTestWithTika.java | 114 +- .../apache/tika/mime/RFC822DetectionTest.java | 85 +- .../parser/AutoDetectParserConfigTest.java | 32 +- .../tika/parser/CompositeParserTest.java | 54 +- .../tika/parser/DummyInitializableParser.java | 39 +- .../tika/parser/DummyParameterizedParser.java | 30 +- .../org/apache/tika/parser/DummyParser.java | 30 +- .../tika/parser/InitializableParserTest.java | 26 +- .../tika/parser/ParameterizedParserTest.java | 44 +- .../tika/parser/ParserDecoratorTest.java | 45 +- .../tika/parser/RegexCaptureParserTest.java | 37 +- .../parser/external2/ExternalParserTest.java | 57 +- .../apache/tika/parser/mock/MockParser.java | 110 +- .../tika/parser/mock/MockParserFactory.java | 23 +- .../tika/parser/mock/MockParserTest.java | 31 +- .../apache/tika/parser/mock/VowelParser.java | 28 +- .../parser/multiple/MultipleParserTest.java | 57 +- .../sax/BasicContentHandlerFactoryTest.java | 87 +- .../tika/sax/BodyContentHandlerTest.java | 37 +- .../tika/sax/CustomErrorHandlerTest.java | 49 +- .../tika/sax/LinkContentHandlerTest.java | 33 +- .../tika/sax/NonValidatingContentHandler.java | 31 +- .../tika/sax/OfflineContentHandlerTest.java | 28 +- .../tika/sax/RichTextContentHandlerTest.java | 36 +- .../tika/sax/SafeContentHandlerTest.java | 22 +- .../tika/sax/SecureContentHandlerTest.java | 30 +- .../org/apache/tika/sax/SerializerTest.java | 40 +- .../tika/sax/XHTMLContentHandlerTest.java | 44 +- .../tika/sax/xpath/XPathParserTest.java | 22 +- .../tika/utils/AnnotationUtilsTest.java | 36 +- .../apache/tika/utils/CharsetUtilsTest.java | 22 +- .../tika/utils/ConcurrentUtilsTest.java | 36 +- .../org/apache/tika/utils/RegexUtilsTest.java | 38 +- .../tika/utils/ServiceLoaderUtilsTest.java | 39 +- .../apache/tika/utils/XMLReaderUtilsTest.java | 153 +- .../tika/detect/magika/MagikaDetector.java | 167 +- .../detect/magika/TestMagikaIntegration.java | 31 +- .../detect/magika/TestMagikaJsonParsing.java | 104 +- .../detect/siegfried/SiegfriedDetector.java | 96 +- .../siegfried/TestSiegfriedIntegration.java | 31 +- .../siegfried/TestSiegfriedJsonParsing.java | 48 +- .../org/apache/tika/eval/app/EvalConfig.java | 31 +- .../apache/tika/eval/app/EvalFilePaths.java | 38 +- .../apache/tika/eval/app/ExtractComparer.java | 247 +-- .../tika/eval/app/ExtractComparerRunner.java | 136 +- .../tika/eval/app/ExtractProfileRunner.java | 131 +- .../apache/tika/eval/app/ExtractProfiler.java | 152 +- .../apache/tika/eval/app/ProfilerBase.java | 286 ++- .../apache/tika/eval/app/StatusReporter.java | 41 +- .../org/apache/tika/eval/app/TikaEvalCLI.java | 29 +- .../tika/eval/app/db/AbstractDBBuffer.java | 26 +- .../org/apache/tika/eval/app/db/ColInfo.java | 24 +- .../org/apache/tika/eval/app/db/Cols.java | 26 +- .../org/apache/tika/eval/app/db/DBBuffer.java | 28 +- .../org/apache/tika/eval/app/db/H2Util.java | 26 +- .../org/apache/tika/eval/app/db/JDBCUtil.java | 68 +- .../apache/tika/eval/app/db/MimeBuffer.java | 46 +- .../apache/tika/eval/app/db/TableInfo.java | 23 +- .../org/apache/tika/eval/app/io/DBWriter.java | 61 +- .../tika/eval/app/io/ExtractReader.java | 82 +- .../eval/app/io/ExtractReaderException.java | 35 +- .../apache/tika/eval/app/io/IDBWriter.java | 22 +- .../apache/tika/eval/app/reports/Report.java | 38 +- .../eval/app/reports/ResultsReporter.java | 115 +- .../eval/app/reports/XLSXHREFFormatter.java | 40 +- .../eval/app/reports/XLSXNumFormatter.java | 31 +- .../eval/app/reports/XSLXCellFormatter.java | 25 +- .../app/tools/BatchTopCommonTokenCounter.java | 47 +- .../app/tools/CommonTokenOverlapCounter.java | 26 +- .../tika/eval/app/tools/LeipzigHelper.java | 43 +- .../tika/eval/app/tools/LeipzigSampler.java | 25 +- .../app/tools/SlowCompositeReaderWrapper.java | 129 +- .../eval/app/tools/TopCommonTokenCounter.java | 117 +- .../tika/eval/app/tools/TrainTestSplit.java | 47 +- .../tika/eval/app/AnalyzerManagerTest.java | 32 +- .../apache/tika/eval/app/EvalConfigTest.java | 22 +- .../apache/tika/eval/app/MockDBWriter.java | 33 +- .../tika/eval/app/ProfilerBatchTest.java | 94 +- .../tika/eval/app/SimpleComparerTest.java | 172 +- .../apache/tika/eval/app/TikaEvalCLITest.java | 124 +- .../tika/eval/app/db/AbstractBufferTest.java | 44 +- .../tika/eval/app/io/ExtractReaderTest.java | 80 +- .../eval/app/reports/ResultsReporterTest.java | 30 +- .../app/tools/TopCommonTokenCounterTest.java | 62 +- .../eval/core/langid/LanguageIDWrapper.java | 26 +- .../core/metadata/TikaEvalMetadataFilter.java | 40 +- .../BasicTokenCountStatsCalculator.java | 22 +- .../core/textstats/BytesRefCalculator.java | 22 +- .../eval/core/textstats/CommonTokens.java | 28 +- .../textstats/CommonTokensBhattacharyya.java | 28 +- .../core/textstats/CommonTokensCosine.java | 26 +- .../core/textstats/CommonTokensHellinger.java | 28 +- .../core/textstats/CommonTokensKLDNormed.java | 28 +- .../textstats/CommonTokensKLDivergence.java | 28 +- .../CompositeTextStatsCalculator.java | 56 +- .../textstats/ContentLengthCalculator.java | 22 +- .../LanguageAwareTokenCountStats.java | 23 +- .../core/textstats/StringStatsCalculator.java | 22 +- .../core/textstats/TextProfileSignature.java | 35 +- .../core/textstats/TextSha256Signature.java | 23 +- .../core/textstats/TextStatsCalculator.java | 22 +- .../textstats/TokenCountPriorityQueue.java | 25 +- .../textstats/TokenCountStatsCalculator.java | 22 +- .../eval/core/textstats/TokenEntropy.java | 23 +- .../eval/core/textstats/TokenLengths.java | 24 +- .../tika/eval/core/textstats/TopNTokens.java | 24 +- .../core/textstats/UnicodeBlockCounter.java | 23 +- .../tokens/AlphaIdeographFilterFactory.java | 30 +- .../core/tokens/AnalyzerDeserializer.java | 102 +- .../eval/core/tokens/AnalyzerManager.java | 29 +- .../CJKBigramAwareLengthFilterFactory.java | 51 +- .../core/tokens/CommonTokenCountManager.java | 66 +- .../eval/core/tokens/CommonTokenResult.java | 30 +- .../eval/core/tokens/ContrastStatistics.java | 22 +- .../tika/eval/core/tokens/LangModel.java | 22 +- .../eval/core/tokens/TokenContraster.java | 31 +- .../core/tokens/TokenCountPriorityQueue.java | 24 +- .../tika/eval/core/tokens/TokenCounter.java | 50 +- .../tika/eval/core/tokens/TokenCounts.java | 23 +- .../tika/eval/core/tokens/TokenIntPair.java | 22 +- .../eval/core/tokens/TokenStatistics.java | 35 +- .../URLEmailNormalizingFilterFactory.java | 39 +- .../tika/eval/core/util/ContentTagParser.java | 67 +- .../tika/eval/core/util/ContentTags.java | 22 +- .../eval/core/util/EvalExceptionUtils.java | 30 +- .../tika/eval/core/langid/LangIdTest.java | 38 +- .../metadata/TikaEvalMetadataFilterTest.java | 41 +- .../eval/core/textstats/TextStatsTest.java | 47 +- .../eval/core/tokens/LuceneTokenCounter.java | 124 +- .../eval/core/tokens/TokenCounterTest.java | 33 +- .../tika/eval/core/util/LanguageIdTest.java | 30 +- .../tika/eval/core/util/MimeUtilTest.java | 29 +- .../tika/example/AdvancedTypeDetector.java | 22 +- .../tika/example/ContentHandlerExample.java | 51 +- .../apache/tika/example/CustomMimeInfo.java | 25 +- .../apache/tika/example/DescribeMetadata.java | 24 +- .../apache/tika/example/DirListParser.java | 64 +- .../tika/example/DisplayMetInstance.java | 25 +- .../tika/example/DumpTikaConfigExample.java | 36 +- .../EncryptedPrescriptionDetector.java | 26 +- .../example/EncryptedPrescriptionParser.java | 31 +- .../tika/example/ExtractEmbeddedFiles.java | 53 +- .../tika/example/GrabPhoneNumbersExample.java | 44 +- .../tika/example/ImportContextImpl.java | 54 +- .../example/InterruptableParsingExample.java | 42 +- .../org/apache/tika/example/Language.java | 25 +- .../tika/example/LanguageDetectingParser.java | 30 +- .../tika/example/LanguageDetectorExample.java | 22 +- .../apache/tika/example/LuceneIndexer.java | 23 +- .../tika/example/LuceneIndexerExtended.java | 26 +- .../apache/tika/example/MediaTypeExample.java | 22 +- .../example/MetadataAwareLuceneIndexer.java | 31 +- .../org/apache/tika/example/MyFirstTika.java | 49 +- .../apache/tika/example/ParsingExample.java | 129 +- .../org/apache/tika/example/Pharmacy.java | 22 +- .../example/PickBestTextEncodingParser.java | 57 +- .../tika/example/PrescriptionParser.java | 34 +- .../org/apache/tika/example/RecentFiles.java | 51 +- .../apache/tika/example/RollbackSoftware.java | 63 +- .../tika/example/SimpleTextExtractor.java | 22 +- .../tika/example/SimpleTypeDetector.java | 22 +- .../apache/tika/example/SpringExample.java | 35 +- .../example/StandardsExtractionExample.java | 47 +- .../tika/example/TIAParsingExample.java | 35 +- .../tika/example/TextStatsFromTikaEval.java | 42 +- .../example/TranscribeTranslateExample.java | 66 +- .../tika/example/TranslatorExample.java | 22 +- .../tika/example/TrecDocumentGenerator.java | 34 +- .../org/apache/tika/example/ZipListFiles.java | 25 +- .../example/AdvancedTypeDetectorTest.java | 22 +- .../example/ContentHandlerExampleTest.java | 41 +- .../example/DumpTikaConfigExampleTest.java | 57 +- .../example/ExtractEmbeddedFilesTest.java | 26 +- .../example/LanguageDetectorExampleTest.java | 22 +- .../tika/example/SimpleTextExtractorTest.java | 36 +- .../tika/example/SimpleTypeDetectorTest.java | 31 +- .../tika/example/TestParsingExample.java | 55 +- .../example/TextStatsFromTikaEvalTest.java | 22 +- .../tika/example/TranslatorExampleTest.java | 22 +- .../tika/pipes/grpc/ExpiringFetcherStore.java | 60 +- .../tika/pipes/grpc/TikaGrpcServer.java | 97 +- .../tika/pipes/grpc/TikaGrpcServerImpl.java | 229 +-- .../pipes/grpc/ExpiringFetcherStoreTest.java | 52 +- ...BiDirectionalStreamingIntegrationTest.java | 135 +- .../tika/pipes/grpc/TikaGrpcServerTest.java | 223 +-- .../boilerpipe/BoilerpipeContentHandler.java | 85 +- .../pipes/kafka/tests/TikaPipesKafkaTest.java | 105 +- .../opensearch/tests/OpenSearchTest.java | 379 ++-- .../tests/OpensearchTestClient.java | 82 +- .../pipes/s3/tests/PipeIntegrationTests.java | 77 +- .../pipes/s3/tests/S3PipeIntegrationTest.java | 89 +- .../pipes/solr/tests/TikaPipesSolr8Test.java | 22 +- .../solr/tests/TikaPipesSolr8ZkTest.java | 30 +- .../pipes/solr/tests/TikaPipesSolr9Test.java | 22 +- .../solr/tests/TikaPipesSolr9ZkTest.java | 30 +- .../solr/tests/TikaPipesSolrTestBase.java | 166 +- .../custom/parser/CustomParserTest.java | 26 +- .../apache/custom/parser/MyCustomParser.java | 30 +- .../woodstox/WoodstoxXMLReaderUtilsTest.java | 159 +- .../TikaFileTypeDetector.java | 25 +- .../tika/filetypedetector/package-info.java | 22 +- .../TikaFileTypeDetectorTest.java | 32 +- .../lingo24/Lingo24LangDetector.java | 64 +- .../lingo24/Lingo24LangDetectorTest.java | 30 +- .../langdetect/mitll/TextLangDetector.java | 34 +- .../mitll/TextLangDetectorTest.java | 30 +- .../langdetect/opennlp/OpenNLPDetector.java | 126 +- .../opennlp/ProbingLanguageDetector.java | 150 +- .../metadatafilter/OpenNLPMetadataFilter.java | 25 +- .../opennlp/OpenNLPDetectorTest.java | 33 +- .../optimaize/OptimaizeLangDetector.java | 69 +- .../OptimaizeMetadataFilter.java | 25 +- .../optimaize/OptimaizeLangDetectorTest.java | 147 +- .../tika/langdetect/LanguageDetectorTest.java | 27 +- .../langdetect/tika/LanguageIdentifier.java | 78 +- .../tika/langdetect/tika/LanguageProfile.java | 45 +- .../tika/LanguageProfilerBuilder.java | 84 +- .../tika/langdetect/tika/ProfilingWriter.java | 37 +- .../langdetect/tika/TikaLanguageDetector.java | 40 +- .../tika/LanguageIdentifierTest.java | 62 +- .../langdetect/tika/LanguageProfileTest.java | 23 +- .../tika/LanguageProfilerBuilderTest.java | 30 +- .../langdetect/tika/ProfilingHandler.java | 37 +- .../langdetect/tika/ProfilingWriterTest.java | 23 +- tika-parent/checkstyle.xml | 28 +- tika-parent/intellij-code-style.xml | 75 - tika-parent/pom.xml | 35 +- .../tika/parser/envi/EnviHeaderParser.java | 106 +- .../apache/tika/parser/gdal/GDALParser.java | 213 ++- .../geoinfo/GeographicInformationParser.java | 194 +- .../apache/tika/parser/grib/GribParser.java | 58 +- .../org/apache/tika/parser/hdf/HDFParser.java | 59 +- .../tika/parser/isatab/ISATabUtils.java | 69 +- .../tika/parser/isatab/ISArchiveParser.java | 52 +- .../tika/parser/netcdf/NetCDFParser.java | 61 +- .../parser/envi/EnviHeaderParserTest.java | 70 +- .../tika/parser/gdal/TestGDALParser.java | 38 +- .../GeographicInformationParserTest.java | 25 +- .../tika/parser/grib/GribParserTest.java | 31 +- .../apache/tika/parser/hdf/HDFParserTest.java | 28 +- .../parser/isatab/ISArchiveParserTest.java | 53 +- .../tika/parser/netcdf/NetCDFParserTest.java | 34 +- .../scientific/integration/TestParsers.java | 50 +- .../tika/parser/geopkg/GeoPkgDBParser.java | 35 +- .../tika/parser/geopkg/GeoPkgParser.java | 53 +- .../tika/parser/geopkg/GeoPkgTableReader.java | 49 +- .../tika/parser/sqlite3/SQLite3DBParser.java | 69 +- .../tika/parser/sqlite3/SQLite3Parser.java | 58 +- .../parser/sqlite3/SQLite3TableReader.java | 44 +- .../parser/sqlite3/SQLite3ParserTest.java | 206 +- .../tika/mime/TestMimeTypesExtended.java | 30 +- .../org/apache/tika/parser/ocr/TestOCR.java | 50 +- .../parser/sqlite3/SQLite3ParserTest.java | 114 +- .../ctakes/CTAKESAnnotationProperty.java | 31 +- .../tika/parser/ctakes/CTAKESConfig.java | 61 +- .../parser/ctakes/CTAKESContentHandler.java | 71 +- .../tika/parser/ctakes/CTAKESParser.java | 50 +- .../tika/parser/ctakes/CTAKESSerializer.java | 26 +- .../tika/parser/ctakes/CTAKESUtils.java | 138 +- .../tika/parser/geo/topic/GeoParser.java | 36 +- .../parser/geo/topic/GeoParserConfig.java | 23 +- .../apache/tika/parser/geo/topic/GeoTag.java | 26 +- .../parser/geo/topic/NameEntityExtractor.java | 43 +- .../topic/gazetteer/GeoGazetteerClient.java | 38 +- .../parser/geo/topic/gazetteer/Location.java | 22 +- .../tika/parser/journal/GrobidRESTParser.java | 54 +- .../tika/parser/journal/JournalParser.java | 30 +- .../tika/parser/journal/TEIDOMParser.java | 72 +- .../apache/tika/parser/ner/NERecogniser.java | 24 +- .../tika/parser/ner/NamedEntityParser.java | 72 +- .../ner/corenlp/CoreNLPNERecogniser.java | 57 +- .../parser/ner/grobid/GrobidNERecogniser.java | 90 +- .../parser/ner/mitie/MITIENERecogniser.java | 65 +- .../parser/ner/nltk/NLTKNERecogniser.java | 41 +- .../ner/opennlp/OpenNLPNERecogniser.java | 96 +- .../parser/ner/opennlp/OpenNLPNameFinder.java | 38 +- .../parser/ner/regex/RegexNERecogniser.java | 52 +- .../sentiment/SentimentAnalysisParser.java | 59 +- .../tika/parser/geo/topic/GeoParserTest.java | 69 +- .../parser/journal/JournalParserTest.java | 28 +- .../apache/tika/parser/journal/TEITest.java | 75 +- .../parser/ner/NamedEntityParserTest.java | 50 +- .../parser/ner/nltk/NLTKNERecogniserTest.java | 30 +- .../ner/regex/RegexNERecogniserTest.java | 32 +- .../SentimentAnalysisParserTest.java | 38 +- .../transcribe/aws/AmazonTranscribe.java | 219 ++- .../transcribe/aws/AmazonTranscribeTest.java | 147 +- .../tika/detect/apple/BPListDetector.java | 70 +- .../tika/detect/apple/IWorkDetector.java | 32 +- .../parser/apple/AppleSingleFileParser.java | 70 +- .../apache/tika/parser/apple/PListParser.java | 90 +- .../parser/iwork/AutoPageNumberUtils.java | 34 +- .../tika/parser/iwork/IWorkPackageParser.java | 72 +- .../parser/iwork/KeynoteContentHandler.java | 31 +- .../parser/iwork/NumbersContentHandler.java | 32 +- .../parser/iwork/PagesContentHandler.java | 68 +- .../iwork/iwana/IWork13PackageParser.java | 117 +- .../iwork/iwana/IWork18PackageParser.java | 46 +- .../tika/detect/apple/IWorkDetectorTest.java | 29 +- .../tika/parser/apple/PListParserTest.java | 34 +- .../parser/iwork/AutoPageNumberUtilsTest.java | 34 +- .../tika/parser/iwork/IWorkParserTest.java | 89 +- .../parser/iwork/iwana/IWork13ParserTest.java | 57 +- .../apache/tika/detect/MatroskaDetector.java | 47 +- .../apache/tika/parser/audio/AudioParser.java | 47 +- .../apache/tika/parser/audio/MidiParser.java | 33 +- .../apache/tika/parser/mp3/AudioFrame.java | 48 +- .../tika/parser/mp3/CompositeTagHandler.java | 26 +- .../org/apache/tika/parser/mp3/ID3Tags.java | 209 +-- .../apache/tika/parser/mp3/ID3v1Handler.java | 56 +- .../apache/tika/parser/mp3/ID3v22Handler.java | 35 +- .../apache/tika/parser/mp3/ID3v23Handler.java | 32 +- .../apache/tika/parser/mp3/ID3v24Handler.java | 29 +- .../apache/tika/parser/mp3/ID3v2Frame.java | 114 +- .../apache/tika/parser/mp3/LyricsHandler.java | 65 +- .../org/apache/tika/parser/mp3/MP3Frame.java | 25 +- .../org/apache/tika/parser/mp3/Mp3Parser.java | 50 +- .../apache/tika/parser/mp3/MpegStream.java | 127 +- .../org/apache/tika/parser/mp4/MP4Parser.java | 135 +- .../tika/parser/mp4/TikaMp4BoxHandler.java | 39 +- .../parser/mp4/boxes/TikaUserDataBox.java | 130 +- .../apache/tika/parser/video/FLVParser.java | 68 +- .../tika/detect/MatroskaDetectorTest.java | 54 +- .../tika/parser/audio/AudioParserTest.java | 37 +- .../tika/parser/audio/MidiParserTest.java | 26 +- .../apache/tika/parser/mp3/Mp3ParserTest.java | 95 +- .../tika/parser/mp3/MpegStreamTest.java | 34 +- .../apache/tika/parser/mp4/MP4ParserTest.java | 231 +-- .../tika/parser/video/FLVParserTest.java | 29 +- .../apache/tika/parser/dgn/DGN8Parser.java | 33 +- .../tika/parser/dwg/AbstractDWGParser.java | 42 +- .../org/apache/tika/parser/dwg/DWGParser.java | 105 +- .../tika/parser/dwg/DWGParserConfig.java | 36 +- .../tika/parser/dwg/DWGReadFormatRemover.java | 51 +- .../apache/tika/parser/dwg/DWGReadParser.java | 152 +- .../tika/parser/dwg/JulianDateUtil.java | 29 +- .../org/apache/tika/parser/prt/PRTParser.java | 64 +- .../tika/parser/dgn/DGN8ParserTest.java | 28 +- .../apache/tika/parser/dwg/DWGParserTest.java | 91 +- .../parser/dwg/DWGReadFormatRemoverTest.java | 42 +- .../apache/tika/parser/prt/PRTParserTest.java | 32 +- .../apache/tika/parser/asm/ClassParser.java | 32 +- .../tika/parser/asm/XHTMLClassVisitor.java | 57 +- .../tika/parser/code/SourceCodeParser.java | 90 +- .../parser/executable/ExecutableParser.java | 81 +- .../executable/UniversalExecutableParser.java | 89 +- .../org/apache/tika/parser/mat/MatParser.java | 75 +- .../tika/parser/sas/SAS7BDATParser.java | 59 +- .../tika/parser/asm/ClassParserTest.java | 43 +- .../parser/code/SourceCodeParserTest.java | 62 +- .../executable/ExecutableParserTest.java | 31 +- .../UniversalExecutableParserTest.java | 32 +- .../apache/tika/parser/mat/MatParserTest.java | 26 +- .../tika/parser/sas/SAS7BDATParserTest.java | 37 +- .../tika/parser/crypto/Pkcs7Parser.java | 46 +- .../apache/tika/parser/crypto/TSDParser.java | 92 +- .../tika/parser/crypto/Pkcs7ParserTest.java | 26 +- .../tika/parser/crypto/TSDParserTest.java | 56 +- .../digestutils/BouncyCastleDigester.java | 35 +- .../parser/digestutils/CommonsDigester.java | 79 +- .../digestutils/CommonsDigesterFactory.java | 25 +- .../parser/font/AdobeFontMetricParser.java | 36 +- .../tika/parser/font/TrueTypeParser.java | 34 +- .../tika/parser/font/FontParsersTest.java | 39 +- .../tika/parser/html/DataURIScheme.java | 34 +- .../html/DataURISchemeParseException.java | 22 +- .../tika/parser/html/DataURISchemeUtil.java | 36 +- .../tika/parser/html/DefaultHtmlMapper.java | 88 +- .../parser/html/HtmlEncodingDetector.java | 88 +- .../apache/tika/parser/html/HtmlHandler.java | 125 +- .../apache/tika/parser/html/HtmlMapper.java | 67 +- .../tika/parser/html/IdentityHtmlMapper.java | 25 +- .../apache/tika/parser/html/JSoupParser.java | 93 +- .../parser/html/XHTMLDowngradeHandler.java | 48 +- .../html/charsetdetector/CharsetAliases.java | 79 +- .../CharsetDetectionResult.java | 22 +- .../html/charsetdetector/MetaProcessor.java | 29 +- .../html/charsetdetector/PreScanner.java | 62 +- .../StandardHtmlEncodingDetector.java | 53 +- .../charsets/ReplacementCharset.java | 26 +- .../charsets/XUserDefinedCharset.java | 22 +- .../parser/html/DataURISchemeParserTest.java | 28 +- .../parser/html/HtmlEncodingDetectorTest.java | 57 +- .../tika/parser/html/HtmlParserTest.java | 623 +++--- .../apache/tika/parser/html/SrcDocTest.java | 30 +- .../StandardHtmlEncodingDetectorTest.java | 126 +- .../parser/image/AbstractImageParser.java | 57 +- .../apache/tika/parser/image/BPGParser.java | 61 +- .../apache/tika/parser/image/HeifParser.java | 35 +- .../apache/tika/parser/image/ICNSParser.java | 73 +- .../apache/tika/parser/image/ICNSType.java | 114 +- .../parser/image/ImageMetadataExtractor.java | 210 +-- .../apache/tika/parser/image/ImageParser.java | 91 +- .../apache/tika/parser/image/JXLParser.java | 34 +- .../apache/tika/parser/image/JpegParser.java | 33 +- .../tika/parser/image/MetadataFields.java | 33 +- .../apache/tika/parser/image/PSDParser.java | 68 +- .../apache/tika/parser/image/TiffParser.java | 33 +- .../apache/tika/parser/image/WebPParser.java | 32 +- .../tika/parser/image/HeifParserTest.java | 33 +- .../tika/parser/image/ICNSParserTest.java | 32 +- .../image/ImageMetadataExtractorTest.java | 70 +- .../tika/parser/image/ImageParserTest.java | 276 +-- .../org/apache/tika/parser/image/JXLTest.java | 27 +- .../tika/parser/image/JpegParserTest.java | 88 +- .../tika/parser/image/PSDParserTest.java | 36 +- .../tika/parser/image/WebPParserTest.java | 41 +- .../tika/parser/jdbc/AbstractDBParser.java | 76 +- .../tika/parser/jdbc/JDBCTableReader.java | 91 +- .../parser/mailcommons/MailDateParser.java | 656 +++---- .../tika/parser/mailcommons/MailUtil.java | 41 +- .../mailcommons/MailDateParserTest.java | 151 +- .../tika/parser/mailcommons/MailUtilTest.java | 31 +- .../tika/parser/mail/MailContentHandler.java | 179 +- .../apache/tika/parser/mail/RFC822Parser.java | 70 +- .../apache/tika/parser/mbox/MboxParser.java | 58 +- .../tika/parser/mail/RFC822ParserTest.java | 269 ++- .../tika/parser/mbox/MboxParserTest.java | 44 +- .../microsoft/POIFSContainerDetector.java | 189 +- .../microsoft/ooxml/OPCPackageDetector.java | 161 +- .../microsoft/MSEmbeddedStreamTranslator.java | 42 +- .../microsoft/PSTEmailStreamTranslator.java | 38 +- .../parser/microsoft/AbstractListManager.java | 99 +- .../microsoft/AbstractOfficeParser.java | 49 +- .../microsoft/AbstractPOIFSExtractor.java | 165 +- .../apache/tika/parser/microsoft/Cell.java | 32 +- .../tika/parser/microsoft/CellDecorator.java | 25 +- .../tika/parser/microsoft/EMFParser.java | 163 +- .../tika/parser/microsoft/ExcelExtractor.java | 220 +-- .../parser/microsoft/FormattingUtils.java | 45 +- .../tika/parser/microsoft/HSLFExtractor.java | 178 +- .../parser/microsoft/JackcessExtractor.java | 127 +- .../tika/parser/microsoft/JackcessParser.java | 74 +- .../tika/parser/microsoft/LinkedCell.java | 25 +- .../tika/parser/microsoft/ListManager.java | 79 +- .../parser/microsoft/MSOwnerFileParser.java | 45 +- .../tika/parser/microsoft/NumberCell.java | 26 +- .../tika/parser/microsoft/OfficeParser.java | 130 +- .../parser/microsoft/OfficeParserConfig.java | 102 +- .../tika/parser/microsoft/OldExcelParser.java | 47 +- .../parser/microsoft/OutlookExtractor.java | 372 ++-- .../parser/microsoft/SummaryExtractor.java | 48 +- .../tika/parser/microsoft/TNEFParser.java | 48 +- .../tika/parser/microsoft/TextCell.java | 25 +- .../microsoft/TikaExcelDataFormatter.java | 36 +- .../microsoft/TikaExcelGeneralFormat.java | 28 +- .../tika/parser/microsoft/WMFParser.java | 58 +- .../tika/parser/microsoft/WordExtractor.java | 186 +- .../activemime/ActiveMimeParser.java | 54 +- .../parser/microsoft/chm/ChmAccessor.java | 25 +- .../tika/parser/microsoft/chm/ChmAssert.java | 66 +- .../parser/microsoft/chm/ChmBlockInfo.java | 54 +- .../tika/parser/microsoft/chm/ChmCommons.java | 68 +- .../parser/microsoft/chm/ChmConstants.java | 62 +- .../microsoft/chm/ChmDirectoryListingSet.java | 176 +- .../parser/microsoft/chm/ChmExtractor.java | 155 +- .../parser/microsoft/chm/ChmItsfHeader.java | 95 +- .../parser/microsoft/chm/ChmItspHeader.java | 237 +-- .../parser/microsoft/chm/ChmLzxBlock.java | 279 +-- .../parser/microsoft/chm/ChmLzxState.java | 110 +- .../microsoft/chm/ChmLzxcControlData.java | 95 +- .../microsoft/chm/ChmLzxcResetTable.java | 62 +- .../tika/parser/microsoft/chm/ChmParser.java | 51 +- .../microsoft/chm/ChmParsingException.java | 22 +- .../parser/microsoft/chm/ChmPmgiHeader.java | 71 +- .../parser/microsoft/chm/ChmPmglHeader.java | 96 +- .../tika/parser/microsoft/chm/ChmSection.java | 55 +- .../tika/parser/microsoft/chm/ChmWrapper.java | 22 +- .../microsoft/chm/DirectoryListingEntry.java | 53 +- .../parser/microsoft/libpst/EmailVisitor.java | 49 +- .../parser/microsoft/libpst/LibPstParser.java | 111 +- .../microsoft/libpst/LibPstParserConfig.java | 32 +- .../msg/ExtendedMetadataExtractor.java | 122 +- .../tika/parser/microsoft/msg/MAPITag.java | 26 +- .../microsoft/msg/TikaNameIdChunks.java | 124 +- .../onenote/CheckedFileNodePushBack.java | 26 +- .../parser/microsoft/onenote/CompactID.java | 26 +- .../tika/parser/microsoft/onenote/Error.java | 25 +- .../microsoft/onenote/ExtendedGUID.java | 23 +- .../microsoft/onenote/FileChunkReference.java | 62 +- .../onenote/FileDataStoreObject.java | 24 +- .../onenote/FileDataStoreObjectReference.java | 22 +- .../parser/microsoft/onenote/FileNode.java | 133 +- .../microsoft/onenote/FileNodeList.java | 22 +- .../microsoft/onenote/FileNodeListHeader.java | 57 +- .../parser/microsoft/onenote/FileNodePtr.java | 28 +- .../onenote/FileNodePtrBackPush.java | 22 +- .../microsoft/onenote/FileNodeUnion.java | 44 +- .../onenote/FndStructureConstants.java | 25 +- .../tika/parser/microsoft/onenote/GUID.java | 31 +- .../onenote/GlobalIdTableEntry2FNDX.java | 22 +- .../onenote/GlobalIdTableEntry3FNDX.java | 22 +- .../onenote/GlobalIdTableEntryFNDX.java | 22 +- .../onenote/GlobalIdTableStartFNDX.java | 22 +- .../parser/microsoft/onenote/IndentUtil.java | 22 +- .../tika/parser/microsoft/onenote/Int24.java | 22 +- .../tika/parser/microsoft/onenote/JCID.java | 88 +- .../onenote/JCIDPropertySetTypeEnum.java | 28 +- .../ObjectDeclarationWithRefCount.java | 22 +- .../ObjectDeclarationWithRefCountBody.java | 24 +- .../ObjectInfoDependencyOverrideData.java | 22 +- .../ObjectInfoDependencyOverrides.java | 22 +- .../ObjectRevisionWithRefCountFNDX.java | 22 +- .../onenote/ObjectSpaceObjectPropSet.java | 30 +- ...ceObjectStreamOfOIDsOSIDsOrContextIDs.java | 26 +- .../onenote/ObjectStreamCounters.java | 22 +- .../onenote/OneNoteDirectFileResource.java | 29 +- .../microsoft/onenote/OneNoteDocument.java | 29 +- .../microsoft/onenote/OneNoteHeader.java | 32 +- .../onenote/OneNoteLegacyDumpStrings.java | 48 +- .../microsoft/onenote/OneNoteParser.java | 165 +- .../onenote/OneNotePropertyEnum.java | 24 +- .../microsoft/onenote/OneNotePropertyId.java | 29 +- .../parser/microsoft/onenote/OneNotePtr.java | 450 +++-- .../microsoft/onenote/OneNoteTreeWalker.java | 220 ++- .../onenote/OneNoteTreeWalkerOptions.java | 30 +- .../microsoft/onenote/PropertyIDType.java | 22 +- .../parser/microsoft/onenote/PropertySet.java | 67 +- .../microsoft/onenote/PropertyValue.java | 45 +- .../parser/microsoft/onenote/Revision.java | 22 +- .../microsoft/onenote/RevisionManifest.java | 24 +- .../onenote/RevisionManifestListStart.java | 22 +- .../onenote/RevisionRoleDeclaration.java | 22 +- .../onenote/RootObjectReference.java | 24 +- .../onenote/RootObjectReferenceBase.java | 22 +- .../fsshttpb/IFSSHTTPBSerializable.java | 23 +- .../onenote/fsshttpb/MSOneStorePackage.java | 117 +- .../onenote/fsshttpb/MSOneStoreParser.java | 151 +- .../DataElementParseErrorException.java | 22 +- .../fsshttpb/property/ArrayNumber.java | 28 +- .../fsshttpb/property/EightBytesOfData.java | 31 +- .../fsshttpb/property/FourBytesOfData.java | 31 +- .../onenote/fsshttpb/property/IProperty.java | 27 +- .../onenote/fsshttpb/property/NoData.java | 24 +- .../fsshttpb/property/OneByteOfData.java | 30 +- .../property/PrtArrayOfPropertyValues.java | 31 +- .../PrtFourBytesOfLengthFollowedByData.java | 35 +- .../fsshttpb/property/TwoBytesOfData.java | 33 +- .../CellManifestCurrentRevision.java | 32 +- .../CellManifestDataElementData.java | 31 +- .../fsshttpb/streamobj/DataElement.java | 90 +- .../fsshttpb/streamobj/DataElementData.java | 27 +- .../fsshttpb/streamobj/DataElementHash.java | 32 +- .../streamobj/DataElementPackage.java | 34 +- .../fsshttpb/streamobj/DataHashObject.java | 40 +- .../fsshttpb/streamobj/DataSizeObject.java | 33 +- .../fsshttpb/streamobj/EncryptionObject.java | 22 +- .../fsshttpb/streamobj/FileDataObject.java | 22 +- .../streamobj/IntermediateNodeObject.java | 38 +- .../fsshttpb/streamobj/JCIDObject.java | 29 +- .../fsshttpb/streamobj/LeafNodeObject.java | 125 +- .../fsshttpb/streamobj/NodeObject.java | 23 +- .../fsshttpb/streamobj/ObjectGroupData.java | 59 +- .../streamobj/ObjectGroupDataElementData.java | 90 +- .../streamobj/ObjectGroupDeclarations.java | 61 +- .../streamobj/ObjectGroupMetadata.java | 42 +- .../ObjectGroupMetadataDeclarations.java | 49 +- .../ObjectGroupObjectBLOBDataDeclaration.java | 39 +- .../streamobj/ObjectGroupObjectData.java | 32 +- .../ObjectGroupObjectDataBLOBReference.java | 33 +- .../streamobj/ObjectGroupObjectDeclare.java | 38 +- .../fsshttpb/streamobj/PropertySet.java | 28 +- .../fsshttpb/streamobj/PropertySetObject.java | 29 +- .../fsshttpb/streamobj/RevisionManifest.java | 32 +- .../RevisionManifestDataElementData.java | 56 +- ...RevisionManifestObjectGroupReferences.java | 37 +- .../RevisionManifestRootDeclare.java | 32 +- .../streamobj/RevisionStoreObject.java | 25 +- .../streamobj/RevisionStoreObjectGroup.java | 67 +- .../fsshttpb/streamobj/SignatureObject.java | 36 +- .../streamobj/StorageIndexCellMapping.java | 36 +- .../StorageIndexDataElementData.java | 62 +- .../StorageIndexManifestMapping.java | 32 +- .../StorageIndexRevisionMapping.java | 36 +- .../StorageManifestDataElementData.java | 48 +- .../streamobj/StorageManifestRootDeclare.java | 32 +- .../streamobj/StorageManifestSchemaGUID.java | 35 +- .../fsshttpb/streamobj/StreamObject.java | 178 +- .../streamobj/StreamObjectHeaderEnd.java | 25 +- .../streamobj/StreamObjectHeaderEnd16bit.java | 55 +- .../streamobj/StreamObjectHeaderEnd8bit.java | 55 +- .../streamobj/StreamObjectHeaderStart.java | 41 +- .../StreamObjectHeaderStart16bit.java | 77 +- .../StreamObjectHeaderStart32bit.java | 70 +- .../StreamObjectParseErrorException.java | 36 +- .../streamobj/StreamObjectTypeHeaderEnd.java | 22 +- .../StreamObjectTypeHeaderStart.java | 22 +- .../streamobj/basic/AdapterHelper.java | 27 +- .../streamobj/basic/AlternativePackaging.java | 27 +- .../fsshttpb/streamobj/basic/BasicObject.java | 40 +- .../fsshttpb/streamobj/basic/BinaryItem.java | 30 +- .../fsshttpb/streamobj/basic/CellID.java | 41 +- .../fsshttpb/streamobj/basic/CellIDArray.java | 35 +- .../streamobj/basic/Compact64bitInt.java | 34 +- .../fsshttpb/streamobj/basic/CompactID.java | 28 +- .../streamobj/basic/DataElementType.java | 22 +- .../streamobj/basic/DataNodeObjectData.java | 31 +- .../fsshttpb/streamobj/basic/ExGUIDArray.java | 33 +- .../fsshttpb/streamobj/basic/ExGuid.java | 33 +- .../fsshttpb/streamobj/basic/HeaderCell.java | 43 +- .../fsshttpb/streamobj/basic/JCID.java | 28 +- .../fsshttpb/streamobj/basic/PropertyID.java | 28 +- .../streamobj/basic/PropertyType.java | 48 +- .../streamobj/basic/RequestTypes.java | 22 +- .../streamobj/basic/SerialNumber.java | 44 +- .../fsshttpb/streamobj/basic/ZipHeader.java | 29 +- .../streamobj/chunking/AbstractChunking.java | 23 +- .../streamobj/chunking/ChunkingFactory.java | 42 +- .../streamobj/chunking/ChunkingMethod.java | 22 +- .../chunking/RDCAnalysisChunking.java | 152 +- .../streamobj/chunking/SimpleChunking.java | 33 +- .../streamobj/chunking/ZipFilesChunking.java | 53 +- .../space/ObjectSpaceObjectPropSet.java | 31 +- .../space/ObjectSpaceObjectStreamHeader.java | 32 +- .../ObjectSpaceObjectStreamOfContextIDs.java | 35 +- .../space/ObjectSpaceObjectStreamOfOIDs.java | 31 +- .../space/ObjectSpaceObjectStreamOfOSIDs.java | 31 +- .../onenote/fsshttpb/unsigned/UByte.java | 82 +- .../onenote/fsshttpb/unsigned/UInteger.java | 85 +- .../onenote/fsshttpb/unsigned/ULong.java | 76 +- .../onenote/fsshttpb/unsigned/UMath.java | 25 +- .../onenote/fsshttpb/unsigned/UNumber.java | 27 +- .../onenote/fsshttpb/unsigned/UShort.java | 62 +- .../onenote/fsshttpb/unsigned/Unsigned.java | 89 +- .../microsoft/onenote/fsshttpb/util/Bit.java | 28 +- .../onenote/fsshttpb/util/BitConverter.java | 57 +- .../onenote/fsshttpb/util/BitReader.java | 44 +- .../onenote/fsshttpb/util/BitWriter.java | 35 +- .../onenote/fsshttpb/util/ByteUtil.java | 22 +- .../fsshttpb/util/DataElementUtils.java | 291 ++- .../onenote/fsshttpb/util/GuidUtil.java | 22 +- .../util/LittleEndianBitConverter.java | 63 +- .../util/SequenceNumberGenerator.java | 23 +- .../onenote/fsshttpb/util/UuidUtils.java | 22 +- .../ooxml/AbstractOOXMLExtractor.java | 237 ++- .../microsoft/ooxml/CommentPersonHandler.java | 37 +- .../microsoft/ooxml/EmbeddedPartMetadata.java | 29 +- .../microsoft/ooxml/MetadataExtractor.java | 76 +- .../microsoft/ooxml/OOXMLExtractor.java | 40 +- .../ooxml/OOXMLExtractorFactory.java | 103 +- .../parser/microsoft/ooxml/OOXMLParser.java | 136 +- .../ooxml/OOXMLTikaBodyPartHandler.java | 74 +- .../OOXMLWordAndPowerPointTextHandler.java | 146 +- .../microsoft/ooxml/OPCPackageWrapper.java | 33 +- .../ooxml/POIXMLTextExtractorDecorator.java | 28 +- .../microsoft/ooxml/ParagraphProperties.java | 22 +- .../parser/microsoft/ooxml/RunProperties.java | 27 +- .../SXSLFPowerPointExtractorDecorator.java | 177 +- .../ooxml/SXWPFWordExtractorDecorator.java | 149 +- .../XSLFPowerPointExtractorDecorator.java | 122 +- .../ooxml/XSSFBExcelExtractorDecorator.java | 63 +- .../ooxml/XSSFExcelExtractorDecorator.java | 228 ++- .../microsoft/ooxml/XWPFListManager.java | 59 +- .../ooxml/XWPFWordExtractorDecorator.java | 180 +- .../ooxml/xps/XPSExtractorDecorator.java | 75 +- .../ooxml/xps/XPSPageContentHandler.java | 180 +- .../microsoft/ooxml/xps/XPSTextExtractor.java | 28 +- .../XSLFEventBasedPowerPointExtractor.java | 44 +- .../xwpf/XWPFEventBasedWordExtractor.java | 104 +- .../ooxml/xwpf/XWPFFeatureExtractor.java | 58 +- .../ooxml/xwpf/XWPFNumberingShim.java | 23 +- .../microsoft/ooxml/xwpf/XWPFStylesShim.java | 47 +- .../xwpf/ml2006/AbstractPartHandler.java | 27 +- .../ooxml/xwpf/ml2006/BinaryDataHandler.java | 35 +- .../xwpf/ml2006/CorePropertiesHandler.java | 41 +- .../ml2006/ExtendedPropertiesHandler.java | 25 +- .../ooxml/xwpf/ml2006/PartHandler.java | 25 +- .../ooxml/xwpf/ml2006/Relationship.java | 22 +- .../xwpf/ml2006/RelationshipsHandler.java | 46 +- .../xwpf/ml2006/RelationshipsManager.java | 25 +- .../xwpf/ml2006/Word2006MLDocHandler.java | 93 +- .../ooxml/xwpf/ml2006/Word2006MLParser.java | 44 +- .../WordAndPowerPointTextPartHandler.java | 41 +- .../microsoft/pst/OutlookPSTParser.java | 67 +- .../microsoft/pst/PSTMailItemParser.java | 141 +- .../tika/parser/microsoft/rtf/GroupState.java | 44 +- .../parser/microsoft/rtf/ListDescriptor.java | 22 +- .../microsoft/rtf/RTFEmbObjHandler.java | 77 +- .../microsoft/rtf/RTFObjDataParser.java | 109 +- .../tika/parser/microsoft/rtf/RTFParser.java | 44 +- .../parser/microsoft/rtf/TextExtractor.java | 318 ++-- .../microsoft/xml/AbstractXML2003Parser.java | 84 +- .../microsoft/xml/HyperlinkHandler.java | 40 +- .../microsoft/xml/SpreadsheetMLParser.java | 48 +- .../parser/microsoft/xml/WordMLParser.java | 110 +- .../microsoft/POIFSContainerDetectorTest.java | 51 +- .../AbstractPOIContainerExtractionTest.java | 43 +- .../tika/parser/microsoft/EMFParserTest.java | 42 +- .../parser/microsoft/ExcelParserTest.java | 114 +- .../parser/microsoft/JackcessParserTest.java | 87 +- .../microsoft/MSOwnerFileParserTest.java | 25 +- .../tika/parser/microsoft/OLE2CasingTest.java | 32 +- .../parser/microsoft/OfficeParserTest.java | 26 +- .../parser/microsoft/OldExcelParserTest.java | 31 +- .../parser/microsoft/OutlookParserTest.java | 135 +- .../microsoft/POIContainerExtractionTest.java | 71 +- .../microsoft/PowerPointParserTest.java | 113 +- .../parser/microsoft/ProjectParserTest.java | 45 +- .../parser/microsoft/PublisherParserTest.java | 30 +- .../microsoft/SolidworksParserTest.java | 68 +- .../tika/parser/microsoft/TNEFParserTest.java | 32 +- .../parser/microsoft/VisioParserTest.java | 30 +- .../tika/parser/microsoft/WMFParserTest.java | 31 +- .../tika/parser/microsoft/WordParserTest.java | 181 +- .../microsoft/WriteProtectedParserTest.java | 30 +- .../activemime/ActiveMimeParserTest.java | 35 +- .../microsoft/chm/TestChmBlockInfo.java | 58 +- .../microsoft/chm/TestChmExtraction.java | 86 +- .../microsoft/chm/TestChmExtractor.java | 28 +- .../microsoft/chm/TestChmItsfHeader.java | 24 +- .../microsoft/chm/TestChmItspHeader.java | 33 +- .../parser/microsoft/chm/TestChmLzxState.java | 49 +- .../microsoft/chm/TestChmLzxcControlData.java | 49 +- .../microsoft/chm/TestChmLzxcResetTable.java | 51 +- .../chm/TestDirectoryListingEntry.java | 24 +- .../parser/microsoft/chm/TestParameters.java | 28 +- .../parser/microsoft/chm/TestPmgiHeader.java | 22 +- .../parser/microsoft/chm/TestPmglHeader.java | 30 +- .../microsoft/libpst/TestLibPstParser.java | 78 +- .../microsoft/onenote/OneNoteParserTest.java | 188 +- .../ooxml/OOXMLContainerExtractionTest.java | 153 +- .../microsoft/ooxml/OOXMLParserTest.java | 468 +++-- .../microsoft/ooxml/SXSLFExtractorTest.java | 203 +- .../microsoft/ooxml/SXWPFExtractorTest.java | 228 ++- .../microsoft/ooxml/TruncatedOOXMLTest.java | 40 +- .../microsoft/ooxml/xps/XPSParserTest.java | 76 +- .../xwpf/ml2006/Word2006MLParserTest.java | 80 +- .../microsoft/pst/OutlookPSTParserTest.java | 62 +- .../parser/microsoft/rtf/RTFParserTest.java | 115 +- .../microsoft/xml/XML2003ParserTest.java | 37 +- .../tika/detect/ole/MiscOLEDetector.java | 80 +- .../org/apache/tika/parser/dbf/DBFCell.java | 61 +- .../tika/parser/dbf/DBFColumnHeader.java | 67 +- .../apache/tika/parser/dbf/DBFFileHeader.java | 51 +- .../org/apache/tika/parser/dbf/DBFParser.java | 49 +- .../org/apache/tika/parser/dbf/DBFReader.java | 63 +- .../org/apache/tika/parser/dbf/DBFRow.java | 27 +- .../tika/parser/dif/DIFContentHandler.java | 40 +- .../org/apache/tika/parser/dif/DIFParser.java | 40 +- .../tika/parser/epub/EpubContentParser.java | 34 +- .../apache/tika/parser/epub/EpubParser.java | 169 +- .../apache/tika/parser/epub/OPFParser.java | 41 +- .../tika/parser/hwp/HwpStreamReader.java | 29 +- .../tika/parser/hwp/HwpTextExtractorV5.java | 74 +- .../apache/tika/parser/hwp/HwpV5Parser.java | 32 +- .../indesign/ContentAndMetadataExtractor.java | 48 +- .../tika/parser/indesign/IDMLParser.java | 48 +- .../tika/parser/mif/MIFContentHandler.java | 33 +- .../apache/tika/parser/mif/MIFExtractor.java | 36 +- .../org/apache/tika/parser/mif/MIFParser.java | 43 +- .../odf/FlatOpenDocumentMacroHandler.java | 42 +- .../parser/odf/FlatOpenDocumentParser.java | 67 +- .../odf/NSNormalizerContentHandler.java | 45 +- .../parser/odf/OpenDocumentBodyHandler.java | 143 +- .../parser/odf/OpenDocumentContentParser.java | 36 +- .../parser/odf/OpenDocumentMacroHandler.java | 36 +- .../odf/OpenDocumentManifestHandler.java | 42 +- .../parser/odf/OpenDocumentMetaParser.java | 89 +- .../tika/parser/odf/OpenDocumentParser.java | 193 +- .../parser/wordperfect/QPWTextExtractor.java | 117 +- .../parser/wordperfect/QuattroProParser.java | 38 +- .../tika/parser/wordperfect/WP5Charsets.java | 334 ++-- .../wordperfect/WP5DocumentAreaExtractor.java | 81 +- .../tika/parser/wordperfect/WP6Charsets.java | 839 +++++---- .../wordperfect/WP6DocumentAreaExtractor.java | 85 +- .../wordperfect/WPDocumentAreaExtractor.java | 36 +- .../parser/wordperfect/WPInputStream.java | 38 +- .../tika/parser/wordperfect/WPPrefixArea.java | 36 +- .../wordperfect/WPPrefixAreaExtractor.java | 52 +- .../parser/wordperfect/WordPerfectParser.java | 57 +- .../apache/tika/parser/dbf/DBFParserTest.java | 83 +- .../apache/tika/parser/dif/DIFParserTest.java | 28 +- .../tika/parser/epub/EpubParserTest.java | 71 +- .../tika/parser/hwp/HwpV5ParserTest.java | 31 +- .../tika/parser/ibooks/iBooksParserTest.java | 56 +- .../tika/parser/indesign/IDMLParserTest.java | 31 +- .../apache/tika/parser/mif/MIFParserTest.java | 25 +- .../apache/tika/parser/odf/ODFParserTest.java | 166 +- .../odf/OpenDocumentContentParserTest.java | 43 +- .../parser/wordperfect/QuattroProTest.java | 29 +- .../parser/wordperfect/WPInputStreamTest.java | 41 +- .../parser/wordperfect/WordPerfectTest.java | 32 +- .../apache/tika/parser/feed/FeedParser.java | 55 +- .../tika/parser/iptc/IptcAnpaParser.java | 425 +++-- .../tika/parser/feed/FeedParserTest.java | 42 +- .../tika/parser/ocr/ImagePreprocessor.java | 49 +- .../tika/parser/ocr/TesseractOCRConfig.java | 180 +- .../tika/parser/ocr/TesseractOCRParser.java | 328 ++-- .../tika/parser/ocr/tess4j/ImageDeskew.java | 34 +- .../tika/parser/ocr/tess4j/ImageUtil.java | 38 +- .../parser/ocr/TesseractOCRConfigTest.java | 55 +- .../parser/ocr/TesseractOCRParserTest.java | 130 +- .../tika/parser/pdf/AbstractPDF2XHTML.java | 459 ++--- .../apache/tika/parser/pdf/AccessChecker.java | 46 +- .../org/apache/tika/parser/pdf/OCR2XHTML.java | 66 +- .../tika/parser/pdf/OCRPageCounter.java | 26 +- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 113 +- .../apache/tika/parser/pdf/PDFDOMUtil.java | 40 +- .../parser/pdf/PDFEncodedStringDecoder.java | 64 +- .../parser/pdf/PDFMarkedContent2XHTML.java | 211 +-- .../org/apache/tika/parser/pdf/PDFParser.java | 384 ++-- .../tika/parser/pdf/PDFParserConfig.java | 286 ++- .../tika/parser/pdf/PDMetadataExtractor.java | 167 +- .../apache/tika/parser/pdf/XFAExtractor.java | 115 +- .../parser/pdf/image/ImageGraphicsEngine.java | 145 +- .../pdf/image/ImageGraphicsEngineFactory.java | 38 +- .../pdf/updates/IncrementalUpdateRecord.java | 22 +- .../pdf/updates/IsIncrementalUpdate.java | 22 +- .../parser/pdf/updates/StartXRefOffset.java | 34 +- .../parser/pdf/updates/StartXRefScanner.java | 82 +- .../pdf/xmpschemas/XMPSchemaIllustrator.java | 25 +- .../parser/pdf/xmpschemas/XMPSchemaPDFUA.java | 22 +- .../parser/pdf/xmpschemas/XMPSchemaPDFVT.java | 24 +- .../parser/pdf/xmpschemas/XMPSchemaPDFX.java | 26 +- .../pdf/xmpschemas/XMPSchemaPDFXId.java | 22 +- .../renderer/pdf/mutool/MuPDFRenderer.java | 60 +- .../pdf/pdfbox/NoTextPDFRenderer.java | 35 +- .../pdf/pdfbox/PDDocumentRenderer.java | 27 +- .../renderer/pdf/pdfbox/PDFBoxRenderer.java | 85 +- .../pdf/pdfbox/PDFRenderingState.java | 22 +- .../pdf/pdfbox/TextOnlyPDFRenderer.java | 50 +- .../pdfbox/VectorGraphicsOnlyPDFRenderer.java | 65 +- .../tika/parser/pdf/AccessCheckerTest.java | 47 +- .../tika/parser/pdf/CustomTikaXMPTest.java | 40 +- .../MyCustomImageGraphicsEngineFactory.java | 36 +- .../parser/pdf/PDFIncrementalUpdatesTest.java | 85 +- .../pdf/PDFMarkedContent2XHTMLTest.java | 38 +- .../apache/tika/parser/pdf/PDFParserTest.java | 620 +++--- .../tika/parser/pdf/PDFRenderingTest.java | 44 +- .../gzip/GZipSpecializationDetector.java | 49 +- .../tika/parser/pkg/CompressorParser.java | 78 +- .../parser/pkg/CompressorParserOptions.java | 26 +- .../apache/tika/parser/pkg/PackageParser.java | 297 ++- .../org/apache/tika/parser/pkg/RarParser.java | 46 +- .../apache/tika/parser/pkg/UnrarParser.java | 84 +- .../tika/parser/pkg/AbstractPkgTest.java | 34 +- .../apache/tika/parser/pkg/ArParserTest.java | 31 +- .../tika/parser/pkg/Bzip2ParserTest.java | 31 +- .../tika/parser/pkg/CompressParserTest.java | 33 +- .../tika/parser/pkg/CompressorParserTest.java | 58 +- .../tika/parser/pkg/GzipParserTest.java | 31 +- .../tika/parser/pkg/PackageParserTest.java | 50 +- .../apache/tika/parser/pkg/RarParserTest.java | 37 +- .../tika/parser/pkg/Seven7ParserTest.java | 33 +- .../apache/tika/parser/pkg/TarParserTest.java | 33 +- .../tika/parser/pkg/UnrarParserTest.java | 32 +- .../apache/tika/parser/pkg/ZipParserTest.java | 74 +- .../tika/parser/pkg/ZlibParserTest.java | 31 +- .../org/apache/tika/parser/csv/CSVParams.java | 26 +- .../org/apache/tika/parser/csv/CSVResult.java | 33 +- .../apache/tika/parser/csv/CSVSniffer.java | 122 +- .../tika/parser/csv/TextAndCSVConfig.java | 24 +- .../tika/parser/csv/TextAndCSVParser.java | 155 +- .../parser/strings/Latin1StringsParser.java | 96 +- .../tika/parser/strings/StringsConfig.java | 43 +- .../tika/parser/strings/StringsEncoding.java | 30 +- .../tika/parser/strings/StringsParser.java | 68 +- .../apache/tika/parser/txt/BOMDetector.java | 41 +- .../tika/parser/txt/CharsetDetector.java | 259 ++- .../apache/tika/parser/txt/CharsetMatch.java | 129 +- .../tika/parser/txt/CharsetRecog_2022.java | 85 +- .../tika/parser/txt/CharsetRecog_UTF8.java | 25 +- .../tika/parser/txt/CharsetRecog_Unicode.java | 36 +- .../tika/parser/txt/CharsetRecog_mbcs.java | 288 +-- .../tika/parser/txt/CharsetRecog_sbcs.java | 1663 +++++++++-------- .../tika/parser/txt/CharsetRecognizer.java | 35 +- .../parser/txt/Icu4jEncodingDetector.java | 39 +- .../org/apache/tika/parser/txt/TXTParser.java | 46 +- .../parser/txt/UniversalEncodingDetector.java | 26 +- .../parser/txt/UniversalEncodingListener.java | 42 +- .../tika/parser/csv/CSVSnifferTest.java | 78 +- .../tika/parser/csv/TextAndCSVParserTest.java | 115 +- .../strings/Latin1StringsParserTest.java | 33 +- .../parser/strings/StringsConfigTest.java | 46 +- .../parser/strings/StringsParserTest.java | 28 +- .../tika/parser/txt/BOMDetectorTest.java | 67 +- .../tika/parser/txt/CharsetDetectorTest.java | 53 +- .../apache/tika/parser/txt/TXTParserTest.java | 84 +- .../apache/tika/parser/http/HttpParser.java | 51 +- .../apache/tika/parser/wacz/WACZParser.java | 72 +- .../apache/tika/parser/warc/WARCParser.java | 77 +- .../tika/parser/http/HttpParserTest.java | 28 +- .../tika/parser/wacz/WACZParserTest.java | 28 +- .../tika/parser/warc/WARCParserTest.java | 59 +- .../tika/parser/tmx/TMXContentHandler.java | 49 +- .../org/apache/tika/parser/tmx/TMXParser.java | 33 +- .../parser/xliff/XLIFF12ContentHandler.java | 29 +- .../tika/parser/xliff/XLIFF12Parser.java | 32 +- .../apache/tika/parser/xliff/XLZParser.java | 38 +- .../parser/xml/AbstractMetadataHandler.java | 32 +- .../AttributeDependantMetadataHandler.java | 33 +- .../parser/xml/AttributeMetadataHandler.java | 38 +- .../apache/tika/parser/xml/DcXMLParser.java | 57 +- .../parser/xml/ElementMetadataHandler.java | 91 +- .../tika/parser/xml/FictionBookParser.java | 50 +- .../tika/parser/xml/MetadataHandler.java | 37 +- .../parser/xml/TextAndAttributeXMLParser.java | 27 +- .../org/apache/tika/parser/xml/XMLParser.java | 41 +- .../apache/tika/parser/xml/XMLProfiler.java | 73 +- .../apache/tika/parser/tmx/TMXParserTest.java | 28 +- .../tika/parser/xliff/XLIFF12ParserTest.java | 28 +- .../tika/parser/xliff/XLZParserTest.java | 30 +- .../tika/parser/xml/DcXMLParserTest.java | 36 +- ...mptyAndDuplicateElementsXMLParserTest.java | 50 +- .../parser/xml/FictionBookParserTest.java | 30 +- .../xml/TextAndAttributeXMLParserTest.java | 30 +- .../tika/parser/xmp/JempboxExtractor.java | 76 +- .../tika/parser/xmp/XMPMetadataExtractor.java | 52 +- .../tika/parser/xmp/XMPPacketScanner.java | 53 +- .../tika/parser/xmp/JempboxExtractorTest.java | 53 +- .../tika/detect/zip/CompressorConstants.java | 23 +- .../zip/DefaultZipContainerDetector.java | 145 +- ...precatedStreamingZipContainerDetector.java | 190 +- .../zip/DeprecatedZipContainerDetector.java | 37 +- .../zip/FrictionlessPackageDetector.java | 40 +- .../apache/tika/detect/zip/IPADetector.java | 26 +- .../apache/tika/detect/zip/JarDetector.java | 32 +- .../apache/tika/detect/zip/KMZDetector.java | 26 +- .../tika/detect/zip/OpenDocumentDetector.java | 38 +- .../tika/detect/zip/PackageConstants.java | 23 +- .../tika/detect/zip/StarOfficeDetector.java | 66 +- .../detect/zip/StreamingDetectContext.java | 40 +- .../zip/StreamingZipContainerDetector.java | 35 +- .../tika/detect/zip/ZipContainerDetector.java | 44 +- .../detect/zip/ZipContainerDetectorBase.java | 25 +- .../apache/tika/zip/utils/ZipSalvager.java | 70 +- .../zip/FrictionlessDataDetectionTest.java | 29 +- .../tika/detect/zip/ZipDetectionTest.java | 54 +- .../tika/parser/internal/Activator.java | 32 +- .../tika/config/TikaConfigSerializerTest.java | 70 +- .../tika/config/TikaDetectorConfigTest.java | 51 +- .../tika/config/TikaEncodingDetectorTest.java | 85 +- .../tika/config/TikaParserConfigTest.java | 36 +- .../tika/config/TikaTranslatorConfigTest.java | 34 +- .../detect/TestContainerAwareDetector.java | 317 ++-- .../tika/detect/TestDetectorLoading.java | 32 +- .../tika/detect/TestFileCommandDetector.java | 32 +- .../apache/tika/detect/TestZipDetector.java | 72 +- .../extractor/EmbeddedDocumentUtilTest.java | 27 +- .../org/apache/tika/mime/MimeTypeTest.java | 22 +- .../org/apache/tika/mime/MimeTypesTest.java | 22 +- .../org/apache/tika/mime/OneOffMimeTest.java | 40 +- .../org/apache/tika/mime/TestMimeTypes.java | 342 ++-- .../tika/ossfuzz/OssFuzzReplicator.java | 27 +- .../org/apache/tika/ossfuzz/ParserFuzzer.java | 48 +- .../parser/AutoDetectParserConfigTest.java | 98 +- .../tika/parser/AutoDetectParserTest.java | 185 +- .../parser/AutoDetectReaderParserTest.java | 42 +- .../BouncyCastleDigestingParserTest.java | 117 +- .../tika/parser/DigestingParserTest.java | 151 +- .../apache/tika/parser/ParsingReaderTest.java | 28 +- .../parser/RecursiveParserWrapperTest.java | 195 +- .../tika/parser/TabularFormatsTest.java | 192 +- .../org/apache/tika/parser/TestParsers.java | 72 +- .../tika/parser/TestXMLEntityExpansion.java | 90 +- .../org/apache/tika/parser/TestXXEInXML.java | 124 +- .../org/apache/tika/parser/XMLTestBase.java | 53 +- .../apple/AppleSingleFileParserTest.java | 32 +- .../apache/tika/parser/apple/IWorkTest.java | 26 +- .../tika/parser/apple/PListParserTest.java | 26 +- .../tika/parser/crypto/TSDParserTest.java | 32 +- .../fork/ForkParserIntegrationTest.java | 122 +- .../tika/parser/html/HtmlParserTest.java | 34 +- .../tika/parser/image/JpegParserTest.java | 35 +- .../tika/parser/mail/MboxParserTest.java | 30 +- .../tika/parser/mail/RFC822ParserTest.java | 58 +- .../tika/parser/microsoft/EMFParserTest.java | 36 +- .../parser/microsoft/ExcelParserTest.java | 26 +- .../microsoft/POIContainerExtractionTest.java | 54 +- .../microsoft/PowerPointParserTest.java | 28 +- .../parser/microsoft/XML2003ParserTest.java | 45 +- .../microsoft/ooxml/OOXMLParserTest.java | 71 +- .../microsoft/ooxml/SXWPFExtractorTest.java | 63 +- .../microsoft/ooxml/TruncatedOOXMLTest.java | 55 +- .../parser/microsoft/rtf/RTFParserTest.java | 82 +- .../tika/parser/mock/MockParserTest.java | 70 +- .../parser/ocr/TesseractOCRParserTest.java | 114 +- .../apache/tika/parser/odf/ODFParserTest.java | 77 +- .../apache/tika/parser/pdf/PDFParserTest.java | 208 +-- .../apache/tika/parser/pkg/ArParserTest.java | 28 +- .../tika/parser/pkg/Bzip2ParserTest.java | 28 +- .../CompositeZipContainerDetectorTest.java | 140 +- .../tika/parser/pkg/CompressParserTest.java | 28 +- .../tika/parser/pkg/CompressorParserTest.java | 30 +- .../tika/parser/pkg/GzipParserTest.java | 42 +- .../tika/parser/pkg/PackageParserTest.java | 26 +- .../apache/tika/parser/pkg/RarParserTest.java | 30 +- .../tika/parser/pkg/Seven7ParserTest.java | 54 +- .../apache/tika/parser/pkg/TarParserTest.java | 28 +- .../tika/parser/pkg/UnrarParserTest.java | 42 +- .../apache/tika/parser/pkg/ZipParserTest.java | 48 +- .../tika/parser/pkg/ZlibParserTest.java | 28 +- .../parser/xml/FictionBookParserTest.java | 28 +- .../tika/sax/BoilerpipeHandlerTest.java | 55 +- ...oublingContentHandlerDecoratorFactory.java | 32 +- .../PhoneExtractingContentHandlerTest.java | 41 +- ...StandardsExtractingContentHandlerTest.java | 30 +- ...pcasingContentHandlerDecoratorFactory.java | 30 +- .../tika/utils/ServiceLoaderUtilsTest.java | 33 +- .../tika/async/cli/SimpleAsyncConfig.java | 31 +- .../apache/tika/async/cli/TikaAsyncCLI.java | 120 +- .../tika/async/cli/TikaConfigAsyncWriter.java | 87 +- .../tika/async/cli/AsyncCliParserTest.java | 49 +- .../tika/async/cli/AsyncProcessorTest.java | 76 +- .../tika/async/cli/TikaAsyncCLITest.java | 34 +- .../async/cli/TikaConfigAsyncWriterTest.java | 51 +- .../pipes/emitter/azblob/AZBlobEmitter.java | 100 +- .../emitter/azblob/TestAZBlobEmitter.java | 33 +- .../tika/pipes/emitter/gcs/GCSEmitter.java | 85 +- .../pipes/emitter/gcs/TestGCSEmitter.java | 33 +- .../tika/pipes/emitter/jdbc/JDBCEmitter.java | 183 +- .../pipes/emitter/jdbc/JDBCEmitterTest.java | 80 +- .../pipes/emitter/kafka/KafkaEmitter.java | 61 +- .../emitter/opensearch/JsonResponse.java | 29 +- .../emitter/opensearch/OpenSearchClient.java | 150 +- .../emitter/opensearch/OpenSearchEmitter.java | 87 +- .../opensearch/OpenSearchClientTest.java | 38 +- .../tika/pipes/emitter/s3/S3Emitter.java | 156 +- .../tika/pipes/emitter/solr/SolrEmitter.java | 117 +- .../emitter/solr/SolrEmitterDevTest.java | 33 +- .../pipes/fetcher/azblob/AZBlobFetcher.java | 74 +- .../azblob/config/AZBlobFetcherConfig.java | 22 +- .../fetcher/azblob/TestAZBlobFetcher.java | 38 +- .../tika/pipes/fetcher/gcs/GCSFetcher.java | 49 +- .../fetcher/gcs/config/GCSFetcherConfig.java | 22 +- .../tika/pipes/fetcher/s3/TestGCSFetcher.java | 34 +- .../tika/pipes/fetcher/http/HttpFetcher.java | 243 ++- .../http/config/HttpFetcherConfig.java | 23 +- .../fetcher/http/config/HttpHeaders.java | 32 +- .../tika/pipes/fetcher/http/jwt/JwtCreds.java | 22 +- .../pipes/fetcher/http/jwt/JwtGenerator.java | 49 +- .../fetcher/http/jwt/JwtPrivateKeyCreds.java | 28 +- .../fetcher/http/jwt/JwtSecretCreds.java | 23 +- .../pipes/fetcher/http/HttpFetcherTest.java | 108 +- .../fetcher/http/config/HttpHeadersTest.java | 28 +- .../fetcher/http/jwt/JwtGeneratorTest.java | 39 +- .../microsoftgraph/MicrosoftGraphFetcher.java | 69 +- .../config/AadCredentialConfigBase.java | 22 +- .../Client2CertificateCredentialsConfig.java | 22 +- .../ClientCertificateCredentialsConfig.java | 25 +- .../config/ClientSecretCredentialsConfig.java | 25 +- .../config/MicrosoftGraphFetcherConfig.java | 29 +- .../MicrosoftGraphFetcherTest.java | 63 +- .../tika/pipes/fetcher/s3/S3Fetcher.java | 147 +- .../fetcher/s3/config/S3FetcherConfig.java | 22 +- .../tika/pipes/fetcher/s3/TestS3Fetcher.java | 30 +- .../apache/tika/client/HttpClientFactory.java | 119 +- .../apache/tika/client/HttpClientUtil.java | 35 +- .../tika/client/TikaClientException.java | 22 +- .../pipes/core/CompositePipesReporter.java | 31 +- .../core/FailedToStartClientException.java | 22 +- .../tika/pipes/core/FetchEmitTuple.java | 51 +- .../apache/tika/pipes/core/HandlerConfig.java | 68 +- .../tika/pipes/core/LoggingPipesReporter.java | 22 +- .../tika/pipes/core/PassbackFilter.java | 23 +- .../apache/tika/pipes/core/PipesClient.java | 205 +- .../apache/tika/pipes/core/PipesConfig.java | 33 +- .../tika/pipes/core/PipesConfigBase.java | 56 +- .../tika/pipes/core/PipesException.java | 22 +- .../apache/tika/pipes/core/PipesParser.java | 30 +- .../apache/tika/pipes/core/PipesReporter.java | 62 +- .../tika/pipes/core/PipesReporterBase.java | 42 +- .../apache/tika/pipes/core/PipesResult.java | 66 +- .../apache/tika/pipes/core/PipesServer.java | 290 +-- .../tika/pipes/core/async/AsyncConfig.java | 34 +- .../tika/pipes/core/async/AsyncEmitter.java | 49 +- .../tika/pipes/core/async/AsyncProcessor.java | 131 +- .../tika/pipes/core/async/AsyncStatus.java | 40 +- .../core/async/OfferLargerThanQueueSize.java | 25 +- .../pipes/core/emitter/AbstractEmitter.java | 29 +- .../tika/pipes/core/emitter/EmitData.java | 39 +- .../tika/pipes/core/emitter/EmitKey.java | 29 +- .../tika/pipes/core/emitter/Emitter.java | 30 +- .../pipes/core/emitter/EmitterManager.java | 50 +- .../tika/pipes/core/emitter/EmptyEmitter.java | 25 +- .../pipes/core/emitter/StreamEmitter.java | 27 +- .../core/emitter/TikaEmitterException.java | 22 +- .../AbstractEmbeddedDocumentBytesHandler.java | 71 +- .../BasicEmbeddedDocumentBytesHandler.java | 40 +- .../EmbeddedDocumentBytesConfig.java | 60 +- .../EmittingEmbeddedDocumentBytesHandler.java | 44 +- .../pipes/core/fetcher/AbstractFetcher.java | 22 +- .../tika/pipes/core/fetcher/EmptyFetcher.java | 26 +- .../tika/pipes/core/fetcher/FetchKey.java | 38 +- .../tika/pipes/core/fetcher/Fetcher.java | 31 +- .../pipes/core/fetcher/FetcherManager.java | 52 +- .../core/fetcher/FetcherStringException.java | 22 +- .../tika/pipes/core/fetcher/RangeFetcher.java | 31 +- .../core/fetcher/config/AbstractConfig.java | 24 +- .../config/FetcherConfigContainer.java | 22 +- .../pipesiterator/CallablePipesIterator.java | 78 +- .../core/pipesiterator/PipesIterator.java | 79 +- .../core/pipesiterator/TotalCountResult.java | 27 +- .../core/pipesiterator/TotalCounter.java | 40 +- .../FetchEmitTupleDeserializer.java | 51 +- .../FetchEmitTupleSerializer.java | 34 +- .../core/serialization/JsonEmitData.java | 28 +- .../serialization/JsonFetchEmitTuple.java | 28 +- .../serialization/JsonFetchEmitTupleList.java | 30 +- .../pipes/emitter/fs/FileSystemEmitter.java | 55 +- .../pipes/fetcher/fs/FileSystemFetcher.java | 89 +- .../fs/config/FileSystemFetcherConfig.java | 22 +- .../tika/pipes/fetcher/url/UrlFetcher.java | 44 +- .../filelist/FileListPipesIterator.java | 47 +- .../fs/FileSystemPipesIterator.java | 59 +- .../tika/pipes/core/PassbackFilterTest.java | 75 +- .../tika/pipes/core/PipesClientTest.java | 80 +- .../tika/pipes/core/PipesServerTest.java | 146 +- .../tika/pipes/core/TikaPipesConfigTest.java | 58 +- .../core/async/AsyncChaosMonkeyTest.java | 138 +- .../pipes/core/async/MockDigesterFactory.java | 22 +- .../tika/pipes/core/async/MockEmitter.java | 33 +- .../tika/pipes/core/async/MockFetcher.java | 33 +- .../tika/pipes/core/async/MockReporter.java | 23 +- .../pipes/core/async/MockReporterTest.java | 34 +- .../tika/pipes/core/emitter/MockEmitter.java | 27 +- .../tika/pipes/core/fetcher/MockFetcher.java | 32 +- .../filelist/FileListPipesIteratorTest.java | 26 +- .../JsonFetchEmitTupleListTest.java | 29 +- .../serialization/JsonFetchEmitTupleTest.java | 63 +- .../fetcher/fs/FileSystemFetcherTest.java | 27 +- .../fs/FileSystemPipesIteratorTest.java | 31 +- .../azblob/AZBlobPipesIterator.java | 86 +- .../azblob/TestAZBlobPipesIterator.java | 28 +- .../pipesiterator/csv/CSVPipesIterator.java | 116 +- .../src/test/java/TestCSVPipesIterator.java | 43 +- .../pipesiterator/gcs/GCSPipesIterator.java | 56 +- .../gcs/TestGCSPipesIterator.java | 28 +- .../pipesiterator/jdbc/JDBCPipesIterator.java | 122 +- .../jdbc/TestJDBCPipesIterator.java | 93 +- .../pipesiterator/json/JsonPipesIterator.java | 31 +- .../json/TestJsonPipesIterator.java | 88 +- .../kafka/KafkaPipesIterator.java | 45 +- .../kafka/TestKafkaPipesIterator.java | 34 +- .../pipesiterator/s3/S3PipesIterator.java | 112 +- .../pipesiterator/s3/TestS3PipesIterator.java | 34 +- .../pipesiterator/solr/SolrPipesIterator.java | 86 +- .../fs/FileSystemStatusReporter.java | 77 +- .../fs/TestFileSystemStatusReporter.java | 66 +- .../reporters/jdbc/JDBCPipesReporter.java | 129 +- .../reporters/jdbc/TestJDBCPipesReporter.java | 74 +- .../reporters/opensearch/JsonResponse.java | 29 +- .../opensearch/OpenSearchClient.java | 70 +- .../opensearch/OpenSearchPipesReporter.java | 80 +- .../tika/serialization/JsonMetadata.java | 52 +- .../tika/serialization/JsonMetadataList.java | 63 +- .../serialization/MetadataDeserializer.java | 30 +- .../serialization/MetadataSerializer.java | 36 +- .../ParseContextDeserializer.java | 45 +- .../serialization/ParseContextSerializer.java | 29 +- .../PrettyMetadataKeyComparator.java | 25 +- .../serialization/TikaJsonDeserializer.java | 143 +- .../serialization/TikaJsonSerializer.java | 124 +- .../TikaSerializationException.java | 22 +- .../serialization/JsonMetadataListTest.java | 70 +- .../tika/serialization/JsonMetadataTest.java | 66 +- .../TestParseContextSerialization.java | 38 +- .../TikaJsonSerializationTest.java | 32 +- .../tika/serialization/mocks/ClassA.java | 32 +- .../tika/serialization/mocks/ClassB.java | 28 +- .../tika/serialization/mocks/ClassC.java | 22 +- .../server/client/TikaAsyncHttpClient.java | 25 +- .../apache/tika/server/client/TikaClient.java | 38 +- .../tika/server/client/TikaClientCLI.java | 75 +- .../client/TikaClientConfigException.java | 22 +- .../tika/server/client/TikaEmitterResult.java | 27 +- .../server/client/TikaPipesHttpClient.java | 52 +- .../server/client/TikaServerClientConfig.java | 37 +- .../apache/tika/server/client/TestBasic.java | 39 +- .../core/CompositeParseContextConfig.java | 32 +- .../core/DefaultInputStreamFactory.java | 31 +- .../server/core/FetcherStreamFactory.java | 82 +- .../apache/tika/server/core/HTMLHelper.java | 26 +- .../tika/server/core/InputStreamFactory.java | 35 +- .../apache/tika/server/core/MetadataList.java | 23 +- .../tika/server/core/ParseContextConfig.java | 52 +- .../core/ProduceTypeResourceComparator.java | 120 +- .../apache/tika/server/core/ServerStatus.java | 37 +- .../server/core/ServerStatusResource.java | 22 +- .../tika/server/core/ServerStatusWatcher.java | 101 +- .../apache/tika/server/core/TaskStatus.java | 25 +- .../tika/server/core/TikaLoggingFilter.java | 30 +- .../tika/server/core/TikaServerCli.java | 63 +- .../tika/server/core/TikaServerConfig.java | 193 +- .../server/core/TikaServerParseException.java | 26 +- .../core/TikaServerParseExceptionMapper.java | 55 +- .../tika/server/core/TikaServerProcess.java | 247 +-- .../tika/server/core/TikaServerWatchDog.java | 151 +- .../apache/tika/server/core/TlsConfig.java | 55 +- .../tika/server/core/WatchDogResult.java | 25 +- .../core/config/DocumentSelectorConfig.java | 26 +- .../core/config/PasswordProviderConfig.java | 26 +- .../server/core/config/TimeoutConfig.java | 26 +- .../server/core/resource/AsyncRequest.java | 23 +- .../server/core/resource/AsyncResource.java | 123 +- .../core/resource/DetectorResource.java | 47 +- .../core/resource/LanguageResource.java | 36 +- .../core/resource/MetadataResource.java | 98 +- .../server/core/resource/PipesResource.java | 74 +- .../resource/RecursiveMetadataResource.java | 133 +- .../server/core/resource/TikaDetectors.java | 61 +- .../server/core/resource/TikaMimeTypes.java | 253 +-- .../server/core/resource/TikaParsers.java | 83 +- .../server/core/resource/TikaResource.java | 339 ++-- .../core/resource/TikaServerResource.java | 22 +- .../core/resource/TikaServerStatus.java | 28 +- .../server/core/resource/TikaVersion.java | 23 +- .../server/core/resource/TikaWelcome.java | 68 +- .../core/resource/TranslateResource.java | 77 +- .../core/resource/UnpackerResource.java | 161 +- .../core/writer/CSVMessageBodyWriter.java | 51 +- .../core/writer/JSONMessageBodyWriter.java | 48 +- .../server/core/writer/JSONObjWriter.java | 54 +- .../writer/MetadataListMessageBodyWriter.java | 48 +- .../tika/server/core/writer/TarWriter.java | 48 +- .../core/writer/TextMessageBodyWriter.java | 54 +- .../server/core/writer/TikaServerWriter.java | 26 +- .../tika/server/core/writer/ZipWriter.java | 54 +- .../apache/tika/server/core/CXFTestBase.java | 74 +- .../tika/server/core/IntegrationTestBase.java | 67 +- .../server/core/LanguageResourceTest.java | 59 +- .../tika/server/core/NullWebClientLogger.java | 26 +- .../core/RecursiveMetadataResourceTest.java | 130 +- .../tika/server/core/ServerStatusTest.java | 32 +- .../tika/server/core/StackTraceOffTest.java | 84 +- .../tika/server/core/StackTraceTest.java | 83 +- .../tika/server/core/TikaMimeTypesTest.java | 55 +- .../tika/server/core/TikaPipesTest.java | 156 +- .../server/core/TikaResourceFetcherTest.java | 81 +- .../core/TikaResourceMetadataFilterTest.java | 40 +- .../server/core/TikaResourceNoStackTest.java | 51 +- .../tika/server/core/TikaResourceTest.java | 241 +-- .../core/TikaServerAsyncIntegrationTest.java | 115 +- .../server/core/TikaServerConfigTest.java | 83 +- .../core/TikaServerIntegrationTest.java | 379 ++-- .../core/TikaServerPipesIntegrationTest.java | 180 +- .../server/core/TikaServerStatusTest.java | 59 +- .../tika/server/core/TikaVersionTest.java | 41 +- .../tika/server/core/TikaWelcomeTest.java | 54 +- .../server/core/TranslateResourceTest.java | 48 +- .../tika/server/eval/TikaEvalResource.java | 78 +- .../server/eval/TikaEvalResourceTest.java | 97 +- .../standard/config/PDFServerConfig.java | 52 +- .../config/TesseractServerConfig.java | 49 +- .../resource/XMPMetadataResource.java | 47 +- .../standard/writer/XMPMessageBodyWriter.java | 46 +- .../server/standard/DetectorResourceTest.java | 61 +- .../tika/server/standard/FetcherTest.java | 46 +- .../standard/JsonMaxFieldLengthTest.java | 55 +- .../server/standard/MetadataResourceTest.java | 141 +- .../standard/OpenNLPMetadataFilterTest.java | 73 +- .../standard/OptimaizeMetadataFilterTest.java | 76 +- .../standard/RecursiveMetadataFilterTest.java | 50 +- .../RecursiveMetadataResourceTest.java | 448 ++--- .../server/standard/TikaDetectorsTest.java | 80 +- .../server/standard/TikaMimeTypesTest.java | 57 +- .../tika/server/standard/TikaParsersTest.java | 83 +- .../tika/server/standard/TikaPipesTest.java | 187 +- .../server/standard/TikaResourceTest.java | 585 +++--- .../server/standard/UnpackerResourceTest.java | 192 +- .../UnpackerResourceWithConfigTest.java | 133 +- .../translate/impl/AbstractTranslator.java | 23 +- .../translate/impl/CachedTranslator.java | 61 +- .../translate/impl/ExternalTranslator.java | 40 +- .../translate/impl/GoogleTranslator.java | 58 +- .../impl/JoshuaNetworkTranslator.java | 132 +- .../translate/impl/Lingo24Translator.java | 57 +- .../translate/impl/MarianTranslator.java | 120 +- .../translate/impl/MicrosoftTranslator.java | 62 +- .../translate/impl/MosesTranslator.java | 57 +- .../translate/impl/RTGTranslator.java | 57 +- .../translate/impl/YandexTranslator.java | 88 +- .../translate/impl/CachedTranslatorTest.java | 40 +- .../translate/impl/GoogleTranslatorTest.java | 30 +- .../impl/JoshuaNetworkTranslatorTest.java | 26 +- .../translate/impl/Lingo24TranslatorTest.java | 31 +- .../translate/impl/MarianTranslatorTest.java | 32 +- .../impl/MicrosoftTranslatorTest.java | 28 +- .../translate/impl/MosesTranslatorTest.java | 24 +- .../translate/impl/RTGTranslatorTest.java | 23 +- .../translate/impl/YandexTranslatorTest.java | 26 +- .../java/org/apache/tika/xmp/XMPMetadata.java | 95 +- .../tika/xmp/convert/AbstractConverter.java | 68 +- .../tika/xmp/convert/GenericConverter.java | 42 +- .../tika/xmp/convert/ITikaToXMPConverter.java | 23 +- .../xmp/convert/MSOfficeBinaryConverter.java | 53 +- .../xmp/convert/MSOfficeXMLConverter.java | 65 +- .../apache/tika/xmp/convert/Namespace.java | 22 +- .../xmp/convert/OpenDocumentConverter.java | 42 +- .../apache/tika/xmp/convert/RTFConverter.java | 48 +- .../apache/tika/xmp/convert/TikaToXMP.java | 49 +- .../org/apache/tika/xmp/TikaToXMPTest.java | 59 +- .../org/apache/tika/xmp/XMPMetadataTest.java | 40 +- 1662 files changed, 50522 insertions(+), 58168 deletions(-) delete mode 100644 tika-parent/intellij-code-style.xml diff --git a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java index f8189cf69e..d9b9c87496 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java +++ b/tika-app/src/main/java/org/apache/tika/cli/AsyncHelper.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.cli; @@ -33,7 +31,7 @@ public static String[] translateArgs(String[] args) { argList.add("-c"); argList.add(c); } else if (arg.equals("-a")) { - //do nothing + // do nothing } else { argList.add(args[i]); } diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index a1db5f8bf0..3a9ec7f0e5 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.cli; @@ -55,17 +53,10 @@ import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; - import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.logging.log4j.Level; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.Tika; import org.apache.tika.async.cli.TikaAsyncCLI; import org.apache.tika.config.TikaConfig; @@ -109,15 +100,21 @@ import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; import org.apache.tika.xmp.XMPMetadata; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** * Simple command line interface for Apache Tika. */ public class TikaCLI { private static final Logger LOG = LoggerFactory.getLogger(TikaCLI.class); - private static final Property NORMALIZED_EMBEDDED_NAME = Property.externalText("tk:normalized-embedded-name"); + private static final Property NORMALIZED_EMBEDDED_NAME = + Property.externalText("tk:normalized-embedded-name"); - private final int MAX_MARK = 20 * 1024 * 1024;//20MB + private final int MAX_MARK = 20 * 1024 * 1024;// 20MB private final OutputType NO_OUTPUT = new OutputType() { @Override @@ -139,46 +136,53 @@ protected ContentHandler getContentHandler(OutputStream output, Metadata metadat private String encoding = null; private final OutputType TEXT = new OutputType() { @Override - protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { + protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) + throws Exception { return new BodyContentHandler(getOutputWriter(output, encoding)); } }; private final OutputType TEXT_MAIN = new OutputType() { @Override - protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { + protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) + throws Exception { return new BoilerpipeContentHandler(getOutputWriter(output, encoding)); } }; private final OutputType TEXT_ALL = new OutputType() { @Override - protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { + protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) + throws Exception { return new WriteOutContentHandler(getOutputWriter(output, encoding)); } }; private final OutputType METADATA = new OutputType() { @Override - protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { + protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) + throws Exception { final PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding)); return new NoDocumentMetHandler(metadata, writer); } }; private final OutputType JSON = new OutputType() { @Override - protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { + protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) + throws Exception { final PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding)); return new NoDocumentJSONMetHandler(metadata, writer); } }; private final OutputType XMP = new OutputType() { @Override - protected ContentHandler getContentHandler(OutputStream output, final Metadata metadata) throws Exception { + protected ContentHandler getContentHandler(OutputStream output, final Metadata metadata) + throws Exception { final PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding)); return new NoDocumentXMPMetaHandler(metadata, writer); } }; private final OutputType LANGUAGE = new OutputType() { @Override - protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { + protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) + throws Exception { final PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding)); return new LanguageHandler() { public void endDocument() { @@ -190,11 +194,10 @@ public void endDocument() { }; private final OutputType DETECT = new OutputType() { @Override - public void process(InputStream stream, OutputStream output, Metadata metadata) throws Exception { + public void process(InputStream stream, OutputStream output, Metadata metadata) + throws Exception { PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding)); - writer.println(detector - .detect(stream, metadata) - .toString()); + writer.println(detector.detect(stream, metadata).toString()); writer.flush(); } }; @@ -208,15 +211,18 @@ public void process(InputStream stream, OutputStream output, Metadata metadata) private boolean prettyPrint; private final OutputType XML = new OutputType() { @Override - protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { + protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) + throws Exception { return getTransformerHandler(output, "xml", encoding, prettyPrint); } }; private OutputType type = XML; private final OutputType HTML = new OutputType() { @Override - protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { - return new ExpandedTitleContentHandler(getTransformerHandler(output, "html", encoding, prettyPrint)); + protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) + throws Exception { + return new ExpandedTitleContentHandler( + getTransformerHandler(output, "html", encoding, prettyPrint)); } }; @@ -266,7 +272,7 @@ private static void async(String[] args) throws Exception { break; } } - if (! StringUtils.isBlank(tikaConfigPath)) { + if (!StringUtils.isBlank(tikaConfigPath)) { TikaAsyncCLI.main(args); return; } @@ -274,7 +280,7 @@ private static void async(String[] args) throws Exception { try { tmpConfig = Files.createTempFile("tika-config-", ".xml"); Files.copy(TikaCLI.class.getResourceAsStream("/tika-config-default-single-file.xml"), - tmpConfig, StandardCopyOption.REPLACE_EXISTING); + tmpConfig, StandardCopyOption.REPLACE_EXISTING); List argList = new ArrayList<>(); argList.add("-c"); argList.add(tmpConfig.toAbsolutePath().toString()); @@ -292,14 +298,14 @@ private static void async(String[] args) throws Exception { /** * Returns a output writer with the given encoding. * - * @param output output stream - * @param encoding output encoding, - * or null for the platform default + * @param output output stream + * @param encoding output encoding, or null for the platform default * @return output writer * @throws UnsupportedEncodingException if the given encoding is not supported * @see TIKA-277 */ - private static Writer getOutputWriter(OutputStream output, String encoding) throws UnsupportedEncodingException { + private static Writer getOutputWriter(OutputStream output, String encoding) + throws UnsupportedEncodingException { if (encoding != null) { return new OutputStreamWriter(output, encoding); } else { @@ -308,33 +314,26 @@ private static Writer getOutputWriter(OutputStream output, String encoding) thro } /** - * Returns a transformer handler that serializes incoming SAX events - * to XHTML or HTML (depending the given method) using the given output - * encoding. + * Returns a transformer handler that serializes incoming SAX events to XHTML or HTML (depending + * the given method) using the given output encoding. * - * @param output output stream - * @param method "xml" or "html" - * @param encoding output encoding, - * or null for the platform default + * @param output output stream + * @param method "xml" or "html" + * @param encoding output encoding, or null for the platform default * @return {@link System#out} transformer handler * @throws TransformerConfigurationException if the transformer can not be created * @see TIKA-277 */ - private static TransformerHandler getTransformerHandler(OutputStream output, String method, String encoding, boolean prettyPrint) - throws TransformerConfigurationException, TikaException { + private static TransformerHandler getTransformerHandler(OutputStream output, String method, + String encoding, boolean prettyPrint) + throws TransformerConfigurationException, TikaException { SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); - handler - .getTransformer() - .setOutputProperty(OutputKeys.METHOD, method); - handler - .getTransformer() - .setOutputProperty(OutputKeys.INDENT, prettyPrint ? "yes" : "no"); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method); + handler.getTransformer().setOutputProperty(OutputKeys.INDENT, prettyPrint ? "yes" : "no"); if (encoding != null) { - handler - .getTransformer() - .setOutputProperty(OutputKeys.ENCODING, encoding); + handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encoding); } handler.setResult(new StreamResult(output)); return handler; @@ -377,7 +376,7 @@ public void process(String arg) throws Exception { } else if (arg.equals("-g") || arg.equals("--gui")) { pipeMode = false; if (configFilePath != null) { - TikaGUI.main(new String[]{configFilePath}); + TikaGUI.main(new String[] {configFilePath}); } else { TikaGUI.main(new String[0]); } @@ -390,7 +389,8 @@ public void process(String arg) throws Exception { } else if (arg.equals("--list-parser-detail") || arg.equals("--list-parser-details")) { pipeMode = false; displayParsers(true, false); - } else if (arg.equals("--list-parser-detail-apt") || arg.equals("--list-parser-details-apt")) { + } else if (arg.equals("--list-parser-detail-apt") + || arg.equals("--list-parser-details-apt")) { pipeMode = false; displayParsers(true, true); } else if (arg.equals("--list-met-models")) { @@ -454,8 +454,8 @@ public void process(String arg) throws Exception { type = DETECT; } else if (arg.startsWith("--extract-dir=")) { String dirPath = arg.substring("--extract-dir=".length()); - //if the user accidentally doesn't include - //a directory, set the directory to the cwd + // if the user accidentally doesn't include + // a directory, set the directory to the cwd if (dirPath.isEmpty()) { dirPath = "."; } @@ -465,8 +465,11 @@ public void process(String arg) throws Exception { context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor()); } else if (arg.equals("-r") || arg.equals("--pretty-print")) { prettyPrint = true; - } else if (arg.equals("-p") || arg.equals("--port") || arg.equals("-s") || arg.equals("--server")) { - throw new IllegalArgumentException("As of Tika 2.0, the server option is no longer supported in tika-app.\n" + "See https://wiki.apache.org/tika/TikaJAXRS for usage."); + } else if (arg.equals("-p") || arg.equals("--port") || arg.equals("-s") + || arg.equals("--server")) { + throw new IllegalArgumentException( + "As of Tika 2.0, the server option is no longer supported in tika-app.\n" + + "See https://wiki.apache.org/tika/TikaJAXRS for usage."); } else if (arg.startsWith("-c")) { networkURI = new URI(arg.substring("-c".length())); } else if (arg.startsWith("--client=")) { @@ -476,16 +479,15 @@ public void process(String arg) throws Exception { configure(); if (arg.equals("-")) { - try (InputStream stream = TikaInputStream.get(CloseShieldInputStream.wrap(System.in))) { + try (InputStream stream = + TikaInputStream.get(CloseShieldInputStream.wrap(System.in))) { type.process(stream, System.out, new Metadata()); } } else { URL url; File file = new File(arg); if (file.isFile()) { - url = file - .toURI() - .toURL(); + url = file.toURI().toURL(); } else { url = new URL(arg); } @@ -507,13 +509,16 @@ private void dumpConfig(TikaConfigSerializer.Mode mode) throws Exception { configure(); TikaConfig localConfig = (config == null) ? TikaConfig.getDefaultConfig() : config; - TikaConfigSerializer.serialize(localConfig, mode, new OutputStreamWriter(System.out, UTF_8), UTF_8); + TikaConfigSerializer.serialize(localConfig, mode, new OutputStreamWriter(System.out, UTF_8), + UTF_8); } - private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException { + private void handleRecursiveJson(URL url, OutputStream output) + throws IOException, SAXException, TikaException { Metadata metadata = new Metadata(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1, config.getMetadataFilter()); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + getContentHandlerFactory(type), -1, config.getMetadataFilter()); try (InputStream input = TikaInputStream.get(url, metadata)) { wrapper.parse(input, handler, metadata, context); } @@ -526,7 +531,8 @@ private void handleRecursiveJson(URL url, OutputStream output) throws IOExceptio } private ContentHandlerFactory getContentHandlerFactory(OutputType type) { - BasicContentHandlerFactory.HANDLER_TYPE handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE; + BasicContentHandlerFactory.HANDLER_TYPE handlerType = + BasicContentHandlerFactory.HANDLER_TYPE.IGNORE; if (type.equals(HTML)) { handlerType = BasicContentHandlerFactory.HANDLER_TYPE.HTML; } else if (type.equals(XML)) { @@ -571,7 +577,8 @@ private void usage() { out.println(" -J or --jsonRecursive Output metadata and content from all"); out.println(" embedded files (choose content type"); out.println(" with -x, -h, -t or -m; default is -x)"); - out.println(" -a or --async Run Tika in async mode; must specify details in a" + " tikaConfig file"); + out.println(" -a or --async Run Tika in async mode; must specify details in a" + + " tikaConfig file"); out.println(" -l or --language Output only language"); out.println(" -d or --detect Detect document type"); out.println(" --digest=X Include digest X (md2, md5, sha1,"); @@ -670,12 +677,13 @@ private void configure() throws TikaException, IOException, SAXException { if (configFilePath != null) { config = new TikaConfig(new File(configFilePath)); } else { - String warn = "As a convenience, TikaCLI has turned on several non-default features\n" + - "as specified in tika-app/src/main/resources/tika-config-default-single-file.xml.\n" + - "See: TIKA-2374, TIKA-4017, TIKA-4354 and TIKA-4472).\n" + - "This is not the default behavior in Tika generally or in tika-server."; + String warn = "As a convenience, TikaCLI has turned on several non-default features\n" + + "as specified in tika-app/src/main/resources/tika-config-default-single-file.xml.\n" + + "See: TIKA-2374, TIKA-4017, TIKA-4354 and TIKA-4472).\n" + + "This is not the default behavior in Tika generally or in tika-server."; LOG.info(warn); - try (InputStream is = getClass().getResourceAsStream("/tika-config-default-single-file.xml")) { + try (InputStream is = getClass() + .getResourceAsStream("/tika-config-default-single-file.xml")) { config = new TikaConfig(is); } } @@ -699,9 +707,7 @@ private void displayMetModels() { for (Class modelClass : modelClasses) { // we don't care about internal Tika met classes // if we do, then we can take this conditional out - if (!modelClass - .getSimpleName() - .contains("Tika")) { + if (!modelClass.getSimpleName().contains("Tika")) { System.out.println(modelClass.getSimpleName()); Field[] keyFields = modelClass.getFields(); Arrays.sort(keyFields, Comparator.comparing(Field::getName)); @@ -713,11 +719,11 @@ private void displayMetModels() { } /* - * Displays loaded parsers and their mime types - * If a parser is a composite parser, it will list the - * sub parsers and their mime-types. + * Displays loaded parsers and their mime types If a parser is a composite parser, it will list + * the sub parsers and their mime-types. */ - private void displayParsers(boolean includeMimeTypes, boolean aptListFormat) throws TikaException, IOException, SAXException { + private void displayParsers(boolean includeMimeTypes, boolean aptListFormat) + throws TikaException, IOException, SAXException { configure(); displayParser(parser, includeMimeTypes, aptListFormat, 3); } @@ -731,17 +737,18 @@ private void displayParser(Parser p, boolean includeMimeTypes, boolean apt, int } boolean isComposite = (p instanceof CompositeParser); - String name = p - .getClass() - .getName(); + String name = p.getClass().getName(); if (apt) { - name = name.substring(0, name.lastIndexOf(".") + 1) + "{{{./api/" + name.replace(".", "/") + "}" + name.substring(name.lastIndexOf(".") + 1) + "}}"; + name = name.substring(0, name.lastIndexOf(".") + 1) + "{{{./api/" + + name.replace(".", "/") + "}" + + name.substring(name.lastIndexOf(".") + 1) + "}}"; } else if (decorated != null) { name += decorated; } - if ((apt && !isComposite) || !apt) { // Don't display Composite parsers in the apt output. - System.out.println(indent(i) + ((apt) ? "* " : "") + name + (isComposite ? " (Composite Parser):" : "")); + if ((apt && !isComposite) || !apt) { // Don't display Composite parsers in the apt output. + System.out.println(indent(i) + ((apt) ? "* " : "") + name + + (isComposite ? " (Composite Parser):" : "")); if (apt) { System.out.println(); } @@ -756,17 +763,18 @@ private void displayParser(Parser p, boolean includeMimeTypes, boolean apt, int } if (isComposite) { - Parser[] subParsers = sortParsers(invertMediaTypeMap(((CompositeParser) p).getParsers())); + Parser[] subParsers = + sortParsers(invertMediaTypeMap(((CompositeParser) p).getParsers())); for (Parser sp : subParsers) { - displayParser(sp, includeMimeTypes, apt, i + ((apt) ? 0 : 3)); // Don't indent for Composites in apt. + displayParser(sp, includeMimeTypes, apt, i + ((apt) ? 0 : 3)); // Don't indent for + // Composites in apt. } } } /* - * Displays loaded detectors and their mime types - * If a detector is a composite detector, it will list the - * sub detectors. + * Displays loaded detectors and their mime types If a detector is a composite detector, it will + * list the sub detectors. */ private void displayDetectors() throws TikaException, IOException, SAXException { configure(); @@ -775,9 +783,7 @@ private void displayDetectors() throws TikaException, IOException, SAXException private void displayDetector(Detector d, int i) { boolean isComposite = (d instanceof CompositeDetector); - String name = d - .getClass() - .getName(); + String name = d.getClass().getName(); System.out.println(indent(i) + name + (isComposite ? " (Composite Detector):" : "")); if (isComposite) { List subDetectors = ((CompositeDetector) d).getDetectors(); @@ -793,16 +799,10 @@ private String indent(int indent) { private Parser[] sortParsers(Map> parsers) { // Get a nicely sorted list of the parsers - Parser[] sortedParsers = parsers - .keySet() - .toArray(new Parser[0]); + Parser[] sortedParsers = parsers.keySet().toArray(new Parser[0]); Arrays.sort(sortedParsers, (p1, p2) -> { - String name1 = p1 - .getClass() - .getName(); - String name2 = p2 - .getClass() - .getName(); + String name1 = p1.getClass().getName(); + String name2 = p2.getClass().getName(); return name1.compareTo(name2); }); return sortedParsers; @@ -814,9 +814,7 @@ private Map> invertMediaTypeMap(Map su if (!parsers.containsKey(e.getValue())) { parsers.put(e.getValue(), new HashSet<>()); } - parsers - .get(e.getValue()) - .add(e.getKey()); + parsers.get(e.getValue()).add(e.getKey()); } return parsers; } @@ -841,21 +839,16 @@ private void displaySupportedTypes() { Parser p = parsers.get(type); if (p != null) { if (p instanceof CompositeParser) { - p = ((CompositeParser) p) - .getParsers() - .get(type); + p = ((CompositeParser) p).getParsers().get(type); } - System.out.println(" parser: " + p - .getClass() - .getName()); + System.out.println(" parser: " + p.getClass().getName()); } } } /** - * Compares our mime types registry with the File(1) tool's - * directory of (uncompiled) Magic entries. - * (Well, those with mimetypes anyway) + * Compares our mime types registry with the File(1) tool's directory of (uncompiled) Magic + * entries. (Well, those with mimetypes anyway) * * @param magicDir Path to the magic directory */ @@ -865,23 +858,24 @@ private void compareFileMagic(String magicDir) throws Exception { // Plausibility check File dir = new File(magicDir); - if ((new File(dir, "elf")).exists() && (new File(dir, "mime")).exists() && (new File(dir, "vorbis")).exists()) { + if ((new File(dir, "elf")).exists() && (new File(dir, "mime")).exists() + && (new File(dir, "vorbis")).exists()) { // Looks plausible } else { - throw new IllegalArgumentException(magicDir + " doesn't seem to hold uncompressed file magic entries"); + throw new IllegalArgumentException( + magicDir + " doesn't seem to hold uncompressed file magic entries"); } // Find all the mimetypes in the directory Set fileMimes = new HashSet<>(); for (File mf : dir.listFiles()) { if (mf.isFile()) { - try (BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(mf), UTF_8))) { + try (BufferedReader r = new BufferedReader( + new InputStreamReader(new FileInputStream(mf), UTF_8))) { String line; while ((line = r.readLine()) != null) { if (line.startsWith("!:mime") || line.startsWith("#!:mime")) { - String mime = line - .substring(7) - .trim(); + String mime = line.substring(7).trim(); fileMimes.add(mime); } } @@ -925,7 +919,9 @@ private void compareFileMagic(String magicDir) throws Exception { } else { // Check the parent next MediaType parent = registry.getSupertype(type.getType()); - if (parent == MediaType.APPLICATION_XML || parent == MediaType.TEXT_PLAIN || parent == MediaType.OCTET_STREAM) { + if (parent == MediaType.APPLICATION_XML + || parent == MediaType.TEXT_PLAIN + || parent == MediaType.OCTET_STREAM) { // Stop checking parents if we hit a top level type parent = null; } @@ -951,22 +947,23 @@ private void compareFileMagic(String magicDir) throws Exception { int tikaAliases = 0; for (MediaType type : registry.getTypes()) { tikaTypes++; - tikaAliases += registry - .getAliases(type) - .size(); + tikaAliases += registry.getAliases(type).size(); } // Report System.out.println("Tika knows about " + tikaTypes + " unique mime types"); - System.out.println("Tika knows about " + (tikaTypes + tikaAliases) + " mime types including aliases"); - System.out.println("The File Magic directory knows about " + fileMimes.size() + " unique mime types"); + System.out.println("Tika knows about " + (tikaTypes + tikaAliases) + + " mime types including aliases"); + System.out.println("The File Magic directory knows about " + fileMimes.size() + + " unique mime types"); System.out.println(); System.out.println("The following mime types are known to File but not Tika:"); for (String mime : tikaLacking) { System.out.println(" " + mime); } System.out.println(); - System.out.println("The following mime types from File have no Tika magic (but their children might):"); + System.out.println( + "The following mime types from File have no Tika magic (but their children might):"); for (String mime : tikaNoMagic) { System.out.println(" " + mime); } @@ -1050,7 +1047,8 @@ public String getPassword(Metadata metadata) { } private class OutputType { - public void process(InputStream input, OutputStream output, Metadata metadata) throws Exception { + public void process(InputStream input, OutputStream output, Metadata metadata) + throws Exception { Parser p = parser; if (fork) { p = new ForkParser(TikaCLI.class.getClassLoader(), p); @@ -1074,7 +1072,8 @@ public void process(InputStream input, OutputStream output, Metadata metadata) t } } - protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception { + protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) + throws Exception { throw new UnsupportedOperationException(); } @@ -1082,7 +1081,8 @@ protected ContentHandler getContentHandler(OutputStream output, Metadata metadat private class FileEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor { - private final EmbeddedStreamTranslator embeddedStreamTranslator = new DefaultEmbeddedStreamTranslator(); + private final EmbeddedStreamTranslator embeddedStreamTranslator = + new DefaultEmbeddedStreamTranslator(); private int count = 0; public boolean shouldParseEmbedded(Metadata metadata) { @@ -1090,7 +1090,8 @@ public boolean shouldParseEmbedded(Metadata metadata) { } @Override - public void parseEmbedded(TikaInputStream tis, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { + public void parseEmbedded(TikaInputStream tis, ContentHandler contentHandler, + Metadata metadata, boolean outputHtml) throws SAXException, IOException { String contentType = metadata.get(Metadata.CONTENT_TYPE); if (StringUtils.isBlank(contentType)) { MediaType mediaType = detector.detect(tis, metadata); @@ -1105,7 +1106,7 @@ public void parseEmbedded(TikaInputStream tis, ContentHandler contentHandler, Me String name = metadata.get(NORMALIZED_EMBEDDED_NAME); Path parent = outputFile.getParent(); - if (parent != null && ! Files.isDirectory(parent)) { + if (parent != null && !Files.isDirectory(parent)) { Files.createDirectories(parent); } System.out.println("Extracting '" + name + "' (" + contentType + ") to " + outputFile); @@ -1120,13 +1121,16 @@ public void parseEmbedded(TikaInputStream tis, ContentHandler contentHandler, Me // // being a CLI program messages should go to the stderr too // - String msg = String.format(Locale.ROOT, "Ignoring unexpected exception trying to save embedded file %s (%s)", name, e.getMessage()); + String msg = String.format(Locale.ROOT, + "Ignoring unexpected exception trying to save embedded file %s (%s)", + name, e.getMessage()); LOG.warn(msg, e); } } private Path getOutputFile(Metadata metadata) throws IOException { - String normalizedName = org.apache.tika.io.FilenameUtils.getSanitizedEmbeddedFilePath(metadata, ".bin", 50); + String normalizedName = org.apache.tika.io.FilenameUtils + .getSanitizedEmbeddedFilePath(metadata, ".bin", 50); if (normalizedName == null) { String ext = org.apache.tika.io.FilenameUtils.calculateExtension(metadata, ".bin"); normalizedName = "file-" + count++ + ext; @@ -1134,12 +1138,13 @@ private Path getOutputFile(Metadata metadata) throws IOException { metadata.set(NORMALIZED_EMBEDDED_NAME, normalizedName); Path outputFile = extractDir.resolve(normalizedName); - //if file already exists, prepend uuid + // if file already exists, prepend uuid if (Files.exists(outputFile)) { String fileName = FilenameUtils.getName(normalizedName); - outputFile = extractDir.resolve( UUID.randomUUID() + "-" + fileName); + outputFile = extractDir.resolve(UUID.randomUUID() + "-" + fileName); } - if (! outputFile.toAbsolutePath().normalize().startsWith(extractDir.toAbsolutePath().normalize())) { + if (!outputFile.toAbsolutePath().normalize() + .startsWith(extractDir.toAbsolutePath().normalize())) { throw new IOException("Path traversal?!: " + outputFile.toAbsolutePath()); } return outputFile; diff --git a/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java b/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java index debc24e5f3..05d4e85467 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java +++ b/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.gui; @@ -47,7 +45,7 @@ class ParsingTransferHandler extends TransferHandler { uriListFlavor = new DataFlavor("text/uri-list;class=java.lang.String"); urlListFlavor = new DataFlavor("text/plain;class=java.lang.String"); } catch (ClassNotFoundException e) { - //swallow + // swallow } } @@ -70,7 +68,7 @@ private static List uriToFileList(Object data) { try { list.add(new File(new URI(s))); } catch (Exception e) { - //swallow + // swallow } } return list; @@ -78,7 +76,8 @@ private static List uriToFileList(Object data) { public boolean canImport(JComponent component, DataFlavor[] flavors) { for (DataFlavor flavor : flavors) { - if (flavor.equals(DataFlavor.javaFileListFlavor) || flavor.equals(uriListFlavor) || flavor.equals(urlListFlavor)) { + if (flavor.equals(DataFlavor.javaFileListFlavor) || flavor.equals(uriListFlavor) + || flavor.equals(urlListFlavor)) { return true; } } @@ -89,7 +88,8 @@ public boolean canImport(JComponent component, DataFlavor[] flavors) { public boolean importData(JComponent component, Transferable transferable) { try { if (transferable.isDataFlavorSupported(DataFlavor.javaFileListFlavor)) { - importFiles((List) transferable.getTransferData(DataFlavor.javaFileListFlavor)); + importFiles((List) transferable + .getTransferData(DataFlavor.javaFileListFlavor)); } else if (transferable.isDataFlavorSupported(urlListFlavor)) { Object data = transferable.getTransferData(urlListFlavor); tika.openURL(new URL(data.toString())); @@ -112,7 +112,8 @@ public void exportAsDrag(JComponent arg0, InputEvent arg1, int arg2) { delegate.exportAsDrag(arg0, arg1, arg2); } - public void exportToClipboard(JComponent arg0, Clipboard arg1, int arg2) throws IllegalStateException { + public void exportToClipboard(JComponent arg0, Clipboard arg1, int arg2) + throws IllegalStateException { delegate.exportToClipboard(arg0, arg1, arg2); } diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java index 1da9cda9c0..23ea83e557 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java +++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.gui; @@ -63,13 +61,7 @@ import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; - import org.apache.commons.io.IOUtils; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; - import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.DocumentSelector; @@ -92,15 +84,19 @@ import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler; import org.apache.tika.serialization.JsonMetadataList; import org.apache.tika.utils.XMLReaderUtils; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; /** - * Simple Swing GUI for Apache Tika. You can drag and drop files on top - * of the window to have them parsed. + * Simple Swing GUI for Apache Tika. You can drag and drop files on top of the window to have them + * parsed. */ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener { - //maximum length to allow for mark for reparse to get JSON - private static final int MAX_MARK = 20 * 1024 * 1024;//20MB + // maximum length to allow for mark for reparse to get JSON + private static final int MAX_MARK = 20 * 1024 * 1024;// 20MB /** * Serial version UID. @@ -186,9 +182,8 @@ public TikaGUI(Parser parser, TikaConfig tikaConfig) { } /** - * Main method. Sets the Swing look and feel to the operating system - * settings, and starts the Tika GUI with an {@link AutoDetectParser} - * instance as the default parser. + * Main method. Sets the Swing look and feel to the operating system settings, and starts the + * Tika GUI with an {@link AutoDetectParser} instance as the default parser. * * @param args ignored * @throws Exception if an error occurs @@ -199,15 +194,17 @@ public static void main(String[] args) throws Exception { File configFile = new File(args[0]); config = new TikaConfig(configFile); } else { - try (InputStream is = TikaGUI.class.getResourceAsStream("/tika-config-default-single-file.xml")) { + try (InputStream is = TikaGUI.class + .getResourceAsStream("/tika-config-default-single-file.xml")) { config = new TikaConfig(is); } } UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName()); final TikaConfig finalConfig = config; - SwingUtilities.invokeLater(() -> new TikaGUI( - new DigestingParser(new AutoDetectParser(finalConfig), - new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256), + SwingUtilities.invokeLater(() -> new TikaGUI(new DigestingParser( + new AutoDetectParser(finalConfig), + new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, + CommonsDigester.DigestAlgorithm.SHA256), false), finalConfig).setVisible(true)); } @@ -256,16 +253,15 @@ public void actionPerformed(ActionEvent e) { openFile(chooser.getSelectedFile()); } } else if ("openurl".equals(command)) { - Object rv = JOptionPane.showInputDialog(this, "Enter the URL of the resource to be parsed:", "Open URL", JOptionPane.PLAIN_MESSAGE, null, null, ""); - if (rv != null && rv - .toString() - .length() > 0) { + Object rv = JOptionPane.showInputDialog(this, + "Enter the URL of the resource to be parsed:", "Open URL", + JOptionPane.PLAIN_MESSAGE, null, null, ""); + if (rv != null && rv.toString().length() > 0) { try { - openURL(new URL(rv - .toString() - .trim())); + openURL(new URL(rv.toString().trim())); } catch (MalformedURLException exception) { - JOptionPane.showMessageDialog(this, "The given string is not a valid URL", "Invalid URL", JOptionPane.ERROR_MESSAGE); + JOptionPane.showMessageDialog(this, "The given string is not a valid URL", + "Invalid URL", JOptionPane.ERROR_MESSAGE); } } } else if ("html".equals(command)) { @@ -283,10 +279,8 @@ public void actionPerformed(ActionEvent e) { } else if ("about".equals(command)) { textDialog("About Apache Tika", TikaGUI.class.getResource("about.html")); } else if ("exit".equals(command)) { - Toolkit - .getDefaultToolkit() - .getSystemEventQueue() - .postEvent(new WindowEvent(this, WindowEvent.WINDOW_CLOSING)); + Toolkit.getDefaultToolkit().getSystemEventQueue() + .postEvent(new WindowEvent(this, WindowEvent.WINDOW_CLOSING)); } } @@ -319,8 +313,9 @@ private void handleStream(InputStream input, Metadata md) throws Exception { StringWriter xmlBuffer = new StringWriter(); StringBuilder metadataBuffer = new StringBuilder(); - ContentHandler handler = - new TeeContentHandler(getHtmlHandler(htmlBuffer), getTextContentHandler(textBuffer), getTextMainContentHandler(textMainBuffer), getXmlContentHandler(xmlBuffer)); + ContentHandler handler = new TeeContentHandler(getHtmlHandler(htmlBuffer), + getTextContentHandler(textBuffer), + getTextMainContentHandler(textMainBuffer), getXmlContentHandler(xmlBuffer)); context.set(DocumentSelector.class, new ImageDocumentSelector()); @@ -373,12 +368,14 @@ private void handleStream(InputStream input, Metadata md) throws Exception { input.reset(); isReset = true; } catch (IOException e) { - setText(json, "Error during stream reset.\n" + "There's a limit of " + MAX_MARK + " bytes for this type of processing in the GUI.\n" + - "Try the app with command line argument of -J."); + setText(json, "Error during stream reset.\n" + "There's a limit of " + MAX_MARK + + " bytes for this type of processing in the GUI.\n" + + "Try the app with command line argument of -J."); } if (isReset) { RecursiveParserWrapperHandler recursiveParserWrapperHandler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1), -1); + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1), -1); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); wrapper.parse(input, recursiveParserWrapperHandler, new Metadata(), new ParseContext()); StringWriter jsonBuffer = new StringWriter(); @@ -394,10 +391,7 @@ private void handleStream(InputStream input, Metadata md) throws Exception { private void handleError(String name, Throwable t) { StringWriter writer = new StringWriter(); writer.append("Apache Tika was unable to parse the document\n"); - writer - .append("at ") - .append(name) - .append(".\n\n"); + writer.append("at ").append(name).append(".\n\n"); writer.append("The full exception stack trace is included below:\n\n"); t.printStackTrace(new PrintWriter(writer)); @@ -419,7 +413,8 @@ private void addWelcomeCard(JPanel panel, String name) { editor.setContentType("text/html"); editor.setEditable(false); editor.setBackground(Color.WHITE); - editor.setTransferHandler(new ParsingTransferHandler(editor.getTransferHandler(), this)); + editor.setTransferHandler( + new ParsingTransferHandler(editor.getTransferHandler(), this)); panel.add(new JScrollPane(editor), name); } catch (IOException e) { e.printStackTrace(); @@ -457,7 +452,8 @@ public void hyperlinkUpdate(HyperlinkEvent e) { try { URL url = e.getURL(); try (InputStream stream = url.openStream()) { - JEditorPane editor = new JEditorPane("text/plain", IOUtils.toString(stream, UTF_8)); + JEditorPane editor = + new JEditorPane("text/plain", IOUtils.toString(stream, UTF_8)); editor.setEditable(false); editor.setBackground(Color.WHITE); editor.setCaretPosition(0); @@ -483,36 +479,33 @@ private void setText(JEditorPane editor, String text) { } /** - * Creates and returns a content handler that turns XHTML input to - * simplified HTML output that can be correctly parsed and displayed - * by {@link JEditorPane}. + * Creates and returns a content handler that turns XHTML input to simplified HTML output that + * can be correctly parsed and displayed by {@link JEditorPane}. *

- * The returned content handler is set to output html - * to the given writer. The XHTML namespace is removed from the output - * to prevent the serializer from using the <tag/> empty element - * syntax that causes extra ">" characters to be displayed. - * The <head> tags are dropped to prevent the serializer from - * generating a <META> content type tag that makes - * {@link JEditorPane} fail thinking that the document character set + * The returned content handler is set to output html to the given writer. The + * XHTML namespace is removed from the output to prevent the serializer from using the + * <tag/> empty element syntax that causes extra ">" characters to be displayed. The + * <head> tags are dropped to prevent the serializer from generating a <META> + * content type tag that makes {@link JEditorPane} fail thinking that the document character set * is inconsistent. *

- * Additionally, it will use ImageSavingParser to re-write embedded:(image) - * image links to be file:///(temporary file) so that they can be loaded. + * Additionally, it will use ImageSavingParser to re-write embedded:(image) image links to be + * file:///(temporary file) so that they can be loaded. * * @param writer output writer * @return HTML content handler * @throws TransformerConfigurationException if an error occurs */ - private ContentHandler getHtmlHandler(Writer writer) throws TransformerConfigurationException, TikaException { + private ContentHandler getHtmlHandler(Writer writer) + throws TransformerConfigurationException, TikaException { SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); - handler - .getTransformer() - .setOutputProperty(OutputKeys.METHOD, "html"); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); handler.setResult(new StreamResult(writer)); return new ContentHandlerDecorator(handler) { @Override - public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException { + public void startElement(String uri, String localName, String name, Attributes atts) + throws SAXException { if (XHTMLContentHandler.XHTML.equals(uri)) { uri = null; } @@ -532,13 +525,13 @@ public void startElement(String uri, String localName, String name, Attributes a String filename = src.substring(src.indexOf(':') + 1); try { File img = imageParser.requestSave(filename); - String newSrc = img - .toURI() - .toString(); + String newSrc = img.toURI().toString(); newAttrs.setValue(i, newSrc); } catch (IOException e) { - System.err.println("Error creating temp image file " + filename); - // The html viewer will show a broken image too to alert them + System.err.println("Error creating temp image file " + + filename); + // The html viewer will show a broken image too to alert + // them } } } @@ -561,12 +554,10 @@ public void endElement(String uri, String localName, String name) throws SAXExce } @Override - public void startPrefixMapping(String prefix, String uri) { - } + public void startPrefixMapping(String prefix, String uri) {} @Override - public void endPrefixMapping(String prefix) { - } + public void endPrefixMapping(String prefix) {} }; } @@ -578,12 +569,11 @@ private ContentHandler getTextMainContentHandler(Writer writer) { return new BoilerpipeContentHandler(writer); } - private ContentHandler getXmlContentHandler(Writer writer) throws TransformerConfigurationException, TikaException { + private ContentHandler getXmlContentHandler(Writer writer) + throws TransformerConfigurationException, TikaException { SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); - handler - .getTransformer() - .setOutputProperty(OutputKeys.METHOD, "xml"); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.setResult(new StreamResult(writer)); return handler; } @@ -599,9 +589,8 @@ public boolean select(Metadata metadata) { } /** - * A recursive parser that saves certain images into the temporary - * directory, and delegates everything else to another downstream - * parser. + * A recursive parser that saves certain images into the temporary directory, and delegates + * everything else to another downstream parser. */ private static class ImageSavingParser implements Parser { private Map wanted = new HashMap<>(); @@ -612,12 +601,10 @@ private ImageSavingParser(Parser downstreamParser) { this.downstreamParser = downstreamParser; try { - File t = Files - .createTempFile("tika", ".test") - .toFile(); + File t = Files.createTempFile("tika", ".test").toFile(); tmpDir = t.getParentFile(); } catch (IOException e) { - //swallow + // swallow } } @@ -629,9 +616,7 @@ public File requestSave(String embeddedName) throws IOException { embeddedName = embeddedName.substring(splitAt); } - File tmp = Files - .createTempFile("tika-embedded-", suffix) - .toFile(); + File tmp = Files.createTempFile("tika-embedded-", suffix).toFile(); wanted.put(embeddedName, tmp); return tmp; } @@ -641,7 +626,8 @@ public Set getSupportedTypes(ParseContext context) { } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (name != null && wanted.containsKey(name)) { try (FileOutputStream out = new FileOutputStream(wanted.get(name))) { diff --git a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java index 8b1d79d106..fe911a48fb 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/AsyncHelperTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.cli; @@ -24,8 +22,9 @@ public class AsyncHelperTest { @Test public void testBasic() throws Exception { - String[] args = new String[]{"-a", "--config=blah.xml", "-i", "input.docx", "-o", "output/dir"}; - String[] expected = new String[]{"-c", "blah.xml", "-i", "input.docx", "-o", "output/dir"}; + String[] args = new String[] {"-a", "--config=blah.xml", "-i", "input.docx", "-o", + "output/dir"}; + String[] expected = new String[] {"-c", "blah.xml", "-i", "input.docx", "-o", "output/dir"}; assertArrayEquals(expected, AsyncHelper.translateArgs(args)); } } diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java index faacd49a28..4151324b3c 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.cli; @@ -26,7 +24,6 @@ import java.io.PrintStream; import java.nio.file.Files; import java.nio.file.Path; - import org.apache.commons.io.FileUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -52,21 +49,26 @@ public class TikaCLIAsyncTest { @BeforeAll public static void setUpClass() throws Exception { ASYNC_CONFIG = Files.createTempFile(ASYNC_OUTPUT_DIR, "async-config-", ".xml"); - String xml = "" + "" + "3" + "" + ASYNC_CONFIG.toAbsolutePath() + "" + "" + "" + - "" + "fsf" + "" + TEST_DATA_FILE.getAbsolutePath() + - "" + - "" + "" + "" + "" + "fse" + "" + - ASYNC_OUTPUT_DIR.toAbsolutePath() + "" + "true" + "" + "" + - "" + "" + TEST_DATA_FILE.getAbsolutePath() + "" + - "fsf" + "fse" + "" + ""; + String xml = "" + "" + "3" + "" + + ASYNC_CONFIG.toAbsolutePath() + "" + "" + + "" + + "" + + "fsf" + "" + TEST_DATA_FILE.getAbsolutePath() + + "" + "" + "" + "" + + "" + + "fse" + "" + ASYNC_OUTPUT_DIR.toAbsolutePath() + + "" + "true" + "" + + "" + + "" + + "" + TEST_DATA_FILE.getAbsolutePath() + "" + + "fsf" + "fse" + + "" + ""; Files.write(ASYNC_CONFIG, xml.getBytes(UTF_8)); } /** - * reset resourcePrefix - * save original System.out and System.err - * clear outContent and errContent if they are not empty - * set outContent and errContent as System.out and System.err + * reset resourcePrefix save original System.out and System.err clear outContent and errContent + * if they are not empty set outContent and errContent as System.out and System.err */ @BeforeEach public void setUp() throws Exception { @@ -85,8 +87,8 @@ public void tearDown() { } /** - * clear outContent and errContent if they are not empty by create a new one. - * set outContent and errContent as System.out and System.err + * clear outContent and errContent if they are not empty by create a new one. set outContent and + * errContent as System.out and System.err */ private void resetContent() throws Exception { if (outContent == null || outContent.size() > 0) { @@ -106,16 +108,10 @@ public void testAsync() throws Exception { String content = getParamOutContent("-a", "-c", ASYNC_CONFIG.toAbsolutePath().toString()); int json = 0; - for (File f : ASYNC_OUTPUT_DIR - .toFile() - .listFiles()) { - if (f - .getName() - .endsWith(".json")) { - //check one file for pretty print - if (f - .getName() - .equals("coffee.xls.json")) { + for (File f : ASYNC_OUTPUT_DIR.toFile().listFiles()) { + if (f.getName().endsWith(".json")) { + // check one file for pretty print + if (f.getName().equals("coffee.xls.json")) { checkForPrettyPrint(f); } json++; @@ -128,7 +124,8 @@ private void checkForPrettyPrint(File f) throws IOException { String json = FileUtils.readFileToString(f, UTF_8); int previous = json.indexOf("Content-Length"); assertTrue(previous > -1); - for (String k : new String[]{"Content-Type", "dc:creator", "dcterms:created", "dcterms:modified", "X-TIKA:content\""}) { + for (String k : new String[] {"Content-Type", "dc:creator", "dcterms:created", + "dcterms:modified", "X-TIKA:content\""}) { int i = json.indexOf(k); assertTrue(i > -1, "should have found " + k); assertTrue(i > previous, "bad order: " + k + " at " + i + " not less than " + previous); @@ -137,8 +134,8 @@ private void checkForPrettyPrint(File f) throws IOException { } /** - * reset outContent and errContent if they are not empty - * run given params in TikaCLI and return outContent String with UTF-8 + * reset outContent and errContent if they are not empty run given params in TikaCLI and return + * outContent String with UTF-8 */ String getParamOutContent(String... params) throws Exception { resetContent(); diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index 391fffd616..b445363278 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.cli; @@ -37,20 +35,18 @@ import java.util.HashSet; import java.util.List; import java.util.Set; - -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.serialization.JsonMetadataList; import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; /** * Tests the Tika's cli @@ -75,14 +71,13 @@ protected static void assertExtracted(Path p, String allFiles) throws IOExceptio assertFalse(Files.isDirectory(p), "File " + p.getFileName() + " is a directory!"); - assertTrue(Files.size(p) > 0, "File " + p.getFileName() + " wasn't extracted with " + "contents"); + assertTrue(Files.size(p) > 0, + "File " + p.getFileName() + " wasn't extracted with " + "contents"); } /** - * reset resourcePrefix - * save original System.out and System.err - * clear outContent and errContent if they are not empty - * set outContent and errContent as System.out and System.err + * reset resourcePrefix save original System.out and System.err clear outContent and errContent + * if they are not empty set outContent and errContent as System.out and System.err */ @BeforeEach public void setUp() throws Exception { @@ -102,8 +97,8 @@ public void tearDown() { } /** - * clear outContent and errContent if they are not empty by create a new one. - * set outContent and errContent as System.out and System.err + * clear outContent and errContent if they are not empty by create a new one. set outContent and + * errContent as System.out and System.err */ private void resetContent() throws Exception { if (outContent == null || outContent.size() > 0) { @@ -118,8 +113,7 @@ private void resetContent() throws Exception { } /** - * Tests --list-parser-detail option of the cli - * Tests --list-parser-details option of the cli + * Tests --list-parser-detail option of the cli Tests --list-parser-details option of the cli * * @throws Exception */ @@ -154,7 +148,8 @@ public void testXMLOutput() throws Exception { assertTrue(content.contains("?xml version=\"1.0\" encoding=\"UTF-8\"?")); content = getParamOutContent("-x", "--digest=sha256", resourcePrefix + "alice.cli.test"); - assertTrue(content.contains(""), "Expanded element should be present"); + assertTrue(content.contains(""), + "Expanded element should be present"); content = getParamOutContent("-h", "--digest=sha384", resourcePrefix + "alice.cli.test"); - assertTrue(content.contains(" metadataList = JsonMetadataList.fromJson(reader); @@ -354,56 +352,59 @@ public void testListMetModels() throws Exception { */ @Test public void testListSupportedTypes() throws Exception { - String content = getParamOutContent("--list-supported-types", resourcePrefix + "alice.cli.test"); + String content = getParamOutContent("--list-supported-types", + resourcePrefix + "alice.cli.test"); assertTrue(content.contains("supertype: application/octet-stream")); } @Test public void testExtractSimple() throws Exception { - String[] expectedChildren = new String[]{"MBD002B040A.cdx", "file-4.png", "MBD002B0FA6.bin", "MBD00262FE3.txt", "file-0.emf"}; + String[] expectedChildren = new String[] {"MBD002B040A.cdx", "file-4.png", + "MBD002B0FA6.bin", "MBD00262FE3.txt", "file-0.emf"}; testExtract("/coffee.xls", expectedChildren, 8); } @Test public void testExtractAbsolute() throws Exception { - String[] expectedChildren = new String[]{"dangerous/dont/touch.pl",}; + String[] expectedChildren = new String[] {"dangerous/dont/touch.pl",}; testExtract("testZip_absolutePath.zip", expectedChildren, 2); } @Test public void testExtractRelative() throws Exception { - String[] expectedChildren = new String[]{"dangerous/dont/touch.pl",}; + String[] expectedChildren = new String[] {"dangerous/dont/touch.pl",}; testExtract("testZip_relative.zip", expectedChildren); } @Test public void testExtractOverlapping() throws Exception { - //there should be two files, one with a prepended uuid-f1.txt - String[] expectedChildren = new String[]{"f1.txt",}; + // there should be two files, one with a prepended uuid-f1.txt + String[] expectedChildren = new String[] {"f1.txt",}; testExtract("testZip_overlappingNames.zip", expectedChildren, 2); } @Test public void testExtract0x00() throws Exception { - String[] expectedChildren = new String[]{"dang erous.pl",}; + String[] expectedChildren = new String[] {"dang erous.pl",}; testExtract("testZip_zeroByte.zip", expectedChildren); } - private void testRecursiveUnpack(String targetFile, String[] expectedChildrenFileNames) throws Exception { - testRecursiveUnpack(targetFile, expectedChildrenFileNames, expectedChildrenFileNames.length); + private void testRecursiveUnpack(String targetFile, String[] expectedChildrenFileNames) + throws Exception { + testRecursiveUnpack(targetFile, expectedChildrenFileNames, + expectedChildrenFileNames.length); } - private void testRecursiveUnpack(String targetFile, String[] expectedChildrenFileNames, int expectedLength) throws Exception { + private void testRecursiveUnpack(String targetFile, String[] expectedChildrenFileNames, + int expectedLength) throws Exception { Path input = Paths.get(new URI(resourcePrefix + "/" + targetFile)); String[] params = {"-Z", input.toAbsolutePath().toString(), - extractDir.toAbsolutePath().toString()}; + extractDir.toAbsolutePath().toString()}; TikaCLI.main(params); Set fileNames = getFileNames(extractDir); - String[] jsonFile = extractDir - .toFile() - .list(); + String[] jsonFile = extractDir.toFile().list(); assertNotNull(jsonFile); assertEquals(expectedLength, jsonFile.length); @@ -416,44 +417,49 @@ private Set getFileNames(Path extractDir) throws IOException { final Set names = new HashSet<>(); Files.walkFileTree(extractDir, new FileVisitor() { @Override - public @NotNull FileVisitResult preVisitDirectory(Path path, @NotNull BasicFileAttributes basicFileAttributes) throws IOException { + public @NotNull FileVisitResult preVisitDirectory(Path path, + @NotNull BasicFileAttributes basicFileAttributes) throws IOException { return FileVisitResult.CONTINUE; } @Override - public @NotNull FileVisitResult visitFile(Path path, @NotNull BasicFileAttributes basicFileAttributes) throws IOException { + public @NotNull FileVisitResult visitFile(Path path, + @NotNull BasicFileAttributes basicFileAttributes) throws IOException { names.add(extractDir.relativize(path).toString().replace('\\', '/')); return FileVisitResult.CONTINUE; } @Override - public @NotNull FileVisitResult visitFileFailed(Path path, @NotNull IOException e) throws IOException { + public @NotNull FileVisitResult visitFileFailed(Path path, @NotNull IOException e) + throws IOException { return FileVisitResult.CONTINUE; } @Override - public @NotNull FileVisitResult postVisitDirectory(Path path, @Nullable IOException e) throws IOException { + public @NotNull FileVisitResult postVisitDirectory(Path path, @Nullable IOException e) + throws IOException { return FileVisitResult.CONTINUE; } }); return names; } - private void testExtract(String targetFile, String[] expectedChildrenFileNames) throws Exception { + private void testExtract(String targetFile, String[] expectedChildrenFileNames) + throws Exception { testExtract(targetFile, expectedChildrenFileNames, expectedChildrenFileNames.length); } - private void testExtract(String targetFile, String[] expectedChildrenFileNames, int expectedLength) throws Exception { + private void testExtract(String targetFile, String[] expectedChildrenFileNames, + int expectedLength) throws Exception { - String[] params = {"--extract-dir=" + ProcessUtils.escapeCommandLine(extractDir - .toAbsolutePath() - .toString()), "-z", resourcePrefix + "/" + targetFile}; + String[] params = { + "--extract-dir=" + ProcessUtils + .escapeCommandLine(extractDir.toAbsolutePath().toString()), + "-z", resourcePrefix + "/" + targetFile}; TikaCLI.main(params); - String[] tempFileNames = extractDir - .toFile() - .list(); + String[] tempFileNames = extractDir.toFile().list(); assertNotNull(tempFileNames); assertEquals(expectedLength, tempFileNames.length); String allFiles = String.join(" : ", tempFileNames); @@ -465,15 +471,14 @@ private void testExtract(String targetFile, String[] expectedChildrenFileNames, @Test public void testExtractTgz() throws Exception { - //TIKA-2564 + // TIKA-2564 - String[] params = {"--extract-dir=" + extractDir.toAbsolutePath(), "-z", resourcePrefix + "/test-documents.tgz"}; + String[] params = {"--extract-dir=" + extractDir.toAbsolutePath(), "-z", + resourcePrefix + "/test-documents.tgz"}; TikaCLI.main(params); - String[] tempFileNames = extractDir - .toFile() - .list(); + String[] tempFileNames = extractDir.toFile().list(); assertNotNull(tempFileNames); String allFiles = String.join(" : ", tempFileNames); @@ -497,8 +502,9 @@ public void testMultiValuedMetadata() throws Exception { public void testZipWithSubdirs() throws Exception { new File("subdir/foo.txt").delete(); new File("subdir").delete(); - String content = getParamOutContent("-z", "--extract-dir=target", resourcePrefix + "testWithSubdirs.zip"); - //assertTrue(content.contains("Extracting 'subdir/foo.txt'")); + String content = getParamOutContent("-z", "--extract-dir=target", + resourcePrefix + "testWithSubdirs.zip"); + // assertTrue(content.contains("Extracting 'subdir/foo.txt'")); // clean up. TODO: These should be in target. assertTrue(new File("target/subdir/foo.txt").delete()); assertTrue(new File("target/subdir").delete()); @@ -506,19 +512,18 @@ public void testZipWithSubdirs() throws Exception { @Test public void testExtractInlineImages() throws Exception { - String[] params = {"--extract-dir=" + extractDir.toAbsolutePath(), "-z", resourcePrefix + "/testPDF_childAttachments.pdf"}; + String[] params = {"--extract-dir=" + extractDir.toAbsolutePath(), "-z", + resourcePrefix + "/testPDF_childAttachments.pdf"}; TikaCLI.main(params); - String[] tempFileNames = extractDir - .toFile() - .list(); + String[] tempFileNames = extractDir.toFile().list(); assertNotNull(tempFileNames); String allFiles = String.join(" : ", tempFileNames); Path jpeg = extractDir.resolve("image0.jpg"); - //tiff isn't extracted without optional image dependency -// File tiff = new File(tempFile, "image1.tif"); + // tiff isn't extracted without optional image dependency + // File tiff = new File(tempFile, "image1.tif"); Path jobOptions = extractDir.resolve("Press Quality(1).joboptions.txt"); Path doc = extractDir.resolve("Unit10.doc"); @@ -529,9 +534,9 @@ public void testExtractInlineImages() throws Exception { @Test public void testDefaultConfigException() throws Exception { - //default xml parser will throw TikaException - //this and TestConfig() are broken into separate tests so that - //setUp and tearDown() are called each time + // default xml parser will throw TikaException + // this and TestConfig() are broken into separate tests so that + // setUp and tearDown() are called each time String[] params = {resourcePrefix + "bad_xml.xml"}; boolean tikaEx = false; try { @@ -544,84 +549,105 @@ public void testDefaultConfigException() throws Exception { @Test public void testConfig() throws Exception { - String content = getParamOutContent("--config=" + TEST_DATA_FILE.toString() + "/tika-config1.xml", resourcePrefix + "bad_xml.xml"); + String content = getParamOutContent( + "--config=" + TEST_DATA_FILE.toString() + "/tika-config1.xml", + resourcePrefix + "bad_xml.xml"); assertTrue(content.contains("apple")); assertTrue(content.contains("org.apache.tika.parser.html.JSoupParser")); } @Test public void testConfigIgnoreInit() throws Exception { - String content = getParamOutContent("--config=" + TEST_DATA_FILE.toString() + "/TIKA-2389-ignore-init-problems.xml", resourcePrefix + "test_recursive_embedded.docx"); + String content = getParamOutContent( + "--config=" + TEST_DATA_FILE.toString() + + "/TIKA-2389-ignore-init-problems.xml", + resourcePrefix + "test_recursive_embedded.docx"); assertTrue(content.contains("embed_1a")); - //TODO: add a real unit test that configures logging to a file to test that nothing is - //written at the various logging levels + // TODO: add a real unit test that configures logging to a file to test that nothing is + // written at the various logging levels } @Test public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception { - String content = getParamOutContent("-m", "-J", "-r", resourcePrefix + "test_recursive_embedded.docx"); + String content = getParamOutContent("-m", "-J", "-r", + resourcePrefix + "test_recursive_embedded.docx"); assertTrue(content.contains("\"extended-properties:AppVersion\" : \"15.0000\",")); - assertTrue(content.contains("\"extended-properties:Application\" : \"Microsoft Office Word\",")); + assertTrue(content.contains( + "\"extended-properties:Application\" : \"Microsoft Office Word\",")); assertTrue(content.contains("\"X-TIKA:embedded_resource_path\" : \"/embed1.zip\"")); assertFalse(content.contains("X-TIKA:content")); } @Test public void testJsonRecursiveMetadataParserDefault() throws Exception { - String content = getParamOutContent("-J", "-r", resourcePrefix + "test_recursive_embedded.docx"); - assertTrue(content.contains("\"X-TIKA:content\" : \"")); - //make sure Executable is there because follow on tests of custom config - //test that it has been turned off. - assertTrue(content.contains("")); + // make sure at least one detector is there + assertTrue(content.contains( + "")); + // make sure Executable is there because follow on tests of custom config + // test that it has been turned off. + assertTrue(content.contains( + "")); content = getParamOutContent("--dump-current-config"); - //make sure at least one detector is there - assertTrue(content.contains("")); - //and at least one parser + // make sure at least one detector is there + assertTrue(content + .contains("")); + // and at least one parser assertTrue(content.contains("")); } @Test public void testConfigSerializationCustomMinimal() throws Exception { - String content = getParamOutContent("--config=" + TEST_DATA_FILE.toString() + "/tika-config2.xml", "--dump-minimal-config").replaceAll("[\r\n\t ]+", " "); - - String expected = - "" + " application/pdf" + " image/jpeg " + - " " + "" + " application/pdf " + ""; + String content = getParamOutContent( + "--config=" + TEST_DATA_FILE.toString() + "/tika-config2.xml", + "--dump-minimal-config").replaceAll("[\r\n\t ]+", " "); + + String expected = "" + + " application/pdf" + + " image/jpeg " + " " + + "" + + " application/pdf " + ""; assertTrue(content.contains(expected)); } @Test public void testConfigSerializationCustomStatic() throws Exception { - String content = getParamOutContent("--config=" + TEST_DATA_FILE.toString() + "/tika-config2.xml", "--dump-static-config"); + String content = getParamOutContent( + "--config=" + TEST_DATA_FILE.toString() + "/tika-config2.xml", + "--dump-static-config"); assertFalse(content.contains("org.apache.tika.parser.executable.Executable")); } /** - * Tests --list-detector option of the cli - * Tests --list-detectors option of the cli + * Tests --list-detector option of the cli Tests --list-detectors option of the cli * * @throws Exception */ @@ -635,8 +661,8 @@ public void testListDetectors() throws Exception { } /** - * Tests --list-parser-detail-apt option of the cli - * Tests --list-parser-details-apt option of the cli + * Tests --list-parser-detail-apt option of the cli Tests --list-parser-details-apt option of + * the cli * * @throws Exception */ @@ -650,8 +676,8 @@ public void testListParserDetailApt() throws Exception { } /** - * reset outContent and errContent if they are not empty - * run given params in TikaCLI and return outContent String with UTF-8 + * reset outContent and errContent if they are not empty run given params in TikaCLI and return + * outContent String with UTF-8 */ String getParamOutContent(String... params) throws Exception { resetContent(); diff --git a/tika-bundles/tika-bundle-standard/src/test/java/org/apache/tika/bundle/BundleIT.java b/tika-bundles/tika-bundle-standard/src/test/java/org/apache/tika/bundle/BundleIT.java index 58d3f8aae5..9fb2dd3610 100644 --- a/tika-bundles/tika-bundle-standard/src/test/java/org/apache/tika/bundle/BundleIT.java +++ b/tika-bundles/tika-bundle-standard/src/test/java/org/apache/tika/bundle/BundleIT.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.bundle; @@ -26,6 +24,7 @@ import static org.ops4j.pax.exam.CoreOptions.options; import static org.ops4j.pax.exam.CoreOptions.systemPackages; +import jakarta.inject.Inject; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; @@ -41,22 +40,6 @@ import java.util.jar.Attributes; import java.util.jar.JarInputStream; import java.util.jar.Manifest; -import jakarta.inject.Inject; - -import org.junit.Ignore; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.ops4j.pax.exam.Configuration; -import org.ops4j.pax.exam.Option; -import org.ops4j.pax.exam.junit.PaxExam; -import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy; -import org.ops4j.pax.exam.spi.reactors.PerMethod; -import org.osgi.framework.Bundle; -import org.osgi.framework.BundleContext; -import org.osgi.framework.ServiceReference; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.Tika; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; @@ -75,6 +58,19 @@ import org.apache.tika.parser.internal.Activator; import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.sax.BodyContentHandler; +import org.junit.Ignore; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.ops4j.pax.exam.Configuration; +import org.ops4j.pax.exam.Option; +import org.ops4j.pax.exam.junit.PaxExam; +import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy; +import org.ops4j.pax.exam.spi.reactors.PerMethod; +import org.osgi.framework.Bundle; +import org.osgi.framework.BundleContext; +import org.osgi.framework.ServiceReference; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; @RunWith(PaxExam.class) @ExamReactorStrategy(PerMethod.class) @@ -95,16 +91,16 @@ public class BundleIT { public Option[] configuration() throws IOException, URISyntaxException, ClassNotFoundException { File base = new File(TARGET, "test-bundles"); return options(systemPackages("javax.xml.bind"), - bundle(new File(base, "tika-core.jar").toURI().toURL().toString()), - //I couldn't find a way to get the build of bundle to work via imports - //for this one - mavenBundle("commons-io", "commons-io", "2.11.0"), - mavenBundle("org.apache.logging.log4j", "log4j-core", "2.17.1"), - mavenBundle("org.apache.logging.log4j", "log4j-api", "2.17.1"), - mavenBundle("org.ops4j.pax.logging", "pax-logging-api", "1.8.5"), - mavenBundle("org.ops4j.pax.logging", "pax-logging-service", "1.8.5"), - junitBundles(), - bundle(new File(base, "tika-bundle-standard.jar").toURI().toURL().toString())); + bundle(new File(base, "tika-core.jar").toURI().toURL().toString()), + // I couldn't find a way to get the build of bundle to work via imports + // for this one + mavenBundle("commons-io", "commons-io", "2.11.0"), + mavenBundle("org.apache.logging.log4j", "log4j-core", "2.17.1"), + mavenBundle("org.apache.logging.log4j", "log4j-api", "2.17.1"), + mavenBundle("org.ops4j.pax.logging", "pax-logging-api", "1.8.5"), + mavenBundle("org.ops4j.pax.logging", "pax-logging-service", "1.8.5"), + junitBundles(), bundle(new File(base, "tika-bundle-standard.jar").toURI() + .toURL().toString())); } @Test @@ -158,8 +154,7 @@ public void testBundleDetection() throws Exception { @Test public void testForkParser() throws Exception { try (ForkParser parser = new ForkParser(Activator.class.getClassLoader(), defaultParser)) { - String data = - "\n

test content

"; + String data = "\n

test content

"; InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8)); Writer writer = new StringWriter(); ContentHandler contentHandler = new BodyContentHandler(writer); @@ -187,11 +182,11 @@ public void testBundleSimpleText() throws Exception { @Test public void testBundleDetectors() throws Exception { - //For some reason, the detector created by OSGi has a flat - //list of detectors, whereas the detector created by the traditional - //service loading method has children: DefaultDetector, MimeTypes. - //We have to flatten the service loaded DefaultDetector to get equivalence. - //Detection behavior should all be the same. + // For some reason, the detector created by OSGi has a flat + // list of detectors, whereas the detector created by the traditional + // service loading method has children: DefaultDetector, MimeTypes. + // We have to flatten the service loaded DefaultDetector to get equivalence. + // Detection behavior should all be the same. // Get the classes found within OSGi ServiceReference detectorRef = bc.getServiceReference(Detector.class); @@ -204,7 +199,7 @@ public void testBundleDetectors() throws Exception { // Check we did get a few, just in case... assertTrue("Should have several Detector names, found " + osgiDetectors.size(), - osgiDetectors.size() > 3); + osgiDetectors.size() > 3); // Get the raw detectors list from the traditional service loading mechanism DefaultDetector detector = new DefaultDetector(); @@ -215,7 +210,7 @@ public void testBundleDetectors() throws Exception { rawDetectors.add(dChild.getClass().getName()); } } else { - //TODO: figure out how to get this loaded correctly from tika-core + // TODO: figure out how to get this loaded correctly from tika-core if (!d.getClass().getName().equals("org.apache.tika.detect.OverrideDetector")) { rawDetectors.add(d.getClass().getName()); } @@ -237,7 +232,7 @@ public void testBundleParsers() throws Exception { // Check we did get a few, just in case... assertTrue("Should have lots Parser names, found " + osgiParsers.size(), - osgiParsers.size() > 15); + osgiParsers.size() > 15); // Get the raw parsers list from the traditional service loading mechanism CompositeParser parser = (CompositeParser) defaultParser; @@ -274,8 +269,8 @@ public void testTikaBundle() throws Exception { ParseContext context = new ParseContext(); context.set(Parser.class, parser); - try (InputStream stream = TikaInputStream.get( - Paths.get("src/test/resources/test-documents.zip"))) { + try (InputStream stream = + TikaInputStream.get(Paths.get("src/test/resources/test-documents.zip"))) { parser.parse(stream, handler, new Metadata(), context); } @@ -310,8 +305,8 @@ public void testPoiTikaBundle() throws Exception { ParseContext context = new ParseContext(); context.set(Parser.class, parser); - try (InputStream stream = TikaInputStream.get( - Paths.get("src/test/resources/testPPT.pptx"))) { + try (InputStream stream = + TikaInputStream.get(Paths.get("src/test/resources/testPPT.pptx"))) { parser.parse(stream, handler, new Metadata(), context); } @@ -329,7 +324,7 @@ public void testAll() throws Exception { ParseContext context = new ParseContext(); context.set(Parser.class, parser); Set needToFix = new HashSet<>(); - //needToFix.add("testAccess2_encrypted.accdb"); + // needToFix.add("testAccess2_encrypted.accdb"); System.out.println(getTestDir()); for (File f : getTestDir().listFiles()) { if (f.isDirectory()) { @@ -343,15 +338,15 @@ public void testAll() throws Exception { try (InputStream is = TikaInputStream.get(f.toPath())) { parser.parse(is, handler, metadata, context); } catch (EncryptedDocumentException e) { - //swallow + // swallow } catch (SAXException e) { - //swallow + // swallow } catch (TikaException e) { System.err.println("tika Exception " + f.getName()); e.printStackTrace(); } System.out.println( - Arrays.asList(metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY))); + Arrays.asList(metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY))); } } diff --git a/tika-core/src/main/java/org/apache/tika/Tika.java b/tika-core/src/main/java/org/apache/tika/Tika.java index 5d121e6d05..7c58fd2bdb 100644 --- a/tika-core/src/main/java/org/apache/tika/Tika.java +++ b/tika-core/src/main/java/org/apache/tika/Tika.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika; @@ -24,9 +22,6 @@ import java.net.URL; import java.nio.file.Path; import java.util.Properties; - -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; @@ -41,11 +36,12 @@ import org.apache.tika.parser.ParsingReader; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.WriteOutContentHandler; +import org.xml.sax.SAXException; /** - * Facade class for accessing Tika functionality. This class hides much of - * the underlying complexity of the lower level Tika classes and provides - * simple methods for many common parsing and type detection operations. + * Facade class for accessing Tika functionality. This class hides much of the underlying complexity + * of the lower level Tika classes and provides simple methods for many common parsing and type + * detection operations. * * @see Parser * @see Detector @@ -69,9 +65,8 @@ public class Tika { private final Translator translator; /** - * Maximum length of the strings returned by the parseToString methods. - * Used to prevent out of memory problems with huge input documents. - * The default setting is 100k characters. + * Maximum length of the strings returned by the parseToString methods. Used to prevent out of + * memory problems with huge input documents. The default setting is 100k characters. */ private int maxStringLength = 100 * 1000; @@ -80,7 +75,7 @@ public class Tika { * Translator. * * @param detector type detector - * @param parser document parser + * @param parser document parser * @since Apache Tika 0.8 */ public Tika(Detector detector, Parser parser) { @@ -92,8 +87,8 @@ public Tika(Detector detector, Parser parser) { /** * Creates a Tika facade using the given detector, parser, and translator instances. * - * @param detector type detector - * @param parser document parser + * @param detector type detector + * @param parser document parser * @param translator text translator * @since Apache Tika 1.6 */ @@ -120,8 +115,8 @@ public Tika() { } /** - * Creates a Tika facade using the given detector instance, the - * default parser configuration, and the default Translator. + * Creates a Tika facade using the given detector instance, the default parser configuration, + * and the default Translator. * * @param detector type detector * @since Apache Tika 0.8 @@ -132,23 +127,20 @@ public Tika(Detector detector) { /** - * Detects the media type of the given document. The type detection is - * based on the content of the given document stream and any given - * document metadata. The document stream can be null, - * in which case only the given document metadata is used for type - * detection. + * Detects the media type of the given document. The type detection is based on the content of + * the given document stream and any given document metadata. The document stream can be + * null, in which case only the given document metadata is used for type detection. *

- * If the document stream supports the - * {@link InputStream#markSupported() mark feature}, then the stream is - * marked and reset to the original position before this method returns. - * Only a limited number of bytes are read from the stream. + * If the document stream supports the {@link InputStream#markSupported() mark feature}, then + * the stream is marked and reset to the original position before this method returns. Only a + * limited number of bytes are read from the stream. *

* The given document stream is not closed by this method. *

- * Unlike in the {@link #parse(InputStream, Metadata)} method, the - * given document metadata is not modified by this method. + * Unlike in the {@link #parse(InputStream, Metadata)} method, the given document metadata is + * not modified by this method. * - * @param stream the document stream, or null + * @param stream the document stream, or null * @param metadata document metadata * @return detected media type * @throws IOException if the stream can not be read @@ -162,19 +154,17 @@ public String detect(InputStream stream, Metadata metadata) throws IOException { } /** - * Detects the media type of the given document. The type detection is - * based on the content of the given document stream and the name of the - * document. + * Detects the media type of the given document. The type detection is based on the content of + * the given document stream and the name of the document. *

- * If the document stream supports the - * {@link InputStream#markSupported() mark feature}, then the stream is - * marked and reset to the original position before this method returns. - * Only a limited number of bytes are read from the stream. + * If the document stream supports the {@link InputStream#markSupported() mark feature}, then + * the stream is marked and reset to the original position before this method returns. Only a + * limited number of bytes are read from the stream. *

* The given document stream is not closed by this method. * * @param stream the document stream - * @param name document name + * @param name document name * @return detected media type * @throws IOException if the stream can not be read * @since Apache Tika 0.9 @@ -186,13 +176,12 @@ public String detect(InputStream stream, String name) throws IOException { } /** - * Detects the media type of the given document. The type detection is - * based on the content of the given document stream. + * Detects the media type of the given document. The type detection is based on the content of + * the given document stream. *

- * If the document stream supports the - * {@link InputStream#markSupported() mark feature}, then the stream is - * marked and reset to the original position before this method returns. - * Only a limited number of bytes are read from the stream. + * If the document stream supports the {@link InputStream#markSupported() mark feature}, then + * the stream is marked and reset to the original position before this method returns. Only a + * limited number of bytes are read from the stream. *

* The given document stream is not closed by this method. * @@ -205,16 +194,15 @@ public String detect(InputStream stream) throws IOException { } /** - * Detects the media type of the given document. The type detection is - * based on the first few bytes of a document and the document name. + * Detects the media type of the given document. The type detection is based on the first few + * bytes of a document and the document name. *

- * For best results at least a few kilobytes of the document data - * are needed. See also the other detect() methods for better - * alternatives when you have more than just the document prefix + * For best results at least a few kilobytes of the document data are needed. See also the other + * detect() methods for better alternatives when you have more than just the document prefix * available for type detection. * * @param prefix first few bytes of the document - * @param name document name + * @param name document name * @return detected media type * @since Apache Tika 0.9 */ @@ -229,12 +217,11 @@ public String detect(byte[] prefix, String name) { } /** - * Detects the media type of the given document. The type detection is - * based on the first few bytes of a document. + * Detects the media type of the given document. The type detection is based on the first few + * bytes of a document. *

- * For best results at least a few kilobytes of the document data - * are needed. See also the other detect() methods for better - * alternatives when you have more than just the document prefix + * For best results at least a few kilobytes of the document data are needed. See also the other + * detect() methods for better alternatives when you have more than just the document prefix * available for type detection. * * @param prefix first few bytes of the document @@ -252,12 +239,11 @@ public String detect(byte[] prefix) { } /** - * Detects the media type of the file at the given path. The type - * detection is based on the document content and a potential known - * file extension. + * Detects the media type of the file at the given path. The type detection is based on the + * document content and a potential known file extension. *

- * Use the {@link #detect(String)} method when you want to detect the - * type of the document without actually accessing the file. + * Use the {@link #detect(String)} method when you want to detect the type of the document + * without actually accessing the file. * * @param path the path of the file * @return detected media type @@ -271,11 +257,11 @@ public String detect(Path path) throws IOException { } /** - * Detects the media type of the given file. The type detection is - * based on the document content and a potential known file extension. + * Detects the media type of the given file. The type detection is based on the document content + * and a potential known file extension. *

- * Use the {@link #detect(String)} method when you want to detect the - * type of the document without actually accessing the file. + * Use the {@link #detect(String)} method when you want to detect the type of the document + * without actually accessing the file. * * @param file the file * @return detected media type @@ -284,19 +270,18 @@ public String detect(Path path) throws IOException { */ public String detect(File file) throws IOException { Metadata metadata = new Metadata(); - try (@SuppressWarnings("deprecation") InputStream stream = TikaInputStream - .get(file, metadata)) { + try (@SuppressWarnings("deprecation") + InputStream stream = TikaInputStream.get(file, metadata)) { return detect(stream, metadata); } } /** - * Detects the media type of the resource at the given URL. The type - * detection is based on the document content and a potential known - * file extension included in the URL. + * Detects the media type of the resource at the given URL. The type detection is based on the + * document content and a potential known file extension included in the URL. *

- * Use the {@link #detect(String)} method when you want to detect the - * type of the document without actually accessing the URL. + * Use the {@link #detect(String)} method when you want to detect the type of the document + * without actually accessing the URL. * * @param url the URL of the resource * @return detected media type @@ -310,11 +295,11 @@ public String detect(URL url) throws IOException { } /** - * Detects the media type of a document with the given file name. - * The type detection is based on known file name extensions. + * Detects the media type of a document with the given file name. The type detection is based on + * known file name extensions. *

- * The given name can also be a URL or a full file path. In such cases - * only the file name part of the string is used for type detection. + * The given name can also be a URL or a full file path. In such cases only the file name part + * of the string is used for type detection. * * @param name the file name of the document * @return detected media type @@ -330,11 +315,11 @@ public String detect(String name) { /** * Translate the given text String to and from the given languages. * - * @param text The text to translate. + * @param text The text to translate. * @param sourceLanguage The input text language (for example, "hi"). * @param targetLanguage The desired output language (for example, "fr"). - * @return The translated text. If translation is unavailable (client keys not set), returns - * the same text back. + * @return The translated text. If translation is unavailable (client keys not set), returns the + * same text back. * @see org.apache.tika.language.translate.Translator */ public String translate(String text, String sourceLanguage, String targetLanguage) { @@ -346,13 +331,13 @@ public String translate(String text, String sourceLanguage, String targetLanguag } /** - * Translate the given text String to the given language, attempting to auto-detect the - * source language. + * Translate the given text String to the given language, attempting to auto-detect the source + * language. * - * @param text The text to translate. + * @param text The text to translate. * @param targetLanguage The desired output language (for example, "en"). - * @return The translated text. If translation is unavailable (client keys not set), returns - * the same text back. + * @return The translated text. If translation is unavailable (client keys not set), returns the + * same text back. * @see org.apache.tika.language.translate.Translator */ public String translate(String text, String targetLanguage) { @@ -365,16 +350,15 @@ public String translate(String text, String targetLanguage) { /** - * Parses the given document and returns the extracted text content. - * Input metadata like a file name or a content type hint can be passed - * in the given metadata instance. Metadata information extracted from - * the document is returned in that same metadata instance. + * Parses the given document and returns the extracted text content. Input metadata like a file + * name or a content type hint can be passed in the given metadata instance. Metadata + * information extracted from the document is returned in that same metadata instance. *

- * The returned reader will be responsible for closing the given stream. - * The stream and any associated resources will be closed at or before - * the time when the {@link Reader#close()} method is called. + * The returned reader will be responsible for closing the given stream. The stream and any + * associated resources will be closed at or before the time when the {@link Reader#close()} + * method is called. * - * @param stream the document to be parsed + * @param stream the document to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the document can not be read or parsed @@ -388,9 +372,9 @@ public Reader parse(InputStream stream, Metadata metadata) throws IOException { /** * Parses the given document and returns the extracted text content. *

- * The returned reader will be responsible for closing the given stream. - * The stream and any associated resources will be closed at or before - * the time when the {@link Reader#close()} method is called. + * The returned reader will be responsible for closing the given stream. The stream and any + * associated resources will be closed at or before the time when the {@link Reader#close()} + * method is called. * * @param stream the document to be parsed * @return extracted text content @@ -403,10 +387,10 @@ public Reader parse(InputStream stream) throws IOException { /** * Parses the file at the given path and returns the extracted text content. *

- * Metadata information extracted from the document is returned in - * the supplied metadata instance. + * Metadata information extracted from the document is returned in the supplied metadata + * instance. * - * @param path the path of the file to be parsed + * @param path the path of the file to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the file can not be read or parsed @@ -430,17 +414,18 @@ public Reader parse(Path path) throws IOException { /** * Parses the given file and returns the extracted text content. *

- * Metadata information extracted from the document is returned in - * the supplied metadata instance. + * Metadata information extracted from the document is returned in the supplied metadata + * instance. * - * @param file the file to be parsed + * @param file the file to be parsed * @param metadata where document's metadata will be populated * @return extracted text content * @throws IOException if the file can not be read or parsed * @see #parse(Path) */ public Reader parse(File file, Metadata metadata) throws IOException { - @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); + @SuppressWarnings("deprecation") + InputStream stream = TikaInputStream.get(file, metadata); return parse(stream, metadata); } @@ -457,8 +442,7 @@ public Reader parse(File file) throws IOException { } /** - * Parses the resource at the given URL and returns the extracted - * text content. + * Parses the resource at the given URL and returns the extracted text content. * * @param url the URL of the resource to be parsed * @return extracted text content @@ -471,53 +455,48 @@ public Reader parse(URL url) throws IOException { } /** - * Parses the given document and returns the extracted text content. - * The given input stream is closed by this method. + * Parses the given document and returns the extracted text content. The given input stream is + * closed by this method. *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to {@link #getMaxStringLength()} first characters extracted - * from the input document. Use the {@link #setMaxStringLength(int)} - * method to adjust this limitation. + * To avoid unpredictable excess memory use, the returned string contains only up to + * {@link #getMaxStringLength()} first characters extracted from the input document. Use the + * {@link #setMaxStringLength(int)} method to adjust this limitation. *

- * NOTE: Unlike most other Tika methods that take an - * {@link InputStream}, this method will close the given stream for - * you as a convenience. With other methods you are still responsible - * for closing the stream or a wrapper instance returned by Tika. + * NOTE: Unlike most other Tika methods that take an {@link InputStream}, this + * method will close the given stream for you as a convenience. With other methods you are still + * responsible for closing the stream or a wrapper instance returned by Tika. * - * @param stream the document to be parsed + * @param stream the document to be parsed * @param metadata document metadata * @return extracted text content - * @throws IOException if the document can not be read + * @throws IOException if the document can not be read * @throws TikaException if the document can not be parsed */ public String parseToString(InputStream stream, Metadata metadata) - throws IOException, TikaException { + throws IOException, TikaException { return parseToString(stream, metadata, maxStringLength); } /** - * Parses the given document and returns the extracted text content. - * The given input stream is closed by this method. This method lets - * you control the maxStringLength per call. + * Parses the given document and returns the extracted text content. The given input stream is + * closed by this method. This method lets you control the maxStringLength per call. *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to maxLength (parameter) first characters extracted - * from the input document. + * To avoid unpredictable excess memory use, the returned string contains only up to maxLength + * (parameter) first characters extracted from the input document. *

- * NOTE: Unlike most other Tika methods that take an - * {@link InputStream}, this method will close the given stream for - * you as a convenience. With other methods you are still responsible - * for closing the stream or a wrapper instance returned by Tika. + * NOTE: Unlike most other Tika methods that take an {@link InputStream}, this + * method will close the given stream for you as a convenience. With other methods you are still + * responsible for closing the stream or a wrapper instance returned by Tika. * - * @param stream the document to be parsed - * @param metadata document metadata + * @param stream the document to be parsed + * @param metadata document metadata * @param maxLength maximum length of the returned string * @return extracted text content - * @throws IOException if the document can not be read + * @throws IOException if the document can not be read * @throws TikaException if the document can not be parsed */ public String parseToString(InputStream stream, Metadata metadata, int maxLength) - throws IOException, TikaException { + throws IOException, TikaException { WriteOutContentHandler handler = new WriteOutContentHandler(maxLength); ParseContext context = new ParseContext(); context.set(Parser.class, parser); @@ -533,22 +512,20 @@ public String parseToString(InputStream stream, Metadata metadata, int maxLength } /** - * Parses the given document and returns the extracted text content. - * The given input stream is closed by this method. + * Parses the given document and returns the extracted text content. The given input stream is + * closed by this method. *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to {@link #getMaxStringLength()} first characters extracted - * from the input document. Use the {@link #setMaxStringLength(int)} - * method to adjust this limitation. + * To avoid unpredictable excess memory use, the returned string contains only up to + * {@link #getMaxStringLength()} first characters extracted from the input document. Use the + * {@link #setMaxStringLength(int)} method to adjust this limitation. *

- * NOTE: Unlike most other Tika methods that take an - * {@link InputStream}, this method will close the given stream for - * you as a convenience. With other methods you are still responsible - * for closing the stream or a wrapper instance returned by Tika. + * NOTE: Unlike most other Tika methods that take an {@link InputStream}, this + * method will close the given stream for you as a convenience. With other methods you are still + * responsible for closing the stream or a wrapper instance returned by Tika. * * @param stream the document to be parsed * @return extracted text content - * @throws IOException if the document can not be read + * @throws IOException if the document can not be read * @throws TikaException if the document can not be parsed */ public String parseToString(InputStream stream) throws IOException, TikaException { @@ -558,14 +535,13 @@ public String parseToString(InputStream stream) throws IOException, TikaExceptio /** * Parses the file at the given path and returns the extracted text content. *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to {@link #getMaxStringLength()} first characters extracted - * from the input document. Use the {@link #setMaxStringLength(int)} - * method to adjust this limitation. + * To avoid unpredictable excess memory use, the returned string contains only up to + * {@link #getMaxStringLength()} first characters extracted from the input document. Use the + * {@link #setMaxStringLength(int)} method to adjust this limitation. * * @param path the path of the file to be parsed * @return extracted text content - * @throws IOException if the file can not be read + * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed */ public String parseToString(Path path) throws IOException, TikaException { @@ -577,35 +553,33 @@ public String parseToString(Path path) throws IOException, TikaException { /** * Parses the given file and returns the extracted text content. *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to {@link #getMaxStringLength()} first characters extracted - * from the input document. Use the {@link #setMaxStringLength(int)} - * method to adjust this limitation. + * To avoid unpredictable excess memory use, the returned string contains only up to + * {@link #getMaxStringLength()} first characters extracted from the input document. Use the + * {@link #setMaxStringLength(int)} method to adjust this limitation. * * @param file the file to be parsed * @return extracted text content - * @throws IOException if the file can not be read + * @throws IOException if the file can not be read * @throws TikaException if the file can not be parsed * @see #parseToString(Path) */ public String parseToString(File file) throws IOException, TikaException { Metadata metadata = new Metadata(); - @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); + @SuppressWarnings("deprecation") + InputStream stream = TikaInputStream.get(file, metadata); return parseToString(stream, metadata); } /** - * Parses the resource at the given URL and returns the extracted - * text content. + * Parses the resource at the given URL and returns the extracted text content. *

- * To avoid unpredictable excess memory use, the returned string contains - * only up to {@link #getMaxStringLength()} first characters extracted - * from the input document. Use the {@link #setMaxStringLength(int)} - * method to adjust this limitation. + * To avoid unpredictable excess memory use, the returned string contains only up to + * {@link #getMaxStringLength()} first characters extracted from the input document. Use the + * {@link #setMaxStringLength(int)} method to adjust this limitation. * * @param url the URL of the resource to be parsed * @return extracted text content - * @throws IOException if the resource can not be read + * @throws IOException if the resource can not be read * @throws TikaException if the resource can not be parsed */ public String parseToString(URL url) throws IOException, TikaException { @@ -615,8 +589,7 @@ public String parseToString(URL url) throws IOException, TikaException { } /** - * Returns the maximum length of strings returned by the - * parseToString methods. + * Returns the maximum length of strings returned by the parseToString methods. * * @return maximum string length, or -1 if the limit has been disabled * @since Apache Tika 0.7 @@ -626,11 +599,9 @@ public int getMaxStringLength() { } /** - * Sets the maximum length of strings returned by the parseToString - * methods. + * Sets the maximum length of strings returned by the parseToString methods. * - * @param maxStringLength maximum string length, - * or -1 to disable this limit + * @param maxStringLength maximum string length, or -1 to disable this limit * @since Apache Tika 0.7 */ public void setMaxStringLength(int maxStringLength) { @@ -667,7 +638,7 @@ public Translator getTranslator() { return translator; } - //--------------------------------------------------------------< Object > + // --------------------------------------------------------------< Object > public String toString() { return getString(); @@ -676,8 +647,8 @@ public String toString() { public static String getString() { String version = null; - try (InputStream stream = Tika.class - .getResourceAsStream("/META-INF/maven/org.apache.tika/tika-core/pom.properties")) { + try (InputStream stream = Tika.class.getResourceAsStream( + "/META-INF/maven/org.apache.tika/tika-core/pom.properties")) { if (stream != null) { Properties properties = new Properties(); properties.load(stream); diff --git a/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java b/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java index 1f7c4a0567..943a798279 100644 --- a/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java +++ b/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.concurrent; @@ -24,9 +22,9 @@ * @since Apache Tika 1.11 */ public interface ConfigurableThreadPoolExecutor extends ExecutorService { - + public void setMaximumPoolSize(int threads); - + public void setCorePoolSize(int threads); } diff --git a/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java b/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java index a4385e279e..6e83a139c9 100644 --- a/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java +++ b/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.concurrent; @@ -25,11 +23,11 @@ * * @since Apache Tika 1.11 */ -public class SimpleThreadPoolExecutor extends ThreadPoolExecutor implements - ConfigurableThreadPoolExecutor { +public class SimpleThreadPoolExecutor extends ThreadPoolExecutor + implements ConfigurableThreadPoolExecutor { public SimpleThreadPoolExecutor() { super(1, 2, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue<>(), - r -> new Thread(r, "Tika Executor Thread")); + r -> new Thread(r, "Tika Executor Thread")); } } diff --git a/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java b/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java index 405294faed..f14f147a7f 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java +++ b/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -29,23 +27,20 @@ import java.util.Locale; import java.util.Map; import java.util.Set; - +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.utils.XMLReaderUtils; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.utils.XMLReaderUtils; - public abstract class ConfigBase { - private static Class[] SUPPORTED_PRIMITIVES = - new Class[]{String.class, boolean.class, long.class, int.class, double.class, - float.class}; + private static Class[] SUPPORTED_PRIMITIVES = new Class[] {String.class, boolean.class, + long.class, int.class, double.class, float.class}; /** * Use this to build a single class, where the user specifies the instance class, e.g. @@ -57,7 +52,7 @@ public abstract class ConfigBase { * @throws IOException */ protected static T buildSingle(String itemName, Class itemClass, InputStream is) - throws TikaConfigException, IOException { + throws TikaConfigException, IOException { Element properties = null; try { properties = XMLReaderUtils.buildDOM(is).getDocumentElement(); @@ -82,7 +77,7 @@ protected static T buildSingle(String itemName, Class itemClass, InputStr * @throws IOException */ protected static T buildSingle(String itemName, Class itemClass, Element properties, - T defaultValue) throws TikaConfigException, IOException { + T defaultValue) throws TikaConfigException, IOException { NodeList children = properties.getChildNodes(); T toConfigure = null; @@ -94,7 +89,7 @@ protected static T buildSingle(String itemName, Class itemClass, Element if (itemName.equals(child.getLocalName())) { if (toConfigure != null) { throw new TikaConfigException( - "There can only be one " + itemName + " in a config"); + "There can only be one " + itemName + " in a config"); } T item = buildClass(child, itemName, itemClass); setParams(item, child, new HashSet<>()); @@ -112,8 +107,8 @@ protected static T buildSingle(String itemName, Class itemClass, Element /** - * Use this to build a list of components for a composite item (e.g. - * CompositeMetadataFilter, FetcherManager), each with their own configurations + * Use this to build a list of components for a composite item (e.g. CompositeMetadataFilter, + * FetcherManager), each with their own configurations * * @param compositeElementName * @param itemName @@ -122,8 +117,8 @@ protected static T buildSingle(String itemName, Class itemClass, Element * @throws IOException */ protected static P buildComposite(String compositeElementName, Class

compositeClass, - String itemName, Class itemClass, InputStream is) - throws TikaConfigException, IOException { + String itemName, Class itemClass, InputStream is) + throws TikaConfigException, IOException { Element properties = null; try { properties = XMLReaderUtils.buildDOM(is).getDocumentElement(); @@ -133,13 +128,12 @@ protected static P buildComposite(String compositeElementName, Class

c throw new TikaConfigException("problem loading xml to dom", e); } return buildComposite(compositeElementName, compositeClass, itemName, itemClass, - properties); + properties); } protected static P buildComposite(String compositeElementName, Class

compositeClass, - String itemName, Class itemClass, - Element properties) - throws TikaConfigException, IOException { + String itemName, Class itemClass, Element properties) + throws TikaConfigException, IOException { if (!properties.getLocalName().equals("properties")) { throw new TikaConfigException("expect properties as root node"); @@ -159,8 +153,8 @@ protected static P buildComposite(String compositeElementName, Class

c P composite = (P) constructor.newInstance(components); setParams(composite, child, new HashSet<>(), itemName); return composite; - } catch (NoSuchMethodException | InvocationTargetException | - InstantiationException | IllegalAccessException e) { + } catch (NoSuchMethodException | InvocationTargetException | InstantiationException + | IllegalAccessException e) { throw new TikaConfigException("can't build composite class", e); } } @@ -169,8 +163,7 @@ protected static P buildComposite(String compositeElementName, Class

c } private static List loadComposite(Node composite, String itemName, - Class itemClass) - throws TikaConfigException { + Class itemClass) throws TikaConfigException { NodeList children = composite.getChildNodes(); List items = new ArrayList<>(); for (int i = 0; i < children.getLength(); i++) { @@ -188,7 +181,7 @@ private static List loadComposite(Node composite, String itemName, } private static T buildClass(Node node, String elementName, Class itemClass) - throws TikaConfigException { + throws TikaConfigException { String className = itemClass.getName(); Node classNameNode = node.getAttributes().getNamedItem("class"); @@ -198,25 +191,25 @@ private static T buildClass(Node node, String elementName, Class itemClass) try { Class clazz = Class.forName(className); if (!itemClass.isAssignableFrom(clazz)) { - throw new TikaConfigException( - elementName + " with class name " + className + " must be of type '" + - itemClass.getName() + "'"); + throw new TikaConfigException(elementName + " with class name " + className + + " must be of type '" + itemClass.getName() + "'"); } return (T) clazz.getDeclaredConstructor().newInstance(); - } catch (InstantiationException | IllegalAccessException | ClassNotFoundException | - NoSuchMethodException | InvocationTargetException e) { - throw new TikaConfigException("problem loading " + elementName + - " with class " + itemClass.getName(), e); + } catch (InstantiationException | IllegalAccessException | ClassNotFoundException + | NoSuchMethodException | InvocationTargetException e) { + throw new TikaConfigException( + "problem loading " + elementName + " with class " + itemClass.getName(), + e); } } private static void setParams(Object object, Node targetNode, Set settings) - throws TikaConfigException { + throws TikaConfigException { setParams(object, targetNode, settings, null); } private static void setParams(Object object, Node targetNode, Set settings, - String exceptNodeName) throws TikaConfigException { + String exceptNodeName) throws TikaConfigException { NodeList children = targetNode.getChildNodes(); List params = new ArrayList<>(); for (int i = 0; i < children.getLength(); i++) { @@ -257,7 +250,7 @@ private static void setParams(Object object, Node targetNode, Set settin if (isPrimitive(setterClassPair.itemClass)) { tryToSetPrimitive(object, setterClassPair, param.getTextContent()); } else { - //tryToSetPrimitive(object, localName, txt); + // tryToSetPrimitive(object, localName, txt); Object item = buildClass(param, itemName, setterClassPair.itemClass); setParams(setterClassPair.itemClass.cast(item), param, new HashSet<>()); try { @@ -296,19 +289,19 @@ private static boolean hasClass(Node param) { } private static SetterClassPair findSetterClassPair(Object object, String itemName) - throws TikaConfigException { + throws TikaConfigException { - //TODO -- we could do more with info from the node -- is it complex, does it have - //a text value, does it have a class, etc... This works for now. - String setter = - "set" + itemName.substring(0, 1).toUpperCase(Locale.US) + itemName.substring(1); + // TODO -- we could do more with info from the node -- is it complex, does it have + // a text value, does it have a class, etc... This works for now. + String setter = "set" + itemName.substring(0, 1).toUpperCase(Locale.US) + + itemName.substring(1); Class itemClass = null; Method setterMethod = null; for (Method method : object.getClass().getMethods()) { if (setter.equals(method.getName())) { Class[] classes = method.getParameterTypes(); if (classes.length == 1) { - //if both setX(String) and setX(Object), prefer setX(String) + // if both setX(String) and setX(Object), prefer setX(String) if (itemClass == null || classes[0].equals(String.class)) { itemClass = classes[0]; setterMethod = method; @@ -319,14 +312,14 @@ private static SetterClassPair findSetterClassPair(Object object, String itemNam if (setterMethod != null && itemClass != null) { return new SetterClassPair(setterMethod, itemClass); } - //now try adders - String adder = - "add" + itemName.substring(0, 1).toUpperCase(Locale.US) + itemName.substring(1); + // now try adders + String adder = "add" + itemName.substring(0, 1).toUpperCase(Locale.US) + + itemName.substring(1); for (Method method : object.getClass().getMethods()) { if (adder.equals(method.getName())) { Class[] classes = method.getParameterTypes(); if (classes.length == 1) { - //if both setX(String) and setX(Object), prefer setX(String) + // if both setX(String) and setX(Object), prefer setX(String) if (itemClass == null || classes[0].equals(String.class)) { itemClass = classes[0]; setterMethod = method; @@ -335,9 +328,8 @@ private static SetterClassPair findSetterClassPair(Object object, String itemNam } } if (setterMethod == null && itemClass == null) { - throw new TikaConfigException( - "Couldn't find setter '" + setter + "' or adder '" + adder + "' for " + itemName + - " of class: " + object.getClass()); + throw new TikaConfigException("Couldn't find setter '" + setter + "' or adder '" + adder + + "' for " + itemName + " of class: " + object.getClass()); } return new SetterClassPair(setterMethod, itemClass); } @@ -368,8 +360,8 @@ private static void tryToSetList(Object object, Node param) throws TikaConfigExc private static void tryToSetClassList(Object object, Node node) throws TikaConfigException { String name = node.getLocalName(); try { - Class interfaze = - Class.forName(node.getAttributes().getNamedItem("class").getTextContent()); + Class interfaze = Class + .forName(node.getAttributes().getNamedItem("class").getTextContent()); List items = new ArrayList(); NodeList nodeList = node.getChildNodes(); for (int i = 0; i < nodeList.getLength(); i++) { @@ -385,8 +377,8 @@ private static void tryToSetClassList(Object object, Node node) throws TikaConfi Method m = object.getClass().getMethod(setter, List.class); m.invoke(object, items); - } catch (ClassNotFoundException | InvocationTargetException | NoSuchMethodException | - IllegalAccessException e) { + } catch (ClassNotFoundException | InvocationTargetException | NoSuchMethodException + | IllegalAccessException e) { throw new TikaConfigException("couldn't build class for " + name, e); } } @@ -415,8 +407,8 @@ private static void tryToSetStringList(Object object, Node param) throws TikaCon private static void tryToSetMap(Object object, Node param) throws TikaConfigException { String name = param.getLocalName(); - //only supports string, string at this point - //use LinkedHashMap to keep insertion order! + // only supports string, string at this point + // use LinkedHashMap to keep insertion order! Map map = new LinkedHashMap<>(); NodeList nodeList = param.getChildNodes(); for (int i = 0; i < nodeList.getLength(); i++) { @@ -441,12 +433,12 @@ private static void tryToSetMap(Object object, Node param) throws TikaConfigExce value = m.getNamedItem("v").getTextContent(); } if (key == null) { - throw new TikaConfigException( - "must specify a 'key' or 'from' value in a map " + "object : " + param); + throw new TikaConfigException("must specify a 'key' or 'from' value in a map " + + "object : " + param); } if (value == null) { - throw new TikaConfigException( - "must specify a 'value' or 'to' value in a " + "map object : " + param); + throw new TikaConfigException("must specify a 'value' or 'to' value in a " + + "map object : " + param); } map.put(key, value); } @@ -467,11 +459,11 @@ private static boolean isMap(Node param) { Node n = nodeList.item(i); if (n.getNodeType() == 1) { if (n.hasAttributes()) { - if (n.getAttributes().getNamedItem("from") != null && - n.getAttributes().getNamedItem("to") != null) { + if (n.getAttributes().getNamedItem("from") != null + && n.getAttributes().getNamedItem("to") != null) { return true; - } else if (n.getAttributes().getNamedItem("k") != null && - n.getAttributes().getNamedItem("v") != null) { + } else if (n.getAttributes().getNamedItem("k") != null + && n.getAttributes().getNamedItem("v") != null) { return true; } } @@ -481,7 +473,7 @@ private static boolean isMap(Node param) { } private static void tryToSetPrimitive(Object object, SetterClassPair setterClassPair, - String value) throws TikaConfigException { + String value) throws TikaConfigException { try { if (setterClassPair.itemClass == int.class) { setterClassPair.setterMethod.invoke(object, Integer.parseInt(value)); @@ -503,13 +495,12 @@ private static void tryToSetPrimitive(Object object, SetterClassPair setterClass /** - * This should be overridden to do something with the settings - * after loading the object. + * This should be overridden to do something with the settings after loading the object. * * @param settings */ protected void handleSettings(Set settings) { - //no-op + // no-op } /** @@ -522,7 +513,7 @@ protected void handleSettings(Set settings) { * @throws IOException */ protected Set configure(String nodeName, InputStream is) - throws TikaConfigException, IOException { + throws TikaConfigException, IOException { Set settings = new HashSet<>(); Node properties = null; @@ -559,8 +550,8 @@ public SetterClassPair(Method setterMethod, Class itemClass) { @Override public String toString() { - return "SetterClassPair{" + "setterMethod=" + setterMethod + ", itemClass=" + - itemClass + '}'; + return "SetterClassPair{" + "setterMethod=" + setterMethod + ", itemClass=" + itemClass + + '}'; } } } diff --git a/tika-core/src/main/java/org/apache/tika/config/Field.java b/tika-core/src/main/java/org/apache/tika/config/Field.java index 403ad6dcd3..660effd19a 100644 --- a/tika-core/src/main/java/org/apache/tika/config/Field.java +++ b/tika-core/src/main/java/org/apache/tika/config/Field.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -23,9 +21,8 @@ import java.lang.annotation.Target; /** - * Field annotation is a contract for binding {@link Param} value from - * Tika Configuration to an object. - * services + * Field annotation is a contract for binding {@link Param} value from Tika Configuration to an + * object. services * * @since Apache Tika 1.14 */ diff --git a/tika-core/src/main/java/org/apache/tika/config/Initializable.java b/tika-core/src/main/java/org/apache/tika/config/Initializable.java index f37bdd9095..8bca63f5b1 100644 --- a/tika-core/src/main/java/org/apache/tika/config/Initializable.java +++ b/tika-core/src/main/java/org/apache/tika/config/Initializable.java @@ -1,34 +1,30 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; import java.util.Map; - import org.apache.tika.exception.TikaConfigException; /** - * Components that must do special processing across multiple fields - * at initialization time should implement this interface. + * Components that must do special processing across multiple fields at initialization time should + * implement this interface. *

- * TikaConfig will call initialize on Initializable classes after - * setting the parameters for non-statically service loaded classes. + * TikaConfig will call initialize on Initializable classes after setting the parameters for + * non-statically service loaded classes. *

- * TikaConfig will call checkInitialization on all Initializables, - * whether loaded statically + * TikaConfig will call checkInitialization on all Initializables, whether loaded statically */ public interface Initializable { @@ -40,10 +36,8 @@ public interface Initializable { /** - * @param problemHandler if there is a problem and no - * custom initializableProblemHandler has been configured - * via Initializable parameters, - * this is called to respond. + * @param problemHandler if there is a problem and no custom initializableProblemHandler has + * been configured via Initializable parameters, this is called to respond. * @throws TikaConfigException */ void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException; diff --git a/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java b/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java index fdca6901c9..ca8164fd50 100644 --- a/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java +++ b/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java @@ -1,30 +1,27 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; -import org.slf4j.LoggerFactory; - import org.apache.tika.exception.TikaConfigException; +import org.slf4j.LoggerFactory; /** - * This is to be used to handle potential recoverable problems that - * might arise during initialization. + * This is to be used to handle potential recoverable problems that might arise during + * initialization. */ public interface InitializableProblemHandler { @@ -33,8 +30,7 @@ public interface InitializableProblemHandler { * Strategy that simply ignores all problems. */ InitializableProblemHandler IGNORE = new InitializableProblemHandler() { - public void handleInitializableProblem(String className, String message) { - } + public void handleInitializableProblem(String className, String message) {} @Override public String toString() { @@ -42,8 +38,8 @@ public String toString() { } }; /** - * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} - * created using the given class name. + * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} created using + * the given class name. */ InitializableProblemHandler INFO = new InitializableProblemHandler() { public void handleInitializableProblem(String classname, String message) { @@ -56,8 +52,8 @@ public String toString() { } }; /** - * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} - * created using the given class name. + * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} created using + * the given class name. */ InitializableProblemHandler WARN = new InitializableProblemHandler() { public void handleInitializableProblem(String classname, String message) { @@ -71,7 +67,7 @@ public String toString() { }; InitializableProblemHandler THROW = new InitializableProblemHandler() { public void handleInitializableProblem(String classname, String message) - throws TikaConfigException { + throws TikaConfigException { throw new TikaConfigException(message); } diff --git a/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java b/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java index 666c20d23e..912806f26b 100644 --- a/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java +++ b/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -21,9 +19,8 @@ /** - * Interface for error handling strategies in service class loading. - * You can implement this interface for a custom error handling mechanism, - * or use one of the predefined strategies. + * Interface for error handling strategies in service class loading. You can implement this + * interface for a custom error handling mechanism, or use one of the predefined strategies. * * @since Apache Tika 0.9 */ @@ -33,8 +30,7 @@ public interface LoadErrorHandler { * Strategy that simply ignores all problems. */ LoadErrorHandler IGNORE = new LoadErrorHandler() { - public void handleLoadError(String classname, Throwable throwable) { - } + public void handleLoadError(String classname, Throwable throwable) {} @Override public String toString() { @@ -42,8 +38,8 @@ public String toString() { } }; /** - * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} - * created using the given class name. + * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} created using + * the given class name. */ LoadErrorHandler WARN = new LoadErrorHandler() { public void handleLoadError(String classname, Throwable throwable) { @@ -56,9 +52,8 @@ public String toString() { } }; /** - * Strategy that throws a {@link RuntimeException} with the given - * throwable as the root cause, thus interrupting the entire service - * loading operation. + * Strategy that throws a {@link RuntimeException} with the given throwable as the root cause, + * thus interrupting the entire service loading operation. */ LoadErrorHandler THROW = new LoadErrorHandler() { public void handleLoadError(String classname, Throwable throwable) { @@ -72,11 +67,10 @@ public String toString() { }; /** - * Handles a problem encountered when trying to load the specified - * service class. The implementation can log or otherwise process - * the given error information. If the method returns normally, then - * the service loader simply skips this class and continues with the - * next one. + * Handles a problem encountered when trying to load the specified service class. The + * implementation can log or otherwise process the given error information. If the method + * returns normally, then the service loader simply skips this class and continues with the next + * one. * * @param classname name of the service class * @param throwable the encountered problem diff --git a/tika-core/src/main/java/org/apache/tika/config/Param.java b/tika-core/src/main/java/org/apache/tika/config/Param.java index 853abb7ec3..e0114c8d12 100644 --- a/tika-core/src/main/java/org/apache/tika/config/Param.java +++ b/tika-core/src/main/java/org/apache/tika/config/Param.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -37,24 +35,21 @@ import javax.xml.transform.TransformerException; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; - +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.multiple.AbstractMultipleParser; +import org.apache.tika.utils.XMLReaderUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.parser.multiple.AbstractMultipleParser; -import org.apache.tika.utils.XMLReaderUtils; - /** * This is a serializable model class for parameters from configuration file. * - * @param value type. Should be serializable to string and have a constructor - * with string param + * @param value type. Should be serializable to string and have a constructor with string param * @since Apache Tika 1.14 */ public class Param implements Serializable { @@ -87,15 +82,14 @@ public class Param implements Serializable { wellKnownMap.put("metadataPolicy", AbstractMultipleParser.MetadataPolicy.class); } - //one of these two is used for serialization + // one of these two is used for serialization private final List valueStrings = new ArrayList<>(); private Class type; private String name; private T actualValue; - public Param() { - } + public Param() {} public Param(String name, Class type, T value) { this.name = name; @@ -116,7 +110,7 @@ public Param(String name, T value) { } public static Param load(InputStream stream) - throws SAXException, IOException, TikaException { + throws SAXException, IOException, TikaException { DocumentBuilder db = XMLReaderUtils.getDocumentBuilder(); Document document = db.parse(stream); @@ -152,8 +146,8 @@ public static Param load(Node node) throws TikaConfigException { String type = typeAttr.getTextContent(); if ("class".equals(type)) { if (classAttr == null) { - throw new TikaConfigException("must specify a class attribute if " + - "type=\"class\""); + throw new TikaConfigException( + "must specify a class attribute if " + "type=\"class\""); } ret.setType(clazz); } else { @@ -176,7 +170,7 @@ public static Param load(Node node) throws TikaConfigException { } else if (Map.class.isAssignableFrom(ret.type)) { loadMap(ret, node); } else { - //allow the empty string + // allow the empty string String textContent = ""; if (value != null) { textContent = value.getTextContent(); @@ -186,12 +180,14 @@ public static Param load(Node node) throws TikaConfigException { } return ret; } - private static void loadObject(Param ret, Node root, Class clazz) throws TikaConfigException { + + private static void loadObject(Param ret, Node root, Class clazz) + throws TikaConfigException { try { - ret.actualValue = (T)clazz.getDeclaredConstructor().newInstance(); - } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | - InvocationTargetException e) { + ret.actualValue = (T) clazz.getDeclaredConstructor().newInstance(); + } catch (InstantiationException | IllegalAccessException | NoSuchMethodException + | InvocationTargetException e) { throw new TikaConfigException("can't build class: " + clazz, e); } @@ -205,19 +201,20 @@ private static void loadObject(Param ret, Node root, Class clazz) throws Param param = load(params.item(j)); Method method = null; - String methodName = "set" + - param.getName().substring(0,1).toUpperCase(Locale.US) + - param.getName().substring(1); + String methodName = "set" + + param.getName().substring(0, 1).toUpperCase(Locale.US) + + param.getName().substring(1); try { method = ret.actualValue.getClass().getMethod(methodName, - param.getType()); + param.getType()); } catch (NoSuchMethodException e) { throw new TikaConfigException("can't find method: " + methodName, e); } try { method.invoke(ret.actualValue, param.getValue()); } catch (IllegalAccessException | InvocationTargetException e) { - throw new TikaConfigException("can't set param value: " + param.getName(), e); + throw new TikaConfigException( + "can't set param value: " + param.getName(), e); } } } @@ -243,10 +240,10 @@ private static void loadMap(Param ret, Node root) throws TikaConfigExcept key = child.getLocalName(); value = child.getTextContent(); } - if (((Map)ret.actualValue).containsKey(key)) { + if (((Map) ret.actualValue).containsKey(key)) { throw new TikaConfigException("Duplicate keys are not allowed: " + key); } - ((Map)ret.actualValue).put(key, value); + ((Map) ret.actualValue).put(key, value); } child = child.getNextSibling(); } @@ -289,7 +286,7 @@ private static T getTypedValue(Class type, String value) { return constructor.newInstance(value); } catch (NoSuchMethodException e) { throw new RuntimeException(type + " doesnt have a constructor that takes String arg", - e); + e); } catch (IllegalAccessException | InstantiationException | InvocationTargetException e) { throw new RuntimeException(e); } @@ -339,8 +336,8 @@ public T getValue() { @Override public String toString() { - return "Param{" + "name='" + name + '\'' + ", valueStrings='" + valueStrings + '\'' + - ", actualValue=" + actualValue + '}'; + return "Param{" + "name='" + name + '\'' + ", valueStrings='" + valueStrings + '\'' + + ", actualValue=" + actualValue + '}'; } public void save(OutputStream stream) throws TransformerException, TikaException { @@ -376,9 +373,9 @@ public void save(Document doc, Node node) { el.appendChild(item); } } else if (Map.class.isAssignableFrom(actualValue.getClass())) { - for (Object key : ((Map)actualValue).keySet()) { + for (Object key : ((Map) actualValue).keySet()) { String keyString = (String) key; - String valueString = (String)((Map)actualValue).get(keyString); + String valueString = (String) ((Map) actualValue).get(keyString); Node item = doc.createElement(keyString); item.setTextContent(valueString); el.appendChild(item); diff --git a/tika-core/src/main/java/org/apache/tika/config/ParamField.java b/tika-core/src/main/java/org/apache/tika/config/ParamField.java index 15e977ae6d..0978434f39 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ParamField.java +++ b/tika-core/src/main/java/org/apache/tika/config/ParamField.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -22,12 +20,11 @@ import java.util.HashMap; import java.util.Locale; import java.util.Map; - import org.apache.tika.exception.TikaConfigException; /** - * This class stores metdata for {@link Field} annotation are used to map them - * to {@link Param} at runtime + * This class stores metdata for {@link Field} annotation are used to map them to {@link Param} at + * runtime * * @since Apache Tika 1.14 */ @@ -35,18 +32,19 @@ public class ParamField { public static final String DEFAULT = "#default"; - //NOTE: since (primitive type) is NOT AssignableFrom (BoxedType), + // NOTE: since (primitive type) is NOT AssignableFrom (BoxedType), // we just use boxed type for everything! // Example : short.class.isAssignableFrom(Short.class) ? false - private static final Map, Class> PRIMITIVE_MAP = - new HashMap, Class>() {{ - put(int.class, Integer.class); - put(short.class, Short.class); - put(boolean.class, Boolean.class); - put(long.class, Long.class); - put(float.class, Float.class); - put(double.class, Double.class); - }}; + private static final Map, Class> PRIMITIVE_MAP = new HashMap, Class>() { + { + put(int.class, Integer.class); + put(short.class, Short.class); + put(boolean.class, Boolean.class); + put(long.class, Long.class); + put(float.class, Float.class); + put(double.class, Double.class); + } + }; private final String name; private final Class type; private final boolean required; @@ -94,13 +92,13 @@ public boolean isRequired() { /** * Sets given value to the annotated field of bean * - * @param bean bean with annotation for field + * @param bean bean with annotation for field * @param value value of field - * @throws IllegalAccessException when it occurs + * @throws IllegalAccessException when it occurs * @throws InvocationTargetException when it occurs */ public void assignValue(Object bean, Object value) - throws IllegalAccessException, InvocationTargetException { + throws IllegalAccessException, InvocationTargetException { if (field != null) { field.set(bean, value); } else { @@ -117,15 +115,15 @@ private Class retrieveType() throws TikaConfigException { if (params.length != 1) { String msg = "Invalid setter method. Must have one and only one parameter. "; if (setter.getName().startsWith("get")) { - msg += "Perhaps the annotation is misplaced on " + setter.getName() + - " while a set'X' is expected?"; + msg += "Perhaps the annotation is misplaced on " + setter.getName() + + " while a set'X' is expected?"; } throw new TikaConfigException(msg); } type = params[0]; } if (type.isPrimitive() && PRIMITIVE_MAP.containsKey(type)) { - type = PRIMITIVE_MAP.get(type); //primitive types have hard time + type = PRIMITIVE_MAP.get(type); // primitive types have hard time } return type; } @@ -138,8 +136,8 @@ private String retrieveParamName(Field annotation) { } else { String setterName = setter.getName(); if (setterName.startsWith("set") && setterName.length() > 3) { - name = setterName.substring(3, 4).toLowerCase(Locale.ROOT) + - setterName.substring(4); + name = setterName.substring(3, 4).toLowerCase(Locale.ROOT) + + setterName.substring(4); } else { name = setter.getName(); } @@ -152,7 +150,7 @@ private String retrieveParamName(Field annotation) { @Override public String toString() { - return "ParamField{" + "name='" + name + '\'' + ", type=" + type + ", required=" + - required + '}'; + return "ParamField{" + "name='" + name + '\'' + ", type=" + type + ", required=" + required + + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java index acc53ca885..12b57fbc77 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java +++ b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -33,7 +31,6 @@ import java.util.Map; import java.util.Set; import java.util.regex.Pattern; - import org.apache.tika.exception.TikaConfigException; import org.apache.tika.utils.ServiceLoaderUtils; @@ -45,17 +42,16 @@ public class ServiceLoader { /** - * The dynamic set of services available in an OSGi environment. - * Managed by the {@link TikaActivator} class and used as an additional - * source of service instances in the {@link #loadServiceProviders(Class)} - * method. + * The dynamic set of services available in an OSGi environment. Managed by the + * {@link TikaActivator} class and used as an additional source of service instances in the + * {@link #loadServiceProviders(Class)} method. */ private static final Map SERVICES = new HashMap<>(); private static final Pattern COMMENT = Pattern.compile("#.*"); private static final Pattern WHITESPACE = Pattern.compile("\\s+"); /** - * The default context class loader to use for all threads, or - * null to automatically select the context class loader. + * The default context class loader to use for all threads, or null to + * automatically select the context class loader. */ private static volatile ClassLoader CONTEXT_CLASS_LOADER = null; private final ClassLoader loader; @@ -64,13 +60,14 @@ public class ServiceLoader { private final boolean dynamic; public ServiceLoader(ClassLoader loader, LoadErrorHandler handler, - InitializableProblemHandler initializableProblemHandler, boolean dynamic) { + InitializableProblemHandler initializableProblemHandler, boolean dynamic) { this.loader = loader; this.handler = handler; this.initializableProblemHandler = initializableProblemHandler; this.dynamic = dynamic; } + public ServiceLoader(ClassLoader loader, LoadErrorHandler handler, boolean dynamic) { this(loader, handler, InitializableProblemHandler.WARN, dynamic); } @@ -80,24 +77,24 @@ public ServiceLoader(ClassLoader loader, LoadErrorHandler handler) { } public ServiceLoader(ClassLoader loader) { - this(loader, - Boolean.getBoolean("org.apache.tika.service.error.warn") ? LoadErrorHandler.WARN : - LoadErrorHandler.IGNORE); + this(loader, Boolean.getBoolean("org.apache.tika.service.error.warn") + ? LoadErrorHandler.WARN + : LoadErrorHandler.IGNORE); } public ServiceLoader() { this(getContextClassLoader(), - Boolean.getBoolean("org.apache.tika.service.error.warn") ? LoadErrorHandler.WARN : - LoadErrorHandler.IGNORE, true); + Boolean.getBoolean("org.apache.tika.service.error.warn") + ? LoadErrorHandler.WARN + : LoadErrorHandler.IGNORE, + true); } /** - * Returns the context class loader of the current thread. If such - * a class loader is not available, then the loader of this class or - * finally the system class loader is returned. + * Returns the context class loader of the current thread. If such a class loader is not + * available, then the loader of this class or finally the system class loader is returned. * - * @return context class loader, or null if no loader - * is available + * @return context class loader, or null if no loader is available * @see TIKA-441 */ static ClassLoader getContextClassLoader() { @@ -112,12 +109,11 @@ static ClassLoader getContextClassLoader() { } /** - * Sets the context class loader to use for all threads that access - * this class. Used for example in an OSGi environment to avoid problems - * with the default context class loader. + * Sets the context class loader to use for all threads that access this class. Used for example + * in an OSGi environment to avoid problems with the default context class loader. * - * @param loader default context class loader, - * or null to automatically pick the loader + * @param loader default context class loader, or null to automatically pick the + * loader */ public static void setContextClassLoader(ClassLoader loader) { CONTEXT_CLASS_LOADER = loader; @@ -166,8 +162,7 @@ public InitializableProblemHandler getInitializableProblemHandler() { } /** - * Returns an input stream for reading the specified resource from the - * configured class loader. + * Returns an input stream for reading the specified resource from the configured class loader. * * @param name resource name * @return input stream, or null if the resource was not found @@ -192,24 +187,23 @@ public ClassLoader getLoader() { } /** - * Loads and returns the named service class that's expected to implement - * the given interface. + * Loads and returns the named service class that's expected to implement the given interface. *

* Note that this class does not use the {@link LoadErrorHandler}, a - * {@link ClassNotFoundException} is always returned for unknown - * classes or classes of the wrong type + * {@link ClassNotFoundException} is always returned for unknown classes or classes of the wrong + * type * * @param iface service interface - * @param name service class name + * @param name service class name * @return service class - * @throws ClassNotFoundException if the service class can not be found - * or does not implement the given interface + * @throws ClassNotFoundException if the service class can not be found or does not implement + * the given interface * @see Class#forName(String, boolean, ClassLoader) * @since Apache Tika 1.1 */ @SuppressWarnings("unchecked") public Class getServiceClass(Class iface, String name) - throws ClassNotFoundException { + throws ClassNotFoundException { if (loader == null) { throw new ClassNotFoundException("Service class " + name + " is not available"); } @@ -218,17 +212,15 @@ public Class getServiceClass(Class iface, String name) throw new ClassNotFoundException("Service class " + name + " is an interface"); } else if (!iface.isAssignableFrom(klass)) { throw new ClassNotFoundException( - "Service class " + name + " does not implement " + iface.getName()); + "Service class " + name + " does not implement " + iface.getName()); } else { return (Class) klass; } } /** - * Returns all the available service resources matching the - * given pattern, such as all instances of tika-mimetypes.xml - * on the classpath, or all org.apache.tika.parser.Parser - * service files. + * Returns all the available service resources matching the given pattern, such as all instances + * of tika-mimetypes.xml on the classpath, or all org.apache.tika.parser.Parser service files. */ public Enumeration findServiceResources(String filePattern) { try { @@ -256,7 +248,7 @@ public List loadServiceProviders(Class iface) { List providers = new ArrayList<>(); Set seen = new HashSet<>(); for (T provider : tmp) { - if (! seen.contains(provider.getClass().getCanonicalName())) { + if (!seen.contains(provider.getClass().getCanonicalName())) { providers.add(provider); seen.add(provider.getClass().getCanonicalName()); } @@ -265,9 +257,8 @@ public List loadServiceProviders(Class iface) { } /** - * Returns the available dynamic service providers of the given type. - * The returned list is newly allocated and may be freely modified - * by the caller. + * Returns the available dynamic service providers of the given type. The returned list is newly + * allocated and may be freely modified by the caller. * * @param iface service provider interface * @return dynamic service providers @@ -294,10 +285,9 @@ public List loadDynamicServiceProviders(Class iface) { } /** - * Returns the defined static service providers of the given type, without - * attempting to load them. - * The providers are loaded using the service provider mechanism using - * the configured class loader (if any). + * Returns the defined static service providers of the given type, without attempting to load + * them. The providers are loaded using the service provider mechanism using the configured + * class loader (if any). * * @param iface service provider interface * @return static list of uninitialised service providers @@ -326,19 +316,18 @@ public List loadStaticServiceProviders(Class iface) { } /** - * Returns the available static service providers of the given type. - * The providers are loaded using the service provider mechanism using - * the configured class loader (if any). The returned list is newly - * allocated and may be freely modified by the caller. + * Returns the available static service providers of the given type. The providers are loaded + * using the service provider mechanism using the configured class loader (if any). The returned + * list is newly allocated and may be freely modified by the caller. * - * @param iface service provider interface + * @param iface service provider interface * @param excludes -- do not load these classes * @return static service providers * @since Apache Tika 1.2 */ @SuppressWarnings("unchecked") public List loadStaticServiceProviders(Class iface, - Collection> excludes) { + Collection> excludes) { List providers = new ArrayList<>(); if (loader != null) { @@ -359,13 +348,13 @@ public List loadStaticServiceProviders(Class iface, if (instance instanceof Initializable) { ((Initializable) instance).initialize(Collections.EMPTY_MAP); ((Initializable) instance) - .checkInitialization(initializableProblemHandler); + .checkInitialization(initializableProblemHandler); } providers.add(instance); } } else { throw new TikaConfigException( - "Class " + name + " is not of type: " + iface); + "Class " + name + " is not of type: " + iface); } } catch (Throwable t) { handler.handleLoadError(name, t); @@ -376,9 +365,10 @@ public List loadStaticServiceProviders(Class iface, } private void collectServiceClassNames(URL resource, Collection names) - throws IOException { + throws IOException { try (InputStream stream = resource.openStream(); - BufferedReader reader = new BufferedReader(new InputStreamReader(stream, UTF_8))) { + BufferedReader reader = + new BufferedReader(new InputStreamReader(stream, UTF_8))) { String line = reader.readLine(); while (line != null) { line = COMMENT.matcher(line).replaceFirst(""); diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java b/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java index e076f1c2dd..85304c441a 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java @@ -1,21 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; +import org.apache.tika.detect.Detector; +import org.apache.tika.parser.Parser; import org.osgi.framework.BundleActivator; import org.osgi.framework.BundleContext; import org.osgi.framework.Constants; @@ -23,17 +23,13 @@ import org.osgi.util.tracker.ServiceTracker; import org.osgi.util.tracker.ServiceTrackerCustomizer; -import org.apache.tika.detect.Detector; -import org.apache.tika.parser.Parser; - /** - * Bundle activator that adjust the class loading mechanism of the - * {@link ServiceLoader} class to work correctly in an OSGi environment. + * Bundle activator that adjust the class loading mechanism of the {@link ServiceLoader} class to + * work correctly in an OSGi environment. *

- * Note that you should not access this class directly. - * Instead the OSGi environment (if present) will automatically invoke the - * methods of this class based on the Bundle-Activator setting in the bundle - * manifest. + * Note that you should not access this class directly. Instead the OSGi + * environment (if present) will automatically invoke the methods of this class based on the + * Bundle-Activator setting in the bundle manifest. * * @since Apache Tika 0.9 */ @@ -44,7 +40,7 @@ public class TikaActivator implements BundleActivator, ServiceTrackerCustomizer private ServiceTracker parserTracker; private BundleContext bundleContext; - //-----------------------------------------------------< BundleActivator > + // -----------------------------------------------------< BundleActivator > public void start(final BundleContext context) throws Exception { bundleContext = context; @@ -73,8 +69,7 @@ public Object addingService(ServiceReference reference) { return service; } - public void modifiedService(ServiceReference reference, Object service) { - } + public void modifiedService(ServiceReference reference, Object service) {} public void removedService(ServiceReference reference, Object service) { ServiceLoader.removeService(reference); diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index 14ff21b117..6ef7906068 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -41,15 +39,6 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import javax.imageio.spi.ServiceRegistry; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.SAXException; - import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor; import org.apache.tika.concurrent.SimpleThreadPoolExecutor; import org.apache.tika.detect.CompositeDetector; @@ -85,16 +74,23 @@ import org.apache.tika.utils.AnnotationUtils; import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; /** * Parse xml config file. */ public class TikaConfig { - public static int DEFAULT_MAX_JSON_STRING_FIELD_LENGTH = 20_000_000;//jackson's default + public static int DEFAULT_MAX_JSON_STRING_FIELD_LENGTH = 20_000_000;// jackson's default public static String MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME = "maxJsonStringFieldLength"; - //use this to look for unneeded instantiations of TikaConfig + // use this to look for unneeded instantiations of TikaConfig protected static final AtomicInteger TIMES_INSTANTIATED = new AtomicInteger(); private static final Logger LOG = LoggerFactory.getLogger(TikaConfig.class); @@ -114,10 +110,14 @@ public class TikaConfig { static { strategyMap.put("", InitializableProblemHandler.DEFAULT); - strategyMap.put(InitializableProblemHandler.IGNORE.toString(), InitializableProblemHandler.IGNORE); - strategyMap.put(InitializableProblemHandler.INFO.toString(), InitializableProblemHandler.INFO); - strategyMap.put(InitializableProblemHandler.WARN.toString(), InitializableProblemHandler.WARN); - strategyMap.put(InitializableProblemHandler.THROW.toString(), InitializableProblemHandler.THROW); + strategyMap.put(InitializableProblemHandler.IGNORE.toString(), + InitializableProblemHandler.IGNORE); + strategyMap.put(InitializableProblemHandler.INFO.toString(), + InitializableProblemHandler.INFO); + strategyMap.put(InitializableProblemHandler.WARN.toString(), + InitializableProblemHandler.WARN); + strategyMap.put(InitializableProblemHandler.THROW.toString(), + InitializableProblemHandler.THROW); } private static int MAX_JSON_STRING_FIELD_LENGTH = DEFAULT_MAX_JSON_STRING_FIELD_LENGTH; @@ -131,26 +131,29 @@ public TikaConfig(Path path) throws TikaException, IOException, SAXException { } public TikaConfig(Path path, ServiceLoader loader) - throws TikaException, IOException, SAXException { + throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(path), loader); } public TikaConfig(File file) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(file.toPath())); } + public TikaConfig(File file, ServiceLoader loader) - throws TikaException, IOException, SAXException { + throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(file.toPath()), loader); } + public TikaConfig(URL url) throws TikaException, IOException, SAXException { this(url, ServiceLoader.getContextClassLoader()); } + public TikaConfig(URL url, ClassLoader loader) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(url.toString()).getDocumentElement(), loader); } public TikaConfig(URL url, ServiceLoader loader) - throws TikaException, IOException, SAXException { + throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(url.toString()).getDocumentElement(), loader); } @@ -199,15 +202,14 @@ private TikaConfig(Element element, ServiceLoader loader) throws TikaException, } /** - * Creates a Tika configuration from the built-in media type rules - * and all the {@link Parser} implementations available through the - * {@link ServiceRegistry service provider mechanism} in the given - * class loader. + * Creates a Tika configuration from the built-in media type rules and all the {@link Parser} + * implementations available through the {@link ServiceRegistry service provider mechanism} in + * the given class loader. * - * @param loader the class loader through which parser implementations - * are loaded, or null for no parsers + * @param loader the class loader through which parser implementations are loaded, or + * null for no parsers * @throws MimeTypeException if the built-in media type rules are broken - * @throws IOException if the built-in media type rules can not be read + * @throws IOException if the built-in media type rules can not be read * @since Apache Tika 0.8 */ public TikaConfig(ClassLoader loader) throws MimeTypeException, IOException { @@ -226,20 +228,22 @@ public TikaConfig(ClassLoader loader) throws MimeTypeException, IOException { } /** - * Creates a default Tika configuration. - * First checks whether an XML config file is specified, either in + * Creates a default Tika configuration. First checks whether an XML config file is specified, + * either in *

    *
  1. System property "tika.config", or
  2. *
  3. Environment variable TIKA_CONFIG
  4. *
- *

If one of these have a value, try to resolve it relative to file - * system or classpath.

- *

If XML config is not specified, initialize from the built-in media - * type rules and all the {@link Parser} implementations available through - * the {@link ServiceRegistry service provider mechanism} in the context - * class loader of the current thread.

+ *

+ * If one of these have a value, try to resolve it relative to file system or classpath. + *

+ *

+ * If XML config is not specified, initialize from the built-in media type rules and all the + * {@link Parser} implementations available through the {@link ServiceRegistry service provider + * mechanism} in the context class loader of the current thread. + *

* - * @throws IOException if the configuration can not be read + * @throws IOException if the configuration can not be read * @throws TikaException if problem with MimeTypes or parsing XML config */ public TikaConfig() throws TikaException, IOException { @@ -283,8 +287,8 @@ public TikaConfig() throws TikaException, IOException { ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader(); this.mimeTypes = typesFromDomElement(element); - this.encodingDetector = - encodingDetectorLoader.loadOverall(element, mimeTypes, serviceLoader); + this.encodingDetector = encodingDetectorLoader.loadOverall(element, mimeTypes, + serviceLoader); this.renderer = rendererLoader.loadOverall(element, mimeTypes, serviceLoader); @@ -293,14 +297,14 @@ public TikaConfig() throws TikaException, IOException { this.detector = detectorLoader.loadOverall(element, mimeTypes, serviceLoader); this.translator = translatorLoader.loadOverall(element, mimeTypes, serviceLoader); this.executorService = - executorLoader.loadOverall(element, mimeTypes, serviceLoader); + executorLoader.loadOverall(element, mimeTypes, serviceLoader); this.metadataFilter = MetadataFilter.load(element, true); this.metadataListFilter = MetadataListFilter.load(element, true); this.autoDetectParserConfig = AutoDetectParserConfig.load(element); setMaxJsonStringFieldLength(element); } catch (SAXException e) { throw new TikaException("Specified Tika configuration has syntax errors: " + config, - e); + e); } } TIMES_INSTANTIATED.incrementAndGet(); @@ -309,7 +313,7 @@ public TikaConfig() throws TikaException, IOException { /** * * @return maximum field length when serializing String fields in Tika's metadata or metadata - * list into JSON + * list into JSON */ public static int getMaxJsonStringFieldLength() { return MAX_JSON_STRING_FIELD_LENGTH; @@ -323,8 +327,8 @@ private void setMaxJsonStringFieldLength(Element properties) throws TikaConfigEx try { MAX_JSON_STRING_FIELD_LENGTH = Integer.parseInt(n.getTextContent()); } catch (NumberFormatException e) { - throw new TikaConfigException(MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME + " " + - "is not an integer", e); + throw new TikaConfigException(MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME + " " + + "is not an integer", e); } return; } @@ -346,8 +350,9 @@ protected static CompositeEncodingDetector getDefaultEncodingDetector(ServiceLoa protected static CompositeRenderer getDefaultRenderer(ServiceLoader loader) { return new CompositeRenderer(loader); } + private static CompositeParser getDefaultParser(MimeTypes types, ServiceLoader loader, - EncodingDetector encodingDetector, Renderer renderer) { + EncodingDetector encodingDetector, Renderer renderer) { return new DefaultParser(types.getMediaTypeRegistry(), loader, encodingDetector, renderer); } @@ -360,7 +365,7 @@ private static ConfigurableThreadPoolExecutor getDefaultExecutorService() { } private static InputStream getConfigInputStream(String config, ServiceLoader serviceLoader) - throws TikaException, IOException { + throws TikaException, IOException { InputStream stream = null; try { stream = new URI(config).toURL().openStream(); @@ -398,9 +403,9 @@ private static String getText(Node node) { } /** - * Provides a default configuration (TikaConfig). Currently creates a - * new instance each time it's called; we may be able to have it - * return a shared instance once it is completely immutable. + * Provides a default configuration (TikaConfig). Currently creates a new instance each time + * it's called; we may be able to have it return a shared instance once it is completely + * immutable. * * @return default configuration */ @@ -426,15 +431,14 @@ private static Element getChild(Element element, String name) { } private static List getTopLevelElementChildren(Element element, String parentName, - String childrenName) - throws TikaException { + String childrenName) throws TikaException { Node parentNode = null; if (parentName != null) { // Should be only zero or one / etc tag NodeList nodes = element.getElementsByTagName(parentName); if (nodes.getLength() > 1) { throw new TikaException( - "Properties may not contain multiple " + parentName + " entries"); + "Properties may not contain multiple " + parentName + " entries"); } else if (nodes.getLength() == 1) { parentNode = nodes.item(0); } @@ -464,7 +468,7 @@ private static List getTopLevelElementChildren(Element element, String } private static MimeTypes typesFromDomElement(Element element) - throws TikaException, IOException { + throws TikaException, IOException { Element mtr = getChild(element, "mimeTypeRepository"); if (mtr != null && mtr.hasAttribute("resource")) { return MimeTypesFactory.create(mtr.getAttribute("resource")); @@ -474,7 +478,7 @@ private static MimeTypes typesFromDomElement(Element element) } private static Set mediaTypesListFromDomElement(Element node, String tag) - throws TikaException { + throws TikaException { Set types = null; NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { @@ -502,7 +506,7 @@ private static Set mediaTypesListFromDomElement(Element node, String } private static ServiceLoader serviceLoaderFromDomElement(Element element, ClassLoader loader) - throws TikaConfigException { + throws TikaConfigException { Element serviceLoaderElement = getChild(element, "service-loader"); ServiceLoader serviceLoader; @@ -516,14 +520,14 @@ private static ServiceLoader serviceLoaderFromDomElement(Element element, ClassL loadErrorHandler = LoadErrorHandler.IGNORE; } InitializableProblemHandler initializableProblemHandler = - getInitializableProblemHandler( - serviceLoaderElement.getAttribute("initializableProblemHandler")); + getInitializableProblemHandler(serviceLoaderElement + .getAttribute("initializableProblemHandler")); if (loader == null) { loader = ServiceLoader.getContextClassLoader(); } serviceLoader = new ServiceLoader(loader, loadErrorHandler, initializableProblemHandler, - dynamic); + dynamic); } else if (loader != null) { serviceLoader = new ServiceLoader(loader); } else { @@ -535,34 +539,36 @@ private static ServiceLoader serviceLoaderFromDomElement(Element element, ClassL /** * Return an InitializableProblemHandler by name. * - * @param initializableProblemHandler can be empty, 'ignore', 'info', 'warn' or 'throw', but never null. + * @param initializableProblemHandler can be empty, 'ignore', 'info', 'warn' or 'throw', but + * never null. * @return an InitializableProblemHandler * @throws TikaConfigException if invalid name */ private static InitializableProblemHandler getInitializableProblemHandler( - String initializableProblemHandler) throws TikaConfigException { - InitializableProblemHandler handler = strategyMap.get(initializableProblemHandler.toUpperCase(Locale.US)); + String initializableProblemHandler) throws TikaConfigException { + InitializableProblemHandler handler = + strategyMap.get(initializableProblemHandler.toUpperCase(Locale.US)); if (handler != null) { return handler; } throw new TikaConfigException(String.format(Locale.US, - "Couldn't parse non-null '%s'. Must be one of 'ignore', 'info', 'warn' or 'throw'", - initializableProblemHandler)); + "Couldn't parse non-null '%s'. Must be one of 'ignore', 'info', 'warn' or 'throw'", + initializableProblemHandler)); } public static void mustNotBeEmpty(String paramName, String paramValue) - throws TikaConfigException { + throws TikaConfigException { if (StringUtils.isBlank(paramValue)) { throw new IllegalArgumentException( - "parameter '" + paramName + "' must be set in the config file"); + "parameter '" + paramName + "' must be set in the config file"); } } public static void mustNotBeEmpty(String paramName, Path paramValue) - throws TikaConfigException { + throws TikaConfigException { if (paramValue == null) { throw new IllegalArgumentException( - "parameter '" + paramName + "' must be set in the config file"); + "parameter '" + paramName + "' must be set in the config file"); } } @@ -574,7 +580,8 @@ private void updateXMLReaderUtils(Element element) throws TikaException { } if (child.hasAttribute("maxEntityExpansions")) { - XMLReaderUtils.setMaxEntityExpansions(Integer.parseInt(child.getAttribute("maxEntityExpansions"))); + XMLReaderUtils.setMaxEntityExpansions( + Integer.parseInt(child.getAttribute("maxEntityExpansions"))); } if (child.hasAttribute("maxNumReuses")) { @@ -668,29 +675,30 @@ private static abstract class XmlLoader { abstract boolean isComposite(Class loadedClass); abstract T preLoadOne(Class loadedClass, String classname, MimeTypes mimeTypes) - throws TikaException; + throws TikaException; abstract CT createDefault(MimeTypes mimeTypes, ServiceLoader loader); abstract CT createComposite(List loaded, MimeTypes mimeTypes, ServiceLoader loader); abstract T createComposite(Class compositeClass, List children, - Set> excludeChildren, - Map params, MimeTypes mimeTypes, - ServiceLoader loader) - throws InvocationTargetException, IllegalAccessException, InstantiationException; + Set> excludeChildren, Map params, + MimeTypes mimeTypes, ServiceLoader loader) throws InvocationTargetException, + IllegalAccessException, InstantiationException; - abstract T decorate(T created, Element element) - throws IOException, TikaException; // eg explicit mime types + abstract T decorate(T created, Element element) throws IOException, TikaException; // eg + // explicit + // mime + // types @SuppressWarnings("unchecked") CT loadOverall(Element element, MimeTypes mimeTypes, ServiceLoader loader) - throws TikaException, IOException { + throws TikaException, IOException { List loaded = new ArrayList<>(); // Find the children of the parent tag, if any for (Element le : getTopLevelElementChildren(element, getParentTagName(), - getLoaderTagName())) { + getLoaderTagName())) { T loadedChild = loadOne(le, mimeTypes, loader); if (loadedChild != null) { loaded.add(loadedChild); @@ -708,16 +716,15 @@ CT loadOverall(Element element, MimeTypes mimeTypes, ServiceLoader loader) return (CT) single; } } else if (!supportsComposite()) { - throw new TikaConfigException( - "Composite not supported for " + getParentTagName() + - ". Must specify only one child!"); + throw new TikaConfigException("Composite not supported for " + getParentTagName() + + ". Must specify only one child!"); } // Wrap the defined parsers/detectors up in a Composite return createComposite(loaded, mimeTypes, loader); } T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) - throws TikaException, IOException { + throws TikaException, IOException { String name = element.getAttribute("class"); if (name.isBlank()) { throw new TikaConfigException("class attribute must not be empty: " + element); @@ -736,7 +743,7 @@ T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) Class loadedClass = loader.getServiceClass(getLoaderClass(), name); // Do pre-load checks and short-circuits - //TODO : allow duplicate instances with different configurations + // TODO : allow duplicate instances with different configurations loaded = preLoadOne(loadedClass, name, mimeTypes); if (loaded != null) { return loaded; @@ -757,8 +764,8 @@ T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) NodeList childNodes = element.getElementsByTagName(getLoaderTagName()); if (childNodes.getLength() > 0) { for (int i = 0; i < childNodes.getLength(); i++) { - T loadedChild = - loadOne((Element) childNodes.item(i), mimeTypes, loader); + T loadedChild = loadOne((Element) childNodes.item(i), mimeTypes, + loader); if (loadedChild != null) { children.add(loadedChild); } @@ -768,25 +775,25 @@ T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) // Get the list of children to exclude Set> excludeChildren = new HashSet<>(); NodeList excludeChildNodes = - element.getElementsByTagName(getLoaderTagName() + "-exclude"); + element.getElementsByTagName(getLoaderTagName() + "-exclude"); if (excludeChildNodes.getLength() > 0) { for (int i = 0; i < excludeChildNodes.getLength(); i++) { Element excl = (Element) excludeChildNodes.item(i); String exclName = excl.getAttribute("class"); try { - excludeChildren - .add(loader.getServiceClass(getLoaderClass(), exclName)); + excludeChildren.add( + loader.getServiceClass(getLoaderClass(), exclName)); } catch (ClassNotFoundException e) { - //TIKA-3268 -- This should stop the world. + // TIKA-3268 -- This should stop the world. throw new TikaConfigException( - "Class not found in -exclude list: " + exclName); + "Class not found in -exclude list: " + exclName); } } } // Create the Composite loaded = createComposite(loadedClass, children, excludeChildren, params, - mimeTypes, loader); + mimeTypes, loader); // Default constructor fallback if (loaded == null) { @@ -799,7 +806,7 @@ T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) // See the thread "Configuring parsers and translators" for details } - //Assigning the params to bean fields/setters + // Assigning the params to bean fields/setters AnnotationUtils.assignFieldParams(loaded, params); if (loaded instanceof Initializable) { ((Initializable) loaded).initialize(params); @@ -813,31 +820,31 @@ T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) if (loader.getLoadErrorHandler() == LoadErrorHandler.THROW) { // Use a different exception signature here throw new TikaConfigException( - "Unable to find a " + getLoaderTagName() + " class: " + name, e); + "Unable to find a " + getLoaderTagName() + " class: " + name, + e); } // Report the problem loader.getLoadErrorHandler().handleLoadError(name, e); return null; } catch (IllegalAccessException e) { throw new TikaException( - "Unable to access a " + getLoaderTagName() + " class: " + name, e); + "Unable to access a " + getLoaderTagName() + " class: " + name, e); } catch (InvocationTargetException e) { throw new TikaException( - "Unable to create a " + getLoaderTagName() + " class: " + name, e); + "Unable to create a " + getLoaderTagName() + " class: " + name, e); } catch (InstantiationException e) { throw new TikaException( - "Unable to instantiate a " + getLoaderTagName() + " class: " + name, e); + "Unable to instantiate a " + getLoaderTagName() + " class: " + name, + e); } catch (NoSuchMethodException e) { - throw new TikaException( - "Unable to find the right constructor for " + getLoaderTagName() + - " class: " + name, e); + throw new TikaException("Unable to find the right constructor for " + + getLoaderTagName() + " class: " + name, e); } } - T newInstance(Class loadedClass) - throws IllegalAccessException, InstantiationException, NoSuchMethodException, - InvocationTargetException { + T newInstance(Class loadedClass) throws IllegalAccessException, + InstantiationException, NoSuchMethodException, InvocationTargetException { return loadedClass.getDeclaredConstructor().newInstance(); } @@ -850,8 +857,8 @@ T newInstance(Class loadedClass) Map getParams(Element el) throws TikaException { Map params = new HashMap<>(); for (Node child = el.getFirstChild(); child != null; child = child.getNextSibling()) { - if (PARAMS_TAG_NAME.equals(child.getNodeName())) { //found the node - if (child.hasChildNodes()) { //it has children + if (PARAMS_TAG_NAME.equals(child.getNodeName())) { // found the node + if (child.hasChildNodes()) { // it has children NodeList childNodes = child.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node item = childNodes.item(i); @@ -861,7 +868,7 @@ Map getParams(Element el) throws TikaException { } } } - break; //only the first one is used + break; // only the first one is used } } return params; @@ -898,12 +905,12 @@ Class getLoaderClass() { @Override Parser preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) throws TikaException { + MimeTypes mimeTypes) throws TikaException { // Check for classes which can't be set in config if (AutoDetectParser.class.isAssignableFrom(loadedClass)) { // https://issues.apache.org/jira/browse/TIKA-866 - throw new TikaException("AutoDetectParser not supported in a " + - " configuration element: " + classname); + throw new TikaException("AutoDetectParser not supported in a " + + " configuration element: " + classname); } // Continue with normal loading return null; @@ -916,9 +923,9 @@ boolean isComposite(Parser loaded) { @Override boolean isComposite(Class loadedClass) { - return CompositeParser.class.isAssignableFrom(loadedClass) || - AbstractMultipleParser.class.isAssignableFrom(loadedClass) || - ParserDecorator.class.isAssignableFrom(loadedClass); + return CompositeParser.class.isAssignableFrom(loadedClass) + || AbstractMultipleParser.class.isAssignableFrom(loadedClass) + || ParserDecorator.class.isAssignableFrom(loadedClass); } @Override @@ -928,16 +935,16 @@ CompositeParser createDefault(MimeTypes mimeTypes, ServiceLoader loader) { @Override CompositeParser createComposite(List parsers, MimeTypes mimeTypes, - ServiceLoader loader) { + ServiceLoader loader) { MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); return new CompositeParser(registry, parsers); } @Override Parser createComposite(Class parserClass, List childParsers, - Set> excludeParsers, - Map params, MimeTypes mimeTypes, ServiceLoader loader) - throws InvocationTargetException, IllegalAccessException, InstantiationException { + Set> excludeParsers, Map params, + MimeTypes mimeTypes, ServiceLoader loader) throws InvocationTargetException, + IllegalAccessException, InstantiationException { Parser parser = null; Constructor c; MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); @@ -945,45 +952,46 @@ Parser createComposite(Class parserClass, List childPa // Try the possible default and composite parser constructors try { c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, - Collection.class, EncodingDetector.class, Renderer.class); - parser = c.newInstance(registry, loader, excludeParsers, encodingDetector, renderer); + Collection.class, EncodingDetector.class, Renderer.class); + parser = c.newInstance(registry, loader, excludeParsers, encodingDetector, + renderer); } catch (NoSuchMethodException me) { - //swallow + // swallow } if (parser == null) { try { c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, - Collection.class, EncodingDetector.class); + Collection.class, EncodingDetector.class); parser = c.newInstance(registry, loader, excludeParsers, encodingDetector); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (parser == null) { try { c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, - Collection.class); + Collection.class); parser = c.newInstance(registry, loader, excludeParsers); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (parser == null) { try { - c = parserClass - .getConstructor(MediaTypeRegistry.class, List.class, Collection.class); + c = parserClass.getConstructor(MediaTypeRegistry.class, List.class, + Collection.class); parser = c.newInstance(registry, childParsers, excludeParsers); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (parser == null) { try { - c = parserClass - .getConstructor(MediaTypeRegistry.class, Collection.class, Map.class); + c = parserClass.getConstructor(MediaTypeRegistry.class, Collection.class, + Map.class); parser = c.newInstance(registry, childParsers, params); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (parser == null) { @@ -991,7 +999,7 @@ Parser createComposite(Class parserClass, List childPa c = parserClass.getConstructor(MediaTypeRegistry.class, List.class); parser = c.newInstance(registry, childParsers); } catch (NoSuchMethodException me) { - //swallow + // swallow } } @@ -999,8 +1007,8 @@ Parser createComposite(Class parserClass, List childPa if (parser == null && ParserDecorator.class.isAssignableFrom(parserClass)) { try { CompositeParser cp; - if (childParsers.size() == 1 && excludeParsers.isEmpty() && - childParsers.get(0) instanceof CompositeParser) { + if (childParsers.size() == 1 && excludeParsers.isEmpty() + && childParsers.get(0) instanceof CompositeParser) { cp = (CompositeParser) childParsers.get(0); } else { cp = new CompositeParser(registry, childParsers, excludeParsers); @@ -1008,16 +1016,15 @@ Parser createComposite(Class parserClass, List childPa c = parserClass.getConstructor(Parser.class); parser = c.newInstance(cp); } catch (NoSuchMethodException me) { - //swallow + // swallow } } return parser; } @Override - Parser newInstance(Class loadedClass) - throws IllegalAccessException, InstantiationException, NoSuchMethodException, - InvocationTargetException { + Parser newInstance(Class loadedClass) throws IllegalAccessException, + InstantiationException, NoSuchMethodException, InvocationTargetException { Parser parser; if (AbstractEncodingDetectorParser.class.isAssignableFrom(loadedClass)) { Constructor ctor = loadedClass.getConstructor(EncodingDetector.class); @@ -1027,7 +1034,7 @@ Parser newInstance(Class loadedClass) } if (parser instanceof RenderingParser) { - ((RenderingParser)parser).setRenderer(renderer); + ((RenderingParser) parser).setRenderer(renderer); } return parser; } @@ -1073,9 +1080,9 @@ Class getLoaderClass() { @Override Detector preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) { + MimeTypes mimeTypes) { // If they asked for the mime types as a detector, give - // them the one we've already created. TIKA-1708 + // them the one we've already created. TIKA-1708 if (MimeTypes.class.equals(loadedClass)) { return mimeTypes; } @@ -1100,18 +1107,17 @@ CompositeDetector createDefault(MimeTypes mimeTypes, ServiceLoader loader) { @Override CompositeDetector createComposite(List detectors, MimeTypes mimeTypes, - ServiceLoader loader) { + ServiceLoader loader) { MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); return new CompositeDetector(registry, detectors); } @Override Detector createComposite(Class detectorClass, - List childDetectors, - Set> excludeDetectors, - Map params, MimeTypes mimeTypes, - ServiceLoader loader) - throws InvocationTargetException, IllegalAccessException, InstantiationException { + List childDetectors, + Set> excludeDetectors, Map params, + MimeTypes mimeTypes, ServiceLoader loader) throws InvocationTargetException, + IllegalAccessException, InstantiationException { Detector detector = null; Constructor c; MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); @@ -1119,20 +1125,20 @@ Detector createComposite(Class detectorClass, // Try the possible default and composite detector constructors if (detector == null) { try { - c = detectorClass - .getConstructor(MimeTypes.class, ServiceLoader.class, Collection.class); + c = detectorClass.getConstructor(MimeTypes.class, ServiceLoader.class, + Collection.class); detector = c.newInstance(mimeTypes, loader, excludeDetectors); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (detector == null) { try { - c = detectorClass - .getConstructor(MediaTypeRegistry.class, List.class, Collection.class); + c = detectorClass.getConstructor(MediaTypeRegistry.class, List.class, + Collection.class); detector = c.newInstance(registry, childDetectors, excludeDetectors); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (detector == null) { @@ -1140,7 +1146,7 @@ Detector createComposite(Class detectorClass, c = detectorClass.getConstructor(MediaTypeRegistry.class, List.class); detector = c.newInstance(registry, childDetectors); } catch (NoSuchMethodException me) { - //swallow + // swallow } } if (detector == null) { @@ -1148,7 +1154,7 @@ Detector createComposite(Class detectorClass, c = detectorClass.getConstructor(List.class); detector = c.newInstance(childDetectors); } catch (NoSuchMethodException me) { - //swallow + // swallow } } @@ -1181,7 +1187,7 @@ Class getLoaderClass() { @Override Translator preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) { + MimeTypes mimeTypes) { // Continue with normal loading return null; } @@ -1203,17 +1209,15 @@ Translator createDefault(MimeTypes mimeTypes, ServiceLoader loader) { @Override Translator createComposite(List loaded, MimeTypes mimeTypes, - ServiceLoader loader) { + ServiceLoader loader) { return loaded.get(0); } @Override Translator createComposite(Class compositeClass, - List children, - Set> excludeChildren, - Map params, MimeTypes mimeTypes, - ServiceLoader loader) - throws InstantiationException { + List children, Set> excludeChildren, + Map params, MimeTypes mimeTypes, ServiceLoader loader) + throws InstantiationException { throw new InstantiationException("Only one translator supported"); } @@ -1223,21 +1227,21 @@ Translator decorate(Translator created, Element element) { } } - private static class ExecutorServiceXmlLoader - extends XmlLoader { + private static class ExecutorServiceXmlLoader extends + XmlLoader { @Override ConfigurableThreadPoolExecutor createComposite( - Class compositeClass, - List children, - Set> excludeChildren, - Map params, MimeTypes mimeTypes, ServiceLoader loader) - throws InstantiationException { + Class compositeClass, + List children, + Set> excludeChildren, + Map params, MimeTypes mimeTypes, ServiceLoader loader) + throws InstantiationException { throw new InstantiationException("Only one executor service supported"); } @Override ConfigurableThreadPoolExecutor createComposite(List loaded, - MimeTypes mimeTypes, ServiceLoader loader) { + MimeTypes mimeTypes, ServiceLoader loader) { return loaded.get(0); } @@ -1248,7 +1252,7 @@ ConfigurableThreadPoolExecutor createDefault(MimeTypes mimeTypes, ServiceLoader @Override ConfigurableThreadPoolExecutor decorate(ConfigurableThreadPoolExecutor created, - Element element) { + Element element) { Element maxThreadElement = getChild(element, "max-threads"); if (maxThreadElement != null) { @@ -1269,8 +1273,7 @@ Class getLoaderClass() { @Override ConfigurableThreadPoolExecutor loadOne(Element element, MimeTypes mimeTypes, - ServiceLoader loader) - throws TikaException, IOException { + ServiceLoader loader) throws TikaException, IOException { return super.loadOne(element, mimeTypes, loader); } @@ -1301,14 +1304,14 @@ boolean isComposite(Class loadedClass) @Override ConfigurableThreadPoolExecutor preLoadOne( - Class loadedClass, String classname, - MimeTypes mimeTypes) { + Class loadedClass, + String classname, MimeTypes mimeTypes) { return null; } } private static class EncodingDetectorXmlLoader - extends XmlLoader { + extends XmlLoader { boolean supportsComposite() { return true; @@ -1340,7 +1343,7 @@ boolean isComposite(Class loadedClass) { @Override EncodingDetector preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) { + MimeTypes mimeTypes) { // Check for classes which can't be set in config // Continue with normal loading return null; @@ -1353,17 +1356,17 @@ EncodingDetector createDefault(MimeTypes mimeTypes, ServiceLoader loader) { @Override CompositeEncodingDetector createComposite(List encodingDetectors, - MimeTypes mimeTypes, ServiceLoader loader) { + MimeTypes mimeTypes, ServiceLoader loader) { return new CompositeEncodingDetector(encodingDetectors); } @Override EncodingDetector createComposite(Class encodingDetectorClass, - List childEncodingDetectors, - Set> excludeDetectors, - Map params, MimeTypes mimeTypes, - ServiceLoader loader) - throws InvocationTargetException, IllegalAccessException, InstantiationException { + List childEncodingDetectors, + Set> excludeDetectors, + Map params, MimeTypes mimeTypes, ServiceLoader loader) + throws InvocationTargetException, IllegalAccessException, + InstantiationException { EncodingDetector encodingDetector = null; Constructor c; @@ -1374,7 +1377,7 @@ EncodingDetector createComposite(Class encodingDetec encodingDetector = c.newInstance(loader, excludeDetectors); } catch (NoSuchMethodException me) { LOG.debug("couldn't find constructor for service loader + collection for {}", - encodingDetectorClass); + encodingDetectorClass); } } if (encodingDetector == null) { @@ -1383,7 +1386,7 @@ EncodingDetector createComposite(Class encodingDetec encodingDetector = c.newInstance(childEncodingDetectors); } catch (NoSuchMethodException me) { LOG.debug("couldn't find constructor for EncodingDetector(List) for {}", - encodingDetectorClass); + encodingDetectorClass); } } @@ -1396,8 +1399,7 @@ EncodingDetector decorate(EncodingDetector created, Element element) { } } - private static class RendererXmlLoader - extends XmlLoader { + private static class RendererXmlLoader extends XmlLoader { boolean supportsComposite() { return true; @@ -1429,7 +1431,7 @@ boolean isComposite(Class loadedClass) { @Override Renderer preLoadOne(Class loadedClass, String classname, - MimeTypes mimeTypes) { + MimeTypes mimeTypes) { // Check for classes which can't be set in config // Continue with normal loading return null; @@ -1441,18 +1443,17 @@ Renderer createDefault(MimeTypes mimeTypes, ServiceLoader loader) { } @Override - Renderer createComposite(List renderers, - MimeTypes mimeTypes, ServiceLoader loader) { + Renderer createComposite(List renderers, MimeTypes mimeTypes, + ServiceLoader loader) { return new CompositeRenderer(renderers); } @Override Renderer createComposite(Class rendererClass, - List childRenderers, - Set> excludeRenderers, - Map params, MimeTypes mimeTypes, - ServiceLoader loader) - throws InvocationTargetException, IllegalAccessException, InstantiationException { + List childRenderers, + Set> excludeRenderers, Map params, + MimeTypes mimeTypes, ServiceLoader loader) throws InvocationTargetException, + IllegalAccessException, InstantiationException { Renderer renderer = null; Constructor c; @@ -1463,7 +1464,7 @@ Renderer createComposite(Class rendererClass, renderer = c.newInstance(loader, excludeRenderers); } catch (NoSuchMethodException me) { LOG.debug("couldn't find constructor for service loader + collection for {}", - renderer); + renderer); } } if (renderer == null) { @@ -1471,8 +1472,7 @@ Renderer createComposite(Class rendererClass, c = rendererClass.getConstructor(List.class); renderer = c.newInstance(childRenderers); } catch (NoSuchMethodException me) { - LOG.debug("couldn't find constructor for Renderer(List) for {}", - rendererClass); + LOG.debug("couldn't find constructor for Renderer(List) for {}", rendererClass); } } return renderer; diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java index a2313f4081..283508b0e6 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -39,13 +37,6 @@ import javax.xml.transform.Transformer; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Node; - import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.DefaultDetector; @@ -63,6 +54,11 @@ import org.apache.tika.parser.multiple.AbstractMultipleParser; import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; public class TikaConfigSerializer { @@ -86,14 +82,14 @@ public class TikaConfigSerializer { } /** - * @param config config to serialize - * @param mode serialization mode - * @param writer writer + * @param config config to serialize + * @param mode serialization mode + * @param writer writer * @param charset charset * @throws Exception */ public static void serialize(TikaConfig config, Mode mode, Writer writer, Charset charset) - throws Exception { + throws Exception { DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder(); // root elements @@ -122,7 +118,7 @@ public static void serialize(TikaConfig config, Mode mode, Writer writer, Charse } private static void addExecutorService(Mode mode, Element rootElement, Document doc, - TikaConfig config) { + TikaConfig config) { ExecutorService executor = config.getExecutorService(); // TODO Implement the reverse of ExecutorServiceXmlLoader @@ -131,7 +127,7 @@ private static void addExecutorService(Mode mode, Element rootElement, Document } private static void addServiceLoader(Mode mode, Element rootElement, Document doc, - TikaConfig config) { + TikaConfig config) { ServiceLoader loader = config.getServiceLoader(); if (mode == Mode.MINIMAL) { @@ -149,17 +145,17 @@ private static void addServiceLoader(Mode mode, Element rootElement, Document do } private static void addTranslator(Mode mode, Element rootElement, Document doc, - TikaConfig config) { + TikaConfig config) { // Unlike the other entries, TikaConfig only wants one of - // these, and no outer list + // these, and no outer list Translator translator = config.getTranslator(); if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) { - Node mimeComment = doc.createComment("for example: "); + Node mimeComment = doc.createComment("for example: "); rootElement.appendChild(mimeComment); } else { - if (translator instanceof DefaultTranslator && - (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) { + if (translator instanceof DefaultTranslator + && (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) { translator = ((DefaultTranslator) translator).getTranslator(); } if (translator != null) { @@ -173,34 +169,34 @@ private static void addTranslator(Mode mode, Element rootElement, Document doc, } private static void addMimeComment(Mode mode, Element rootElement, Document doc) { - Node mimeComment = doc.createComment("for example: "); + Node mimeComment = doc.createComment("for example: "); rootElement.appendChild(mimeComment); } private static void addEncodingDetectors(Mode mode, Element rootElement, Document doc, - TikaConfig config) throws Exception { + TikaConfig config) throws Exception { EncodingDetector encDetector = config.getEncodingDetector(); if (mode == Mode.MINIMAL && encDetector instanceof DefaultEncodingDetector) { // Don't output anything, all using defaults Node detComment = doc.createComment( - "for example: " + - ""); + "for example: " + + ""); rootElement.appendChild(detComment); return; } Element encDetectorsElement = doc.createElement("encodingDetectors"); - if (mode == Mode.CURRENT && encDetector instanceof DefaultEncodingDetector || - !(encDetector instanceof CompositeEncodingDetector)) { + if (mode == Mode.CURRENT && encDetector instanceof DefaultEncodingDetector + || !(encDetector instanceof CompositeEncodingDetector)) { Element encDetectorElement = doc.createElement("encodingDetector"); encDetectorElement.setAttribute("class", encDetector.getClass().getCanonicalName()); encDetectorsElement.appendChild(encDetectorElement); } else { List children = - ((CompositeEncodingDetector) encDetector).getDetectors(); + ((CompositeEncodingDetector) encDetector).getDetectors(); for (EncodingDetector d : children) { Element encDetectorElement = doc.createElement("encodingDetector"); encDetectorElement.setAttribute("class", d.getClass().getCanonicalName()); @@ -213,20 +209,20 @@ private static void addEncodingDetectors(Mode mode, Element rootElement, Documen } private static void addDetectors(Mode mode, Element rootElement, Document doc, - TikaConfig config) throws Exception { + TikaConfig config) throws Exception { Detector detector = config.getDetector(); if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) { // Don't output anything, all using defaults - Node detComment = doc.createComment("for example: "); + Node detComment = doc.createComment("for example: "); rootElement.appendChild(detComment); return; } Element detectorsElement = doc.createElement("detectors"); - if (mode == Mode.CURRENT && detector instanceof DefaultDetector || - !(detector instanceof CompositeDetector)) { + if (mode == Mode.CURRENT && detector instanceof DefaultDetector + || !(detector instanceof CompositeDetector)) { Element detectorElement = doc.createElement("detector"); detectorElement.setAttribute("class", detector.getClass().getCanonicalName()); detectorsElement.appendChild(detectorElement); @@ -243,7 +239,7 @@ private static void addDetectors(Mode mode, Element rootElement, Document doc, } private static void addParsers(Mode mode, Element rootElement, Document doc, TikaConfig config) - throws Exception { + throws Exception { Parser parser = config.getParser(); if (mode == Mode.MINIMAL && parser instanceof DefaultParser) { // Don't output anything, all using defaults @@ -259,7 +255,7 @@ private static void addParsers(Mode mode, Element rootElement, Document doc, Tik } private static void addParser(Mode mode, Element rootElement, Document doc, Parser parser) - throws Exception { + throws Exception { // If the parser is decorated, is it a kind where we output the parser inside? ParserDecorator decoration = null; if (parser instanceof ParserDecorator) { @@ -280,8 +276,8 @@ private static void addParser(Mode mode, Element rootElement, Document doc, Pars outputParser = false; } // Special case for making Default to static - if (parser instanceof DefaultParser && - (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) { + if (parser instanceof DefaultParser + && (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) { outputParser = false; } } else if (parser instanceof AbstractMultipleParser) { @@ -299,7 +295,7 @@ private static void addParser(Mode mode, Element rootElement, Document doc, Pars } private static Element addParser(Mode mode, Element rootElement, Document doc, Parser parser, - ParserDecorator decorator) throws Exception { + ParserDecorator decorator) throws Exception { ParseContext context = new ParseContext(); Set addedTypes = new TreeSet<>(); @@ -343,7 +339,7 @@ public static void serializeParams(Document doc, Element element, Object object) Matcher setterMatcher = Pattern.compile("\\Aset([A-Z].*)").matcher(""); Matcher getterMatcher = Pattern.compile("\\A(?:get|is)([A-Z].+)\\Z").matcher(""); - //TODO -- check code base for setters with lowercase initial letters?! + // TODO -- check code base for setters with lowercase initial letters?! MethodTuples nonPrimitiveSetters = new MethodTuples(); MethodTuples primitiveSetters = new MethodTuples(); MethodTuples nonPrimitiveGetters = new MethodTuples(); @@ -353,18 +349,19 @@ public static void serializeParams(Document doc, Element element, Object object) if (setterMatcher.reset(method.getName()).find()) { if (!Modifier.isPublic(method.getModifiers())) { - //we could just call getMethods, but this can be helpful debugging inf + // we could just call getMethods, but this can be helpful debugging inf LOG.trace("inaccessible setter: {} in {}", method.getName(), object.getClass()); continue; } - //require @Field on setters + // require @Field on setters if (method.getAnnotation(Field.class) == null) { - // LOG.warn("unannotated setter {} in {}", method.getName(), object.getClass()); + // LOG.warn("unannotated setter {} in {}", method.getName(), object.getClass()); continue; } if (parameterTypes.length != 1) { - //TODO -- check code base for setX() zero parameters that set boolean to true - LOG.warn("setter with wrong number of params " + method.getName() + " " + parameterTypes.length); + // TODO -- check code base for setX() zero parameters that set boolean to true + LOG.warn("setter with wrong number of params " + method.getName() + " " + + parameterTypes.length); continue; } String paramName = methodToParamName(setterMatcher.group(1)); @@ -375,20 +372,22 @@ public static void serializeParams(Document doc, Element element, Object object) } } else if (getterMatcher.reset(method.getName()).find()) { if (parameterTypes.length != 0) { - //require 0 parameters for the getter + // require 0 parameters for the getter continue; } String paramName = methodToParamName(getterMatcher.group(1)); if (PRIMITIVES.containsKey(method.getReturnType())) { - primitiveGetters.add(new MethodTuple(paramName, method, method.getReturnType())); + primitiveGetters.add( + new MethodTuple(paramName, method, method.getReturnType())); } else { - nonPrimitiveGetters.add(new MethodTuple(paramName, method, method.getReturnType())); + nonPrimitiveGetters.add( + new MethodTuple(paramName, method, method.getReturnType())); } } } - //TODO -- remove nonprimitive setters/getters that have a string equivalent + // TODO -- remove nonprimitive setters/getters that have a string equivalent serializePrimitives(doc, element, object, primitiveSetters, primitiveGetters); serializeNonPrimitives(doc, element, object, nonPrimitiveSetters, nonPrimitiveGetters); @@ -402,25 +401,22 @@ private static String methodToParamName(String name) { } - private static void serializeNonPrimitives(Document doc, Element element, - Object object, - MethodTuples setterTuples, - MethodTuples getterTuples) { + private static void serializeNonPrimitives(Document doc, Element element, Object object, + MethodTuples setterTuples, MethodTuples getterTuples) { for (Map.Entry> e : setterTuples.tuples.entrySet()) { Set getters = getterTuples.tuples.get(e.getKey()); processNonPrimitive(e.getKey(), e.getValue(), getters, doc, element, object); if (!getterTuples.tuples.containsKey(e.getKey())) { LOG.warn("no getter for setter non-primitive: {} in {}", e.getKey(), - object.getClass()); + object.getClass()); continue; } } } private static void processNonPrimitive(String name, Set setters, - Set getters, Document doc, Element element, - Object object) { + Set getters, Document doc, Element element, Object object) { for (MethodTuple setter : setters) { for (MethodTuple getter : getters) { if (setter.singleParam.equals(getter.singleParam)) { @@ -432,8 +428,7 @@ private static void processNonPrimitive(String name, Set setters, } private static void serializeObject(String name, Document doc, Element element, - MethodTuple setter, - MethodTuple getter, Object object) { + MethodTuple setter, MethodTuple getter, Object object) { Object item = null; try { @@ -451,9 +446,8 @@ private static void serializeObject(String name, Document doc, Element element, serializeParams(doc, element, item); } - private static void serializePrimitives(Document doc, Element root, - Object object, - MethodTuples setterTuples, MethodTuples getterTuples) { + private static void serializePrimitives(Document doc, Element root, Object object, + MethodTuples setterTuples, MethodTuples getterTuples) { Element paramsElement = null; if (object instanceof AbstractMultipleParser) { @@ -461,7 +455,7 @@ private static void serializePrimitives(Document doc, Element root, Element paramElement = doc.createElement("param"); paramElement.setAttribute("name", "metadataPolicy"); paramElement.setAttribute("value", - ((AbstractMultipleParser) object).getMetadataPolicy().toString()); + ((AbstractMultipleParser) object).getMetadataPolicy().toString()); paramsElement.appendChild(paramElement); root.appendChild(paramsElement); } @@ -504,10 +498,10 @@ private static void serializePrimitives(Document doc, Element root, param.setAttribute("name", getterTuple.name); param.setAttribute("type", PRIMITIVES.get(getterTuple.singleParam)); if (List.class.isAssignableFrom(getterTuple.singleParam)) { - //this outputs even empty list elements, which I think is good. + // this outputs even empty list elements, which I think is good. addList(param, doc, getterTuple, (List) value); } else if (Map.class.isAssignableFrom(getterTuple.singleParam)) { - //this outputs even empty lists, which I think is good. + // this outputs even empty lists, which I think is good. addMap(param, doc, getterTuple, (Map) value); } else { param.setTextContent(valString); @@ -521,7 +515,7 @@ private static void serializePrimitives(Document doc, Element root, } private static void addMap(Element param, Document doc, MethodTuple getterTuple, - Map object) { + Map object) { for (Map.Entry e : new TreeMap(object).entrySet()) { Element element = doc.createElement("string"); element.setAttribute("key", e.getKey()); @@ -532,7 +526,7 @@ private static void addMap(Element param, Document doc, MethodTuple getterTuple, } private static void addList(Element param, Document doc, MethodTuple getterTuple, - List list) { + List list) { for (String s : list) { Element element = doc.createElement("string"); element.setTextContent(s); @@ -563,8 +557,8 @@ private static Method findGetter(MethodTuple setter, Object object) { } private static MethodTuple pickBestSetter(Set tuples) { - //TODO -- if both string and integer, which one do we pick? - //stub for now -- just pick the first + // TODO -- if both string and integer, which one do we pick? + // stub for now -- just pick the first for (MethodTuple t : tuples) { return t; } @@ -607,8 +601,8 @@ public boolean equals(Object o) { return false; } MethodTuple that = (MethodTuple) o; - return name.equals(that.name) && method.equals(that.method) && - singleParam.equals(that.singleParam); + return name.equals(that.name) && method.equals(that.method) + && singleParam.equals(that.singleParam); } @Override @@ -630,8 +624,8 @@ public enum Mode { */ STATIC, /** - * Static version of the config, with explicit lists of decorators etc, - * and all parsers given with their detected supported mime types + * Static version of the config, with explicit lists of decorators etc, and all parsers + * given with their detected supported mime types */ STATIC_FULL } diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java b/tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java index 842c7a432d..67adb2df7a 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; import java.io.Serializable; - import org.apache.tika.parser.ParseContext; public class TikaTaskTimeout implements Serializable { diff --git a/tika-core/src/main/java/org/apache/tika/config/package-info.java b/tika-core/src/main/java/org/apache/tika/config/package-info.java index 77a0559f7f..6a2f7a7f54 100644 --- a/tika-core/src/main/java/org/apache/tika/config/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/config/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java index bd7d4f2a95..26ce383636 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java +++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -22,9 +20,6 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; - -import org.xml.sax.InputSource; - import org.apache.tika.config.LoadErrorHandler; import org.apache.tika.config.ServiceLoader; import org.apache.tika.exception.TikaException; @@ -32,23 +27,24 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.utils.CharsetUtils; +import org.xml.sax.InputSource; /** - * An input stream reader that automatically detects the character encoding - * to be used for converting bytes to characters. + * An input stream reader that automatically detects the character encoding to be used for + * converting bytes to characters. * * @since Apache Tika 1.2 */ public class AutoDetectReader extends BufferedReader { private static final ServiceLoader DEFAULT_LOADER = - new ServiceLoader(AutoDetectReader.class.getClassLoader()); + new ServiceLoader(AutoDetectReader.class.getClassLoader()); private static final EncodingDetector DEFAULT_DETECTOR; static { DEFAULT_DETECTOR = new CompositeEncodingDetector( - DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class)); + DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class)); } private final Charset charset; @@ -65,34 +61,33 @@ private AutoDetectReader(InputStream stream, Charset charset) throws IOException } /** - * @param stream stream from which to read -- make sure that it supports mark! + * @param stream stream from which to read -- make sure that it supports mark! * @param metadata * @param detector * @param handler * @throws IOException * @throws TikaException */ - private AutoDetectReader(InputStream stream, Metadata metadata, - EncodingDetector detector, LoadErrorHandler handler) - throws IOException, TikaException { + private AutoDetectReader(InputStream stream, Metadata metadata, EncodingDetector detector, + LoadErrorHandler handler) throws IOException, TikaException { this(stream, detect(stream, metadata, detector, handler)); } public AutoDetectReader(InputStream stream, Metadata metadata, - EncodingDetector encodingDetector) throws IOException, TikaException { - this(getBuffered(stream), metadata, encodingDetector, - DEFAULT_LOADER.getLoadErrorHandler()); + EncodingDetector encodingDetector) throws IOException, TikaException { + this(getBuffered(stream), metadata, encodingDetector, DEFAULT_LOADER.getLoadErrorHandler()); } public AutoDetectReader(InputStream stream, Metadata metadata, ServiceLoader loader) - throws IOException, TikaException { + throws IOException, TikaException { this(getBuffered(stream), metadata, - new CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)), - loader.getLoadErrorHandler()); + new CompositeEncodingDetector( + loader.loadServiceProviders(EncodingDetector.class)), + loader.getLoadErrorHandler()); } public AutoDetectReader(InputStream stream, Metadata metadata) - throws IOException, TikaException { + throws IOException, TikaException { this(stream, metadata, DEFAULT_DETECTOR); } @@ -100,9 +95,8 @@ public AutoDetectReader(InputStream stream) throws IOException, TikaException { this(stream, new Metadata()); } - private static Charset detect(InputStream input, Metadata metadata, - EncodingDetector detector, LoadErrorHandler handler) - throws IOException, TikaException { + private static Charset detect(InputStream input, Metadata metadata, EncodingDetector detector, + LoadErrorHandler handler) throws IOException, TikaException { // Ask all given detectors for the character encoding try { Charset charset = detector.detect(input, metadata); @@ -123,7 +117,7 @@ private static Charset detect(InputStream input, Metadata metadata, Charset cs = CharsetUtils.forName(charset); metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name()); metadata.set(TikaCoreProperties.ENCODING_DETECTOR, - "AutoDetectReader-charset-metadata-fallback"); + "AutoDetectReader-charset-metadata-fallback"); return cs; } catch (IllegalArgumentException e) { // ignore diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java index ed53918540..5e9b10885e 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -23,7 +21,6 @@ import java.util.Collection; import java.util.Collections; import java.util.List; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -45,7 +42,7 @@ public class CompositeDetector implements Detector { private final List detectors; public CompositeDetector(MediaTypeRegistry registry, List detectors, - Collection> excludeDetectors) { + Collection> excludeDetectors) { if (excludeDetectors == null || excludeDetectors.isEmpty()) { this.detectors = detectors; } else { @@ -78,8 +75,8 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } MediaType type = MediaType.OCTET_STREAM; - //we have to iterate through all detectors because the override detector may - //be within a CompositeDetector + // we have to iterate through all detectors because the override detector may + // be within a CompositeDetector for (Detector detector : getDetectors()) { MediaType detected = detector.detect(input, metadata); if (registry.isSpecializationOf(detected, type)) { @@ -111,6 +108,7 @@ private static MediaType detectOverrides(Metadata metadata) { } return null; } + /** * Returns the component detectors. */ @@ -119,12 +117,12 @@ public List getDetectors() { } private boolean isExcluded(Collection> excludeDetectors, - Class d) { + Class d) { return excludeDetectors.contains(d) || assignableFrom(excludeDetectors, d); } private boolean assignableFrom(Collection> excludeDetectors, - Class d) { + Class d) { for (Class e : excludeDetectors) { if (e.isAssignableFrom(d)) { return true; diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java index 7db79ccc7b..b24e7440a6 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -24,7 +22,6 @@ import java.util.Collections; import java.util.LinkedList; import java.util.List; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -38,8 +35,7 @@ public class CompositeEncodingDetector implements EncodingDetector, Serializable private final List detectors; public CompositeEncodingDetector(List detectors, - Collection> - excludeEncodingDetectors) { + Collection> excludeEncodingDetectors) { this.detectors = new LinkedList<>(); for (EncodingDetector encodingDetector : detectors) { if (!isExcluded(excludeEncodingDetectors, encodingDetector.getClass())) { @@ -55,7 +51,7 @@ public CompositeEncodingDetector(List detectors) { } /** - * @param input text document input stream, or null + * @param input text document input stream, or null * @param metadata input metadata for the document * @return the detected Charset or null if no charset could be detected * @throws IOException @@ -66,10 +62,10 @@ public Charset detect(InputStream input, Metadata metadata) throws IOException { Charset detected = detector.detect(input, metadata); if (detected != null) { metadata.set(TikaCoreProperties.DETECTED_ENCODING, detected.name()); - //if this has been set by a leaf detector, do not overwrite - if (! detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) { + // if this has been set by a leaf detector, do not overwrite + if (!detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) { metadata.set(TikaCoreProperties.ENCODING_DETECTOR, - detector.getClass().getSimpleName()); + detector.getClass().getSimpleName()); } return detected; } @@ -82,15 +78,15 @@ public List getDetectors() { } private boolean isExcluded( - Collection> excludeEncodingDetectors, - Class encodingDetector) { - return excludeEncodingDetectors.contains(encodingDetector) || - assignableFrom(excludeEncodingDetectors, encodingDetector); + Collection> excludeEncodingDetectors, + Class encodingDetector) { + return excludeEncodingDetectors.contains(encodingDetector) + || assignableFrom(excludeEncodingDetectors, encodingDetector); } private boolean assignableFrom( - Collection> excludeEncodingDetectors, - Class encodingDetector) { + Collection> excludeEncodingDetectors, + Class encodingDetector) { for (Class e : excludeEncodingDetectors) { if (e.isAssignableFrom(encodingDetector)) { return true; diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java index 038d274e46..12ecd2c337 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -20,20 +18,18 @@ import java.util.Collections; import java.util.List; import javax.imageio.spi.ServiceRegistry; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.mime.MimeTypes; import org.apache.tika.utils.ServiceLoaderUtils; /** - * A composite detector based on all the {@link Detector} implementations - * available through the {@link ServiceRegistry service provider mechanism}. + * A composite detector based on all the {@link Detector} implementations available through the + * {@link ServiceRegistry service provider mechanism}. *

- * Detectors are loaded and returned in a specified order, of user supplied - * followed by non-MimeType Tika, followed by the Tika MimeType class. - * If you need to control the order of the Detectors, you should instead - * construct your own {@link CompositeDetector} and pass in the list - * of Detectors in the required order. + * Detectors are loaded and returned in a specified order, of user supplied followed by non-MimeType + * Tika, followed by the Tika MimeType class. If you need to control the order of the Detectors, you + * should instead construct your own {@link CompositeDetector} and pass in the list of Detectors in + * the required order. * * @since Apache Tika 0.9 */ @@ -46,7 +42,7 @@ public class DefaultDetector extends CompositeDetector { private transient final ServiceLoader loader; public DefaultDetector(MimeTypes types, ServiceLoader loader, - Collection> excludeDetectors) { + Collection> excludeDetectors) { super(types.getMediaTypeRegistry(), getDefaultDetectors(types, loader, excludeDetectors)); this.loader = loader; } @@ -72,25 +68,22 @@ public DefaultDetector() { } /** - * Finds all statically loadable detectors and sort the list by name, - * rather than discovery order. Detectors are used in the given order, - * so put the Tika parsers last so that non-Tika (user supplied) - * parsers can take precedence. + * Finds all statically loadable detectors and sort the list by name, rather than discovery + * order. Detectors are used in the given order, so put the Tika parsers last so that non-Tika + * (user supplied) parsers can take precedence. *

- * If an {@link OverrideDetector} is loaded, it takes precedence over - * all other detectors. + * If an {@link OverrideDetector} is loaded, it takes precedence over all other detectors. * * @param loader service loader * @return ordered list of statically loadable detectors */ private static List getDefaultDetectors(MimeTypes types, ServiceLoader loader, - Collection> - excludeDetectors) { + Collection> excludeDetectors) { List detectors = - loader.loadStaticServiceProviders(Detector.class, excludeDetectors); + loader.loadStaticServiceProviders(Detector.class, excludeDetectors); ServiceLoaderUtils.sortLoadedClasses(detectors); - //look for the override index and put that first + // look for the override index and put that first int overrideIndex = -1; int i = 0; for (Detector detector : detectors) { diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java index 4cf64d5e97..2e9f6ad959 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java @@ -1,37 +1,33 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; import java.util.Collection; import javax.imageio.spi.ServiceRegistry; - import org.apache.tika.config.ServiceLoader; /** - * A composite encoding detector based on all the {@link EncodingDetector} implementations - * available through the {@link ServiceRegistry service provider mechanism}. Those - * loaded via the service provider mechanism are ordered by how they appear in the - * file, if there is a single service file. If multiple, there is no guarantee of order. + * A composite encoding detector based on all the {@link EncodingDetector} implementations available + * through the {@link ServiceRegistry service provider mechanism}. Those loaded via the service + * provider mechanism are ordered by how they appear in the file, if there is a single service file. + * If multiple, there is no guarantee of order. *

*

- * If you need to control the order of the Detectors, you should instead - * construct your own {@link CompositeDetector} and pass in the list - * of Detectors in the required order. + * If you need to control the order of the Detectors, you should instead construct your own + * {@link CompositeDetector} and pass in the list of Detectors in the required order. * * @since Apache Tika 1.15 */ @@ -46,8 +42,7 @@ public DefaultEncodingDetector(ServiceLoader loader) { } public DefaultEncodingDetector(ServiceLoader loader, - Collection> - excludeEncodingDetectors) { + Collection> excludeEncodingDetectors) { super(loader.loadServiceProviders(EncodingDetector.class), excludeEncodingDetectors); } diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java index b7df0b6fa6..197fed999d 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java @@ -1,34 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; import java.util.List; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.mime.MimeTypes; import org.apache.tika.mime.ProbabilisticMimeDetectionSelector; import org.apache.tika.utils.ServiceLoaderUtils; /** - * A version of {@link DefaultDetector} for probabilistic mime - * detectors, which use statistical techniques to blend the - * results of differing underlying detectors when attempting - * to detect the type of a given file. - * TODO Link to documentation on configuring these probabilities + * A version of {@link DefaultDetector} for probabilistic mime detectors, which use statistical + * techniques to blend the results of differing underlying detectors when attempting to detect the + * type of a given file. TODO Link to documentation on configuring these probabilities */ public class DefaultProbDetector extends CompositeDetector { private static final long serialVersionUID = -8836240060532323352L; @@ -56,7 +51,7 @@ public DefaultProbDetector() { } private static List getDefaultDetectors(ProbabilisticMimeDetectionSelector sel, - ServiceLoader loader) { + ServiceLoader loader) { List detectors = loader.loadStaticServiceProviders(Detector.class); ServiceLoaderUtils.sortLoadedClasses(detectors); detectors.add(sel); diff --git a/tika-core/src/main/java/org/apache/tika/detect/Detector.java b/tika-core/src/main/java/org/apache/tika/detect/Detector.java index fc237aa5aa..d58368dbac 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/Detector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/Detector.java @@ -1,32 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; /** - * Content type detector. Implementations of this interface use various - * heuristics to detect the content type of a document based on given - * input metadata or the first few bytes of the document stream. + * Content type detector. Implementations of this interface use various heuristics to detect the + * content type of a document based on given input metadata or the first few bytes of the document + * stream. * * @since Apache Tika 0.3 */ @@ -34,22 +31,19 @@ public interface Detector extends Serializable { /** * Detects the content type of the given input document. Returns - * application/octet-stream if the type of the document - * can not be detected. + * application/octet-stream if the type of the document can not be detected. *

- * If the document input stream is not available, then the first - * argument may be null. Otherwise the detector may - * read bytes from the start of the stream to help in type detection. - * The given stream is guaranteed to support the - * {@link InputStream#markSupported() mark feature} and the detector - * is expected to {@link InputStream#mark(int) mark} the stream before - * reading any bytes from it, and to {@link InputStream#reset() reset} - * the stream before returning. The stream must not be closed by the - * detector. + * If the document input stream is not available, then the first argument may be + * null. Otherwise the detector may read bytes from the start of the stream to help + * in type detection. The given stream is guaranteed to support the + * {@link InputStream#markSupported() mark feature} and the detector is expected to + * {@link InputStream#mark(int) mark} the stream before reading any bytes from it, and to + * {@link InputStream#reset() reset} the stream before returning. The stream must not be closed + * by the detector. *

* The given input metadata is only read, not modified, by the detector. * - * @param input document input stream, or null + * @param input document input stream, or null * @param metadata input metadata for the document * @return detected media type, or application/octet-stream * @throws IOException if the document input stream could not be read diff --git a/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java b/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java index 9f996301ce..89d82eb7af 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; import java.io.IOException; import java.io.InputStream; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java index 9dbad4c277..a8f5460c49 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -20,35 +18,32 @@ import java.io.InputStream; import java.io.Serializable; import java.nio.charset.Charset; - import org.apache.tika.metadata.Metadata; /** - * Character encoding detector. Implementations of this interface use - * various heuristics to detect the character encoding of a text document - * based on given input metadata or the first few bytes of the document stream. + * Character encoding detector. Implementations of this interface use various heuristics to detect + * the character encoding of a text document based on given input metadata or the first few bytes of + * the document stream. * * @since Apache Tika 0.4 */ public interface EncodingDetector extends Serializable { /** - * Detects the character encoding of the given text document, or - * null if the encoding of the document can not be detected. + * Detects the character encoding of the given text document, or null if the + * encoding of the document can not be detected. *

- * If the document input stream is not available, then the first - * argument may be null. Otherwise the detector may - * read bytes from the start of the stream to help in encoding detection. - * The given stream is guaranteed to support the - * {@link InputStream#markSupported() mark feature} and the detector - * is expected to {@link InputStream#mark(int) mark} the stream before - * reading any bytes from it, and to {@link InputStream#reset() reset} - * the stream before returning. The stream must not be closed by the - * detector. + * If the document input stream is not available, then the first argument may be + * null. Otherwise the detector may read bytes from the start of the stream to help + * in encoding detection. The given stream is guaranteed to support the + * {@link InputStream#markSupported() mark feature} and the detector is expected to + * {@link InputStream#mark(int) mark} the stream before reading any bytes from it, and to + * {@link InputStream#reset() reset} the stream before returning. The stream must not be closed + * by the detector. *

* The given input metadata is only read, not modified, by the detector. * - * @param input text document input stream, or null + * @param input text document input stream, or null * @param metadata input metadata for the document * @return detected character encoding, or null * @throws IOException if the document input stream could not be read diff --git a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java index 42349faec1..6a2f6388ae 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -22,10 +20,6 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.Field; import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.TemporaryResources; @@ -38,26 +32,27 @@ import org.apache.tika.utils.FileProcessResult; import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * This runs the linux 'file' command against a file. If - * this is called on a TikaInputStream, it will use the underlying Path - * or spool the full file to disk and then run file against that. + * This runs the linux 'file' command against a file. If this is called on a TikaInputStream, it + * will use the underlying Path or spool the full file to disk and then run file against that. *

- * If this is run against any other type of InputStream, it will spool - * up to {@link #maxBytes} to disk and then run the detector. + * If this is run against any other type of InputStream, it will spool up to {@link #maxBytes} to + * disk and then run the detector. *

* As with all detectors, mark must be supported. *

- * If you want to use file's mime type in the parse, e.g. - * to select the parser in AutoDetectParser, set {@link FileCommandDetector#setUseMime(boolean)} - * to true. The default behavior is to store the value as {@link FileCommandDetector#FILE_MIME} - * but rely on other detectors for the "active" mime used by Tika. + * If you want to use file's mime type in the parse, e.g. to select the parser in AutoDetectParser, + * set {@link FileCommandDetector#setUseMime(boolean)} to true. The default behavior is to store the + * value as {@link FileCommandDetector#FILE_MIME} but rely on other detectors for the "active" mime + * used by Tika. */ public class FileCommandDetector implements Detector { - //TODO: file has some diff mimes names for some very common mimes - //should we map file mimes to Tika mimes, e.g. text/xml -> application/xml?? + // TODO: file has some diff mimes names for some very common mimes + // should we map file mimes to Tika mimes, e.g. text/xml -> application/xml?? public static Property FILE_MIME = Property.externalText("file:mime"); private static final Logger LOGGER = LoggerFactory.getLogger(FileCommandDetector.class); @@ -77,12 +72,12 @@ public static boolean checkHasFile() { public static boolean checkHasFile(String fileCommandPath) { - String[] commandline = new String[]{fileCommandPath, "-v"}; + String[] commandline = new String[] {fileCommandPath, "-v"}; return ExternalParser.check(commandline); } /** - * @param input document input stream, or null + * @param input document input stream, or null * @param metadata input metadata for the document * @return mime as identified by the file command or application/octet-stream otherwise * @throws IOException @@ -101,8 +96,8 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } TikaInputStream tis = TikaInputStream.cast(input); if (tis != null) { - //spool the full file to disk, if called with a TikaInputStream - //and there is no underlying file + // spool the full file to disk, if called with a TikaInputStream + // and there is no underlying file return detectOnPath(tis.getPath(), metadata); } @@ -118,8 +113,8 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException private MediaType detectOnPath(Path path, Metadata metadata) throws IOException { - String[] args = - new String[]{ProcessUtils.escapeCommandLine(fileCommandPath), "-b", "--mime-type", + String[] args = new String[] {ProcessUtils.escapeCommandLine(fileCommandPath), "-b", + "--mime-type", ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString())}; ProcessBuilder builder = new ProcessBuilder(args); FileProcessResult result = ProcessUtils.execute(builder, timeoutMs, 10000, 10000); @@ -149,8 +144,8 @@ private MediaType detectOnPath(Path path, Metadata metadata) throws IOException @Field public void setFilePath(String fileCommandPath) { - //this opens up a potential command vulnerability. - //Don't ever let an untrusted user set this. + // this opens up a potential command vulnerability. + // Don't ever let an untrusted user set this. this.fileCommandPath = fileCommandPath; checkHasFile(this.fileCommandPath); } @@ -163,10 +158,10 @@ public void setUseMime(boolean useMime) { public boolean isUseMime() { return useMime; } + /** - * If this is not called on a TikaInputStream, this detector - * will spool up to this many bytes to a file to be detected - * by the 'file' command. + * If this is not called on a TikaInputStream, this detector will spool up to this many bytes to + * a file to be detected by the 'file' command. * * @param maxBytes */ diff --git a/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java b/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java index 5e00779be2..57c50b628b 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -28,25 +26,24 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; /** - * Content type detection based on magic bytes, i.e. type-specific patterns - * near the beginning of the document input stream. + * Content type detection based on magic bytes, i.e. type-specific patterns near the beginning of + * the document input stream. *

- * Because this works on bytes, not characters, by default any string - * matching is done as ISO_8859_1. To use an explicit different - * encoding, supply a type other than "string" / "stringignorecase" + * Because this works on bytes, not characters, by default any string matching is done as + * ISO_8859_1. To use an explicit different encoding, supply a type other than "string" / + * "stringignorecase" * * @since Apache Tika 0.3 */ public class MagicDetector implements Detector { /** - * The matching media type. Returned by the - * {@link #detect(InputStream, Metadata)} method if a match is found. + * The matching media type. Returned by the {@link #detect(InputStream, Metadata)} method if a + * match is found. */ private final MediaType type; /** @@ -54,14 +51,14 @@ public class MagicDetector implements Detector { */ private final int length; /** - * The magic match pattern. If this byte pattern is equal to the - * possibly bit-masked bytes from the input stream, then the type - * detection succeeds and the configured {@link #type} is returned. + * The magic match pattern. If this byte pattern is equal to the possibly bit-masked bytes from + * the input stream, then the type detection succeeds and the configured {@link #type} is + * returned. */ private final byte[] pattern; /** - * Length of the pattern, which in the case of regular expressions will - * not be the same as the comparison window length. + * Length of the pattern, which in the case of regular expressions will not be the same as the + * comparison window length. */ private final int patternLength; /** @@ -77,26 +74,24 @@ public class MagicDetector implements Detector { */ private final byte[] mask; /** - * First offset (inclusive) of the comparison window within the - * document input stream. Greater than or equal to zero. + * First offset (inclusive) of the comparison window within the document input stream. Greater + * than or equal to zero. */ private final int offsetRangeBegin; /** - * Last offset (inclusive) of the comparison window within the document - * input stream. Greater than or equal to the - * {@link #offsetRangeBegin first offset}. + * Last offset (inclusive) of the comparison window within the document input stream. Greater + * than or equal to the {@link #offsetRangeBegin first offset}. *

- * Note that this is not the offset of the last byte read from - * the document stream. Instead, the last window of bytes to be compared - * starts at this offset. + * Note that this is not the offset of the last byte read from the document stream. + * Instead, the last window of bytes to be compared starts at this offset. */ private final int offsetRangeEnd; /** - * Creates a detector for input documents that have the exact given byte - * pattern at the beginning of the document stream. + * Creates a detector for input documents that have the exact given byte pattern at the + * beginning of the document stream. * - * @param type matching media type + * @param type matching media type * @param pattern magic match pattern */ public MagicDetector(MediaType type, byte[] pattern) { @@ -104,49 +99,46 @@ public MagicDetector(MediaType type, byte[] pattern) { } /** - * Creates a detector for input documents that have the exact given byte - * pattern at the given offset of the document stream. + * Creates a detector for input documents that have the exact given byte pattern at the given + * offset of the document stream. * - * @param type matching media type + * @param type matching media type * @param pattern magic match pattern - * @param offset offset of the pattern match + * @param offset offset of the pattern match */ public MagicDetector(MediaType type, byte[] pattern, int offset) { this(type, pattern, null, offset, offset); } /** - * Creates a detector for input documents that meet the specified magic - * match. {@code pattern} must NOT be a regular expression. - * Constructor maintained for legacy reasons. + * Creates a detector for input documents that meet the specified magic match. {@code pattern} + * must NOT be a regular expression. Constructor maintained for legacy reasons. */ public MagicDetector(MediaType type, byte[] pattern, byte[] mask, int offsetRangeBegin, - int offsetRangeEnd) { + int offsetRangeEnd) { this(type, pattern, mask, false, offsetRangeBegin, offsetRangeEnd); } /** - * Creates a detector for input documents that meet the specified - * magic match. + * Creates a detector for input documents that meet the specified magic match. */ public MagicDetector(MediaType type, byte[] pattern, byte[] mask, boolean isRegex, - int offsetRangeBegin, int offsetRangeEnd) { + int offsetRangeBegin, int offsetRangeEnd) { this(type, pattern, mask, isRegex, false, offsetRangeBegin, offsetRangeEnd); } /** - * Creates a detector for input documents that meet the specified - * magic match. + * Creates a detector for input documents that meet the specified magic match. */ public MagicDetector(MediaType type, byte[] pattern, byte[] mask, boolean isRegex, - boolean isStringIgnoreCase, int offsetRangeBegin, int offsetRangeEnd) { + boolean isStringIgnoreCase, int offsetRangeBegin, int offsetRangeEnd) { if (type == null) { throw new IllegalArgumentException("Matching media type is null"); } else if (pattern == null) { throw new IllegalArgumentException("Magic match pattern is null"); } else if (offsetRangeBegin < 0 || offsetRangeEnd < offsetRangeBegin) { - throw new IllegalArgumentException( - "Invalid offset range: [" + offsetRangeBegin + "," + offsetRangeEnd + "]"); + throw new IllegalArgumentException("Invalid offset range: [" + offsetRangeBegin + "," + + offsetRangeEnd + "]"); } this.type = type; @@ -185,7 +177,7 @@ public MagicDetector(MediaType type, byte[] pattern, byte[] mask, boolean isRege } public static MagicDetector parse(MediaType mediaType, String type, String offset, String value, - String mask) { + String mask) { int start = 0; int end = 0; if (offset != null) { @@ -206,7 +198,7 @@ public static MagicDetector parse(MediaType mediaType, String type, String offse } return new MagicDetector(mediaType, patternBytes, maskBytes, type.equals("regex"), - type.equals("stringignorecase"), start, end); + type.equals("stringignorecase"), start, end); } private static byte[] decodeValue(String value, String type) { @@ -244,25 +236,26 @@ private static byte[] decodeValue(String value, String type) { case "host16": case "little16": { int i = Integer.parseInt(tmpVal, radix); - decoded = new byte[]{(byte) (i & 0x00FF), (byte) (i >> 8)}; + decoded = new byte[] {(byte) (i & 0x00FF), (byte) (i >> 8)}; break; } case "big16": { int i = Integer.parseInt(tmpVal, radix); - decoded = new byte[]{(byte) (i >> 8), (byte) (i & 0x00FF)}; + decoded = new byte[] {(byte) (i >> 8), (byte) (i & 0x00FF)}; break; } case "host32": case "little32": { long i = Long.parseLong(tmpVal, radix); - decoded = new byte[]{(byte) ((i & 0x000000FF)), (byte) ((i & 0x0000FF00) >> 8), - (byte) ((i & 0x00FF0000) >> 16), (byte) ((i & 0xFF000000) >> 24)}; + decoded = new byte[] {(byte) ((i & 0x000000FF)), (byte) ((i & 0x0000FF00) >> 8), + (byte) ((i & 0x00FF0000) >> 16), (byte) ((i & 0xFF000000) >> 24)}; break; } case "big32": { long i = Long.parseLong(tmpVal, radix); - decoded = new byte[]{(byte) ((i & 0xFF000000) >> 24), (byte) ((i & 0x00FF0000) >> 16), - (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF))}; + decoded = new byte[] {(byte) ((i & 0xFF000000) >> 24), + (byte) ((i & 0x00FF0000) >> 16), (byte) ((i & 0x0000FF00) >> 8), + (byte) ((i & 0x000000FF))}; break; } } @@ -296,8 +289,8 @@ private static byte[] decodeString(String value, String type) { i++; } else { int j = i + 1; - while ((j < i + 4) && (j < value.length()) && - (Character.isDigit(value.charAt(j)))) { + while ((j < i + 4) && (j < value.length()) + && (Character.isDigit(value.charAt(j)))) { j++; } decoded.write(Short.decode("0" + value.substring(i + 1, j)).byteValue()); @@ -334,7 +327,7 @@ private static byte[] decodeString(String value, String type) { } /** - * @param input document input stream, or null + * @param input document input stream, or null * @param metadata ignored */ public MediaType detect(InputStream input, Metadata metadata) throws IOException { @@ -426,13 +419,12 @@ public int getLength() { } /** - * Returns a string representation of the Detection Rule. - * Should sort nicely by type and details, as we sometimes - * compare these. + * Returns a string representation of the Detection Rule. Should sort nicely by type and + * details, as we sometimes compare these. */ public String toString() { // Needs to be unique, as these get compared. - return "Magic Detection for " + type + " looking for " + pattern.length + " bytes = " + - Arrays.toString(this.pattern) + " mask = " + Arrays.toString(this.mask); + return "Magic Detection for " + type + " looking for " + pattern.length + " bytes = " + + Arrays.toString(this.pattern) + " mask = " + Arrays.toString(this.mask); } } diff --git a/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java b/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java index bcbf48feb5..14117bca94 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -27,12 +25,10 @@ import java.net.URL; import java.nio.file.Path; import java.util.Objects; - +import org.apache.tika.mime.MediaType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.mime.MediaType; - public class NNExampleModelDetector extends TrainedModelDetector { private static final String EXAMPLE_NNMODEL_FILE = "tika-example.nnmodel"; @@ -87,12 +83,12 @@ public void loadDefaultModels(ClassLoader classLoader) { // This allows us to replicate class.getResource() when using // the classloader directly String classPrefix = - TrainedModelDetector.class.getPackage().getName().replace('.', '/') + "/"; + TrainedModelDetector.class.getPackage().getName().replace('.', '/') + "/"; // Get the core URL, and all the extensions URLs URL modelURL = classLoader.getResource(classPrefix + EXAMPLE_NNMODEL_FILE); Objects.requireNonNull(modelURL, - "required resource " + classPrefix + EXAMPLE_NNMODEL_FILE + " not found"); + "required resource " + classPrefix + EXAMPLE_NNMODEL_FILE + " not found"); try (InputStream stream = modelURL.openStream()) { loadDefaultModels(stream); } catch (IOException e) { @@ -102,11 +98,10 @@ public void loadDefaultModels(ClassLoader classLoader) { } /** - * read the comments where the model configuration is written, e.g the - * number of inputs, hiddens and output please ensure the first char in the - * given string is # In this example grb model file, there are 4 elements 1) - * type 2) number of input units 3) number of hidden units. 4) number of - * output units. + * read the comments where the model configuration is written, e.g the number of inputs, hiddens + * and output please ensure the first char in the given string is # In this example grb model + * file, there are 4 elements 1) type 2) number of input units 3) number of hidden units. 4) + * number of output units. */ private void readDescription(final NNTrainedModelBuilder builder, final String line) { int numInputs; @@ -130,8 +125,8 @@ private void readDescription(final NNTrainedModelBuilder builder, final String l } /** - * Read the next line for the model parameters and populate the build which - * later will be used to instantiate the instance of TrainedModel + * Read the next line for the model parameters and populate the build which later will be used + * to instantiate the instance of TrainedModel * * @param builder * @param line diff --git a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java index 73ee560db4..25de18f417 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java +++ b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -26,7 +24,7 @@ public class NNTrainedModel extends TrainedModel { private final float[][] Theta2; public NNTrainedModel(final int nInput, final int nHidden, final int nOutput, - final float[] nn_params) { + final float[] nn_params) { this.numOfInputs = nInput; this.numOfHidden = nHidden; this.numOfOutputs = nOutput; @@ -64,8 +62,7 @@ public double predict(double[] unseen) { } /** - * The given input vector of unseen is m=(256 + 1) * n= 1 this returns a - * prediction probability + * The given input vector of unseen is m=(256 + 1) * n= 1 this returns a prediction probability */ @Override public float predict(float[] unseen) { diff --git a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java index 9b4eab3854..15c5ad14cd 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java +++ b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ diff --git a/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java b/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java index 36d01e1711..6bd334187a 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -23,22 +21,20 @@ import java.net.URLDecoder; import java.util.Map; import java.util.regex.Pattern; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; /** - * Content type detection based on the resource name. An instance of this - * class contains a set of regular expression patterns that are matched - * against the resource name potentially given as a part of the input metadata. + * Content type detection based on the resource name. An instance of this class contains a set of + * regular expression patterns that are matched against the resource name potentially given as a + * part of the input metadata. *

- * If a pattern matches the given name, then the media type associated with - * that pattern is returned as the likely content type of the input document. - * Otherwise the returned type is application/octet-stream. + * If a pattern matches the given name, then the media type associated with that pattern is returned + * as the likely content type of the input document. Otherwise the returned type is + * application/octet-stream. *

- * See the {@link #detect(InputStream, Metadata)} method for more details - * of the matching algorithm. + * See the {@link #detect(InputStream, Metadata)} method for more details of the matching algorithm. * * @since Apache Tika 0.3 */ @@ -50,10 +46,9 @@ public class NameDetector implements Detector { private final Map patterns; /** - * Creates a new content type detector based on the given name patterns. - * The given pattern map is not copied, so the caller may update the - * mappings even after this detector instance has been created. However, - * the map must not be concurrently modified while this instance + * Creates a new content type detector based on the given name patterns. The given pattern map + * is not copied, so the caller may update the mappings even after this detector instance has + * been created. However, the map must not be concurrently modified while this instance * is used for type detection. * * @param patterns map from name patterns to corresponding media types @@ -63,34 +58,24 @@ public NameDetector(Map patterns) { } /** - * Detects the content type of an input document based on the document - * name given in the input metadata. The RESOURCE_NAME_KEY attribute of - * the given input metadata is expected to contain the name (normally - * a file name or a URL) of the input document. + * Detects the content type of an input document based on the document name given in the input + * metadata. The RESOURCE_NAME_KEY attribute of the given input metadata is expected to contain + * the name (normally a file name or a URL) of the input document. *

* If a resource name is given, then it is first processed as follows. *

    - *
  1. - * Potential URL query (?...) and fragment identifier (#...) - * parts are removed from the end of the resource name. - *
  2. - *
  3. - * Potential leading path elements (up to the last slash or backslash) - * are removed from the beginning of the resource name. - *
  4. - *
  5. - * Potential URL encodings (%nn, in UTF-8) are decoded. - *
  6. - *
  7. - * Any leading and trailing whitespace is removed. - *
  8. + *
  9. Potential URL query (?...) and fragment identifier (#...) parts are removed from the end + * of the resource name.
  10. + *
  11. Potential leading path elements (up to the last slash or backslash) are removed from the + * beginning of the resource name.
  12. + *
  13. Potential URL encodings (%nn, in UTF-8) are decoded.
  14. + *
  15. Any leading and trailing whitespace is removed.
  16. *
*

- * The resulting name string (if any) is then matched in sequence against - * all the configured name patterns. If a match is found, then the (first) - * matching media type is returned. + * The resulting name string (if any) is then matched in sequence against all the configured + * name patterns. If a match is found, then the (first) matching media type is returned. * - * @param input ignored + * @param input ignored * @param metadata input metadata, possibly with a RESOURCE_NAME_KEY value * @return detected media type, or application/octet-stream */ diff --git a/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java index 896a795318..7ec58f4d8b 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -21,7 +19,6 @@ import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; - import org.apache.tika.config.Field; import org.apache.tika.metadata.Metadata; @@ -29,8 +26,8 @@ * Always returns the charset passed in via the initializer */ public class NonDetectingEncodingDetector implements EncodingDetector { - //would have preferred final, but need mutability for - //loading via TikaConfig + // would have preferred final, but need mutability for + // loading via TikaConfig private Charset charset = StandardCharsets.UTF_8; /** diff --git a/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java b/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java index b6c5a41f8a..879cff0f78 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; import java.io.IOException; import java.io.InputStream; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -27,8 +24,8 @@ * Use this to force a content type detection via the * {@link TikaCoreProperties#CONTENT_TYPE_USER_OVERRIDE} key in the metadata object. *

- * This is also required to override detection by some parsers - * via {@link TikaCoreProperties#CONTENT_TYPE_PARSER_OVERRIDE}. + * This is also required to override detection by some parsers via + * {@link TikaCoreProperties#CONTENT_TYPE_PARSER_OVERRIDE}. * * @deprecated after 2.5.0 this functionality was moved to the CompositeDetector */ diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java index 96583be7c2..796eb868aa 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java @@ -1,38 +1,34 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; /** - * Content type detection of plain text documents. This detector looks at the - * beginning of the document input stream and considers the document to be - * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are - * found. As a special case some control bytes (up to 2% of all characters) - * are also allowed in a text document if it also contains no or just a few - * (less than 10%) characters above the 7-bit ASCII range. + * Content type detection of plain text documents. This detector looks at the beginning of the + * document input stream and considers the document to be a text document if no ASCII (ISO-Latin-1, + * UTF-8, etc.) control bytes are found. As a special case some control bytes (up to 2% of all + * characters) are also allowed in a text document if it also contains no or just a few (less than + * 10%) characters above the 7-bit ASCII range. *

- * Note that text documents with a character encoding like UTF-16 are better - * detected with {@link MagicDetector} and an appropriate magic byte pattern. + * Note that text documents with a character encoding like UTF-16 are better detected with + * {@link MagicDetector} and an appropriate magic byte pattern. * * @since Apache Tika 0.3 */ @@ -44,21 +40,20 @@ public class TextDetector implements Detector { private static final long serialVersionUID = 4774601079503507765L; /** - * The number of bytes from the beginning of the document stream - * to test for control bytes. + * The number of bytes from the beginning of the document stream to test for control bytes. */ private static final int DEFAULT_NUMBER_OF_BYTES_TO_TEST = 512; /** - * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes - * in the range below 0x20 (the space character). If an entry in this - * table is true then that byte is very unlikely to occur - * in a plain text document. + * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes in the range below 0x20 + * (the space character). If an entry in this table is true then that byte is very + * unlikely to occur in a plain text document. *

- * The contents of this lookup table are based on the following definition - * from section 4 of the "Content-Type Processing Model" Internet-draft + * The contents of this lookup table are based on the following definition from section 4 of the + * "Content-Type Processing Model" Internet-draft * (draft-abarth-mime-sniff-01). + * *

      * +-------------------------+
      * | Binary data byte ranges |
@@ -86,29 +81,29 @@ public class TextDetector implements Detector {
     private final int bytesToTest;
 
     /**
-     * Constructs a {@link TextDetector} which will look at the default number
-     * of bytes from the beginning of the document.
+     * Constructs a {@link TextDetector} which will look at the default number of bytes from the
+     * beginning of the document.
      */
     public TextDetector() {
         this(DEFAULT_NUMBER_OF_BYTES_TO_TEST);
     }
 
     /**
-     * Constructs a {@link TextDetector} which will look at a given number of
-     * bytes from the beginning of the document.
+     * Constructs a {@link TextDetector} which will look at a given number of bytes from the
+     * beginning of the document.
      */
     public TextDetector(int bytesToTest) {
         this.bytesToTest = bytesToTest;
     }
 
     /**
-     * Looks at the beginning of the document input stream to determine
-     * whether the document is text or not.
+     * Looks at the beginning of the document input stream to determine whether the document is text
+     * or not.
      *
-     * @param input    document input stream, or null
+     * @param input document input stream, or null
      * @param metadata ignored
-     * @return "text/plain" if the input stream suggest a text document,
-     * "application/octet-stream" otherwise
+     * @return "text/plain" if the input stream suggest a text document, "application/octet-stream"
+     *         otherwise
      */
     public MediaType detect(InputStream input, Metadata metadata) throws IOException {
         if (input == null) {
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
index 50f8d790aa..9434af8671 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.detect;
 
@@ -35,11 +33,11 @@ public void addData(byte[] buffer, int offset, int length) {
     }
 
     /**
-     * Checks whether at least one byte was seen and that the bytes that
-     * were seen were mostly plain text (i.e. < 2% control, > 90% ASCII range).
+     * Checks whether at least one byte was seen and that the bytes that were seen were mostly plain
+     * text (i.e. < 2% control, > 90% ASCII range).
      *
-     * @return true if the seen bytes were mostly safe ASCII,
-     * false otherwise
+     * @return true if the seen bytes were mostly safe ASCII, false
+     *         otherwise
      * @see TIKA-483
      * @see TIKA-688
      */
@@ -53,8 +51,7 @@ public boolean isMostlyAscii() {
     /**
      * Checks whether the observed byte stream looks like UTF-8 encoded text.
      *
-     * @return true if the seen bytes look like UTF-8,
-     * false otherwise
+     * @return true if the seen bytes look like UTF-8, false otherwise
      * @since Apache Tika 1.3
      */
     public boolean looksLikeUTF8() {
@@ -63,16 +60,16 @@ public boolean looksLikeUTF8() {
         int safe = countSafeControl();
 
         int expectedContinuation = 0;
-        int[] leading = new int[]{count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8)};
+        int[] leading = new int[] {count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8)};
         for (int i = 0; i < leading.length; i++) {
             utf8 += leading[i];
             expectedContinuation += (i + 1) * leading[i];
         }
 
         int continuation = count(0x80, 0xc0);
-        return utf8 > 0 && continuation <= expectedContinuation &&
-                continuation >= expectedContinuation - 3 && count(0xf8, 0x100) == 0 &&
-                (control - safe) * 100 < utf8 * 2;
+        return utf8 > 0 && continuation <= expectedContinuation
+                        && continuation >= expectedContinuation - 3 && count(0xf8, 0x100) == 0
+                        && (control - safe) * 100 < utf8 * 2;
     }
 
     /**
@@ -95,13 +92,13 @@ public int count(int b) {
     }
 
     /**
-     * Counts control characters (i.e. < 0x20, excluding tab, CR, LF,
-     * page feed and escape).
+     * Counts control characters (i.e. < 0x20, excluding tab, CR, LF, page feed and escape).
      * 

- * This definition of control characters is based on section 4 of the - * "Content-Type Processing Model" Internet-draft + * This definition of control characters is based on section 4 of the "Content-Type Processing + * Model" Internet-draft * (draft-abarth-mime-sniff-01). + * *

      * +-------------------------+
      * | Binary data byte ranges |
@@ -150,7 +147,7 @@ private int count(int from, int to) {
 
     private int countSafeControl() {
         return count('\t') + count('\n') + count('\r') // tab, LF, CR
-                + count(0x0c) + count(0x1b);           // new page, escape
+                        + count(0x0c) + count(0x1b); // new page, escape
     }
 
 }
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java b/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java
index 0111b233bb..53e16dfcf1 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.detect;
 
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
index 25b9f085be..8592d67bc3 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 
 package org.apache.tika.detect;
@@ -31,7 +29,6 @@
 import java.nio.file.Path;
 import java.util.HashMap;
 import java.util.Map;
-
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -61,8 +58,7 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException
             MediaType maxType = MediaType.OCTET_STREAM;
 
             /*
-             * iterate the map to find out the one that gives the higher
-             * prediction value.
+             * iterate the map to find out the one that gives the higher prediction value.
              */
             for (Map.Entry entry : MODEL_MAP.entrySet()) {
                 MediaType key = entry.getKey();
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
index 60d75c7b0c..6e4b14e079 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
@@ -1,43 +1,39 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.detect;
 
 import java.io.InputStream;
-
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
 /**
- * Content type detection based on a content type hint. This detector simply
- * trusts any valid content type hint given in the input metadata, and returns
- * that as the likely type of the input document.
+ * Content type detection based on a content type hint. This detector simply trusts any valid
+ * content type hint given in the input metadata, and returns that as the likely type of the input
+ * document.
  *
  * @since Apache Tika 0.3
  */
 public class TypeDetector implements Detector {
 
     /**
-     * Detects the content type of an input document based on a type hint
-     * given in the input metadata. The CONTENT_TYPE attribute of the given
-     * input metadata is expected to contain the type of the input document.
-     * If that attribute exists and contains a valid type name, then that
-     * type is returned.
+     * Detects the content type of an input document based on a type hint given in the input
+     * metadata. The CONTENT_TYPE attribute of the given input metadata is expected to contain the
+     * type of the input document. If that attribute exists and contains a valid type name, then
+     * that type is returned.
      *
-     * @param input    ignored
+     * @param input ignored
      * @param metadata input metadata, possibly with a CONTENT_TYPE value
      * @return detected media type, or application/octet-stream
      */
diff --git a/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java b/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
index 94d8531498..5589318e74 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.detect;
 
@@ -20,19 +18,17 @@
 import java.io.InputStream;
 import java.util.Arrays;
 import javax.xml.namespace.QName;
-
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.utils.XMLReaderUtils;
-
 /**
- * Utility class that uses a {@link javax.xml.parsers.SAXParser} to determine
- * the namespace URI and local name of the root element of an XML file.
+ * Utility class that uses a {@link javax.xml.parsers.SAXParser} to determine the namespace URI and
+ * local name of the root element of an XML file.
  *
  * @since Apache Tika 0.4
  */
@@ -66,17 +62,16 @@ public QName extractRootElement(byte[] data) {
     public QName extractRootElement(InputStream stream) {
         return extractRootElement(stream, false);
     }
-    
+
     private QName extractRootElement(InputStream stream, boolean throwMalformed) {
         ExtractorHandler handler = new ExtractorHandler();
         try {
-            XMLReaderUtils.parseSAX(CloseShieldInputStream.wrap(stream),
-                    handler, EMPTY_CONTEXT);
+            XMLReaderUtils.parseSAX(CloseShieldInputStream.wrap(stream), handler, EMPTY_CONTEXT);
         } catch (SecurityException e) {
             throw e;
         } catch (Exception e) {
             if (throwMalformed && (e instanceof CharConversionException
-                    || e.getCause() instanceof CharConversionException)) {
+                            || e.getCause() instanceof CharConversionException)) {
                 throw new MalformedCharException(e);
             }
         }
@@ -89,7 +84,7 @@ private static class ExtractorHandler extends DefaultHandler {
 
         @Override
         public void startElement(String uri, String local, String name, Attributes attributes)
-                throws SAXException {
+                        throws SAXException {
             this.rootElement = new QName(uri, local);
             throw new SAXException("Aborting: root element received");
         }
diff --git a/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java b/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java
index 5ce52681e2..3ba9cbdb4a 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java
@@ -1,24 +1,21 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.detect;
 
 import java.io.IOException;
 import java.io.InputStream;
-
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
diff --git a/tika-core/src/main/java/org/apache/tika/detect/package-info.java b/tika-core/src/main/java/org/apache/tika/detect/package-info.java
index dede49cfb6..7e6ae44762 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/package-info.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 
 /**
diff --git a/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java b/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java
index 2af59d326e..1aad277e55 100644
--- a/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java
+++ b/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.embedder;
 
@@ -21,7 +19,6 @@
 import java.io.OutputStream;
 import java.io.Serializable;
 import java.util.Set;
-
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -36,11 +33,11 @@
 public interface Embedder extends Serializable {
 
     /**
-     * Returns the set of media types supported by this embedder when used with
-     * the given parse context.
+     * Returns the set of media types supported by this embedder when used with the given parse
+     * context.
      * 

- * The name differs from the precedence of {@link Parser#getSupportedTypes(ParseContext)} - * so that parser implementations may also choose to implement this interface. + * The name differs from the precedence of {@link Parser#getSupportedTypes(ParseContext)} so + * that parser implementations may also choose to implement this interface. * * @param context parse context * @return immutable set of media types @@ -48,46 +45,41 @@ public interface Embedder extends Serializable { Set getSupportedEmbedTypes(ParseContext context); /** - * Embeds related document metadata from the given metadata object into the - * given output stream. + * Embeds related document metadata from the given metadata object into the given output stream. *

- * The given document stream is consumed but not closed by this method. The - * responsibility to close the stream remains on the caller. + * The given document stream is consumed but not closed by this method. The responsibility to + * close the stream remains on the caller. *

- * Information about the parsing context can be passed in the context - * parameter. See the parser implementations for the kinds of context - * information they expect. + * Information about the parsing context can be passed in the context parameter. See the parser + * implementations for the kinds of context information they expect. *

- * In general implementations should favor preserving the source file's metadata - * unless an update to a field is explicitly defined in the Metadata object. - * More specifically: + * In general implementations should favor preserving the source file's metadata unless an + * update to a field is explicitly defined in the Metadata object. More specifically: *

    - *
  • Embedder implementations should only attempt to update metadata fields - * present in the given Metadata object. Other fields should be left untouched.
  • - *
  • Embedder implementations should set properties as empty when the - * corresponding field in the Metadata object is an empty string, i.e. ""
  • - *
  • Embedder implementations should nullify or delete properties - * corresponding to fields with a null value in the given Metadata object.
  • - *
  • Embedder implementations should set the property - * corresponding to a particular field in the given Metadata object in all - * metadata containers whenever possible and appropriate for the file format at the time. - * If a particular metadata container falls out of use and/or is superseded by another - * (such as IIC vs XMP for IPTC) it is up to the implementation to decide if and when - * to cease embedding in the alternate container.
  • - *
  • Embedder implementations should attempt to embed as much of the metadata - * as accurately as possible. An implementation may choose a strict approach - * and throw an exception if a value to be embedded exceeds the length allowed - * or may choose to truncate the value.
  • + *
  • Embedder implementations should only attempt to update metadata fields present in the + * given Metadata object. Other fields should be left untouched.
  • + *
  • Embedder implementations should set properties as empty when the corresponding field in + * the Metadata object is an empty string, i.e. ""
  • + *
  • Embedder implementations should nullify or delete properties corresponding to fields with + * a null value in the given Metadata object.
  • + *
  • Embedder implementations should set the property corresponding to a particular field in + * the given Metadata object in all metadata containers whenever possible and appropriate for + * the file format at the time. If a particular metadata container falls out of use and/or is + * superseded by another (such as IIC vs XMP for IPTC) it is up to the implementation to decide + * if and when to cease embedding in the alternate container.
  • + *
  • Embedder implementations should attempt to embed as much of the metadata as accurately as + * possible. An implementation may choose a strict approach and throw an exception if a value to + * be embedded exceeds the length allowed or may choose to truncate the value.
  • *
* - * @param metadata document metadata (input and output) + * @param metadata document metadata (input and output) * @param originalStream the document stream (input) - * @param outputStream the output stream to write the metadata embedded data to - * @param context parse context - * @throws IOException if the document stream could not be read + * @param outputStream the output stream to write the metadata embedded data to + * @param context parse context + * @throws IOException if the document stream could not be read * @throws TikaException if the document could not be parsed */ void embed(Metadata metadata, InputStream originalStream, OutputStream outputStream, - ParseContext context) throws IOException, TikaException; + ParseContext context) throws IOException, TikaException; } diff --git a/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java b/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java index c58d57345f..edba0747b8 100644 --- a/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java +++ b/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.embedder; @@ -29,10 +27,8 @@ import java.util.List; import java.util.Map; import java.util.Set; - import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -43,24 +39,22 @@ import org.apache.tika.parser.external.ExternalParser; /** - * Embedder that uses an external program (like sed or exiftool) to embed text - * content and metadata into a given document. + * Embedder that uses an external program (like sed or exiftool) to embed text content and metadata + * into a given document. * * @since Apache Tika 1.3 */ public class ExternalEmbedder implements Embedder { /** - * Token to be replaced with a String array of metadata assignment command - * arguments + * Token to be replaced with a String array of metadata assignment command arguments */ public static final String METADATA_COMMAND_ARGUMENTS_TOKEN = "${METADATA}"; /** - * Token to be replaced with a String array of metadata assignment command - * arguments + * Token to be replaced with a String array of metadata assignment command arguments */ public static final String METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN = - "${METADATA_SERIALIZED}"; + "${METADATA_SERIALIZED}"; private static final long serialVersionUID = -2828829275642475697L; private final TemporaryResources tmp = new TemporaryResources(); /** @@ -76,8 +70,8 @@ public class ExternalEmbedder implements Embedder { * * @see Runtime#exec(String[]) */ - private String[] command = - new String[]{"sed", "-e", "$a\\\n" + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, + private String[] command = new String[] {"sed", "-e", + "$a\\\n" + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, ExternalParser.INPUT_FILE_TOKEN}; private String commandAssignmentOperator = "="; private String commandAssignmentDelimeter = ", "; @@ -85,8 +79,7 @@ public class ExternalEmbedder implements Embedder { private boolean quoteAssignmentValues = false; /** - * Serializes a collection of metadata command line arguments into a single - * string. + * Serializes a collection of metadata command line arguments into a single string. * * @param metadataCommandArguments * @return the serialized metadata arguments string @@ -99,30 +92,28 @@ protected static String serializeMetadata(List metadataCommandArguments) } /** - * Checks to see if the command can be run. Typically used with something - * like "myapp --version" to check to see if "myapp" is installed and on the - * path. + * Checks to see if the command can be run. Typically used with something like "myapp --version" + * to check to see if "myapp" is installed and on the path. * - * @param checkCmd the check command to run + * @param checkCmd the check command to run * @param errorValue what is considered an error value? * @return whether or not the check completed without error */ public static boolean check(String checkCmd, int... errorValue) { - return check(new String[]{checkCmd}, errorValue); + return check(new String[] {checkCmd}, errorValue); } /** - * Checks to see if the command can be run. Typically used with something - * like "myapp --version" to check to see if "myapp" is installed and on the - * path. + * Checks to see if the command can be run. Typically used with something like "myapp --version" + * to check to see if "myapp" is installed and on the path. * - * @param checkCmd the check command to run + * @param checkCmd the check command to run * @param errorValue what is considered an error value? * @return whether or not the check completed without error */ public static boolean check(String[] checkCmd, int... errorValue) { if (errorValue.length == 0) { - errorValue = new int[]{127}; + errorValue = new int[] {127}; } try { @@ -156,15 +147,13 @@ public Set getSupportedEmbedTypes() { } public void setSupportedEmbedTypes(Set supportedEmbedTypes) { - this.supportedEmbedTypes = - Collections.unmodifiableSet(new HashSet<>(supportedEmbedTypes)); + this.supportedEmbedTypes = Collections.unmodifiableSet(new HashSet<>(supportedEmbedTypes)); } /** * Gets the command to be run. This can include either of - * {@link ExternalParser#INPUT_FILE_TOKEN} or - * {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command - * needs filenames. + * {@link ExternalParser#INPUT_FILE_TOKEN} or {@link ExternalParser#OUTPUT_FILE_TOKEN} if the + * command needs filenames. * * @return */ @@ -174,9 +163,8 @@ public String[] getCommand() { /** * Sets the command to be run. This can include either of - * {@link ExternalParser#INPUT_FILE_TOKEN} or - * {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command - * needs filenames. + * {@link ExternalParser#INPUT_FILE_TOKEN} or {@link ExternalParser#OUTPUT_FILE_TOKEN} if the + * command needs filenames. * * @see Runtime#exec(String[]) */ @@ -203,8 +191,7 @@ public void setCommandAssignmentOperator(String commandAssignmentOperator) { } /** - * Gets the delimiter for multiple assignments for the command line tool, - * i.e. ", ". + * Gets the delimiter for multiple assignments for the command line tool, i.e. ", ". * * @return the assignment delimiter */ @@ -213,8 +200,7 @@ public String getCommandAssignmentDelimeter() { } /** - * Sets the delimiter for multiple assignments for the command line tool, - * i.e. ", ". + * Sets the delimiter for multiple assignments for the command line tool, i.e. ", ". * * @param commandAssignmentDelimeter */ @@ -223,8 +209,7 @@ public void setCommandAssignmentDelimeter(String commandAssignmentDelimeter) { } /** - * Gets the operator to append rather than replace a value for the command - * line tool, i.e. "+=". + * Gets the operator to append rather than replace a value for the command line tool, i.e. "+=". * * @return the append operator */ @@ -233,8 +218,7 @@ public String getCommandAppendOperator() { } /** - * Sets the operator to append rather than replace a value for the command - * line tool, i.e. "+=". + * Sets the operator to append rather than replace a value for the command line tool, i.e. "+=". * * @param commandAppendOperator */ @@ -243,8 +227,7 @@ public void setCommandAppendOperator(String commandAppendOperator) { } /** - * Gets whether or not to quote assignment values, i.e. tag='value'. The - * default is false. + * Gets whether or not to quote assignment values, i.e. tag='value'. The default is false. * * @return whether or not to quote assignment values */ @@ -271,8 +254,8 @@ public Map getMetadataCommandArguments() { } /** - * Sets the map of Metadata keys to command line parameters. Set this to - * null to disable Metadata embedding. + * Sets the map of Metadata keys to command line parameters. Set this to null to disable + * Metadata embedding. * * @param arguments */ @@ -281,8 +264,8 @@ public void setMetadataCommandArguments(Map arguments) { } /** - * Constructs a collection of command line arguments responsible for setting - * individual metadata fields based on the given metadata. + * Constructs a collection of command line arguments responsible for setting individual metadata + * fields based on the given metadata. * * @param metadata the metadata to embed * @return the metadata-related command line arguments @@ -304,18 +287,16 @@ protected List getCommandMetadataSegments(Metadata metadata) { if (quoteAssignmentValues) { assignmentValue = "'" + assignmentValue + "'"; } - commandMetadataSegments - .add(metadataCommandArgument + commandAppendOperator + - assignmentValue); + commandMetadataSegments.add(metadataCommandArgument + + commandAppendOperator + assignmentValue); } } else { String assignmentValue = metadata.get(metadataName); if (quoteAssignmentValues) { assignmentValue = "'" + assignmentValue + "'"; } - commandMetadataSegments - .add(metadataCommandArgument + commandAssignmentOperator + - assignmentValue); + commandMetadataSegments.add(metadataCommandArgument + + commandAssignmentOperator + assignmentValue); } } } @@ -326,20 +307,19 @@ protected List getCommandMetadataSegments(Metadata metadata) { } /** - * Executes the configured external command and passes the given document - * stream as a simple XHTML document to the given SAX content handler. - * Metadata is only extracted if {@link #setMetadataCommandArguments(Map)} - * has been called to set arguments. + * Executes the configured external command and passes the given document stream as a simple + * XHTML document to the given SAX content handler. Metadata is only extracted if + * {@link #setMetadataCommandArguments(Map)} has been called to set arguments. */ @Override public void embed(final Metadata metadata, final InputStream inputStream, - final OutputStream outputStream, final ParseContext context) - throws IOException, TikaException { + final OutputStream outputStream, final ParseContext context) + throws IOException, TikaException { boolean inputToStdIn = true; boolean outputFromStdOut = true; boolean hasMetadataCommandArguments = - (metadataCommandArguments != null && !metadataCommandArguments.isEmpty()); + (metadataCommandArguments != null && !metadataCommandArguments.isEmpty()); boolean serializeMetadataCommandArgumentsToken = false; boolean replacedMetadataCommandArgumentsToken = false; @@ -357,13 +337,13 @@ public void embed(final Metadata metadata, final InputStream inputStream, for (String commandSegment : origCmd) { if (commandSegment.contains(ExternalParser.INPUT_FILE_TOKEN)) { commandSegment = commandSegment.replace(ExternalParser.INPUT_FILE_TOKEN, - tikaInputStream.getFile().toString()); + tikaInputStream.getFile().toString()); inputToStdIn = false; } if (commandSegment.contains(ExternalParser.OUTPUT_FILE_TOKEN)) { tempOutputFile = tmp.createTemporaryFile(); - commandSegment = commandSegment - .replace(ExternalParser.OUTPUT_FILE_TOKEN, tempOutputFile.toString()); + commandSegment = commandSegment.replace(ExternalParser.OUTPUT_FILE_TOKEN, + tempOutputFile.toString()); outputFromStdOut = false; } if (commandSegment.contains(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN)) { @@ -384,15 +364,15 @@ public void embed(final Metadata metadata, final InputStream inputStream, int i = 0; for (String commandSegment : cmd) { if (commandSegment.contains(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN)) { - commandSegment = commandSegment - .replace(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, + commandSegment = commandSegment.replace( + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, serializeMetadata(commandMetadataSegments)); cmd.set(i, commandSegment); } i++; } - } else if (!replacedMetadataCommandArgumentsToken && - !serializeMetadataCommandArgumentsToken) { + } else if (!replacedMetadataCommandArgumentsToken + && !serializeMetadataCommandArgumentsToken) { // Tack metadata onto the end of the cmd as arguments cmd.addAll(commandMetadataSegments); } @@ -406,7 +386,8 @@ public void embed(final Metadata metadata, final InputStream inputStream, process = Runtime.getRuntime().exec(cmd.toArray(String[]::new)); } - UnsynchronizedByteArrayOutputStream stdErrOutputStream = UnsynchronizedByteArrayOutputStream.builder().get(); + UnsynchronizedByteArrayOutputStream stdErrOutputStream = + UnsynchronizedByteArrayOutputStream.builder().get(); try { sendStdErrToOutputStream(process, stdErrOutputStream); @@ -427,7 +408,8 @@ public void embed(final Metadata metadata, final InputStream inputStream, } catch (InterruptedException ignore) { } // The command is finished, read the output file into the given output stream - InputStream tempOutputFileInputStream = TikaInputStream.get(tempOutputFile.toPath()); + InputStream tempOutputFileInputStream = + TikaInputStream.get(tempOutputFile.toPath()); IOUtils.copy(tempOutputFileInputStream, outputStream); } } finally { @@ -441,7 +423,7 @@ public void embed(final Metadata metadata, final InputStream inputStream, // Clean up temp output files tempOutputFile.delete(); } catch (Exception e) { - //swallow + // swallow } } if (!inputToStdIn) { @@ -452,9 +434,9 @@ public void embed(final Metadata metadata, final InputStream inputStream, IOUtils.closeQuietly(outputStream); IOUtils.closeQuietly(stdErrOutputStream); if (process.exitValue() != 0) { - throw new TikaException("There was an error executing the command line" + - "\nExecutable Command:\n\n" + cmd + "\nExecutable Error:\n\n" + - stdErrOutputStream.toString(UTF_8.name())); + throw new TikaException("There was an error executing the command line" + + "\nExecutable Command:\n\n" + cmd + "\nExecutable Error:\n\n" + + stdErrOutputStream.toString(UTF_8.name())); } } } @@ -462,11 +444,11 @@ public void embed(final Metadata metadata, final InputStream inputStream, /** * Creates a new thread for copying a given input stream to a given output stream. * - * @param inputStream the source input stream + * @param inputStream the source input stream * @param outputStream the target output stream */ private void multiThreadedStreamCopy(final InputStream inputStream, - final OutputStream outputStream) { + final OutputStream outputStream) { new Thread(() -> { try { IOUtils.copy(inputStream, outputStream); @@ -477,13 +459,12 @@ private void multiThreadedStreamCopy(final InputStream inputStream, } /** - * Sends the contents of the given input stream to the - * standard input of the given process. Potential exceptions are - * ignored. + * Sends the contents of the given input stream to the standard input of the given process. + * Potential exceptions are ignored. *

* Note that the given input stream is not closed by this method. * - * @param process the process + * @param process the process * @param inputStream the input stream to send to standard input of the process */ private void sendInputStreamToStdIn(final InputStream inputStream, final Process process) { @@ -491,13 +472,12 @@ private void sendInputStreamToStdIn(final InputStream inputStream, final Process } /** - * Sends the standard output of the given - * process to the given output stream. Potential exceptions are - * ignored. + * Sends the standard output of the given process to the given output stream. Potential + * exceptions are ignored. *

* Note that the given output stream is not closed by this method. * - * @param process the process + * @param process the process * @param outputStream the putput stream to send to standard input of the process */ private void sendStdOutToOutputStream(final Process process, final OutputStream outputStream) { @@ -509,12 +489,11 @@ private void sendStdOutToOutputStream(final Process process, final OutputStream } /** - * Starts a thread that reads and discards the contents of the standard - * stream of the given process. Potential exceptions are ignored, and the - * stream is closed once fully processed. + * Starts a thread that reads and discards the contents of the standard stream of the given + * process. Potential exceptions are ignored, and the stream is closed once fully processed. * - * @param process the process - * param outputStream the output stream to send to standard error of the process + * @param process the process param outputStream the output stream to send to standard error of + * the process */ private void sendStdErrToOutputStream(final Process process, final OutputStream outputStream) { multiThreadedStreamCopy(process.getErrorStream(), outputStream); diff --git a/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java b/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java index b5f2136ea9..52467d568e 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; /** - * Exception to be thrown when a document does not allow content extraction. - * As of this writing, PDF documents are the only type of document that might - * cause this type of exception. + * Exception to be thrown when a document does not allow content extraction. As of this writing, PDF + * documents are the only type of document that might cause this type of exception. */ public class AccessPermissionException extends TikaException { public AccessPermissionException() { diff --git a/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java b/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java index 5ebad6d3a6..4872e530d3 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java @@ -1,24 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; /** - * This exception should be thrown when the parse absolutely, positively has to stop. - * This exception must not be caught and swallowed if an embedded parser throws it. + * This exception should be thrown when the parse absolutely, positively has to stop. This exception + * must not be caught and swallowed if an embedded parser throws it. */ public class CorruptedFileException extends TikaException { public CorruptedFileException(String msg) { diff --git a/tika-core/src/main/java/org/apache/tika/exception/EncryptedDocumentException.java b/tika-core/src/main/java/org/apache/tika/exception/EncryptedDocumentException.java index f86739e77e..881860136c 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/EncryptedDocumentException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/EncryptedDocumentException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; diff --git a/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java b/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java index 3ec3294b31..4d20053937 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; @@ -20,8 +18,7 @@ import java.io.IOException; /* - * Thrown when fetcher or similar has a maxLength set, but the - * underlying file is too long. + * Thrown when fetcher or similar has a maxLength set, but the underlying file is too long. */ public class FileTooLongException extends IOException { @@ -34,8 +31,8 @@ public FileTooLongException(long length, long maxLength) { } private static String msg(long length, long maxLength) { - return "File is " + length + " bytes, but " + maxLength + - " is the maximum length allowed. You can modify maxLength via " + - "the setter on the fetcher."; + return "File is " + length + " bytes, but " + maxLength + + " is the maximum length allowed. You can modify maxLength via " + + "the setter on the fetcher."; } } diff --git a/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java b/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java index 4e0bc43087..50f29583dd 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java index 1dcd3275f9..7c5f52b337 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; /** - * Tika Config Exception is an exception to occur when there is an error - * in Tika config file and/or one or more of the parsers failed to initialize - * from that erroneous config. + * Tika Config Exception is an exception to occur when there is an error in Tika config file and/or + * one or more of the parsers failed to initialize from that erroneous config. * * @since Apache Tika 1.14 */ diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaException.java index ceac19d6a6..a9dde433a0 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/TikaException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/TikaException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java index fbc1a95528..5eac8746a4 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java @@ -1,26 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; /* - * Thrown when a parser is asked to allocate more memory than is allowable - * for a given threshold. For example, the ZCompressorInputStream might - * be asked to create an array many gigabytes of length by a corrupt file. + * Thrown when a parser is asked to allocate more memory than is allowable for a given threshold. + * For example, the ZCompressorInputStream might be asked to create an array many gigabytes of + * length by a corrupt file. */ public class TikaMemoryLimitException extends TikaException { @@ -33,8 +31,8 @@ public TikaMemoryLimitException(long triedToAllocate, long maxAllowable) { } private static String msg(long triedToAllocate, long maxAllowable) { - return "Tried to allocate " + triedToAllocate + " bytes, but " + maxAllowable + - " is the maximum allowed. Please open an issue https://issues.apache.org/jira/projects/TIKA" + - " if you believe this file is not corrupt."; + return "Tried to allocate " + triedToAllocate + " bytes, but " + maxAllowable + + " is the maximum allowed. Please open an issue https://issues.apache.org/jira/projects/TIKA" + + " if you believe this file is not corrupt."; } } diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java index a53dbd6a31..42d1cb3a5e 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; diff --git a/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java b/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java index 4322e64f9b..a0e778915a 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java @@ -1,35 +1,30 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; /** - * Parsers should throw this exception when they encounter - * a file format that they do not support. This should only happen - * when we're not able to differentiate versions by the mime. For example, - * At the time of this writing, "application/wordperfect" covers all versions - * of the wordperfect format; however, the parser only handles 6.x. + * Parsers should throw this exception when they encounter a file format that they do not support. + * This should only happen when we're not able to differentiate versions by the mime. For example, + * At the time of this writing, "application/wordperfect" covers all versions of the wordperfect + * format; however, the parser only handles 6.x. *

- * Whenever possible/convenient, it is better to distinguish file formats by mime - * so that unsupported formats will be handled by the - * {@link org.apache.tika.parser.EmptyParser}. - * However, if we can't differentiate by mime or we need to rely on the parser - * to distinguish the versions (in the case that magic can't distinguish), - * this exception should be thrown. + * Whenever possible/convenient, it is better to distinguish file formats by mime so that + * unsupported formats will be handled by the {@link org.apache.tika.parser.EmptyParser}. However, + * if we can't differentiate by mime or we need to rely on the parser to distinguish the versions + * (in the case that magic can't distinguish), this exception should be thrown. */ public class UnsupportedFormatException extends TikaException { diff --git a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java index 3e661ada5c..a66a12dbbf 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; @@ -20,10 +18,11 @@ public class WriteLimitReachedException extends SAXException { - //in case of (hopefully impossible) cyclic exception + // in case of (hopefully impossible) cyclic exception private final static int MAX_DEPTH = 100; private final int writeLimit; + public WriteLimitReachedException(int writeLimit) { this.writeLimit = writeLimit; } @@ -31,18 +30,17 @@ public WriteLimitReachedException(int writeLimit) { @Override public String getMessage() { return "Your document contained more than " + writeLimit - + " characters, and so your requested limit has been" - + " reached. To receive the full text of the document," - + " increase your limit. (Text up to the limit is" - + " however available)."; + + " characters, and so your requested limit has been" + + " reached. To receive the full text of the document," + + " increase your limit. (Text up to the limit is" + " however available)."; } + /** - * Checks whether the given exception (or any of it's root causes) was - * thrown by this handler as a signal of reaching the write limit. + * Checks whether the given exception (or any of it's root causes) was thrown by this handler as + * a signal of reaching the write limit. * * @param t throwable - * @return true if the write limit was reached, - * false otherwise + * @return true if the write limit was reached, false otherwise * @since Apache Tika 2.0 */ public static boolean isWriteLimitReached(Throwable t) { diff --git a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java index 125bc21b90..9f6bc3e943 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.exception; @@ -26,15 +24,15 @@ public class ZeroByteFileException extends TikaException { /** * If this is in the {@link org.apache.tika.parser.ParseContext}, the * {@link org.apache.tika.parser.AutoDetectParser} and the - * {@link org.apache.tika.parser.RecursiveParserWrapper} will - * ignore embedded files with zero-byte length inputstreams + * {@link org.apache.tika.parser.RecursiveParserWrapper} will ignore embedded files with + * zero-byte length inputstreams */ public static IgnoreZeroByteFileException IGNORE_ZERO_BYTE_FILE_EXCEPTION = - new IgnoreZeroByteFileException(); + new IgnoreZeroByteFileException(); - //If this is in the parse context, the AutoDetectParser and the - //RecursiveParserWrapper should ignore zero byte files - //and not throw a Zero} + // If this is in the parse context, the AutoDetectParser and the + // RecursiveParserWrapper should ignore zero byte files + // and not throw a Zero} public ZeroByteFileException(String msg) { super(msg); } diff --git a/tika-core/src/main/java/org/apache/tika/exception/package-info.java b/tika-core/src/main/java/org/apache/tika/exception/package-info.java index 80ab125814..d078aa115c 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/exception/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java index 1d5a239db6..e6448c595c 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; import java.util.Set; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -34,8 +31,8 @@ public class BasicEmbeddedBytesSelector implements EmbeddedBytesSelector { private final Set excludeEmbeddedResourceTypes; public BasicEmbeddedBytesSelector(Set includeMimes, Set excludeMimes, - Set includeEmbeddedResourceTypes, - Set excludeEmbeddedResourceTypes) { + Set includeEmbeddedResourceTypes, + Set excludeEmbeddedResourceTypes) { this.includeMimes = includeMimes; this.excludeMimes = excludeMimes; this.includeEmbeddedResourceTypes = includeEmbeddedResourceTypes; @@ -47,7 +44,7 @@ public boolean select(Metadata metadata) { if (mime == null) { mime = ""; } else { - //if mime matters at all, make sure to get the mime without parameters + // if mime matters at all, make sure to get the mime without parameters if (includeMimes.size() > 0 || excludeMimes.size() > 0) { MediaType mt = MediaType.parse(mime); if (mt != null) { @@ -58,18 +55,19 @@ public boolean select(Metadata metadata) { if (excludeMimes.contains(mime)) { return false; } - if (includeMimes.size() > 0 && ! includeMimes.contains(mime)) { + if (includeMimes.size() > 0 && !includeMimes.contains(mime)) { return false; } String embeddedResourceType = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); - //if a parser doesn't specify the type, treat it as ATTACHMENT - embeddedResourceType = StringUtils.isBlank(embeddedResourceType) ? "ATTACHMENT" : - embeddedResourceType; + // if a parser doesn't specify the type, treat it as ATTACHMENT + embeddedResourceType = StringUtils.isBlank(embeddedResourceType) ? "ATTACHMENT" + : embeddedResourceType; if (excludeEmbeddedResourceTypes.contains(embeddedResourceType)) { return false; } - if (includeEmbeddedResourceTypes.size() > 0 && includeEmbeddedResourceTypes.contains(embeddedResourceType)) { + if (includeEmbeddedResourceTypes.size() > 0 + && includeEmbeddedResourceTypes.contains(embeddedResourceType)) { return true; } return false; diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java index cfc70b5f36..1c7ffd48fc 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java @@ -1,64 +1,56 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; import java.io.IOException; import java.io.Serializable; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; /** - * Tika container extractor interface. - * Container Extractors provide access to the embedded - * resources within container formats such as .zip and .doc + * Tika container extractor interface. Container Extractors provide access to the embedded resources + * within container formats such as .zip and .doc */ public interface ContainerExtractor extends Serializable { /** - * Is this Container Extractor able to process the - * supplied container? + * Is this Container Extractor able to process the supplied container? * * @since Apache Tika 0.8 */ boolean isSupported(TikaInputStream input) throws IOException; /** - * Processes a container file, and extracts all the embedded - * resources from within it. + * Processes a container file, and extracts all the embedded resources from within it. *

- * The {@link EmbeddedResourceHandler} you supply will - * be called for each embedded resource in the container. It is - * up to you whether you process the contents of the resource or not. + * The {@link EmbeddedResourceHandler} you supply will be called for each embedded resource in + * the container. It is up to you whether you process the contents of the resource or not. *

- * The given document stream is consumed but not closed by this method. - * The responsibility to close the stream remains on the caller. + * The given document stream is consumed but not closed by this method. The responsibility to + * close the stream remains on the caller. *

- * If required, nested containers (such as a .docx within a .zip) - * can automatically be recursed into, and processed inline. If - * no recurseExtractor is given, the nested containers will be + * If required, nested containers (such as a .docx within a .zip) can automatically be recursed + * into, and processed inline. If no recurseExtractor is given, the nested containers will be * treated as with any other embedded resources. * - * @param stream the document stream (input) + * @param stream the document stream (input) * @param recurseExtractor the extractor to use on any embedded containers - * @param handler handler for the embedded files (output) - * @throws IOException if the document stream could not be read + * @param handler handler for the embedded files (output) + * @throws IOException if the document stream could not be read * @throws TikaException if the container could not be parsed * @since Apache Tika 0.8 */ void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, - EmbeddedResourceHandler handler) throws IOException, TikaException; + EmbeddedResourceHandler handler) throws IOException, TikaException; } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java index b9d6985cce..41017a5cee 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java @@ -1,41 +1,38 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; import java.io.IOException; import java.io.OutputStream; import java.util.List; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.utils.ServiceLoaderUtils; /** - * Loads EmbeddedStreamTranslators via service loading. Tries to run each - * in turn. If a translator accepts the stream, it will do the translation but not close the stream. + * Loads EmbeddedStreamTranslators via service loading. Tries to run each in turn. If a translator + * accepts the stream, it will do the translation but not close the stream. */ public class DefaultEmbeddedStreamTranslator implements EmbeddedStreamTranslator { final List translators; private static List getDefaultFilters(ServiceLoader loader) { - List embeddedStreamTranslators - = loader.loadServiceProviders(EmbeddedStreamTranslator.class); + List embeddedStreamTranslators = + loader.loadServiceProviders(EmbeddedStreamTranslator.class); ServiceLoaderUtils.sortLoadedClasses(embeddedStreamTranslators); return embeddedStreamTranslators; } @@ -49,15 +46,17 @@ private DefaultEmbeddedStreamTranslator(List translato } /** - * This should sniff the stream to determine if it needs to be translated. - * The translator is responsible for resetting the stream if any bytes have been read. + * This should sniff the stream to determine if it needs to be translated. The translator is + * responsible for resetting the stream if any bytes have been read. + * * @param inputStream * @param metadata * @return * @throws IOException */ @Override - public boolean shouldTranslate(TikaInputStream inputStream, Metadata metadata) throws IOException { + public boolean shouldTranslate(TikaInputStream inputStream, Metadata metadata) + throws IOException { for (EmbeddedStreamTranslator translator : translators) { if (translator.shouldTranslate(inputStream, metadata)) { return true; @@ -68,6 +67,7 @@ public boolean shouldTranslate(TikaInputStream inputStream, Metadata metadata) t /** * This will consume the InputStream and write the stream to the output stream + * * @param inputStream * @param metadata * @param outputStream to write to @@ -75,7 +75,8 @@ public boolean shouldTranslate(TikaInputStream inputStream, Metadata metadata) t * @throws IOException */ @Override - public void translate(TikaInputStream inputStream, Metadata metadata, OutputStream outputStream) throws IOException { + public void translate(TikaInputStream inputStream, Metadata metadata, OutputStream outputStream) + throws IOException { for (EmbeddedStreamTranslator translator : translators) { if (translator.shouldTranslate(inputStream, metadata)) { translator.translate(inputStream, metadata, outputStream); diff --git a/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java index aa34aa12bb..3e3cc70ee7 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java @@ -1,28 +1,25 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; import org.apache.tika.metadata.Metadata; /** - * Interface for different document selection strategies for purposes like - * embedded document extraction by a {@link ContainerExtractor} instance. - * An implementation of this interface defines some specific selection - * criteria to be applied against the document metadata passed to the + * Interface for different document selection strategies for purposes like embedded document + * extraction by a {@link ContainerExtractor} instance. An implementation of this interface defines + * some specific selection criteria to be applied against the document metadata passed to the * {@link #select(Metadata)} method. * * @since Apache Tika 0.8 @@ -30,12 +27,11 @@ public interface DocumentSelector { /** - * Checks if a document with the given metadata matches the specified - * selection criteria. + * Checks if a document with the given metadata matches the specified selection criteria. * * @param metadata document metadata - * @return true if the document matches the selection criteria, - * false otherwise + * @return true if the document matches the selection criteria, false + * otherwise */ boolean select(Metadata metadata); diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java index 2ec7df667e..9ccbccc174 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; @@ -26,6 +24,7 @@ public boolean select(Metadata metadata) { return true; } } + EmbeddedBytesSelector ACCEPT_ALL = new AcceptAll(); boolean select(Metadata metadata); diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java index f7237bd6ac..8e87302593 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java @@ -1,33 +1,31 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; /** * This factory creates EmbeddedDocumentExtractors that require an - * {@link EmbeddedDocumentBytesHandler} in the - * {@link org.apache.tika.parser.ParseContext} should extend this. + * {@link EmbeddedDocumentBytesHandler} in the {@link org.apache.tika.parser.ParseContext} should + * extend this. * - * This is a shim interface to signal to {@link org.apache.tika.pipes.PipesServer} - * to use the {@link @RUnpackExtractor} if the user doesn't configure a custom - * EmbeddedDocumentExtractor. + * This is a shim interface to signal to {@link org.apache.tika.pipes.PipesServer} to use the + * {@link @RUnpackExtractor} if the user doesn't configure a custom EmbeddedDocumentExtractor. * * TODO: Figure out how to simplify this and allow for emitting of the source document. */ -public interface EmbeddedDocumentByteStoreExtractorFactory extends EmbeddedDocumentExtractorFactory { +public interface EmbeddedDocumentByteStoreExtractorFactory + extends EmbeddedDocumentExtractorFactory { } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java index 12357a7189..697ece05b9 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; @@ -20,11 +18,10 @@ import java.io.IOException; import java.io.InputStream; import java.util.List; - import org.apache.tika.metadata.Metadata; public interface EmbeddedDocumentBytesHandler extends Closeable { - //we need metadata for the emitter store...can we get away without it? + // we need metadata for the emitter store...can we get away without it? void add(int id, Metadata metadata, InputStream inputStream) throws IOException; List getIds(); diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java index 3f977e3dbe..0238c964db 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java @@ -1,36 +1,33 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; import java.io.IOException; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public interface EmbeddedDocumentExtractor { boolean shouldParseEmbedded(Metadata metadata); /** - * Processes the supplied embedded resource, calling the delegating - * parser with the appropriate details. + * Processes the supplied embedded resource, calling the delegating parser with the appropriate + * details. + * * @param stream The embedded resource * @param handler The handler to use * @param metadata The metadata for the embedded resource @@ -38,7 +35,6 @@ public interface EmbeddedDocumentExtractor { * @throws org.xml.sax.SAXException * @throws java.io.IOException */ - void parseEmbedded( - TikaInputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) - throws SAXException, IOException; + void parseEmbedded(TikaInputStream stream, ContentHandler handler, Metadata metadata, + boolean outputHtml) throws SAXException, IOException; } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java index 4a55052aa3..919504daf8 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; import java.io.Serializable; - import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java index 4d73545c11..04f6b31000 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java @@ -1,28 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; import java.io.IOException; import java.io.Serializable; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; @@ -40,21 +34,23 @@ import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.StatefulParser; import org.apache.tika.utils.ExceptionUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Utility class to handle common issues with embedded documents. *

- * Use statically if all that is needed is getting the EmbeddedDocumentExtractor. - * Otherwise, instantiate an instance. + * Use statically if all that is needed is getting the EmbeddedDocumentExtractor. Otherwise, + * instantiate an instance. *

- * Note: This is not thread safe. Make sure to instantiate one per thread. + * Note: This is not thread safe. Make sure to instantiate one per thread. */ public class EmbeddedDocumentUtil implements Serializable { private final ParseContext context; private final EmbeddedDocumentExtractor embeddedDocumentExtractor; - //these are lazily initialized and can be null + // these are lazily initialized and can be null private TikaConfig tikaConfig; private MimeTypes mimeTypes; private Detector detector; @@ -65,12 +61,12 @@ public EmbeddedDocumentUtil(ParseContext context) { } /** - * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext. - * As of Tika 1.15, an AutoDetectParser will automatically be added to parse - * embedded documents if no Parser.class is specified in the ParseContext. + * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext. As of Tika + * 1.15, an AutoDetectParser will automatically be added to parse embedded documents if no + * Parser.class is specified in the ParseContext. *

- * If you'd prefer not to parse embedded documents, set Parser.class - * to {@link org.apache.tika.parser.EmptyParser} in the ParseContext. + * If you'd prefer not to parse embedded documents, set Parser.class to + * {@link org.apache.tika.parser.EmptyParser} in the ParseContext. * * @param context * @return EmbeddedDocumentExtractor @@ -80,8 +76,8 @@ public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContex if (extractor != null) { return extractor; } - //ensure that an AutoDetectParser is - //available for parsing embedded docs TIKA-2096 + // ensure that an AutoDetectParser is + // available for parsing embedded docs TIKA-2096 Parser embeddedParser = context.get(Parser.class); if (embeddedParser == null) { TikaConfig tikaConfig = context.get(TikaConfig.class); @@ -97,9 +93,8 @@ public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContex } /** - * Utility function to get the Parser that was sent in to the - * ParseContext to handle embedded documents. If it is stateful, - * unwrap it to get its stateless delegating parser. + * Utility function to get the Parser that was sent in to the ParseContext to handle embedded + * documents. If it is stateful, unwrap it to get its stateless delegating parser. *

* If there is no Parser in the parser context, this will return null. * @@ -122,7 +117,7 @@ public PasswordProvider getPasswordProvider() { } public Detector getDetector() { - //be as lazy as possible and cache + // be as lazy as possible and cache Detector localDetector = context.get(Detector.class); if (localDetector != null) { return localDetector; @@ -137,7 +132,7 @@ public Detector getDetector() { public MimeTypes getMimeTypes() { MimeTypes localMimeTypes = context.get(MimeTypes.class); - //be as lazy as possible and cache the mimeTypes + // be as lazy as possible and cache the mimeTypes if (localMimeTypes != null) { return localMimeTypes; } @@ -149,13 +144,13 @@ public MimeTypes getMimeTypes() { } /** - * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext - * that was included during initialization, and then creating a new one from - * via {@link TikaConfig#getDefaultConfig()} if it can't find one in the - * ParseContext. This caches the default config so that it only has to be created once. + * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext that was + * included during initialization, and then creating a new one from via + * {@link TikaConfig#getDefaultConfig()} if it can't find one in the ParseContext. This + * caches the default config so that it only has to be created once. */ public TikaConfig getTikaConfig() { - //be as lazy as possible and cache the TikaConfig + // be as lazy as possible and cache the TikaConfig if (tikaConfig == null) { tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { @@ -168,7 +163,7 @@ public TikaConfig getTikaConfig() { public String getExtension(TikaInputStream is, Metadata metadata) { String mimeString = metadata.get(Metadata.CONTENT_TYPE); - //use the buffered mimetypes as default + // use the buffered mimetypes as default MimeTypes localMimeTypes = getMimeTypes(); MimeType mimeType = null; @@ -177,7 +172,7 @@ public String getExtension(TikaInputStream is, Metadata metadata) { try { mimeType = localMimeTypes.forName(mimeString); } catch (MimeTypeException e) { - //swallow + // swallow } } if (mimeType == null) { @@ -187,12 +182,12 @@ public String getExtension(TikaInputStream is, Metadata metadata) { detected = true; is.reset(); } catch (IOException | MimeTypeException e) { - //swallow + // swallow } } if (mimeType != null) { if (detected) { - //set or correct the mime type + // set or correct the mime type metadata.set(Metadata.CONTENT_TYPE, mimeType.toString()); } return mimeType.getExtension(); @@ -219,20 +214,19 @@ private EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { } public void parseEmbedded(TikaInputStream tis, ContentHandler handler, Metadata metadata, - boolean outputHtml) throws IOException, SAXException { + boolean outputHtml) throws IOException, SAXException { embeddedDocumentExtractor.parseEmbedded(tis, handler, metadata, outputHtml); } /** - * Tries to find an existing parser within the ParseContext. - * It looks inside of CompositeParsers and ParserDecorators. - * The use case is when a parser needs to parse an internal stream - * that is _part_ of the document, e.g. rtf body inside an msg. + * Tries to find an existing parser within the ParseContext. It looks inside of CompositeParsers + * and ParserDecorators. The use case is when a parser needs to parse an internal stream that is + * _part_ of the document, e.g. rtf body inside an msg. *

- * Can return null if the context contains no parser or - * the correct parser can't be found. + * Can return null if the context contains no parser or the correct parser can't be + * found. * - * @param clazz parser class to search for + * @param clazz parser class to search for * @param context * @return */ diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java index 23d00635d9..adb47d28d2 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java @@ -1,37 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; import java.io.InputStream; - import org.apache.tika.mime.MediaType; /** - * Tika container extractor callback interface. - * To work with a {@link ContainerExtractor}, your code needs - * to implement this interface. + * Tika container extractor callback interface. To work with a {@link ContainerExtractor}, your code + * needs to implement this interface. */ public interface EmbeddedResourceHandler { /** - * Called to process an embedded resource within the container. - * This will be called once per embedded resource within the - * container, along with whatever details are available on - * the embedded resource. - * + * Called to process an embedded resource within the container. This will be called once per + * embedded resource within the container, along with whatever details are available on the + * embedded resource. + * * @since Apache Tika 0.8 * @param filename The filename of the embedded resource, if known * @param mediaType The media type of the embedded resource, if known diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java index 4a582506fa..3cc14600e9 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java @@ -1,31 +1,27 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; import java.io.IOException; import java.io.OutputStream; - import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; /** - * Interface for different filtering of embedded streams. - * Specifically, unravel OLE streams in tika-server unpack, - * and/or handle open containers in TikaInputStream + * Interface for different filtering of embedded streams. Specifically, unravel OLE streams in + * tika-server unpack, and/or handle open containers in TikaInputStream * * @since Apache Tika 2.0.0 */ @@ -33,6 +29,7 @@ public interface EmbeddedStreamTranslator { boolean shouldTranslate(TikaInputStream inputStream, Metadata metadata) throws IOException; - void translate(TikaInputStream inputStream, Metadata metadata, OutputStream os) throws IOException; + void translate(TikaInputStream inputStream, Metadata metadata, OutputStream os) + throws IOException; } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java index 83220f0d1b..6469821ff1 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java @@ -1,26 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; import org.xml.sax.ContentHandler; /** - * Simple pointer class to allow parsers to pass on the parent contenthandler through - * to the embedded document's parse + * Simple pointer class to allow parsers to pass on the parent contenthandler through to the + * embedded document's parse */ public class ParentContentHandler { diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java index caa4636eee..db8b0b16d2 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; @@ -20,11 +18,6 @@ import java.io.IOException; import java.io.InputStream; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; @@ -38,13 +31,15 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.StatefulParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** - * An implementation of {@link ContainerExtractor} powered by the regular - * {@link Parser} API. This allows you to easily extract out all the - * embedded resources from within container files supported by normal Tika - * parsers. By default the {@link AutoDetectParser} will be used, to allow - * extraction from the widest range of containers. + * An implementation of {@link ContainerExtractor} powered by the regular {@link Parser} API. This + * allows you to easily extract out all the embedded resources from within container files supported + * by normal Tika parsers. By default the {@link AutoDetectParser} will be used, to allow extraction + * from the widest range of containers. */ public class ParserContainerExtractor implements ContainerExtractor { @@ -60,8 +55,7 @@ public ParserContainerExtractor() { } public ParserContainerExtractor(TikaConfig config) { - this(new AutoDetectParser(config), - new DefaultDetector(config.getMimeRepository())); + this(new AutoDetectParser(config), new DefaultDetector(config.getMimeRepository())); } public ParserContainerExtractor(Parser parser, Detector detector) { @@ -76,10 +70,8 @@ public boolean isSupported(TikaInputStream input) throws IOException { } @Override - public void extract( - TikaInputStream stream, ContainerExtractor recurseExtractor, - EmbeddedResourceHandler handler) - throws IOException, TikaException { + public void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, + EmbeddedResourceHandler handler) throws IOException, TikaException { ParseContext context = new ParseContext(); context.set(Parser.class, new RecursiveParser(parser, recurseExtractor, handler)); try { @@ -95,9 +87,8 @@ private class RecursiveParser extends StatefulParser { private final EmbeddedResourceHandler handler; - private RecursiveParser(Parser statelessParser, - ContainerExtractor extractor, - EmbeddedResourceHandler handler) { + private RecursiveParser(Parser statelessParser, ContainerExtractor extractor, + EmbeddedResourceHandler handler) { super(statelessParser); this.extractor = extractor; this.handler = handler; @@ -109,10 +100,8 @@ public Set getSupportedTypes(ParseContext context) { } @Override - public void parse( - InputStream stream, ContentHandler ignored, - Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { + public void parse(InputStream stream, ContentHandler ignored, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); @@ -122,7 +111,7 @@ public void parse( MediaType type = detector.detect(tis, metadata); if (extractor == null) { - // Let the handler process the embedded resource + // Let the handler process the embedded resource handler.handle(filename, type, tis); } else { // Use a temporary file to process the stream twice diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java index 21117b33b9..d356663cd5 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; @@ -21,11 +19,6 @@ import java.io.File; import java.io.FilenameFilter; import java.io.IOException; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; - import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; @@ -38,10 +31,13 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; /** - * Helper class for parsers of package archives or other compound document - * formats that support embedded or attached component documents. + * Helper class for parsers of package archives or other compound document formats that support + * embedded or attached component documents. * * @since Apache Tika 0.8 */ @@ -78,9 +74,8 @@ public boolean shouldParseEmbedded(Metadata metadata) { } @Override - public void parseEmbedded( - TikaInputStream tis, ContentHandler handler, Metadata metadata, boolean outputHtml) - throws SAXException, IOException { + public void parseEmbedded(TikaInputStream tis, ContentHandler handler, Metadata metadata, + boolean outputHtml) throws SAXException, IOException { if (outputHtml) { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "package-entry"); @@ -98,13 +93,14 @@ public void parseEmbedded( // Use the delegate parser to parse this entry try { tis.setCloseShield(); - DELEGATING_PARSER.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(handler)), - metadata, context); + DELEGATING_PARSER.parse(tis, + new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, + context); } catch (EncryptedDocumentException ede) { recordException(ede, context); } catch (CorruptedFileException e) { - //necessary to stop the parse to avoid infinite loops - //on corrupt sqlite3 files + // necessary to stop the parse to avoid infinite loops + // on corrupt sqlite3 files throw new IOException(e); } catch (TikaException e) { recordException(e, context); diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java index 9136228c4a..c10a72f78d 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; @@ -20,8 +18,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -public class ParsingEmbeddedDocumentExtractorFactory - implements EmbeddedDocumentExtractorFactory { +public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocumentExtractorFactory { private boolean writeFileNameToContent = true; @@ -32,8 +29,7 @@ public void setWriteFileNameToContent(boolean writeFileNameToContent) { @Override public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) { - ParsingEmbeddedDocumentExtractor ex = - new ParsingEmbeddedDocumentExtractor(parseContext); + ParsingEmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(parseContext); ex.setWriteFileNameToContent(writeFileNameToContent); return ex; } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java index 234c3155f1..83aaa662e5 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; @@ -24,13 +22,6 @@ import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; - import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; @@ -41,6 +32,11 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; /** * Recursive Unpacker and text and metadata extractor. @@ -50,13 +46,14 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { private static final Logger LOGGER = - LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class); + LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class); private static final File ABSTRACT_PATH = new File(""); private EmbeddedBytesSelector embeddedBytesSelector = EmbeddedBytesSelector.ACCEPT_ALL; - private final EmbeddedStreamTranslator embeddedStreamTranslator = new DefaultEmbeddedStreamTranslator(); + private final EmbeddedStreamTranslator embeddedStreamTranslator = + new DefaultEmbeddedStreamTranslator(); private long bytesExtracted = 0; private final long maxEmbeddedBytesForExtraction; @@ -67,9 +64,8 @@ public RUnpackExtractor(ParseContext context, long maxEmbeddedBytesForExtraction @Override - public void parseEmbedded( - TikaInputStream tis, ContentHandler handler, Metadata metadata, boolean outputHtml) - throws SAXException, IOException { + public void parseEmbedded(TikaInputStream tis, ContentHandler handler, Metadata metadata, + boolean outputHtml) throws SAXException, IOException { if (outputHtml) { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "package-entry"); @@ -86,7 +82,8 @@ public void parseEmbedded( // Use the delegate parser to parse this entry try { - EmbeddedDocumentBytesHandler bytesHandler = context.get(EmbeddedDocumentBytesHandler.class); + EmbeddedDocumentBytesHandler bytesHandler = + context.get(EmbeddedDocumentBytesHandler.class); tis.setCloseShield(); if (bytesHandler != null) { parseWithBytes(tis, handler, metadata); @@ -96,8 +93,8 @@ public void parseEmbedded( } catch (EncryptedDocumentException ede) { recordException(ede, context); } catch (CorruptedFileException e) { - //necessary to stop the parse to avoid infinite loops - //on corrupt sqlite3 files + // necessary to stop the parse to avoid infinite loops + // on corrupt sqlite3 files throw new IOException(e); } catch (TikaException e) { recordException(e, context); @@ -110,15 +107,16 @@ public void parseEmbedded( } } - private void parseWithBytes(TikaInputStream tis, ContentHandler handler, Metadata metadata) throws TikaException, IOException, SAXException { + private void parseWithBytes(TikaInputStream tis, ContentHandler handler, Metadata metadata) + throws TikaException, IOException, SAXException { - //trigger spool to disk + // trigger spool to disk Path rawBytes = tis.getPath(); - //There may be a "translated" path for OLE2 etc + // There may be a "translated" path for OLE2 etc Path translated = null; try { - //translate the stream or not + // translate the stream or not if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) { translated = Files.createTempFile("tika-tmp-", ".bin"); try (OutputStream os = Files.newOutputStream(translated)) { @@ -142,10 +140,10 @@ private void parseWithBytes(TikaInputStream tis, ContentHandler handler, Metadat } private void parse(InputStream stream, ContentHandler handler, Metadata metadata) - throws TikaException, IOException, SAXException { + throws TikaException, IOException, SAXException { getDelegatingParser().parse(stream, - new EmbeddedContentHandler(new BodyContentHandler(handler)), - metadata, context); + new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, + context); } private void storeEmbeddedBytes(Path p, Metadata metadata) { @@ -153,21 +151,21 @@ private void storeEmbeddedBytes(Path p, Metadata metadata) { return; } - if (! embeddedBytesSelector.select(metadata)) { + if (!embeddedBytesSelector.select(metadata)) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("skipping embedded bytes {} <-> {}", - metadata.get(Metadata.CONTENT_TYPE), - metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + metadata.get(Metadata.CONTENT_TYPE), + metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); } return; } EmbeddedDocumentBytesHandler embeddedDocumentBytesHandler = - context.get(EmbeddedDocumentBytesHandler.class); + context.get(EmbeddedDocumentBytesHandler.class); int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID); try (InputStream is = Files.newInputStream(p)) { if (bytesExtracted >= maxEmbeddedBytesForExtraction) { - throw new IOException("Bytes extracted (" + bytesExtracted + - ") >= max allowed (" + maxEmbeddedBytesForExtraction + ")"); + throw new IOException("Bytes extracted (" + bytesExtracted + ") >= max allowed (" + + maxEmbeddedBytesForExtraction + ")"); } long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted; @@ -175,19 +173,19 @@ private void storeEmbeddedBytes(Path p, Metadata metadata) { embeddedDocumentBytesHandler.add(id, metadata, boundedIs); bytesExtracted += boundedIs.getPos(); if (boundedIs.hasHitBound()) { - throw new IOException("Bytes extracted (" + bytesExtracted + - ") >= max allowed (" + maxEmbeddedBytesForExtraction + "). Truncated " + - "bytes"); + throw new IOException("Bytes extracted (" + bytesExtracted + + ") >= max allowed (" + maxEmbeddedBytesForExtraction + + "). Truncated " + "bytes"); } } } catch (IOException e) { LOGGER.warn("problem writing out embedded bytes", e); - //info in metadata doesn't actually make it back to the metadata list - //because we're filtering and cloning the metadata at the end of the parse - //which happens before we try to copy out the files. - //TODO fix this - //metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION, - // ExceptionUtils.getStackTrace(e)); + // info in metadata doesn't actually make it back to the metadata list + // because we're filtering and cloning the metadata at the end of the parse + // which happens before we try to copy out the files. + // TODO fix this + // metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION, + // ExceptionUtils.getStackTrace(e)); } } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java index a715ed25f4..696ce9cef4 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.extractor; @@ -20,7 +18,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.metadata.Metadata; @@ -37,6 +34,7 @@ public class RUnpackExtractorFactory implements EmbeddedDocumentByteStoreExtract private Set embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET; private long maxEmbeddedBytesForExtraction = DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION; + @Field public void setWriteFileNameToContent(boolean writeFileNameToContent) { this.writeFileNameToContent = writeFileNameToContent; @@ -70,16 +68,17 @@ public void setEmbeddedBytesExcludeEmbeddedResourceTypes(List excludeAtt } /** - * Total number of bytes to write out. A good zip bomb may contain petabytes - * compressed into a few kb. Make sure that you can't fill up a disk! + * Total number of bytes to write out. A good zip bomb may contain petabytes compressed into a + * few kb. Make sure that you can't fill up a disk! * - * This does not include the container file in the count of bytes written out. - * This only counts the lengths of the embedded files. + * This does not include the container file in the count of bytes written out. This only counts + * the lengths of the embedded files. * * @param maxEmbeddedBytesForExtraction */ @Field - public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction) throws TikaConfigException { + public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction) + throws TikaConfigException { if (maxEmbeddedBytesForExtraction < 0) { throw new TikaConfigException("maxEmbeddedBytesForExtraction must be >= 0"); } @@ -88,9 +87,7 @@ public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction) @Override public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) { - RUnpackExtractor ex = - new RUnpackExtractor(parseContext, - maxEmbeddedBytesForExtraction); + RUnpackExtractor ex = new RUnpackExtractor(parseContext, maxEmbeddedBytesForExtraction); ex.setWriteFileNameToContent(writeFileNameToContent); ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector()); return ex; @@ -98,14 +95,13 @@ public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext par private EmbeddedBytesSelector createEmbeddedBytesSelector() { - if (embeddedBytesIncludeMimeTypes.size() == 0 && - embeddedBytesExcludeMimeTypes.size() == 0 && - embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 && - embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) { + if (embeddedBytesIncludeMimeTypes.size() == 0 && embeddedBytesExcludeMimeTypes.size() == 0 + && embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 + && embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) { return EmbeddedBytesSelector.ACCEPT_ALL; } return new BasicEmbeddedBytesSelector(embeddedBytesIncludeMimeTypes, - embeddedBytesExcludeMimeTypes, embeddedBytesIncludeEmbeddedResourceTypes, - embeddedBytesExcludeEmbeddedResourceTypes); + embeddedBytesExcludeMimeTypes, embeddedBytesIncludeEmbeddedResourceTypes, + embeddedBytesExcludeEmbeddedResourceTypes); } } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/package-info.java b/tika-core/src/main/java/org/apache/tika/extractor/package-info.java index 3d3e92b525..f22c3d7f13 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java index 51b1beeff1..18ffb0559f 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -36,8 +34,8 @@ class ClassLoaderProxy extends ClassLoader implements ForkProxy { private static final long serialVersionUID = -7303109260448540420L; /** - * Names of resources that could not be found. Used to avoid repeated - * lookup of commonly accessed, but often not present, resources like + * Names of resources that could not be found. Used to avoid repeated lookup of commonly + * accessed, but often not present, resources like * META-INF/services/javax.xml.parsers.SAXParserFactory. */ private final Set notFound = new HashSet<>(); diff --git a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java index 7af85ada51..5e71e592fc 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -32,11 +30,10 @@ public ClassLoaderResource(ClassLoader loader) { } /** - * Processes a request for one (code 1) or many (code 2) class loader - * resources. The requested resources are sent preceded with a boolean - * true value. If the resource was not found (code 1) or - * when the last resource has been sent (code 2), a boolean - * false value is sent instead. + * Processes a request for one (code 1) or many (code 2) class loader resources. The requested + * resources are sent preceded with a boolean true value. If the resource was not + * found (code 1) or when the last resource has been sent (code 2), a boolean false + * value is sent instead. * * @param name resource name * @throws IOException if the resource could not be sent @@ -66,20 +63,18 @@ public Throwable process(DataInputStream input, DataOutputStream output) throws } /** - * Sends the contents of the given input stream to the given output. - * The stream is sent in chunks of less than 64kB, each preceded by - * a 16-bit integer value that indicates the length of the following - * chunk. A zero short value is sent at the end to signify the end of - * the stream. + * Sends the contents of the given input stream to the given output. The stream is sent in + * chunks of less than 64kB, each preceded by a 16-bit integer value that indicates the length + * of the following chunk. A zero short value is sent at the end to signify the end of the + * stream. *

- * The stream is guaranteed to be closed by this method, regardless of - * the way it returns. + * The stream is guaranteed to be closed by this method, regardless of the way it returns. * * @param stream the stream to be sent * @throws IOException if the stream could not be sent */ private void writeAndCloseStream(DataOutputStream output, InputStream stream) - throws IOException { + throws IOException { try { byte[] buffer = new byte[0x10000 - 1]; int n; diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java index 371dd05c57..c9802ac903 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; - import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; @@ -79,8 +76,8 @@ private void sendString(String string) throws SAXException { } /** - * Breaks the string in 21,845 size chunks to not - * throw UTFDataFormatException at least in Oracle JDK 8. + * Breaks the string in 21,845 size chunks to not throw UTFDataFormatException at least in + * Oracle JDK 8. */ private void writeString(String string) throws IOException { int max = 65535 / 3; @@ -139,7 +136,7 @@ public void endPrefixMapping(String prefix) throws SAXException { } public void startElement(String uri, String localName, String qName, Attributes atts) - throws SAXException { + throws SAXException { sendRequest(START_ELEMENT); sendString(uri); sendString(localName); diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java index f8971b9a67..922e662bc8 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; - import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -61,7 +58,7 @@ private void internalProcess(DataInputStream input) throws IOException, SAXExcep atts = new AttributesImpl(); for (int i = 0; i < n; i++) { atts.addAttribute(readString(input), readString(input), readString(input), - readString(input), readString(input)); + readString(input), readString(input)); } } handler.startElement(uri, localName, qName, atts); diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java b/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java index f1a47206d9..e13b190a89 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -33,14 +31,12 @@ import java.util.jar.JarEntry; import java.util.jar.JarOutputStream; import java.util.zip.ZipEntry; - import org.apache.commons.io.IOUtils; -import org.xml.sax.ContentHandler; - import org.apache.tika.exception.TikaException; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.utils.ProcessUtils; +import org.xml.sax.ContentHandler; class ForkClient { private static final AtomicInteger CLIENT_COUNTER = new AtomicInteger(0); @@ -57,30 +53,28 @@ class ForkClient { private final DataInputStream input; - //this is used for debugging/smoke testing + // this is used for debugging/smoke testing private final int id = CLIENT_COUNTER.incrementAndGet(); private volatile int filesProcessed = 0; public ForkClient(Path tikaDir, ParserFactoryFactory parserFactoryFactory, List java, - TimeoutLimits timeoutLimits) throws IOException, TikaException { + TimeoutLimits timeoutLimits) throws IOException, TikaException { this(tikaDir, parserFactoryFactory, null, java, timeoutLimits); } /** - * @param tikaDir directory containing jars from which to start - * the child server and load the Parser - * @param parserFactoryFactory factory to send to forked process to build parser - * upon arrival - * @param classLoader class loader to use for non-parser resource - * (content-handler, etc.) - * @param java java commandline to use for the commandline server + * @param tikaDir directory containing jars from which to start the child server and load the + * Parser + * @param parserFactoryFactory factory to send to forked process to build parser upon arrival + * @param classLoader class loader to use for non-parser resource (content-handler, etc.) + * @param java java commandline to use for the commandline server * @throws IOException * @throws TikaException */ public ForkClient(Path tikaDir, ParserFactoryFactory parserFactoryFactory, - ClassLoader classLoader, List java, TimeoutLimits timeoutLimits) - throws IOException, TikaException { + ClassLoader classLoader, List java, TimeoutLimits timeoutLimits) + throws IOException, TikaException { jar = null; loader = null; boolean ok = false; @@ -132,7 +126,7 @@ public ForkClient(Path tikaDir, ParserFactoryFactory parserFactoryFactory, public ForkClient(ClassLoader loader, Object object, List java, - TimeoutLimits timeoutLimits) throws IOException, TikaException { + TimeoutLimits timeoutLimits) throws IOException, TikaException { boolean ok = false; try { this.loader = loader; @@ -168,8 +162,8 @@ public ForkClient(ClassLoader loader, Object object, List java, } /** - * Creates a temporary jar file that can be used to bootstrap the forked - * server process. Remember to remove the file when no longer used. + * Creates a temporary jar file that can be used to bootstrap the forked server process. + * Remember to remove the file when no longer used. * * @return the created jar file * @throws IOException if the bootstrap archive could not be created @@ -189,9 +183,9 @@ private static File createBootstrapJar() throws IOException { } /** - * Fills in the jar file used to bootstrap the forked server process. - * All the required .class files and a manifest with a - * Main-Class entry are written into the archive. + * Fills in the jar file used to bootstrap the forked server process. All the required + * .class files and a manifest with a Main-Class entry are written + * into the archive. * * @param file file to hold the bootstrap archive * @throws IOException if the bootstrap archive could not be created @@ -203,9 +197,9 @@ private static void fillBootstrapJar(File file) throws IOException { jar.write(manifest.getBytes(UTF_8)); Class[] bootstrap = {ForkServer.class, ForkObjectInputStream.class, ForkProxy.class, - ClassLoaderProxy.class, MemoryURLConnection.class, MemoryURLStreamHandler.class, - MemoryURLStreamHandlerFactory.class, MemoryURLStreamRecord.class, - TikaException.class}; + ClassLoaderProxy.class, MemoryURLConnection.class, + MemoryURLStreamHandler.class, MemoryURLStreamHandlerFactory.class, + MemoryURLStreamRecord.class, TikaException.class}; ClassLoader loader = ForkServer.class.getClassLoader(); for (Class klass : bootstrap) { String path = klass.getName().replace('.', '/') + ".class"; @@ -227,10 +221,10 @@ private void waitForStartBeacon() throws IOException { } else if (type == -1) { throw new IOException("EOF while waiting for start beacon"); } else { - //can't do this because of + // can't do this because of // ForkParserIntegrationTest // #testAttachingADebuggerOnTheForkedParserShouldWork -// throw new IOException("Unexpected byte while waiting for start beacon: "+type); + // throw new IOException("Unexpected byte while waiting for start beacon: "+type); } } } @@ -249,7 +243,7 @@ public synchronized boolean ping() { } public synchronized Throwable call(String method, Object... args) - throws IOException, TikaException { + throws IOException, TikaException { filesProcessed++; List r = new ArrayList<>(resources); output.writeByte(ForkServer.CALL); @@ -265,26 +259,26 @@ public int getFilesProcessed() { } /** - * Serializes the object first into an in-memory buffer and then - * writes it to the output stream with a preceding size integer. + * Serializes the object first into an in-memory buffer and then writes it to the output stream + * with a preceding size integer. * - * @param object object to be serialized + * @param object object to be serialized * @param resources list of fork resources, used when adding proxies * @throws IOException if the object could not be serialized */ private void sendObject(Object object, List resources) - throws IOException, TikaException { + throws IOException, TikaException { int n = resources.size(); if (object instanceof InputStream) { resources.add(new InputStreamResource((InputStream) object)); object = new InputStreamProxy(n); } else if (object instanceof RecursiveParserWrapperHandler) { resources.add(new RecursiveMetadataContentHandlerResource( - (RecursiveParserWrapperHandler) object)); + (RecursiveParserWrapperHandler) object)); object = new RecursiveMetadataContentHandlerProxy(n, - ((RecursiveParserWrapperHandler) object).getContentHandlerFactory()); - } else if (object instanceof ContentHandler && - !(object instanceof AbstractRecursiveParserWrapperHandler)) { + ((RecursiveParserWrapperHandler) object).getContentHandlerFactory()); + } else if (object instanceof ContentHandler + && !(object instanceof AbstractRecursiveParserWrapperHandler)) { resources.add(new ContentHandlerResource((ContentHandler) object)); object = new ContentHandlerProxy(n); } else if (object instanceof ClassLoader) { @@ -296,8 +290,8 @@ private void sendObject(Object object, List resources) ForkObjectInputStream.sendObject(object, output); } catch (NotSerializableException nse) { // Build a more friendly error message for this - throw new TikaException("Unable to serialize " + object.getClass().getSimpleName() + - " to pass to the Forked Parser", nse); + throw new TikaException("Unable to serialize " + object.getClass().getSimpleName() + + " to pass to the Forked Parser", nse); } waitForResponse(resources); @@ -316,10 +310,10 @@ public synchronized void close() { if (process != null) { process.destroyForcibly(); try { - //TIKA-1933 + // TIKA-1933 process.waitFor(); } catch (InterruptedException e) { - //swallow + // swallow } } if (jar != null) { diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java b/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java index 61e2dae48f..4c850ee1c4 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -27,15 +25,13 @@ import java.io.ObjectStreamClass; /** - * An object input stream that uses a given class loader when deserializing - * objects. + * An object input stream that uses a given class loader when deserializing objects. *

- * Note that this functionality could easily be implemented as a simple - * anonymous {@link ObjectInputStream} subclass, but since the - * functionality is needed during the somewhat complicated bootstrapping - * of the stdin/out communication channel of a forked server process, - * it's better if class has a stable name that can be referenced at - * compile-time by the {@link ForkClient} class. + * Note that this functionality could easily be implemented as a simple anonymous + * {@link ObjectInputStream} subclass, but since the functionality is needed during the somewhat + * complicated bootstrapping of the stdin/out communication channel of a forked server process, it's + * better if class has a stable name that can be referenced at compile-time by the + * {@link ForkClient} class. */ class ForkObjectInputStream extends ObjectInputStream { @@ -45,10 +41,10 @@ class ForkObjectInputStream extends ObjectInputStream { private final ClassLoader loader; /** - * Creates a new object input stream that uses the given class loader - * when deserializing objects. + * Creates a new object input stream that uses the given class loader when deserializing + * objects. * - * @param input underlying input stream + * @param input underlying input stream * @param loader class loader used when deserializing objects * @throws IOException if this stream could not be initiated */ @@ -58,8 +54,8 @@ public ForkObjectInputStream(InputStream input, ClassLoader loader) throws IOExc } /** - * Serializes the object first into an in-memory buffer and then - * writes it to the output stream with a preceding size integer. + * Serializes the object first into an in-memory buffer and then writes it to the output stream + * with a preceding size integer. * * @param object object to be serialized * @param output output stream @@ -77,23 +73,23 @@ public static void sendObject(Object object, DataOutputStream output) throws IOE } /** - * Deserializes an object from the given stream. The serialized object - * is expected to be preceded by a size integer, that is used for reading - * the entire serialization into a memory before deserializing it. + * Deserializes an object from the given stream. The serialized object is expected to be + * preceded by a size integer, that is used for reading the entire serialization into a memory + * before deserializing it. * - * @param input input stream from which the serialized object is read + * @param input input stream from which the serialized object is read * @param loader class loader to be used for loading referenced classes - * @throws IOException if the object could not be deserialized + * @throws IOException if the object could not be deserialized * @throws ClassNotFoundException if a referenced class is not found */ public static Object readObject(DataInputStream input, ClassLoader loader) - throws IOException, ClassNotFoundException { + throws IOException, ClassNotFoundException { int n = input.readInt(); byte[] data = new byte[n]; input.readFully(data); ObjectInputStream deserializer = - new ForkObjectInputStream(new ByteArrayInputStream(data), loader); + new ForkObjectInputStream(new ByteArrayInputStream(data), loader); return deserializer.readObject(); } diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java b/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java index 84d1156e2a..ce2af32c07 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -27,10 +25,6 @@ import java.util.List; import java.util.Queue; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -41,6 +35,8 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.TeeContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class ForkParser implements Parser, Closeable { @@ -49,12 +45,12 @@ public class ForkParser implements Parser, Closeable { */ private static final long serialVersionUID = -4962742892274663950L; - //these are used by the legacy usage + // these are used by the legacy usage private final ClassLoader loader; private final Parser parser; - //these are used when the server builds a parser via a directory - //of jars, not via legacy bootstrap etc. + // these are used when the server builds a parser via a directory + // of jars, not via legacy bootstrap etc. private final Path tikaBin; private final ParserFactoryFactory parserFactoryFactory; private final Queue pool = new LinkedList<>(); @@ -81,14 +77,12 @@ public class ForkParser implements Parser, Closeable { private int maxFilesProcessedPerClient = -1; /** - * If you have a directory with, say, tike-app.jar and you want the - * forked process/server to build a parser - * and run it from that -- so that you can keep all of those dependencies out of + * If you have a directory with, say, tike-app.jar and you want the forked process/server to + * build a parser and run it from that -- so that you can keep all of those dependencies out of * your client code, use this initializer. * - * @param tikaBin directory containing the tika-app.jar or similar -- - * full jar including tika-core and all - * desired parsers and dependencies + * @param tikaBin directory containing the tika-app.jar or similar -- full jar including + * tika-core and all desired parsers and dependencies * @param factoryFactory */ public ForkParser(Path tikaBin, ParserFactoryFactory factoryFactory) { @@ -101,16 +95,14 @@ public ForkParser(Path tikaBin, ParserFactoryFactory factoryFactory) { /** * EXPERT * - * @param tikaBin directory containing the tika-app.jar or similar - * -- full jar including tika-core and all - * desired parsers and dependencies - * @param parserFactoryFactory -- the factory to use to generate the parser factory - * in the forked process/server - * @param classLoader to use for all classes besides the parser in the - * forked process/server + * @param tikaBin directory containing the tika-app.jar or similar -- full jar including + * tika-core and all desired parsers and dependencies + * @param parserFactoryFactory -- the factory to use to generate the parser factory in the + * forked process/server + * @param classLoader to use for all classes besides the parser in the forked process/server */ public ForkParser(Path tikaBin, ParserFactoryFactory parserFactoryFactory, - ClassLoader classLoader) { + ClassLoader classLoader) { parser = null; loader = classLoader; this.tikaBin = tikaBin; @@ -124,8 +116,8 @@ public ForkParser(Path tikaBin, ParserFactoryFactory parserFactoryFactory, public ForkParser(ClassLoader loader, Parser parser) { if (parser instanceof ForkParser) { throw new IllegalArgumentException( - "The underlying parser of a ForkParser should not be a ForkParser, " + - "but a specific implementation."); + "The underlying parser of a ForkParser should not be a ForkParser, " + + "but a specific implementation."); } this.tikaBin = null; this.parserFactoryFactory = null; @@ -160,11 +152,9 @@ public synchronized void setPoolSize(int poolSize) { } /** - * Sets the command used to start the forked server process. - * The arguments "-jar" and "/path/to/bootstrap.jar" - * or "-cp" and "/path/to/tika_bin" are - * appended to the given command when starting the process. - * The default setting is {"java", "-Xmx32m"}. + * Sets the command used to start the forked server process. The arguments "-jar" and + * "/path/to/bootstrap.jar" or "-cp" and "/path/to/tika_bin" are appended to the given command + * when starting the process. The default setting is {"java", "-Xmx32m"}. *

* Creates a defensive copy. * @@ -190,51 +180,44 @@ public Set getSupportedTypes(ParseContext context) { } /** - * This sends the objects to the server for parsing, and the server via - * the proxies acts on the handler as if it were updating it directly. + * This sends the objects to the server for parsing, and the server via the proxies acts on the + * handler as if it were updating it directly. *

* If using a {@link org.apache.tika.parser.RecursiveParserWrapper}, there are two options: *

*

- *

    - *
  1. Send in a class that extends - * {@link org.apache.tika.sax.RecursiveParserWrapperHandler}, - * and the server will proxy back the data as best it can[0].
  2. - *
  3. Send in a class that extends {@link AbstractRecursiveParserWrapperHandler} - * and the server will act on the class but not proxy back the data. This - * can be used, for example, if all you want to do is write to disc, extend - * {@link AbstractRecursiveParserWrapperHandler} to write to disc when - * {@link AbstractRecursiveParserWrapperHandler#endDocument(ContentHandler, - * Metadata)} - * is called, and the server will take care of the writing via the handler.
  4. - *
+ *
    + *
  1. Send in a class that extends {@link org.apache.tika.sax.RecursiveParserWrapperHandler}, + * and the server will proxy back the data as best it can[0].
  2. + *
  3. Send in a class that extends {@link AbstractRecursiveParserWrapperHandler} and the server + * will act on the class but not proxy back the data. This can be used, for example, if all you + * want to do is write to disc, extend {@link AbstractRecursiveParserWrapperHandler} to write to + * disc when {@link AbstractRecursiveParserWrapperHandler#endDocument(ContentHandler, Metadata)} + * is called, and the server will take care of the writing via the handler.
  4. + *
*

*

- * NOTE:[0] "the server will proxy back the data as best it can". - * If the handler implements Serializable and is actually serializable, the - * server will send it and the - * {@link Metadata} back upon - * {@link org.apache.tika.sax.RecursiveParserWrapperHandler# - * endEmbeddedDocument(ContentHandler, Metadata)} - * or {@link org.apache.tika.sax.RecursiveParserWrapperHandler# - * endEmbeddedDocument(ContentHandler, Metadata)}. - * If the handler does not implement {@link java.io.Serializable} or if there is a - * {@link java.io.NotSerializableException} thrown during serialization, the server will - * call {@link ContentHandler#toString()} on the ContentHandler and set that value with the - * {@link TikaCoreProperties#TIKA_CONTENT} key and then - * serialize and proxy that data back. + * NOTE:[0] "the server will proxy back the data as best it can". If the + * handler implements Serializable and is actually serializable, the server will send it and the + * {@link Metadata} back upon {@link org.apache.tika.sax.RecursiveParserWrapperHandler# + * endEmbeddedDocument(ContentHandler, Metadata)} or + * {@link org.apache.tika.sax.RecursiveParserWrapperHandler# endEmbeddedDocument(ContentHandler, + * Metadata)}. If the handler does not implement {@link java.io.Serializable} or if there is a + * {@link java.io.NotSerializableException} thrown during serialization, the server will call + * {@link ContentHandler#toString()} on the ContentHandler and set that value with the + * {@link TikaCoreProperties#TIKA_CONTENT} key and then serialize and proxy that data back. *

* - * @param stream the document stream (input) - * @param handler handler for the XHTML SAX events (output) + * @param stream the document stream (input) + * @param handler handler for the XHTML SAX events (output) * @param metadata document metadata (input and output) - * @param context parse context + * @param context parse context * @throws IOException * @throws SAXException * @throws TikaException */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { if (stream == null) { throw new NullPointerException("null stream"); } @@ -244,9 +227,9 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, boolean alive = false; ForkClient client = acquireClient(); try { - ContentHandler tee = - (handler instanceof AbstractRecursiveParserWrapperHandler) ? handler : - new TeeContentHandler(handler, new MetadataContentHandler(metadata)); + ContentHandler tee = (handler instanceof AbstractRecursiveParserWrapperHandler) + ? handler + : new TeeContentHandler(handler, new MetadataContentHandler(metadata)); t = client.call("parse", stream, tee, metadata, context); alive = true; @@ -256,10 +239,10 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, throw te; } catch (IOException e) { // Problem occurred on the other side - throw new TikaException("Failed to communicate with a forked parser process." + - " The process has most likely crashed due to some error" + - " like running out of memory. A new process will be" + - " started for the next parsing request.", e); + throw new TikaException("Failed to communicate with a forked parser process." + + " The process has most likely crashed due to some error" + + " like running out of memory. A new process will be" + + " started for the next parsing request.", e); } finally { releaseClient(client, alive); } @@ -313,17 +296,17 @@ private synchronized ForkClient acquireClient() throws IOException, TikaExceptio private ForkClient newClient() throws IOException, TikaException { TimeoutLimits timeoutLimits = new TimeoutLimits(serverPulseMillis, serverParseTimeoutMillis, - serverWaitTimeoutMillis); + serverWaitTimeoutMillis); if (loader == null && parser == null && tikaBin != null && parserFactoryFactory != null) { return new ForkClient(tikaBin, parserFactoryFactory, java, timeoutLimits); - } else if (loader != null && parser != null && tikaBin == null && - parserFactoryFactory == null) { + } else if (loader != null && parser != null && tikaBin == null + && parserFactoryFactory == null) { return new ForkClient(loader, parser, java, timeoutLimits); - } else if (loader != null && parser == null && tikaBin != null && - parserFactoryFactory != null) { + } else if (loader != null && parser == null && tikaBin != null + && parserFactoryFactory != null) { return new ForkClient(tikaBin, parserFactoryFactory, loader, java, timeoutLimits); } else { - //TODO: make this more useful + // TODO: make this more useful throw new IllegalStateException("Unexpected combination of state items"); } } @@ -331,8 +314,8 @@ private ForkClient newClient() throws IOException, TikaException { private synchronized void releaseClient(ForkClient client, boolean alive) { currentlyInUse--; if (currentlyInUse + pool.size() < poolSize && alive) { - if (maxFilesProcessedPerClient > 0 && - client.getFilesProcessed() >= maxFilesProcessedPerClient) { + if (maxFilesProcessedPerClient > 0 + && client.getFilesProcessed() >= maxFilesProcessedPerClient) { client.close(); } else { pool.offer(client); @@ -344,10 +327,8 @@ private synchronized void releaseClient(ForkClient client, boolean alive) { } /** - * The amount of time in milliseconds that the server - * should wait before checking to see if the parse has timed out - * or if the wait has timed out - * The default is 5 seconds. + * The amount of time in milliseconds that the server should wait before checking to see if the + * parse has timed out or if the wait has timed out The default is 5 seconds. * * @param serverPulseMillis milliseconds to sleep before checking if there has been any activity */ @@ -356,9 +337,8 @@ public void setServerPulseMillis(long serverPulseMillis) { } /** - * The maximum amount of time allowed for the server to try to parse a file. - * If more than this time elapses, the server shuts down, and the ForkParser - * throws an exception. + * The maximum amount of time allowed for the server to try to parse a file. If more than this + * time elapses, the server shuts down, and the ForkParser throws an exception. * * @param serverParseTimeoutMillis */ @@ -367,9 +347,9 @@ public void setServerParseTimeoutMillis(long serverParseTimeoutMillis) { } /** - * The maximum amount of time allowed for the server to wait for a new request to parse - * a file. The server will shutdown after this amount of time, and a new server will have - * to be started by a new client. + * The maximum amount of time allowed for the server to wait for a new request to parse a file. + * The server will shutdown after this amount of time, and a new server will have to be started + * by a new client. * * @param serverWaitTimeoutMillis */ @@ -378,14 +358,13 @@ public void setServerWaitTimeoutMillis(long serverWaitTimeoutMillis) { } /** - * If there is a slowly building memory leak in one of the parsers, - * it is useful to set a limit on the number of files processed - * by a server before it is shutdown and restarted. Default value is -1. + * If there is a slowly building memory leak in one of the parsers, it is useful to set a limit + * on the number of files processed by a server before it is shutdown and restarted. Default + * value is -1. * - * @param maxFilesProcessedPerClient maximum number of files that a server can handle - * before the parser shuts down a client and creates - * a new process. If set to -1, the server is never restarted - * because of the number of files handled. + * @param maxFilesProcessedPerClient maximum number of files that a server can handle before the + * parser shuts down a client and creates a new process. If set to -1, the server is + * never restarted because of the number of files handled. */ public void setMaxFilesProcessedPerServer(int maxFilesProcessedPerClient) { this.maxFilesProcessedPerClient = maxFilesProcessedPerClient; diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java index b10eac8ded..308a1fb245 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java b/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java index 9bbd82bdd5..35716dcbfc 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java b/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java index c3249c1d1f..0b5a0a2f19 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -26,11 +24,9 @@ import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.net.URL; - -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.parser.ParserFactory; +import org.xml.sax.SAXException; class ForkServer implements Runnable { @@ -61,28 +57,28 @@ class ForkServer implements Runnable { */ private final DataOutputStream output; private final boolean active = true; - //milliseconds to sleep before checking to see if there has been any reading/writing - //If no reading or writing in this time, shutdown the server. + // milliseconds to sleep before checking to see if there has been any reading/writing + // If no reading or writing in this time, shutdown the server. private long serverPulseMillis = 5000; private long serverParserTimeoutMillis = 60000; private long serverWaitTimeoutMillis = 60000; - //can't be class Parser because then you'd - //have to include that in bootstrap jar (legacy mode) + // can't be class Parser because then you'd + // have to include that in bootstrap jar (legacy mode) private Object parser; private ClassLoader classLoader; private boolean parsing = false; private long since; + /** - * Sets up a forked server instance using the given stdin/out - * communication channel. + * Sets up a forked server instance using the given stdin/out communication channel. * - * @param input input stream for reading from the parent process + * @param input input stream for reading from the parent process * @param output output stream for writing to the parent process * @throws IOException if the server instance could not be created */ public ForkServer(InputStream input, OutputStream output, long serverPulseMillis, - long serverParserTimeoutMillis, long serverWaitTimeoutMillis) - throws IOException { + long serverParserTimeoutMillis, long serverWaitTimeoutMillis) + throws IOException { this.input = new DataInputStream(input); this.output = new DataOutputStream(output); this.serverPulseMillis = serverPulseMillis; @@ -93,10 +89,9 @@ public ForkServer(InputStream input, OutputStream output, long serverPulseMillis } /** - * Starts a forked server process using the standard input and output - * streams for communication with the parent process. Any attempts by - * stray code to read from standard input or write to standard output - * is redirected to avoid interfering with the communication channel. + * Starts a forked server process using the standard input and output streams for communication + * with the parent process. Any attempts by stray code to read from standard input or write to + * standard output is redirected to avoid interfering with the communication channel. * * @param args command line arguments, ignored * @throws Exception if the server could not be started @@ -108,9 +103,8 @@ public static void main(String[] args) throws Exception { URL.setURLStreamHandlerFactory(new MemoryURLStreamHandlerFactory()); - ForkServer server = - new ForkServer(System.in, System.out, serverPulseMillis, serverParseTimeoutMillis, - serverWaitTimeoutMillis); + ForkServer server = new ForkServer(System.in, System.out, serverPulseMillis, + serverParseTimeoutMillis, serverWaitTimeoutMillis); System.setIn(new ByteArrayInputStream(new byte[0])); System.setOut(System.err); @@ -128,8 +122,8 @@ public void run() { long elapsed = System.currentTimeMillis() - since; if (parsing && elapsed > serverParserTimeoutMillis) { break; - } else if (!parsing && serverWaitTimeoutMillis > 0 && - elapsed > serverWaitTimeoutMillis) { + } else if (!parsing && serverWaitTimeoutMillis > 0 + && elapsed > serverWaitTimeoutMillis) { break; } } @@ -137,12 +131,12 @@ public void run() { } System.exit(0); } catch (InterruptedException e) { - //swallow + // swallow } } public void processRequests() { - //initialize + // initialize try { initializeParserAndLoader(); } catch (Throwable t) { @@ -157,7 +151,7 @@ public void processRequests() { } return; } - //main loop + // main loop try { while (true) { int request = input.read(); @@ -179,7 +173,7 @@ public void processRequests() { } private void initializeParserAndLoader() - throws IOException, ClassNotFoundException, TikaException, SAXException { + throws IOException, ClassNotFoundException, TikaException, SAXException { output.writeByte(READY); output.flush(); @@ -192,36 +186,36 @@ private void initializeParserAndLoader() switch (configIndex) { case INIT_PARSER_FACTORY_FACTORY: if (firstObject instanceof ParserFactoryFactory) { - //the user has submitted a parser factory, but no class loader + // the user has submitted a parser factory, but no class loader classLoader = ForkServer.class.getClassLoader(); ParserFactory parserFactory = ((ParserFactoryFactory) firstObject).build(); parser = parserFactory.build(); } else { throw new IllegalArgumentException( - "Expecting only one object of class ParserFactoryFactory"); + "Expecting only one object of class ParserFactoryFactory"); } break; case INIT_LOADER_PARSER: if (firstObject instanceof ClassLoader) { classLoader = (ClassLoader) firstObject; Thread.currentThread().setContextClassLoader(classLoader); - //parser from parent process + // parser from parent process parser = readObject(classLoader); } else { throw new IllegalArgumentException( - "Expecting ClassLoader followed by a Parser"); + "Expecting ClassLoader followed by a Parser"); } break; case INIT_PARSER_FACTORY_FACTORY_LOADER: if (firstObject instanceof ParserFactoryFactory) { - //the user has submitted a parser factory and a class loader + // the user has submitted a parser factory and a class loader ParserFactory parserFactory = ((ParserFactoryFactory) firstObject).build(); parser = parserFactory.build(); classLoader = (ClassLoader) readObject(ForkServer.class.getClassLoader()); Thread.currentThread().setContextClassLoader(classLoader); } else { throw new IllegalStateException( - "Expecing ParserFactoryFactory followed by a class loader"); + "Expecing ParserFactoryFactory followed by a class loader"); } break; } @@ -281,12 +275,12 @@ private Method getMethod(Object object, String name) { } /** - * Deserializes an object from the given stream. The serialized object - * is expected to be preceded by a size integer, that is used for reading - * the entire serialization into a memory before deserializing it. + * Deserializes an object from the given stream. The serialized object is expected to be + * preceded by a size integer, that is used for reading the entire serialization into a memory + * before deserializing it. * * @param loader class loader to be used for loading referenced classes - * @throws IOException if the object could not be deserialized + * @throws IOException if the object could not be deserialized * @throws ClassNotFoundException if a referenced class is not found */ private Object readObject(ClassLoader loader) throws IOException, ClassNotFoundException { diff --git a/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java b/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java index cca9b74711..eda9c855bf 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java +++ b/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; diff --git a/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java b/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java index 04ba93cdcb..de028d33cc 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java +++ b/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java index 74a16878dc..905d5a3a53 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java +++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -31,8 +29,7 @@ class MemoryURLConnection extends URLConnection { } @Override - public void connect() { - } + public void connect() {} @Override public InputStream getInputStream() { diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java index 6f4b1435e4..acc0105df0 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java +++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -33,8 +31,7 @@ class MemoryURLStreamHandler extends URLStreamHandler { private static final AtomicInteger counter = new AtomicInteger(); - private static final List records = - new LinkedList<>(); + private static final List records = new LinkedList<>(); public static URL createURL(byte[] data) { try { diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java index 5f3d818d92..1f77ba390b 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java index 8a72035fda..f40e48a6b9 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java +++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; diff --git a/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java b/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java index c1f1f5612b..f2c035c1fc 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java @@ -1,27 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; +import org.apache.tika.metadata.Metadata; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.metadata.Metadata; - class MetadataContentHandler extends DefaultHandler { private final Metadata metadata; @@ -31,7 +28,7 @@ public MetadataContentHandler(Metadata metadata) { } public void startElement(String uri, String local, String name, Attributes attributes) - throws SAXException { + throws SAXException { if ("meta".equals(local)) { String aname = attributes.getValue("name"); String content = attributes.getValue("content"); diff --git a/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java b/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java index 580b1ef435..0f5e2625c3 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java +++ b/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -21,13 +19,12 @@ import java.io.Serializable; import java.lang.reflect.Constructor; import java.util.Map; - import org.apache.tika.exception.TikaException; import org.apache.tika.parser.ParserFactory; /** - * Lightweight, easily serializable class that contains enough information - * to build a {@link ParserFactory} + * Lightweight, easily serializable class that contains enough information to build a + * {@link ParserFactory} */ public class ParserFactoryFactory implements Serializable { diff --git a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java index 1195ef20d0..9bf5104bd5 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java +++ b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -23,25 +21,24 @@ import java.io.NotSerializableException; import java.io.ObjectOutputStream; import java.io.Serializable; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - *

This class calls #toString() on the ContentHandler, inserts it into the Metadata object - * and serializes the Metadata object. + *

+ * This class calls #toString() on the ContentHandler, inserts it into the Metadata object and + * serializes the Metadata object. *

- * Ideally, this would serialize the ContentHandler and the Metadata object as separate objects, - * but we can't guarantee that the ContentHandler is Serializable (e.g. the StringWriter in - * the WriteOutContentHandler). + * Ideally, this would serialize the ContentHandler and the Metadata object as separate objects, but + * we can't guarantee that the ContentHandler is Serializable (e.g. the StringWriter in the + * WriteOutContentHandler). */ class RecursiveMetadataContentHandlerProxy extends RecursiveParserWrapperHandler - implements ForkProxy { + implements ForkProxy { public static final byte EMBEDDED_DOCUMENT = 1; public static final byte MAIN_DOCUMENT = 2; @@ -59,7 +56,7 @@ class RecursiveMetadataContentHandlerProxy extends RecursiveParserWrapperHandler private transient DataOutputStream output; public RecursiveMetadataContentHandlerProxy(int resource, - ContentHandlerFactory contentHandlerFactory) { + ContentHandlerFactory contentHandlerFactory) { super(contentHandlerFactory); this.resource = resource; } @@ -70,7 +67,7 @@ public void init(DataInputStream input, DataOutputStream output) { @Override public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) - throws SAXException { + throws SAXException { proxyBackToClient(EMBEDDED_DOCUMENT, contentHandler, metadata); decrementEmbeddedDepth(); } @@ -84,7 +81,7 @@ public void endDocument(ContentHandler contentHandler, Metadata metadata) throws } private void proxyBackToClient(int embeddedOrMainDocument, ContentHandler contentHandler, - Metadata metadata) throws SAXException { + Metadata metadata) throws SAXException { try { output.write(ForkServer.RESOURCE); output.writeByte(resource); @@ -96,7 +93,7 @@ private void proxyBackToClient(int embeddedOrMainDocument, ContentHandler conten bytes = serialize(contentHandler); success = true; } catch (NotSerializableException e) { - //object lied + // object lied } if (success) { @@ -107,9 +104,9 @@ private void proxyBackToClient(int embeddedOrMainDocument, ContentHandler conten return; } } - //if contenthandler is not allegedly or actually Serializable - //fall back to adding contentHandler.toString() to the metadata object - //and send that. + // if contenthandler is not allegedly or actually Serializable + // fall back to adding contentHandler.toString() to the metadata object + // and send that. metadata.set(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString()); output.writeByte(METADATA_ONLY); send(metadata); @@ -133,9 +130,9 @@ private void sendBytes(byte[] bytes) throws IOException { } private byte[] serialize(Object object) throws IOException { - //can't figure out why I'm getting an IllegalAccessException - //when I try to use ForkedObjectInputStream, but - //not when I do this manually ?! + // can't figure out why I'm getting an IllegalAccessException + // when I try to use ForkedObjectInputStream, but + // not when I do this manually ?! ByteArrayOutputStream bos = new ByteArrayOutputStream(); try (ObjectOutputStream oos = new ObjectOutputStream(bos)) { oos.writeObject(object); diff --git a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java index 638e24daab..94b0e89c5f 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java +++ b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java @@ -1,32 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; class RecursiveMetadataContentHandlerResource implements ForkResource { @@ -51,14 +47,12 @@ private void internalProcess(DataInputStream input) throws IOException, SAXExcep byte handlerAndMetadataOrMetadataOnly = input.readByte(); ContentHandler localContentHandler = DEFAULT_HANDLER; - if (handlerAndMetadataOrMetadataOnly == - RecursiveMetadataContentHandlerProxy.HANDLER_AND_METADATA) { + if (handlerAndMetadataOrMetadataOnly == RecursiveMetadataContentHandlerProxy.HANDLER_AND_METADATA) { localContentHandler = (ContentHandler) readObject(input); - } else if (handlerAndMetadataOrMetadataOnly != - RecursiveMetadataContentHandlerProxy.METADATA_ONLY) { + } else if (handlerAndMetadataOrMetadataOnly != RecursiveMetadataContentHandlerProxy.METADATA_ONLY) { throw new IllegalArgumentException( - "Expected HANDLER_AND_METADATA or METADATA_ONLY, but got:" + - handlerAndMetadataOrMetadataOnly); + "Expected HANDLER_AND_METADATA or METADATA_ONLY, but got:" + + handlerAndMetadataOrMetadataOnly); } Metadata metadata = (Metadata) readObject(input); @@ -68,7 +62,7 @@ private void internalProcess(DataInputStream input) throws IOException, SAXExcep handler.endDocument(localContentHandler, metadata); } else { throw new IllegalArgumentException( - "Expected either 0x01 or 0x02, but got: " + embeddedOrMain); + "Expected either 0x01 or 0x02, but got: " + embeddedOrMain); } byte isComplete = input.readByte(); if (isComplete != RecursiveMetadataContentHandlerProxy.COMPLETE) { diff --git a/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java b/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java index 6610437c80..52be577547 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java +++ b/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; diff --git a/tika-core/src/main/java/org/apache/tika/fork/package-info.java b/tika-core/src/main/java/org/apache/tika/fork/package-info.java index 74cdd062d2..c0d8732fe4 100644 --- a/tika-core/src/main/java/org/apache/tika/fork/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/fork/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java index 7515761a26..795430a8b2 100644 --- a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java @@ -1,33 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; - import org.apache.commons.io.IOUtils; /** - * Very slight modification of Commons' BoundedInputStream - * so that we can figure out if this hit the bound or not. + * Very slight modification of Commons' BoundedInputStream so that we can figure out if this hit the + * bound or not. *

- * This relies on IOUtils' skip and read to try to fully - * read/skip inputstream. + * This relies on IOUtils' skip and read to try to fully read/skip inputstream. */ public class BoundedInputStream extends InputStream { @@ -56,8 +52,7 @@ public int read() throws IOException { * Invokes the delegate's read(byte[]) method. * * @param b the buffer to read the bytes into - * @return the number of bytes read or -1 if the end of stream or - * the limit has been reached. + * @return the number of bytes read or -1 if the end of stream or the limit has been reached. * @throws IOException if an I/O error occurs */ @Override @@ -70,11 +65,10 @@ public int read(final byte[] b) throws IOException { *

* This does not have the same guarantees as IOUtil's readFully()...be careful. * - * @param b the buffer to read the bytes into + * @param b the buffer to read the bytes into * @param off The start offset * @param len The number of bytes to read - * @return the number of bytes read or -1 if the end of stream or - * the limit has been reached. + * @return the number of bytes read or -1 if the end of stream or the limit has been reached. * @throws IOException if an I/O error occurs */ @Override @@ -94,9 +88,8 @@ public int read(final byte[] b, final int off, final int len) throws IOException } /** - * Invokes the delegate's skip(long) method. - * As with InputStream generally, this does not guarantee reading n bytes. - * Use IOUtils' skipFully for that functionality. + * Invokes the delegate's skip(long) method. As with InputStream generally, this + * does not guarantee reading n bytes. Use IOUtils' skipFully for that functionality. * * @param n the number of bytes to skip * @return the actual number of bytes skipped diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java index 242dd8c748..3cbc29713b 100644 --- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java +++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java @@ -1,31 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; import java.io.IOException; import java.io.InputStream; - import org.apache.tika.exception.TikaException; /** * General Endian Related Utilties. *

- * This class provides static utility methods for input/output operations - * on numbers in Big and Little Endian formats. + * This class provides static utility methods for input/output operations on numbers in Big and + * Little Endian formats. *

* Origin of code: Based on the version in POI */ @@ -37,11 +34,11 @@ public class EndianUtils { * * @param stream the InputStream from which the short is to be read * @return the short (16-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static short readShortLE(InputStream stream) - throws IOException, BufferUnderrunException { + throws IOException, BufferUnderrunException { return (short) readUShortLE(stream); } @@ -50,11 +47,11 @@ public static short readShortLE(InputStream stream) * * @param stream the InputStream from which the short is to be read * @return the short (16-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static short readShortBE(InputStream stream) - throws IOException, BufferUnderrunException { + throws IOException, BufferUnderrunException { return (short) readUShortBE(stream); } @@ -81,7 +78,7 @@ public static int readUShortBE(InputStream stream) throws IOException, BufferUnd * * @param stream the InputStream from which the int is to be read * @return the int (32-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static long readUIntLE(InputStream stream) throws IOException, BufferUnderrunException { @@ -100,7 +97,7 @@ public static long readUIntLE(InputStream stream) throws IOException, BufferUnde * * @param stream the InputStream from which the int is to be read * @return the int (32-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static long readUIntBE(InputStream stream) throws IOException, BufferUnderrunException { @@ -119,7 +116,7 @@ public static long readUIntBE(InputStream stream) throws IOException, BufferUnde * * @param stream the InputStream from which the int is to be read * @return the int (32-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static int readIntLE(InputStream stream) throws IOException, BufferUnderrunException { @@ -138,7 +135,7 @@ public static int readIntLE(InputStream stream) throws IOException, BufferUnderr * * @param stream the InputStream from which the int is to be read * @return the int (32-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static int readIntBE(InputStream stream) throws IOException, BufferUnderrunException { @@ -157,7 +154,7 @@ public static int readIntBE(InputStream stream) throws IOException, BufferUnderr * * @param stream the InputStream from which the int is to be read * @return the int (32-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static int readIntME(InputStream stream) throws IOException, BufferUnderrunException { @@ -176,7 +173,7 @@ public static int readIntME(InputStream stream) throws IOException, BufferUnderr * * @param stream the InputStream from which the long is to be read * @return the long (64-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static long readLongLE(InputStream stream) throws IOException, BufferUnderrunException { @@ -192,9 +189,10 @@ public static long readLongLE(InputStream stream) throws IOException, BufferUnde throw new BufferUnderrunException(); } - return ((long) ch8 << 56) + ((long) ch7 << 48) + ((long) ch6 << 40) + ((long) ch5 << 32) + - ((long) ch4 << 24) + // cast to long to preserve bit 31 (sign bit for ints) - (ch3 << 16) + (ch2 << 8) + (ch1); + return ((long) ch8 << 56) + ((long) ch7 << 48) + ((long) ch6 << 40) + ((long) ch5 << 32) + + ((long) ch4 << 24) + // cast to long to preserve bit 31 (sign bit for + // ints) + (ch3 << 16) + (ch2 << 8) + (ch1); } /** @@ -202,7 +200,7 @@ public static long readLongLE(InputStream stream) throws IOException, BufferUnde * * @param stream the InputStream from which the long is to be read * @return the long (64-bit) value - * @throws IOException will be propagated back to the caller + * @throws IOException will be propagated back to the caller * @throws BufferUnderrunException if the stream cannot provide enough bytes */ public static long readLongBE(InputStream stream) throws IOException, BufferUnderrunException { @@ -218,14 +216,15 @@ public static long readLongBE(InputStream stream) throws IOException, BufferUnde throw new BufferUnderrunException(); } - return ((long) ch1 << 56) + ((long) ch2 << 48) + ((long) ch3 << 40) + ((long) ch4 << 32) + - ((long) ch5 << 24) + // cast to long to preserve bit 31 (sign bit for ints) - (ch6 << 16) + (ch7 << 8) + (ch8); + return ((long) ch1 << 56) + ((long) ch2 << 48) + ((long) ch3 << 40) + ((long) ch4 << 32) + + ((long) ch5 << 24) + // cast to long to preserve bit 31 (sign bit for + // ints) + (ch6 << 16) + (ch7 << 8) + (ch8); } /** - * Gets the integer value that is stored in UTF-8 like fashion, in Big Endian - * but with the high bit on each number indicating if it continues or not + * Gets the integer value that is stored in UTF-8 like fashion, in Big Endian but with the high + * bit on each number indicating if it continues or not */ public static long readUE7(InputStream stream) throws IOException { int i; @@ -262,7 +261,7 @@ public static short getShortLE(byte[] data) { /** * Get a LE short value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the short (16-bit) value */ @@ -283,7 +282,7 @@ public static int getUShortLE(byte[] data) { /** * Get a LE unsigned short value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the unsigned short (16-bit) value in an integer */ @@ -306,7 +305,7 @@ public static short getShortBE(byte[] data) { /** * Get a BE short value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the short (16-bit) value */ @@ -327,7 +326,7 @@ public static int getUShortBE(byte[] data) { /** * Get a BE unsigned short value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the unsigned short (16-bit) value in an integer */ @@ -350,7 +349,7 @@ public static int getIntLE(byte[] data) { /** * Get a LE int value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the int (32-bit) value */ @@ -376,7 +375,7 @@ public static int getIntBE(byte[] data) { /** * Get a BE int value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the int (32-bit) value */ @@ -402,7 +401,7 @@ public static long getUIntLE(byte[] data) { /** * Get a LE unsigned int value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the unsigned int (32-bit) value in a long */ @@ -424,7 +423,7 @@ public static long getUIntBE(byte[] data) { /** * Get a BE unsigned int value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the unsigned int (32-bit) value in a long */ @@ -436,7 +435,7 @@ public static long getUIntBE(byte[] data, int offset) { /** * Get a LE long value from a byte array * - * @param data the byte array + * @param data the byte array * @param offset a starting offset into the byte array * @return the long (64-bit) value */ @@ -451,8 +450,7 @@ public static long getLongLE(byte[] data, int offset) { } /** - * Convert an 'unsigned' byte to an integer. ie, don't carry across the - * sign. + * Convert an 'unsigned' byte to an integer. ie, don't carry across the sign. * * @param b Description of the Parameter * @return Description of the Return Value @@ -464,7 +462,7 @@ public static int ubyteToInt(byte b) { /** * get the unsigned value of a byte. * - * @param data the byte array. + * @param data the byte array. * @param offset a starting offset into the byte array. * @return the unsigned value of the byte as a 16 bit short */ diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java index 234347c251..fe36d4f181 100644 --- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java +++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -20,7 +18,6 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MimeTypeException; @@ -35,10 +32,10 @@ public class FilenameUtils { /** * Reserved characters */ - public final static char[] RESERVED_FILENAME_CHARACTERS = - {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, - 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, - 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, '?', ':', '*', '<', '>', '|', '"', '\''}; + public final static char[] RESERVED_FILENAME_CHARACTERS = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, + 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + '?', ':', '*', '<', '>', '|', '"', '\''}; private final static HashSet RESERVED = new HashSet<>(38); @@ -52,9 +49,9 @@ public class FilenameUtils { private final static Pattern ASCII_NUMERIC = Pattern.compile("\\A\\.(?i)[a-z0-9]{1,5}\\Z"); /** - * Scans the given file name for reserved characters on different OSs and - * file systems and returns a sanitized version of the name with the - * reserved chars replaced by their hexadecimal value. + * Scans the given file name for reserved characters on different OSs and file systems and + * returns a sanitized version of the name with the reserved chars replaced by their hexadecimal + * value. *

* For example why?.zip will be converted into why%3F.zip * @@ -72,7 +69,7 @@ public static String normalize(final String name) { for (char c : name.toCharArray()) { if (RESERVED.contains(c)) { sb.append('%').append((c < 16) ? "0" : "") - .append(Integer.toHexString(c).toUpperCase(Locale.ROOT)); + .append(Integer.toHexString(c).toUpperCase(Locale.ROOT)); } else { sb.append(c); } @@ -82,18 +79,15 @@ public static String normalize(final String name) { } /** - * This is a duplication of the algorithm and functionality - * available in commons io FilenameUtils. If Java's File were - * able handle Windows file paths correctly in linux, - * we wouldn't need this. + * This is a duplication of the algorithm and functionality available in commons io + * FilenameUtils. If Java's File were able handle Windows file paths correctly in linux, we + * wouldn't need this. *

- * The goal of this is to get a filename from a path. - * The package parsers and some other embedded doc - * extractors could put anything into TikaCoreProperties.RESOURCE_NAME_KEY. + * The goal of this is to get a filename from a path. The package parsers and some other + * embedded doc extractors could put anything into TikaCoreProperties.RESOURCE_NAME_KEY. *

- * If a careless client used that filename as if it were a - * filename and not a path when writing embedded files, - * bad things could happen. Consider: "../../../my_ppt.ppt". + * If a careless client used that filename as if it were a filename and not a path when writing + * embedded files, bad things could happen. Consider: "../../../my_ppt.ppt". *

* Consider using this in combination with {@link #normalize(String)}. * @@ -107,8 +101,8 @@ public static String getName(final String path) { } int unix = path.lastIndexOf("/"); int windows = path.lastIndexOf("\\"); - //some macintosh file names are stored with : as the delimiter - //also necessary to properly handle C:somefilename + // some macintosh file names are stored with : as the delimiter + // also necessary to properly handle C:somefilename int colon = path.lastIndexOf(":"); String cand = path.substring(Math.max(colon, Math.max(unix, windows)) + 1); if (cand.equals("..") || cand.equals(".")) { @@ -118,16 +112,16 @@ public static String getName(final String path) { } /** - * This includes the period, e.g. ".pdf". - * This requires that an extension contain only ascii alphanumerics - * and it requires that an extension length be 5 or less. + * This includes the period, e.g. ".pdf". This requires that an extension contain only ascii + * alphanumerics and it requires that an extension length be 5 or less. + * * @param path * @return the suffix or an empty string if one could not be found */ public static String getSuffixFromPath(String path) { String n = getName(path); int i = n.lastIndexOf("."); - //arbitrarily sets max extension length + // arbitrarily sets max extension length if (i > -1 && n.length() - i < 6) { String suffix = n.substring(i); if (ASCII_NUMERIC.matcher(suffix).matches()) { @@ -137,10 +131,10 @@ public static String getSuffixFromPath(String path) { return StringUtils.EMPTY; } - public static String getSanitizedEmbeddedFileName(Metadata metadata, - String defaultExtension, int maxLength) { + public static String getSanitizedEmbeddedFileName(Metadata metadata, String defaultExtension, + int maxLength) { String path = getEmbeddedName(metadata); - //fName could be a full path or null + // fName could be a full path or null if (StringUtils.isBlank(path)) { return null; } @@ -170,9 +164,9 @@ public static String getSanitizedEmbeddedFileName(Metadata metadata, if (StringUtils.isBlank(namePart)) { return null; } - //remove all initial . + // remove all initial . namePart = namePart.replaceAll("\\A\\.+", "_"); - //defense in depth. We shouldn't need this + // defense in depth. We shouldn't need this namePart = namePart.replaceAll("(\\.\\.)+", "_"); namePart = namePart.replaceAll("[/\\\\]+", "_"); namePart = namePart.replaceAll(":+", "_"); @@ -182,7 +176,7 @@ public static String getSanitizedEmbeddedFileName(Metadata metadata, return null; } - //if path is > max length, return only the name part + // if path is > max length, return only the name part if (namePart.length() > maxLength) { return namePart.substring(0, maxLength - extension.length() - 3) + "..." + extension; } @@ -191,19 +185,19 @@ public static String getSanitizedEmbeddedFileName(Metadata metadata, } /** - * This tries to sanitize dangerous user generated embedded file paths. - * If trusting these paths for writing files, users should run checks to make - * sure that the generated file path does not zipslip out of the target directory. + * This tries to sanitize dangerous user generated embedded file paths. If trusting these paths + * for writing files, users should run checks to make sure that the generated file path does not + * zipslip out of the target directory. * * @param metadata * @param defaultExtension * @param maxLength * @return */ - public static String getSanitizedEmbeddedFilePath(Metadata metadata, - String defaultExtension, int maxLength) { + public static String getSanitizedEmbeddedFilePath(Metadata metadata, String defaultExtension, + int maxLength) { String path = getEmbeddedPath(metadata); - //fName could be a full path or null + // fName could be a full path or null if (StringUtils.isBlank(path)) { return null; } @@ -246,21 +240,23 @@ public static String getSanitizedEmbeddedFilePath(Metadata metadata, if (StringUtils.isBlank(namePart)) { return null; } - //remove all initial . + // remove all initial . namePart = namePart.replaceAll("\\A\\.+", "_"); - //defense in depth. We shouldn't need this + // defense in depth. We shouldn't need this namePart = namePart.replaceAll("\\.{2,}", "."); namePart = namePart.replaceAll("[/\\\\]+", "_"); if (StringUtils.isBlank(namePart)) { return null; } - String retPath = StringUtils.isBlank(relPath) ? namePart + extension : relPath + "/" + namePart + extension; + String retPath = StringUtils.isBlank(relPath) ? namePart + extension + : relPath + "/" + namePart + extension; - //if path is > max length, return only the name part + // if path is > max length, return only the name part if (retPath.length() > maxLength) { if (namePart.length() > maxLength) { - return namePart.substring(0, maxLength - extension.length() - 3) + "..." + extension; + return namePart.substring(0, maxLength - extension.length() - 3) + "..." + + extension; } return namePart + extension; } @@ -272,7 +268,8 @@ private static int getPrefixLength(String path) { if (prefixLength > 0) { return prefixLength; } - if (path.length() == 2 && path.charAt(0) >= 'A' && path.charAt(0) <= 'Z' && path.charAt(1) == ':') { + if (path.length() == 2 && path.charAt(0) >= 'A' && path.charAt(0) <= 'Z' + && path.charAt(1) == ':') { return 2; } return 0; @@ -290,40 +287,40 @@ private static String removeProtocol(String path) { return path; } - //may return null + // may return null private static String getEmbeddedPath(Metadata metadata) { - //potentially look for other values in embedded path or original file name, etc... - //maybe different fallback order? + // potentially look for other values in embedded path or original file name, etc... + // maybe different fallback order? String path = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); - if (! StringUtils.isBlank(path)) { + if (!StringUtils.isBlank(path)) { return path; } path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); - if (! StringUtils.isBlank(path)) { + if (!StringUtils.isBlank(path)) { return path; } path = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID); - if (! StringUtils.isBlank(path)) { + if (!StringUtils.isBlank(path)) { return path; } return metadata.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME); } - //this tries for resource name first, and then backs off to path + // this tries for resource name first, and then backs off to path private static String getEmbeddedName(Metadata metadata) { - //potentially look for other values in embedded path or original file name, etc... - //maybe different fallback order? + // potentially look for other values in embedded path or original file name, etc... + // maybe different fallback order? String path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); - if (! StringUtils.isBlank(path)) { + if (!StringUtils.isBlank(path)) { return path; } path = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID); - if (! StringUtils.isBlank(path)) { + if (!StringUtils.isBlank(path)) { return path; } path = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); - if (! StringUtils.isBlank(path)) { + if (!StringUtils.isBlank(path)) { return path; } @@ -331,8 +328,8 @@ private static String getEmbeddedName(Metadata metadata) { } /** - * Calculate the extension based on the {@link Metadata#CONTENT_TYPE} value. - * On parse exception or null value, return the default value. + * Calculate the extension based on the {@link Metadata#CONTENT_TYPE} value. On parse exception + * or null value, return the default value. * * @param metadata * @param defaultValue @@ -344,16 +341,14 @@ public static String calculateExtension(Metadata metadata, String defaultValue) return defaultValue; } try { - String ext = MIME_TYPES - .forName(mime) - .getExtension(); + String ext = MIME_TYPES.forName(mime).getExtension(); if (ext == null) { return ".bin"; } else { return ext; } } catch (MimeTypeException e) { - //swallow + // swallow } return ".bin"; } diff --git a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java index 247705b0d5..5b6f960ffd 100644 --- a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java +++ b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -22,21 +20,21 @@ public class IOUtils { public static long skip(final InputStream input, final long toSkip, byte[] buffer) - throws IOException { + throws IOException { if (toSkip < 0) { throw new IllegalArgumentException( - "Skip count must be non-negative, actual: " + toSkip); + "Skip count must be non-negative, actual: " + toSkip); } /* - * N.B. no need to synchronize this because: - we don't care if the - * buffer is created multiple times (the data is ignored) - we always use the same size - * buffer, so if it it is recreated it - * will still be OK (if the buffer size were variable, we would need to synch. to ensure - * some other thread did not create a smaller one) + * N.B. no need to synchronize this because: - we don't care if the buffer is created + * multiple times (the data is ignored) - we always use the same size buffer, so if it it is + * recreated it will still be OK (if the buffer size were variable, we would need to synch. + * to ensure some other thread did not create a smaller one) */ long remain = toSkip; while (remain > 0) { - // See https://issues.apache.org/jira/browse/IO-203 for why we use read() rather than delegating to skip() + // See https://issues.apache.org/jira/browse/IO-203 for why we use read() rather than + // delegating to skip() final long n = input.read(buffer, 0, (int) Math.min(remain, buffer.length)); if (n < 0) { // EOF break; diff --git a/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java b/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java index 17e416a579..c59f9fbef1 100644 --- a/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java +++ b/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -20,14 +18,17 @@ import java.io.InputStream; /** - *

A factory which returns a fresh {@link InputStream} for the same - * resource each time.

- *

This is typically desired where it is easier / quicker / simpler to - * fetch a fresh {@link InputStream} to re-read a given resource, rather - * than do any kind of buffering.

- *

It is typically used with {@link TikaInputStream#get(InputStreamFactory)} - * when combined with a Parser which needs to read the resource's stream - * multiple times when processing.

+ *

+ * A factory which returns a fresh {@link InputStream} for the same resource each time. + *

+ *

+ * This is typically desired where it is easier / quicker / simpler to fetch a fresh + * {@link InputStream} to re-read a given resource, rather than do any kind of buffering. + *

+ *

+ * It is typically used with {@link TikaInputStream#get(InputStreamFactory)} when combined with a + * Parser which needs to read the resource's stream multiple times when processing. + *

*/ public interface InputStreamFactory { InputStream getInputStream() throws IOException; diff --git a/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java b/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java index 32e671e78b..361b062289 100644 --- a/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -20,24 +18,23 @@ import java.io.InputStream; /** - * Stream wrapper that make it easy to read up to n bytes ahead from - * a stream that supports the mark feature. This class insulates the - * underlying stream from things like possible mark(), reset() and close() - * calls by external components that might otherwise invalidate the marked - * state of a stream. + * Stream wrapper that make it easy to read up to n bytes ahead from a stream that supports the mark + * feature. This class insulates the underlying stream from things like possible mark(), reset() and + * close() calls by external components that might otherwise invalidate the marked state of a + * stream. *

* The recommended usage pattern of this class is: + * *

- *     try (InputStream lookahead = new LookaheadInputStream(stream, n)) {
- *         processStream(lookahead);
- *     }
+ * try (InputStream lookahead = new LookaheadInputStream(stream, n)) {
+ *     processStream(lookahead);
+ * }
  * 
*

- * This usage pattern guarantees that only up to n bytes from the original - * stream can ever be read, and that the stream will have been marked and - * then reset to its original state once the above code block exits. No - * code in the fictional processStream() method can affect the the state of - * the original stream. + * This usage pattern guarantees that only up to n bytes from the original stream can ever be read, + * and that the stream will have been marked and then reset to its original state once the above + * code block exits. No code in the fictional processStream() method can affect the the state of the + * original stream. * * @since Apache Tika 0.10 */ @@ -52,14 +49,13 @@ public class LookaheadInputStream extends InputStream { private int mark = 0; /** - * Creates a lookahead wrapper for the given input stream. - * The given input stream should support the mark feature, - * as otherwise the state of that stream will be undefined - * after the lookahead wrapper has been closed. As a special - * case a null stream is treated as an empty stream. + * Creates a lookahead wrapper for the given input stream. The given input stream should support + * the mark feature, as otherwise the state of that stream will be undefined after the lookahead + * wrapper has been closed. As a special case a null stream is treated as an empty + * stream. * * @param stream input stream, can be null - * @param n maximum number of bytes to look ahead + * @param n maximum number of bytes to look ahead */ public LookaheadInputStream(InputStream stream, int n) { this.stream = stream; diff --git a/tika-core/src/main/java/org/apache/tika/io/TailStream.java b/tika-core/src/main/java/org/apache/tika/io/TailStream.java index a1621c20b3..202237d7cc 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TailStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TailStream.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -22,18 +20,17 @@ /** *

- * A specialized input stream implementation which records the last portion read - * from an underlying stream. + * A specialized input stream implementation which records the last portion read from an underlying + * stream. *

*

- * This stream implementation is useful to deal with information which is known - * to be located at the end of a stream (e.g. ID3 v1 tags). While reading bytes - * from the underlying stream, a given number of bytes is kept in an internal - * buffer. This buffer can then be queried after the whole stream was read. It - * contains the last bytes read from the original input stream. + * This stream implementation is useful to deal with information which is known to be located at the + * end of a stream (e.g. ID3 v1 tags). While reading bytes from the underlying stream, a given + * number of bytes is kept in an internal buffer. This buffer can then be queried after the whole + * stream was read. It contains the last bytes read from the original input stream. *

* - * @param in the underlying input stream + * @param in the underlying input stream * @param tailSize the size of the tail buffer */ public class TailStream extends FilterInputStream { @@ -80,7 +77,7 @@ public class TailStream extends FilterInputStream { /** * Creates a new instance of {@code TailStream}. * - * @param in the underlying input stream + * @param in the underlying input stream * @param size the size of the tail buffer */ public TailStream(InputStream in, int size) { @@ -90,8 +87,7 @@ public TailStream(InputStream in, int size) { } /** - * {@inheritDoc} This implementation adds the read byte to the internal tail - * buffer. + * {@inheritDoc} This implementation adds the read byte to the internal tail buffer. */ @Override public int read() throws IOException { @@ -103,9 +99,8 @@ public int read() throws IOException { } /** - * {@inheritDoc} This implementation delegates to the underlying stream and - * then adds the correct portion of the read buffer to the internal tail - * buffer. + * {@inheritDoc} This implementation delegates to the underlying stream and then adds the + * correct portion of the read buffer to the internal tail buffer. */ @Override public int read(byte[] buf) throws IOException { @@ -117,9 +112,8 @@ public int read(byte[] buf) throws IOException { } /** - * {@inheritDoc} This implementation delegates to the underlying stream and - * then adds the correct portion of the read buffer to the internal tail - * buffer. + * {@inheritDoc} This implementation delegates to the underlying stream and then adds the + * correct portion of the read buffer to the internal tail buffer. */ @Override public int read(byte[] buf, int ofs, int length) throws IOException { @@ -131,8 +125,8 @@ public int read(byte[] buf, int ofs, int length) throws IOException { } /** - * {@inheritDoc} This implementation delegates to the {@code read()} method - * to ensure that the tail buffer is also filled if data is skipped. + * {@inheritDoc} This implementation delegates to the {@code read()} method to ensure that the + * tail buffer is also filled if data is skipped. */ @Override public long skip(long n) throws IOException { @@ -153,9 +147,8 @@ public long skip(long n) throws IOException { } /** - * {@inheritDoc} This implementation saves the internal state including the - * content of the tail buffer so that it can be restored when ''reset()'' is - * called later. + * {@inheritDoc} This implementation saves the internal state including the content of the tail + * buffer so that it can be restored when ''reset()'' is called later. */ @Override public void mark(int limit) { @@ -166,9 +159,9 @@ public void mark(int limit) { } /** - * {@inheritDoc} This implementation restores this stream's state to the - * state when ''mark()'' was called the last time. If ''mark()'' has not - * been called before, this method has no effect. + * {@inheritDoc} This implementation restores this stream's state to the state when ''mark()'' + * was called the last time. If ''mark()'' has not been called before, this method has no + * effect. */ @Override public void reset() { @@ -180,10 +173,9 @@ public void reset() { } /** - * Returns an array with the last data read from the underlying stream. If - * the underlying stream contained more data than the ''tailSize'' - * constructor argument, the returned array has a length of ''tailSize''. - * Otherwise, its length equals the number of bytes read. + * Returns an array with the last data read from the underlying stream. If the underlying stream + * contained more data than the ''tailSize'' constructor argument, the returned array has a + * length of ''tailSize''. Otherwise, its length equals the number of bytes read. * * @return an array with the last data read from the underlying stream */ @@ -211,8 +203,8 @@ private void appendByte(byte b) { /** * Adds the content of the given buffer to the internal tail buffer. * - * @param buf the buffer - * @param ofs the start offset in the buffer + * @param buf the buffer + * @param ofs the start offset in the buffer * @param length the number of bytes to be copied */ private void appendBuf(byte[] buf, int ofs, int length) { @@ -226,12 +218,12 @@ private void appendBuf(byte[] buf, int ofs, int length) { } /** - * Replaces the content of the internal tail buffer by the last portion of - * the given buffer. This method is called if a buffer was read from the - * underlying stream whose length is larger than the tail buffer. + * Replaces the content of the internal tail buffer by the last portion of the given buffer. + * This method is called if a buffer was read from the underlying stream whose length is larger + * than the tail buffer. * - * @param buf the buffer - * @param ofs the start offset in the buffer + * @param buf the buffer + * @param ofs the start offset in the buffer * @param length the number of bytes to be copied */ private void replaceTailBuffer(byte[] buf, int ofs, int length) { @@ -240,13 +232,12 @@ private void replaceTailBuffer(byte[] buf, int ofs, int length) { } /** - * Copies the given buffer into the internal tail buffer at the current - * position. This method is called if a buffer is read from the underlying - * stream whose length is smaller than the tail buffer. In this case the - * tail buffer is only partly overwritten. + * Copies the given buffer into the internal tail buffer at the current position. This method is + * called if a buffer is read from the underlying stream whose length is smaller than the tail + * buffer. In this case the tail buffer is only partly overwritten. * - * @param buf the buffer - * @param ofs the start offset in the buffer + * @param buf the buffer + * @param ofs the start offset in the buffer * @param length the number of bytes to be copied */ private void copyToTailBuffer(byte[] buf, int ofs, int length) { diff --git a/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java b/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java index c1565ab86d..7a309a8d74 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java +++ b/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -22,18 +20,16 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.LinkedList; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * Utility class for tracking and ultimately closing or otherwise disposing - * a collection of temporary resources. + * Utility class for tracking and ultimately closing or otherwise disposing a collection of + * temporary resources. *

* Note that this class is not thread-safe. * @@ -54,22 +50,20 @@ public class TemporaryResources implements Closeable { private Path tempFileDir = null; /** - * Sets the directory to be used for the temporary files created by - * the {@link #createTempFile(String)} method. + * Sets the directory to be used for the temporary files created by the + * {@link #createTempFile(String)} method. * - * @param tempFileDir temporary file directory, - * or null for the system default + * @param tempFileDir temporary file directory, or null for the system default */ public void setTemporaryFileDirectory(Path tempFileDir) { this.tempFileDir = tempFileDir; } /** - * Sets the directory to be used for the temporary files created by - * the {@link #createTempFile(String)} method. + * Sets the directory to be used for the temporary files created by the + * {@link #createTempFile(String)} method. * - * @param tempFileDir temporary file directory, - * or null for the system default + * @param tempFileDir temporary file directory, or null for the system default * @see #setTemporaryFileDirectory(Path) */ public void setTemporaryFileDirectory(File tempFileDir) { @@ -77,8 +71,9 @@ public void setTemporaryFileDirectory(File tempFileDir) { } /** - * Creates a temporary file that will automatically be deleted when - * the {@link #close()} method is called, returning its path. + * Creates a temporary file that will automatically be deleted when the {@link #close()} method + * is called, returning its path. + * * @param suffix -- the suffix of the file if known, starting with "." as in ".pdf" * @return Path to created temporary file that will be deleted after closing * @throws IOException @@ -86,8 +81,8 @@ public void setTemporaryFileDirectory(File tempFileDir) { public Path createTempFile(String suffix) throws IOException { String actualSuffix = StringUtils.isBlank(suffix) ? ".tmp" : suffix; - final Path path = tempFileDir == null ? Files.createTempFile("apache-tika-", actualSuffix) : - Files.createTempFile(tempFileDir, "apache-tika-", actualSuffix); + final Path path = tempFileDir == null ? Files.createTempFile("apache-tika-", actualSuffix) + : Files.createTempFile(tempFileDir, "apache-tika-", actualSuffix); addResource(() -> { try { Files.delete(path); @@ -105,8 +100,8 @@ public Path createTempFile() throws IOException { } /** - * Creates a temporary file that will automatically be deleted when - * the {@link #close()} method is called, returning its path. + * Creates a temporary file that will automatically be deleted when the {@link #close()} method + * is called, returning its path. * * @return Path to created temporary file that will be deleted after closing * @throws IOException @@ -118,9 +113,10 @@ public Path createTempFile(Metadata metadata) throws IOException { } return createTempFile(FilenameUtils.getSuffixFromPath(resourceName)); } + /** - * Creates and returns a temporary file that will automatically be - * deleted when the {@link #close()} method is called. + * Creates and returns a temporary file that will automatically be deleted when the + * {@link #close()} method is called. * * @return Created temporary file that'll be deleted after closing * @throws IOException @@ -131,8 +127,8 @@ public File createTemporaryFile() throws IOException { } /** - * Adds a new resource to the set of tracked resources that will all be - * closed when the {@link #close()} method is called. + * Adds a new resource to the set of tracked resources that will all be closed when the + * {@link #close()} method is called. * * @param resource resource to be tracked */ @@ -141,8 +137,8 @@ public void addResource(Closeable resource) { } /** - * Returns the latest of the tracked resources that implements or - * extends the given interface or class. + * Returns the latest of the tracked resources that implements or extends the given interface or + * class. * * @param klass interface or class * @return matching resource, or null if not found @@ -158,15 +154,13 @@ public T getResource(Class klass) { } /** - * Closes all tracked resources. The resources are closed in reverse order - * from how they were added. + * Closes all tracked resources. The resources are closed in reverse order from how they were + * added. *

- * Any suppressed exceptions from managed resources are collected and - * then added to the first thrown exception, which is re-thrown once - * all the resources have been closed. + * Any suppressed exceptions from managed resources are collected and then added to the first + * thrown exception, which is re-thrown once all the resources have been closed. * - * @throws IOException if one or more of the tracked resources - * could not be closed + * @throws IOException if one or more of the tracked resources could not be closed */ public void close() throws IOException { // Release all resources and keep track of any exceptions @@ -191,12 +185,10 @@ public void close() throws IOException { } /** - * Calls the {@link #close()} method and wraps the potential - * {@link IOException} into a {@link TikaException} for convenience - * when used within Tika. + * Calls the {@link #close()} method and wraps the potential {@link IOException} into a + * {@link TikaException} for convenience when used within Tika. * - * @throws TikaException if one or more of the tracked resources - * could not be closed + * @throws TikaException if one or more of the tracked resources could not be closed */ public void dispose() throws TikaException { try { diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java index 254bff7ba2..7f461cdaf0 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -35,40 +33,33 @@ import java.nio.file.Paths; import java.sql.Blob; import java.sql.SQLException; - import org.apache.commons.io.input.TaggedInputStream; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.Parser; import org.apache.tika.utils.StringUtils; /** - * Input stream with extended capabilities. The purpose of this class is - * to allow files and other resources and information to be associated with - * the {@link InputStream} instance passed through the - * {@link org.apache.tika.parser.Parser} interface and other similar APIs. + * Input stream with extended capabilities. The purpose of this class is to allow files and other + * resources and information to be associated with the {@link InputStream} instance passed through + * the {@link org.apache.tika.parser.Parser} interface and other similar APIs. *

- * TikaInputStream instances can be created using the various static - * get() factory methods. Most of these methods take an optional - * {@link Metadata} argument that is then filled with the available input - * metadata from the given resource. The created TikaInputStream instance - * keeps track of the original resource used to create it, while behaving - * otherwise just like a normal, buffered {@link InputStream}. - * A TikaInputStream instance is also guaranteed to support the - * {@link #mark(int)} feature. + * TikaInputStream instances can be created using the various static get() factory + * methods. Most of these methods take an optional {@link Metadata} argument that is then filled + * with the available input metadata from the given resource. The created TikaInputStream instance + * keeps track of the original resource used to create it, while behaving otherwise just like a + * normal, buffered {@link InputStream}. A TikaInputStream instance is also guaranteed to support + * the {@link #mark(int)} feature. *

- * Code that wants to access the underlying file or other resources - * associated with a TikaInputStream should first use the - * {@link #get(InputStream)} factory method to cast or wrap a given - * {@link InputStream} into a TikaInputStream instance. + * Code that wants to access the underlying file or other resources associated with a + * TikaInputStream should first use the {@link #get(InputStream)} factory method to cast or wrap a + * given {@link InputStream} into a TikaInputStream instance. *

- * TikaInputStream includes a few safety features to protect against parsers - * that may fail to check for an EOF or may incorrectly rely on the unreliable - * value returned from {@link FileInputStream#skip}. These parser failures - * can lead to infinite loops. We strongly encourage the use of - * TikaInputStream. + * TikaInputStream includes a few safety features to protect against parsers that may fail to check + * for an EOF or may incorrectly rely on the unreliable value returned from + * {@link FileInputStream#skip}. These parser failures can lead to infinite loops. We strongly + * encourage the use of TikaInputStream. * * @since Apache Tika 0.8 */ @@ -76,9 +67,8 @@ public class TikaInputStream extends TaggedInputStream { private static final int MAX_CONSECUTIVE_EOFS = 1000; /** - * Blob size threshold that limits the largest BLOB size to be - * buffered fully in memory by the {@link #get(Blob, Metadata)} - * method. + * Blob size threshold that limits the largest BLOB size to be buffered fully in memory by the + * {@link #get(Blob, Metadata)} method. */ private static final int BLOB_SIZE_THRESHOLD = 1024 * 1024; /** @@ -86,16 +76,15 @@ public class TikaInputStream extends TaggedInputStream { */ private final TemporaryResources tmp; /** - * The Factory that can create fresh {@link InputStream}s for - * the resource this reads for, eg when needing to re-read. + * The Factory that can create fresh {@link InputStream}s for the resource this reads for, eg + * when needing to re-read. */ private InputStreamFactory streamFactory; /** - * The path to the file that contains the contents of this stream. - * This is either the original file passed to the - * {@link #TikaInputStream(Path)} constructor or a temporary file created - * by a call to the {@link #getPath()} method. If neither has been called, - * then the value is null. + * The path to the file that contains the contents of this stream. This is either the original + * file passed to the {@link #TikaInputStream(Path)} constructor or a temporary file created by + * a call to the {@link #getPath()} method. If neither has been called, then the value is + * null. */ private Path path; /** @@ -111,9 +100,8 @@ public class TikaInputStream extends TaggedInputStream { */ private long mark = -1; /** - * A opened container, such as a POIFS FileSystem - * for an OLE2 document, or a Zip file for a - * zip based (eg ooxml, odf) document. + * A opened container, such as a POIFS FileSystem for an OLE2 document, or a Zip file for a zip + * based (eg ooxml, odf) document. */ private Object openContainer; private int consecutiveEOFs = 0; @@ -124,13 +112,13 @@ public class TikaInputStream extends TaggedInputStream { */ private int closeShieldDepth = 0; - //suffix of the file if known. This is used to create temp files - //with the right suffixes. This should include the initial . as in ".doc" + // suffix of the file if known. This is used to create temp files + // with the right suffixes. This should include the initial . as in ".doc" private String suffix = null; /** - * Creates a TikaInputStream instance. This private constructor is used - * by the static factory methods based on the available information. + * Creates a TikaInputStream instance. This private constructor is used by the static factory + * methods based on the available information. * * @param path the path to the file that contains the stream * @throws IOException if an I/O error occurs @@ -152,8 +140,8 @@ private TikaInputStream(Path path, TemporaryResources tmp, long length) throws I } /** - * Creates a TikaInputStream instance. This private constructor is used - * by the static factory methods based on the available information. + * Creates a TikaInputStream instance. This private constructor is used by the static factory + * methods based on the available information. * * @param file the file that contains the stream * @throws FileNotFoundException if the file does not exist @@ -170,18 +158,18 @@ private TikaInputStream(File file) throws FileNotFoundException { } /** - * Creates a TikaInputStream instance. This private constructor is used - * by the static factory methods based on the available information. + * Creates a TikaInputStream instance. This private constructor is used by the static factory + * methods based on the available information. *

- * The given stream needs to be included in the given temporary resource - * collection if the caller wants it also to get closed when the - * {@link #close()} method is invoked. + * The given stream needs to be included in the given temporary resource collection if the + * caller wants it also to get closed when the {@link #close()} method is invoked. * * @param stream buffered stream (must support the mark feature) - * @param tmp tracker for temporary resources associated with this stream + * @param tmp tracker for temporary resources associated with this stream * @param length total length of the stream, or -1 if unknown */ - private TikaInputStream(InputStream stream, TemporaryResources tmp, long length, String suffix) { + private TikaInputStream(InputStream stream, TemporaryResources tmp, long length, + String suffix) { super(stream); this.path = null; this.tmp = tmp; @@ -190,29 +178,27 @@ private TikaInputStream(InputStream stream, TemporaryResources tmp, long length, } /** - * Checks whether the given stream is a TikaInputStream instance. - * The given stream can be null, in which case the return - * value is false. + * Checks whether the given stream is a TikaInputStream instance. The given stream can be + * null, in which case the return value is false. * * @param stream input stream, possibly null - * @return true if the stream is a TikaInputStream instance, - * false otherwise + * @return true if the stream is a TikaInputStream instance, false + * otherwise */ public static boolean isTikaInputStream(InputStream stream) { return stream instanceof TikaInputStream; } /** - * Casts or wraps the given stream to a TikaInputStream instance. - * This method can be used to access the functionality of this class - * even when given just a normal input stream instance. + * Casts or wraps the given stream to a TikaInputStream instance. This method can be used to + * access the functionality of this class even when given just a normal input stream instance. *

- * The given temporary file provider is used for any temporary files, - * and should be disposed when the returned stream is no longer used. + * The given temporary file provider is used for any temporary files, and should be disposed + * when the returned stream is no longer used. *

- * Use this method instead of the {@link #get(InputStream)} alternative - * when you don't explicitly close the returned stream. The - * recommended access pattern is: + * Use this method instead of the {@link #get(InputStream)} alternative when you don't + * explicitly close the returned stream. The recommended access pattern is: + * *

      * try (TemporaryResources tmp = new TemporaryResources()) {
      *     TikaInputStream stream = TikaInputStream.get(..., tmp);
@@ -221,15 +207,15 @@ public static boolean isTikaInputStream(InputStream stream) {
      * 
*

* The given stream instance will not be closed when the - * {@link TemporaryResources#close()} method is called by the - * try-with-resources statement. The caller is expected to explicitly - * close the original stream when it's no longer used. + * {@link TemporaryResources#close()} method is called by the try-with-resources statement. The + * caller is expected to explicitly close the original stream when it's no longer used. * * @param stream normal input stream * @return a TikaInputStream instance * @since Apache Tika 0.10 */ - public static TikaInputStream get(InputStream stream, TemporaryResources tmp, Metadata metadata) { + public static TikaInputStream get(InputStream stream, TemporaryResources tmp, + Metadata metadata) { if (stream == null) { throw new NullPointerException("The Stream must not be null"); } @@ -246,9 +232,8 @@ public static TikaInputStream get(InputStream stream, TemporaryResources tmp, Me } /** - * Use this if there is no actual underlying InputStream. It is important - * to set a length so that the zip bomb detector won't be triggered - * in the SecurityHandler. + * Use this if there is no actual underlying InputStream. It is important to set a length so + * that the zip bomb detector won't be triggered in the SecurityHandler. *

* If your stream has underlying bytes and a length, see {@link #setOpenContainer(Object)} * @@ -257,33 +242,33 @@ public static TikaInputStream get(InputStream stream, TemporaryResources tmp, Me * @param metadata * @return */ - public static TikaInputStream getFromContainer(Object openContainer, long length, Metadata metadata) { + public static TikaInputStream getFromContainer(Object openContainer, long length, + Metadata metadata) { TikaInputStream tis = TikaInputStream.get(new byte[0], metadata); tis.setOpenContainer(openContainer); - //this overwrites the length that was set in the constructor above + // this overwrites the length that was set in the constructor above tis.setLength(length); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length)); return tis; } /** - * Casts or wraps the given stream to a TikaInputStream instance. - * This method can be used to access the functionality of this class - * even when given just a normal input stream instance. + * Casts or wraps the given stream to a TikaInputStream instance. This method can be used to + * access the functionality of this class even when given just a normal input stream instance. *

- * Use this method instead of the - * {@link #get(InputStream, TemporaryResources, Metadata)} alternative when you - * do explicitly close the returned stream. The recommended - * access pattern is: + * Use this method instead of the {@link #get(InputStream, TemporaryResources, Metadata)} + * alternative when you do explicitly close the returned stream. The recommended access + * pattern is: + * *

      * try (TikaInputStream stream = TikaInputStream.get(...)) {
      *     // process stream
      * }
      * 
*

- * The given stream instance will be closed along with any other resources - * associated with the returned TikaInputStream instance when the - * {@link #close()} method is called by the try-with-resources statement. + * The given stream instance will be closed along with any other resources associated with the + * returned TikaInputStream instance when the {@link #close()} method is called by the + * try-with-resources statement. * * @param stream normal input stream * @return a TikaInputStream instance @@ -293,23 +278,22 @@ public static TikaInputStream get(InputStream stream) { } /** - * Casts or wraps the given stream to a TikaInputStream instance. - * This method can be used to access the functionality of this class - * even when given just a normal input stream instance. + * Casts or wraps the given stream to a TikaInputStream instance. This method can be used to + * access the functionality of this class even when given just a normal input stream instance. *

- * Use this method instead of the - * {@link #get(InputStream, TemporaryResources, Metadata)} alternative when you - * do explicitly close the returned stream. The recommended - * access pattern is: + * Use this method instead of the {@link #get(InputStream, TemporaryResources, Metadata)} + * alternative when you do explicitly close the returned stream. The recommended access + * pattern is: + * *

      * try (TikaInputStream stream = TikaInputStream.get(...)) {
      *     // process stream
      * }
      * 
*

- * The given stream instance will be closed along with any other resources - * associated with the returned TikaInputStream instance when the - * {@link #close()} method is called by the try-with-resources statement. + * The given stream instance will be closed along with any other resources associated with the + * returned TikaInputStream instance when the {@link #close()} method is called by the + * try-with-resources statement. * * @param stream normal input stream * @return a TikaInputStream instance @@ -319,8 +303,8 @@ public static TikaInputStream get(InputStream stream, Metadata metadata) { } /** - * Returns the given stream casts to a TikaInputStream, or - * null if the stream is not a TikaInputStream. + * Returns the given stream casts to a TikaInputStream, or null if the stream is + * not a TikaInputStream. * * @param stream normal input stream * @return a TikaInputStream instance @@ -337,8 +321,8 @@ public static TikaInputStream cast(InputStream stream) { /** * Creates a TikaInputStream from the given array of bytes. *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the given data to a temporary file. + * Note that you must always explicitly close the returned stream as in some cases it may end up + * writing the given data to a temporary file. * * @param data input data * @return a TikaInputStream instance @@ -348,27 +332,27 @@ public static TikaInputStream get(byte[] data) { } /** - * Creates a TikaInputStream from the given array of bytes. The length of - * the array is stored as input metadata in the given metadata instance. + * Creates a TikaInputStream from the given array of bytes. The length of the array is stored as + * input metadata in the given metadata instance. *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the given data to a temporary file. + * Note that you must always explicitly close the returned stream as in some cases it may end up + * writing the given data to a temporary file. * - * @param data input data + * @param data input data * @param metadata metadata instance * @return a TikaInputStream instance */ public static TikaInputStream get(byte[] data, Metadata metadata) { metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length)); - return new TikaInputStream(new UnsynchronizedByteArrayInputStream(data), new TemporaryResources(), - data.length, getExtension(metadata)); + return new TikaInputStream(new UnsynchronizedByteArrayInputStream(data), + new TemporaryResources(), data.length, getExtension(metadata)); } /** * Creates a TikaInputStream from the file at the given path. *

- * Note that you must always explicitly close the returned stream to - * prevent leaking open file handles. + * Note that you must always explicitly close the returned stream to prevent leaking open file + * handles. * * @param path input file * @return a TikaInputStream instance @@ -379,16 +363,16 @@ public static TikaInputStream get(Path path) throws IOException { } /** - * Creates a TikaInputStream from the file at the given path. The file name - * and length are stored as input metadata in the given metadata instance. + * Creates a TikaInputStream from the file at the given path. The file name and length are + * stored as input metadata in the given metadata instance. *

- * If there's an {@link TikaCoreProperties#RESOURCE_NAME_KEY} in the - * metadata object, this will not overwrite that value with the path's name. + * If there's an {@link TikaCoreProperties#RESOURCE_NAME_KEY} in the metadata object, this will + * not overwrite that value with the path's name. *

- * Note that you must always explicitly close the returned stream to - * prevent leaking open file handles. + * Note that you must always explicitly close the returned stream to prevent leaking open file + * handles. * - * @param path input file + * @param path input file * @param metadata metadata instance * @return a TikaInputStream instance * @throws IOException if an I/O error occurs @@ -402,7 +386,7 @@ public static TikaInputStream get(Path path, Metadata metadata) throws IOExcepti } public static TikaInputStream get(Path path, Metadata metadata, TemporaryResources tmp) - throws IOException { + throws IOException { long length = Files.size(path); if (StringUtils.isBlank(metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY))) { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, path.getFileName().toString()); @@ -414,14 +398,14 @@ public static TikaInputStream get(Path path, Metadata metadata, TemporaryResourc /** * Creates a TikaInputStream from the given file. *

- * Note that you must always explicitly close the returned stream to - * prevent leaking open file handles. + * Note that you must always explicitly close the returned stream to prevent leaking open file + * handles. * * @param file input file * @return a TikaInputStream instance * @throws FileNotFoundException if the file does not exist - * @deprecated use {@link #get(Path)}. In Tika 2.0, this will be removed - * or modified to throw an IOException. + * @deprecated use {@link #get(Path)}. In Tika 2.0, this will be removed or modified to throw an + * IOException. */ @Deprecated public static TikaInputStream get(File file) throws FileNotFoundException { @@ -429,19 +413,18 @@ public static TikaInputStream get(File file) throws FileNotFoundException { } /** - * Creates a TikaInputStream from the given file. The file name and - * length are stored as input metadata in the given metadata instance. + * Creates a TikaInputStream from the given file. The file name and length are stored as input + * metadata in the given metadata instance. *

- * Note that you must always explicitly close the returned stream to - * prevent leaking open file handles. + * Note that you must always explicitly close the returned stream to prevent leaking open file + * handles. * - * @param file input file + * @param file input file * @param metadata metadata instance * @return a TikaInputStream instance - * @throws FileNotFoundException if the file does not exist - * or cannot be opened for reading - * @deprecated use {@link #get(Path, Metadata)}. In Tika 2.0, - * this will be removed or modified to throw an IOException. + * @throws FileNotFoundException if the file does not exist or cannot be opened for reading + * @deprecated use {@link #get(Path, Metadata)}. In Tika 2.0, this will be removed or modified + * to throw an IOException. */ @Deprecated public static TikaInputStream get(File file, Metadata metadata) throws FileNotFoundException { @@ -453,11 +436,11 @@ public static TikaInputStream get(File file, Metadata metadata) throws FileNotFo } /** - * Creates a TikaInputStream from a Factory which can create - * fresh {@link InputStream}s for the same resource multiple times. - *

This is typically desired when working with {@link Parser}s that - * need to re-read the stream multiple times, where other forms - * of buffering (eg File) are slower than just getting a fresh + * Creates a TikaInputStream from a Factory which can create fresh {@link InputStream}s for the + * same resource multiple times. + *

+ * This is typically desired when working with {@link Parser}s that need to re-read the stream + * multiple times, where other forms of buffering (eg File) are slower than just getting a fresh * new stream each time. */ public static TikaInputStream get(InputStreamFactory factory) throws IOException { @@ -465,15 +448,15 @@ public static TikaInputStream get(InputStreamFactory factory) throws IOException } /** - * Creates a TikaInputStream from a Factory which can create - * fresh {@link InputStream}s for the same resource multiple times. - *

This is typically desired when working with {@link Parser}s that - * need to re-read the stream multiple times, where other forms - * of buffering (eg File) are slower than just getting a fresh + * Creates a TikaInputStream from a Factory which can create fresh {@link InputStream}s for the + * same resource multiple times. + *

+ * This is typically desired when working with {@link Parser}s that need to re-read the stream + * multiple times, where other forms of buffering (eg File) are slower than just getting a fresh * new stream each time. */ public static TikaInputStream get(InputStreamFactory factory, TemporaryResources tmp) - throws IOException { + throws IOException { TikaInputStream stream = get(factory.getInputStream(), tmp, null); stream.streamFactory = factory; return stream; @@ -482,10 +465,9 @@ public static TikaInputStream get(InputStreamFactory factory, TemporaryResources /** * Creates a TikaInputStream from the given database BLOB. *

- * Note that the result set containing the BLOB may need to be kept open - * until the returned TikaInputStream has been processed and closed. - * You must also always explicitly close the returned stream as in - * some cases it may end up writing the blob data to a temporary file. + * Note that the result set containing the BLOB may need to be kept open until the returned + * TikaInputStream has been processed and closed. You must also always explicitly close the + * returned stream as in some cases it may end up writing the blob data to a temporary file. * * @param blob database BLOB * @return a TikaInputStream instance @@ -496,16 +478,14 @@ public static TikaInputStream get(Blob blob) throws SQLException { } /** - * Creates a TikaInputStream from the given database BLOB. The BLOB - * length (if available) is stored as input metadata in the given - * metadata instance. + * Creates a TikaInputStream from the given database BLOB. The BLOB length (if available) is + * stored as input metadata in the given metadata instance. *

- * Note that the result set containing the BLOB may need to be kept open - * until the returned TikaInputStream has been processed and closed. - * You must also always explicitly close the returned stream as in - * some cases it may end up writing the blob data to a temporary file. + * Note that the result set containing the BLOB may need to be kept open until the returned + * TikaInputStream has been processed and closed. You must also always explicitly close the + * returned stream as in some cases it may end up writing the blob data to a temporary file. * - * @param blob database BLOB + * @param blob database BLOB * @param metadata metadata instance * @return a TikaInputStream instance * @throws SQLException if BLOB data can not be accessed @@ -526,8 +506,7 @@ public static TikaInputStream get(Blob blob, Metadata metadata) throws SQLExcept return get(blob.getBytes(1, (int) length), metadata); } else { return new TikaInputStream(new BufferedInputStream(blob.getBinaryStream()), - new TemporaryResources(), length, - getExtension(metadata)); + new TemporaryResources(), length, getExtension(metadata)); } } @@ -542,8 +521,8 @@ private static String getExtension(Metadata metadata) { /** * Creates a TikaInputStream from the resource at the given URI. *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the resource to a temporary file. + * Note that you must always explicitly close the returned stream as in some cases it may end up + * writing the resource to a temporary file. * * @param uri resource URI * @return a TikaInputStream instance @@ -554,13 +533,13 @@ public static TikaInputStream get(URI uri) throws IOException { } /** - * Creates a TikaInputStream from the resource at the given URI. The - * available input metadata is stored in the given metadata instance. + * Creates a TikaInputStream from the resource at the given URI. The available input metadata is + * stored in the given metadata instance. *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the resource to a temporary file. + * Note that you must always explicitly close the returned stream as in some cases it may end up + * writing the resource to a temporary file. * - * @param uri resource URI + * @param uri resource URI * @param metadata metadata instance * @return a TikaInputStream instance * @throws IOException if the resource can not be accessed @@ -580,8 +559,8 @@ public static TikaInputStream get(URI uri, Metadata metadata) throws IOException /** * Creates a TikaInputStream from the resource at the given URL. *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the resource to a temporary file. + * Note that you must always explicitly close the returned stream as in some cases it may end up + * writing the resource to a temporary file. * * @param url resource URL * @return a TikaInputStream instance @@ -592,13 +571,13 @@ public static TikaInputStream get(URL url) throws IOException { } /** - * Creates a TikaInputStream from the resource at the given URL. The - * available input metadata is stored in the given metadata instance. + * Creates a TikaInputStream from the resource at the given URL. The available input metadata is + * stored in the given metadata instance. *

- * Note that you must always explicitly close the returned stream as in - * some cases it may end up writing the resource to a temporary file. + * Note that you must always explicitly close the returned stream as in some cases it may end up + * writing the resource to a temporary file. * - * @param url resource URL + * @param url resource URL * @param metadata metadata instance * @return a TikaInputStream instance * @throws IOException if the resource can not be accessed @@ -640,14 +619,13 @@ public static TikaInputStream get(URL url, Metadata metadata) throws IOException } return new TikaInputStream(new BufferedInputStream(connection.getInputStream()), - new TemporaryResources(), length, getExtension(metadata)); + new TemporaryResources(), length, getExtension(metadata)); } /** - * Fills the given buffer with upcoming bytes from this stream without - * advancing the current stream position. The buffer is filled up unless - * the end of stream is encountered before that. This method will block - * if not enough bytes are immediately available. + * Fills the given buffer with upcoming bytes from this stream without advancing the current + * stream position. The buffer is filled up unless the end of stream is encountered before that. + * This method will block if not enough bytes are immediately available. * * @param buffer byte buffer * @return number of bytes written to the buffer @@ -674,9 +652,8 @@ public int peek(byte[] buffer) throws IOException { } /** - * Returns the open container object if any, such as a - * POIFS FileSystem in the event of an OLE2 document - * being detected and processed by the OLE2 detector. + * Returns the open container object if any, such as a POIFS FileSystem in the event of an OLE2 + * document being detected and processed by the OLE2 detector. * * @return Open Container for this stream, or null if none */ @@ -685,14 +662,12 @@ public Object getOpenContainer() { } /** - * Stores the open container object against - * the stream, eg after a Zip contents - * detector has loaded the file to decide - * what it contains. + * Stores the open container object against the stream, eg after a Zip contents detector has + * loaded the file to decide what it contains. *

* If there's no undelrying stream, consider {@link #getFromContainer(Object, long, Metadata)} - * because that will avoid potential improper zip bomb exceptions from the SecurityHandler if - * it thinks the length of the stream == 0. + * because that will avoid potential improper zip bomb exceptions from the SecurityHandler if it + * thinks the length of the stream == 0. */ public void setOpenContainer(Object container) { openContainer = container; @@ -714,8 +689,8 @@ public boolean hasInputStreamFactory() { } /** - * If the Stream was created from an {@link InputStreamFactory}, - * return that, otherwise null. + * If the Stream was created from an {@link InputStreamFactory}, return that, otherwise + * null. */ public InputStreamFactory getInputStreamFactory() { return streamFactory; @@ -727,10 +702,9 @@ public boolean hasFile() { /** - * If the user created this TikaInputStream with a file, - * the original file will be returned. If not, the entire stream - * will be spooled to a temporary file which will be deleted - * upon the close of this TikaInputStream + * If the user created this TikaInputStream with a file, the original file will be returned. If + * not, the entire stream will be spooled to a temporary file which will be deleted upon the + * close of this TikaInputStream * * @return * @throws IOException @@ -740,11 +714,11 @@ public Path getPath() throws IOException { } /** - * @param maxBytes if this is less than 0 and if an underlying file doesn't already exist, - * the full file will be spooled to disk - * @return the original path used in the initialization of this TikaInputStream, - * a temporary file if the stream was shorter than maxBytes, or null - * if the underlying stream was longer than maxBytes. + * @param maxBytes if this is less than 0 and if an underlying file doesn't already exist, the + * full file will be spooled to disk + * @return the original path used in the initialization of this TikaInputStream, a temporary + * file if the stream was shorter than maxBytes, or null if + * the underlying stream was longer than maxBytes. * @throws IOException */ public Path getPath(int maxBytes) throws IOException { @@ -756,12 +730,13 @@ public Path getPath(int maxBytes) throws IOException { } else { Path tmpFile = tmp.createTempFile(suffix); if (maxBytes > -1) { - try (BoundedInputStream boundedInputStream = new BoundedInputStream(maxBytes, this)) { + try (BoundedInputStream boundedInputStream = + new BoundedInputStream(maxBytes, this)) { boundedInputStream.mark(maxBytes); try { Files.copy(boundedInputStream, tmpFile, REPLACE_EXISTING); if (boundedInputStream.hasHitBound()) { - //tmpFile will be cleaned up when this TikaInputStream is closed + // tmpFile will be cleaned up when this TikaInputStream is closed return null; } } finally { @@ -772,7 +747,7 @@ public Path getPath(int maxBytes) throws IOException { // Spool the entire stream into a temporary file Files.copy(this, tmpFile, REPLACE_EXISTING); } - //successful so far, set tis' path to tmpFile + // successful so far, set tis' path to tmpFile path = tmpFile; // Create a new input stream and make sure it'll get closed @@ -794,8 +769,8 @@ public void close() throws IOException { // Update length to file size. Update position, mark long sz = Files.size(path); if (getOpenContainer() != null && sz == 0 && length > -1) { - //don't update size if there's an open container and the sz == 0 - //hope that the length was sent in earlier via getFromContainer + // don't update size if there's an open container and the sz == 0 + // hope that the length was sent in earlier via getFromContainer } else { length = sz; } @@ -824,11 +799,10 @@ public boolean hasLength() { } /** - * Returns the length (in bytes) of this stream. Note that if the length - * was not available when this stream was instantiated, then this method - * will use the {@link #getPath()} method to buffer the entire stream to - * a temporary file in order to calculate the stream length. This case - * will only work if the stream has not yet been consumed. + * Returns the length (in bytes) of this stream. Note that if the length was not available when + * this stream was instantiated, then this method will use the {@link #getPath()} method to + * buffer the entire stream to a temporary file in order to calculate the stream length. This + * case will only work if the stream has not yet been consumed. * * @return stream length * @throws IOException if the length can not be determined @@ -850,8 +824,8 @@ public long getPosition() { } /** - * This should only be called by the constructor for an open container with a 0 length - * byte inputStream + * This should only be called by the constructor for an open container with a 0 length byte + * inputStream * * @param length */ @@ -860,19 +834,19 @@ private void setLength(long length) { } /** - * This relies on {@link IOUtils#skip(InputStream, long, byte[])} to ensure - * that the alleged bytes skipped were actually skipped. + * This relies on {@link IOUtils#skip(InputStream, long, byte[])} to ensure that the alleged + * bytes skipped were actually skipped. * * @param ln the number of bytes to skip * @return the number of bytes skipped - * @throws IOException if the number of bytes requested to be skipped does not match the - * number of bytes skipped or if there's an IOException during the read. + * @throws IOException if the number of bytes requested to be skipped does not match the number + * of bytes skipped or if there's an IOException during the read. */ @Override public long skip(long ln) throws IOException { - //On TIKA-3092, we found that using the static byte array buffer - //caused problems with multithreading with the FlateInputStream - //from a POIFS document stream + // On TIKA-3092, we found that using the static byte array buffer + // caused problems with multithreading with the FlateInputStream + // from a POIFS document stream if (skipBuffer == null) { skipBuffer = new byte[4096]; } @@ -924,9 +898,9 @@ protected void afterRead(int n) throws IOException { } else { consecutiveEOFs++; if (consecutiveEOFs > MAX_CONSECUTIVE_EOFS) { - throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." + - "If you think your file is not corrupt, please open an issue on Tika's " + - "JIRA"); + throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." + + "If you think your file is not corrupt, please open an issue on Tika's " + + "JIRA"); } } } @@ -942,6 +916,7 @@ public void removeCloseShield() { public boolean isCloseShield() { return closeShieldDepth > 0; } + @Override public String toString() { String str = "TikaInputStream of "; diff --git a/tika-core/src/main/java/org/apache/tika/io/package-info.java b/tika-core/src/main/java/org/apache/tika/io/package-info.java index 36c7274da5..d88b7f4b94 100644 --- a/tika-core/src/main/java/org/apache/tika/io/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/io/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java index e5b520ffc1..7b51fda92f 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java @@ -1,22 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.language.detect; public enum LanguageConfidence { - HIGH, MEDIUM, LOW, NONE // Special value when no language is detected + HIGH, MEDIUM, LOW, NONE // Special value when no language is detected } diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java index 722ded343b..cc8ed0cd91 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.language.detect; @@ -20,7 +18,6 @@ import java.util.List; import java.util.Map; import java.util.Set; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.utils.CompareUtils; @@ -49,8 +46,8 @@ public abstract class LanguageDetector { private static final ServiceLoader DEFAULT_SERVICE_LOADER = new ServiceLoader(); - //if a user calls detect on a huge string, break it into this size - //and add sequentially until hasEnoughText() is true + // if a user calls detect on a huge string, break it into this size + // and add sequentially until hasEnoughText() is true private static final int BUFFER_LENGTH = 4096; // True if text is expected to be a mix of languages, and thus higher-resolution @@ -76,7 +73,7 @@ public static List getLanguageDetectors() { public static List getLanguageDetectors(ServiceLoader loader) { List detectors = - loader.loadStaticServiceProviders(LanguageDetector.class); + loader.loadStaticServiceProviders(LanguageDetector.class); detectors.sort(CompareUtils::compareClassName); return detectors; } @@ -100,20 +97,18 @@ public LanguageDetector setShortText(boolean shortText) { } /** - * Load (or re-load) all available language models. This must - * be called after any settings that would impact the models - * being loaded (e.g. mixed language/short text), but - * before any of the document processing routines (below) - * are called. Note that it only needs to be called once. + * Load (or re-load) all available language models. This must be called after any settings that + * would impact the models being loaded (e.g. mixed language/short text), but before any of the + * document processing routines (below) are called. Note that it only needs to be called once. * * @return this */ public abstract LanguageDetector loadModels() throws IOException; /** - * Load (or re-load) the models specified in . These use the - * ISO 639-1 names, with an optional "-" for more - * specific specification (e.g. "zh-CN" for Chinese in China). + * Load (or re-load) the models specified in . These use the ISO 639-1 names, with an + * optional "-" for more specific specification (e.g. "zh-CN" for Chinese in + * China). * * @param languages list of target languages. * @return this @@ -121,8 +116,7 @@ public LanguageDetector setShortText(boolean shortText) { public abstract LanguageDetector loadModels(Set languages) throws IOException; /** - * Provide information about whether a model exists for a specific - * language. + * Provide information about whether a model exists for a specific language. * * @param language ISO 639-1 name for language * @return true if a model for this language exists. @@ -130,20 +124,20 @@ public LanguageDetector setShortText(boolean shortText) { public abstract boolean hasModel(String language); /** - * Set the a-priori probabilities for these languages. The provided map uses the language - * as the key, and the probability (0.0 > probability < 1.0) of text being in that language. - * Note that if the probabilities don't sum to 1.0, these values will be normalized. + * Set the a-priori probabilities for these languages. The provided map uses the language as the + * key, and the probability (0.0 > probability < 1.0) of text being in that language. Note that + * if the probabilities don't sum to 1.0, these values will be normalized. *

* If hasModel() returns false for any of the languages, an IllegalArgumentException is thrown. *

- * Use of these probabilities is detector-specific, and thus might not impact the results at + * Use of these probabilities is detector-specific, and thus might not impact the results at * all. As such, these should be viewed as a hint. * * @param languageProbabilities Map from language to probability * @return this */ public abstract LanguageDetector setPriors(Map languageProbabilities) - throws IOException; + throws IOException; // ============================================================ // The routines below are called when processing a document @@ -155,20 +149,18 @@ public abstract LanguageDetector setPriors(Map languageProbabilit public abstract void reset(); /** - * Add statistics about this text for the current document. Note - * that we assume an implicit word break exists before/after - * each of these runs of text. + * Add statistics about this text for the current document. Note that we assume an implicit word + * break exists before/after each of these runs of text. * * @param cbuf Character buffer - * @param off Offset into cbuf to first character in the run of text - * @param len Number of characters in the run of text. + * @param off Offset into cbuf to first character in the run of text + * @param len Number of characters in the run of text. */ public abstract void addText(char[] cbuf, int off, int len); /** - * Add to the statistics being accumulated for the current - * document. Note that this is a default implementation for adding - * a string (not optimized) + * Add to the statistics being accumulated for the current document. Note that this is a + * default implementation for adding a string (not optimized) * * @param text Characters to add to current statistics. */ @@ -180,7 +172,7 @@ public void addText(CharSequence text) { return; } int start = 0; - while (! hasEnoughText() && start < len) { + while (!hasEnoughText() && start < len) { int end = Math.min(start + BUFFER_LENGTH, len); char[] chars = text.subSequence(start, end).toString().toCharArray(); addText(chars, 0, chars.length); @@ -191,11 +183,11 @@ public void addText(CharSequence text) { /** - * Tell the caller whether more text is required for the current document - * before the language can be reliably detected. + * Tell the caller whether more text is required for the current document before the language + * can be reliably detected. *

- * Implementations can override this to do early termination of stats - * collection, which can improve performance with longer documents. + * Implementations can override this to do early termination of stats collection, which can + * improve performance with longer documents. *

* Note that detect() can be called even when this returns false * @@ -208,9 +200,9 @@ public boolean hasEnoughText() { /** * Detect languages based on previously submitted text (via addText calls). * - * @return list of all possible languages with at least medium confidence, - * sorted by confidence from highest to lowest. There will always - * be at least one result, which might have a confidence of NONE. + * @return list of all possible languages with at least medium confidence, sorted by confidence + * from highest to lowest. There will always be at least one result, which might have a + * confidence of NONE. */ public abstract List detectAll(); @@ -223,8 +215,8 @@ public LanguageResult detect() { * Utility wrapper that detects the language of a given chunk of text. * * @param text String to add to current statistics. - * @return list of all possible languages with at least medium confidence, - * sorted by confidence from highest to lowest. + * @return list of all possible languages with at least medium confidence, sorted by confidence + * from highest to lowest. */ public List detectAll(String text) { reset(); diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java index af3e1bd02c..1aa57f3d7f 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java @@ -1,28 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.language.detect; import java.io.IOException; - import org.apache.tika.sax.WriteOutContentHandler; /** - * SAX content handler that updates a language detector based on all the - * received character content. + * SAX content handler that updates a language detector based on all the received character content. * * @since Apache Tika 0.10 */ @@ -45,9 +41,8 @@ public LanguageHandler(LanguageDetector detector) { } /** - * Returns the language detector used by this content handler. - * Note that the returned detector gets updated whenever new SAX events - * are received by this content handler. + * Returns the language detector used by this content handler. Note that the returned detector + * gets updated whenever new SAX events are received by this content handler. * * @return language detector */ diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java index ed52640bef..c96b1cd9f3 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.language.detect; @@ -21,18 +19,16 @@ /** * Support for language tags (as defined by https://tools.ietf.org/html/bcp47) *

- * See https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes for a list of - * three character language codes. + * See https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes for a list of three character language + * codes. *

- * TODO change to LanguageTag, and use these vs. strings everywhere in the - * language detector API? + * TODO change to LanguageTag, and use these vs. strings everywhere in the language detector API? */ public class LanguageNames { public static String makeName(String language, String script, String region) { - Locale locale = - new Locale.Builder().setLanguage(language).setScript(script).setRegion(region) - .build(); + Locale locale = new Locale.Builder().setLanguage(language).setScript(script) + .setRegion(region).build(); return locale.toLanguageTag(); } @@ -74,7 +70,7 @@ public static boolean equals(String languageTagA, String languageTagB) { // e.g. zh-CN => zh-Hans-CN, zh-TW => zh-Hant-TW. // TODO Treat missing script == present script, if present script is default - // (suppressed) for the language. So "en-Latn" == "en" + // (suppressed) for the language. So "en-Latn" == "en" // TODO probably OK to ignore extensions diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java index dada5fda17..1ec662dfdc 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.language.detect; @@ -67,16 +65,11 @@ public boolean isUnknown() { } /** - * Return true if the target language matches the detected language. We consider - * it a match if, for the precision requested or detected, it matches. This means: + * Return true if the target language matches the detected language. We consider it a match if, + * for the precision requested or detected, it matches. This means: *

- * target | detected | match? - * zh | en | false - * zh | zh | true - * zh | zh-CN | true - * zh-CN | zh | true - * zh-CN | zh-TW | false - * zh-CN | zh-cn | true (case-insensitive) + * target | detected | match? zh | en | false zh | zh | true zh | zh-CN | true zh-CN | zh | true + * zh-CN | zh-TW | false zh-CN | zh-cn | true (case-insensitive) * * @param language * @return diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java index 92cd630c9d..f3989b87b1 100644 --- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java +++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.language.detect; @@ -34,9 +32,8 @@ public LanguageWriter(LanguageDetector detector) { } /** - * Returns the language detector used by this writer. Note that - * the returned language detector gets updated whenever new characters - * are written. + * Returns the language detector used by this writer. Note that the returned language detector + * gets updated whenever new characters are written. * * @return language detector */ @@ -62,15 +59,13 @@ public void write(char[] cbuf, int off, int len) { * Ignored. */ @Override - public void close() throws IOException { - } + public void close() throws IOException {} /** * Ignored. */ @Override - public void flush() { - } + public void flush() {} public void reset() { detector.reset(); diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java b/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java index 11b45d5930..df51bb4c5d 100644 --- a/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java +++ b/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java @@ -1,33 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.language.translate; import java.io.IOException; import java.util.List; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.exception.TikaException; import org.apache.tika.utils.CompareUtils; /** - * A translator which picks the first available {@link Translator} - * implementations available through the - * {@link javax.imageio.spi.ServiceRegistry service provider mechanism}. + * A translator which picks the first available {@link Translator} implementations available through + * the {@link javax.imageio.spi.ServiceRegistry service provider mechanism}. * * @since Apache Tika 1.6 */ @@ -43,8 +39,8 @@ public DefaultTranslator() { } /** - * Finds all statically loadable translators and sort the list by name, - * rather than discovery order. + * Finds all statically loadable translators and sort the list by name, rather than discovery + * order. * * @param loader service loader * @return ordered list of statically loadable translators @@ -71,7 +67,7 @@ private static Translator getFirstAvailable(ServiceLoader loader) { * Translate, using the first available service-loaded translator */ public String translate(String text, String sourceLanguage, String targetLanguage) - throws TikaException, IOException { + throws TikaException, IOException { Translator t = getFirstAvailable(loader); if (t != null) { return t.translate(text, sourceLanguage, targetLanguage); diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java b/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java index 9324af224a..360470e6c4 100644 --- a/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java +++ b/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.language.translate; /** - * Dummy translator that always declines to give any text. Useful as a - * sentinel translator for when none others are available. - * for unknown document types. + * Dummy translator that always declines to give any text. Useful as a sentinel translator for when + * none others are available. for unknown document types. */ public class EmptyTranslator implements Translator { public String translate(String text, String sourceLanguage, String targetLanguage) { diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java index 563e6c4fc5..ced6170318 100644 --- a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java +++ b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.language.translate; import java.io.IOException; - import org.apache.tika.exception.TikaException; /** @@ -29,25 +26,25 @@ public interface Translator { /** * Translate text between given languages. * - * @param text The text to translate. + * @param text The text to translate. * @param sourceLanguage The input text language (for example, "en"). * @param targetLanguage The desired language to translate to (for example, "fr"). * @return The translation result. If translation is unavailable, returns the same text back. - * @throws TikaException When there is an error translating. + * @throws TikaException When there is an error translating. * @throws java.io.IOException * @since Tika 1.6 */ String translate(String text, String sourceLanguage, String targetLanguage) - throws TikaException, IOException; + throws TikaException, IOException; /** - * Translate text to the given language - * This method attempts to auto-detect the source language of the text. + * Translate text to the given language This method attempts to auto-detect the source language + * of the text. * - * @param text The text to translate. + * @param text The text to translate. * @param targetLanguage The desired language to translate to (for example, "hi"). * @return The translation result. If translation is unavailable, returns the same text back. - * @throws TikaException When there is an error translating. + * @throws TikaException When there is an error translating. * @throws java.io.IOException * @since Tika 1.6 */ diff --git a/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java b/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java index db689f9120..b32fccd13b 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java @@ -1,31 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * Until we can find a common standard, we'll use these options. They - * were mostly derived from PDFBox's AccessPermission, but some can - * apply to other document formats, especially CAN_MODIFY and FILL_IN_FORM. + * Until we can find a common standard, we'll use these options. They were mostly derived from + * PDFBox's AccessPermission, but some can apply to other document formats, especially CAN_MODIFY + * and FILL_IN_FORM. */ public interface AccessPermissions { - String PREFIX = - "access_permission" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + String PREFIX = "access_permission" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; /** * Can any modifications be made to the document @@ -38,11 +35,10 @@ public interface AccessPermissions { Property EXTRACT_CONTENT = Property.externalText(PREFIX + "extract_content"); /** - * Should content be extracted for the purposes - * of accessibility. + * Should content be extracted for the purposes of accessibility. */ Property EXTRACT_FOR_ACCESSIBILITY = - Property.externalText(PREFIX + "extract_for_accessibility"); + Property.externalText(PREFIX + "extract_for_accessibility"); /** * Can the user insert/rotate/delete pages. diff --git a/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java b/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java index 9ad1632837..fe8d32f5a8 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java @@ -1,24 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * Met keys from NCAR CCSM files in the Climate Forecast Convention. + * Met keys from NCAR CCSM files in the Climate Forecast + * Convention. */ public interface ClimateForcast { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java b/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java index 122a1fc578..f9978d1965 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Database.java b/tika-core/src/main/java/org/apache/tika/metadata/Database.java index 736213942b..23ab31f797 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Database.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Database.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java index 283080a0d7..df5fdd3c1e 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -21,15 +19,13 @@ * * @see dublincore.org * - *

- * These keys are intended to be a unifying normalization of information - * within a file. For some formats, like PDF, where there may be conflicting - * information in different parts of the file (xmp vs. docinfo) for the - * same metadata key, we do what we can, and these keys represent a - * normalization of metadata values within a file. - *

- * For Dublin Core information that derives specifically and only from - * XMP, see {@link XMPDC}. + *

+ * These keys are intended to be a unifying normalization of information within a file. For + * some formats, like PDF, where there may be conflicting information in different parts of the + * file (xmp vs. docinfo) for the same metadata key, we do what we can, and these keys + * represent a normalization of metadata values within a file. + *

+ * For Dublin Core information that derives specifically and only from XMP, see {@link XMPDC}. */ public interface DublinCore { @@ -39,162 +35,146 @@ public interface DublinCore { String PREFIX_DC_TERMS = "dcterms"; /** - * Typically, Format may include the media-type or dimensions of the - * resource. Format may be used to determine the software, hardware or - * other equipment needed to display or operate the resource. Examples - * of dimensions include size and duration. Recommended best practice is - * to select a value from a controlled vocabulary (for example, the list - * of Internet Media Types [MIME] defining computer media formats). + * Typically, Format may include the media-type or dimensions of the resource. Format may be + * used to determine the software, hardware or other equipment needed to display or operate the + * resource. Examples of dimensions include size and duration. Recommended best practice is to + * select a value from a controlled vocabulary (for example, the list of Internet Media Types + * [MIME] defining computer media formats). */ Property FORMAT = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "format"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "format"); /** - * Recommended best practice is to identify the resource by means of - * a string or number conforming to a formal identification system. - * Example formal identification systems include the Uniform Resource - * Identifier (URI) (including the Uniform Resource Locator (URL)), - * the Digital Object Identifier (DOI) and the International Standard - * Book Number (ISBN). + * Recommended best practice is to identify the resource by means of a string or number + * conforming to a formal identification system. Example formal identification systems include + * the Uniform Resource Identifier (URI) (including the Uniform Resource Locator (URL)), the + * Digital Object Identifier (DOI) and the International Standard Book Number (ISBN). */ Property IDENTIFIER = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "identifier"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "identifier"); /** * Date on which the resource was changed. */ Property MODIFIED = Property.internalDate( - PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "modified"); + PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "modified"); /** - * An entity responsible for making contributions to the content of the - * resource. Examples of a Contributor include a person, an organisation, - * or a service. Typically, the name of a Contributor should be used to - * indicate the entity. + * An entity responsible for making contributions to the content of the resource. Examples of a + * Contributor include a person, an organisation, or a service. Typically, the name of a + * Contributor should be used to indicate the entity. */ Property CONTRIBUTOR = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contributor"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contributor"); /** - * The extent or scope of the content of the resource. Coverage will - * typically include spatial location (a place name or geographic - * coordinates), temporal period (a period label, date, or date range) - * or jurisdiction (such as a named administrative entity). Recommended - * best practice is to select a value from a controlled vocabulary (for - * example, the Thesaurus of Geographic Names [TGN]) and that, where - * appropriate, named places or time periods be used in preference to - * numeric identifiers such as sets of coordinates or date ranges. + * The extent or scope of the content of the resource. Coverage will typically include spatial + * location (a place name or geographic coordinates), temporal period (a period label, date, or + * date range) or jurisdiction (such as a named administrative entity). Recommended best + * practice is to select a value from a controlled vocabulary (for example, the Thesaurus of + * Geographic Names [TGN]) and that, where appropriate, named places or time periods be used in + * preference to numeric identifiers such as sets of coordinates or date ranges. */ Property COVERAGE = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "coverage"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "coverage"); /** - * An entity primarily responsible for making the content of the resource. - * Examples of a Creator include a person, an organisation, or a service. - * Typically, the name of a Creator should be used to indicate the entity. + * An entity primarily responsible for making the content of the resource. Examples of a Creator + * include a person, an organisation, or a service. Typically, the name of a Creator should be + * used to indicate the entity. */ Property CREATOR = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creator"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creator"); /** * Date of creation of the resource. */ Property CREATED = Property.internalDate( - PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "created"); + PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "created"); /** - * A date associated with an event in the life cycle of the resource. - * Typically, Date will be associated with the creation or availability of - * the resource. Recommended best practice for encoding the date value is - * defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD - * format. + * A date associated with an event in the life cycle of the resource. Typically, Date will be + * associated with the creation or availability of the resource. Recommended best practice for + * encoding the date value is defined in a profile of ISO 8601 [W3CDTF] and follows the + * YYYY-MM-DD format. */ Property DATE = Property.internalDate( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "date"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "date"); /** - * An account of the content of the resource. Description may include - * but is not limited to: an abstract, table of contents, reference to - * a graphical representation of content or a free-text account of - * the content. + * An account of the content of the resource. Description may include but is not limited to: an + * abstract, table of contents, reference to a graphical representation of content or a + * free-text account of the content. */ Property DESCRIPTION = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "description"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "description"); /** - * A language of the intellectual content of the resource. Recommended - * best practice is to use RFC 3066 [RFC3066], which, in conjunction - * with ISO 639 [ISO639], defines two- and three-letter primary language - * tags with optional subtags. Examples include "en" or "eng" for English, - * "akk" for Akkadian, and "en-GB" for English used in the United Kingdom. + * A language of the intellectual content of the resource. Recommended best practice is to use + * RFC 3066 [RFC3066], which, in conjunction with ISO 639 [ISO639], defines two- and + * three-letter primary language tags with optional subtags. Examples include "en" or "eng" for + * English, "akk" for Akkadian, and "en-GB" for English used in the United Kingdom. */ Property LANGUAGE = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "language"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "language"); /** - * An entity responsible for making the resource available. Examples of - * a Publisher include a person, an organisation, or a service. Typically, - * the name of a Publisher should be used to indicate the entity. + * An entity responsible for making the resource available. Examples of a Publisher include a + * person, an organisation, or a service. Typically, the name of a Publisher should be used to + * indicate the entity. */ Property PUBLISHER = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "publisher"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "publisher"); /** - * A reference to a related resource. Recommended best practice is to - * reference the resource by means of a string or number conforming to - * a formal identification system. + * A reference to a related resource. Recommended best practice is to reference the resource by + * means of a string or number conforming to a formal identification system. */ Property RELATION = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "relation"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "relation"); /** - * Information about rights held in and over the resource. Typically, - * a Rights element will contain a rights management statement for - * the resource, or reference a service providing such information. - * Rights information often encompasses Intellectual Property Rights - * (IPR), Copyright, and various Property Rights. If the Rights element - * is absent, no assumptions can be made about the status of these and - * other rights with respect to the resource. + * Information about rights held in and over the resource. Typically, a Rights element will + * contain a rights management statement for the resource, or reference a service providing such + * information. Rights information often encompasses Intellectual Property Rights (IPR), + * Copyright, and various Property Rights. If the Rights element is absent, no assumptions can + * be made about the status of these and other rights with respect to the resource. */ Property RIGHTS = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "rights"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "rights"); /** - * A reference to a resource from which the present resource is derived. - * The present resource may be derived from the Source resource in whole - * or in part. Recommended best practice is to reference the resource by - * means of a string or number conforming to a formal identification + * A reference to a resource from which the present resource is derived. The present resource + * may be derived from the Source resource in whole or in part. Recommended best practice is to + * reference the resource by means of a string or number conforming to a formal identification * system. */ Property SOURCE = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "source"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "source"); /** - * The topic of the content of the resource. Typically, a Subject will - * be expressed as keywords, key phrases or classification codes that - * describe a topic of the resource. Recommended best practice is to - * select a value from a controlled vocabulary or formal classification - * scheme. + * The topic of the content of the resource. Typically, a Subject will be expressed as keywords, + * key phrases or classification codes that describe a topic of the resource. Recommended best + * practice is to select a value from a controlled vocabulary or formal classification scheme. */ Property SUBJECT = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"); /** - * A name given to the resource. Typically, a Title will be a name by - * which the resource is formally known. + * A name given to the resource. Typically, a Title will be a name by which the resource is + * formally known. */ Property TITLE = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "title"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "title"); /** - * The nature or genre of the content of the resource. Type includes terms - * describing general categories, functions, genres, or aggregation levels - * for content. Recommended best practice is to select a value from a - * controlled vocabulary (for example, the DCMI Type Vocabulary - * [DCMITYPE]). To describe the physical or digital manifestation of - * the resource, use the Format element. + * The nature or genre of the content of the resource. Type includes terms describing general + * categories, functions, genres, or aggregation levels for content. Recommended best practice + * is to select a value from a controlled vocabulary (for example, the DCMI Type Vocabulary + * [DCMITYPE]). To describe the physical or digital manifestation of the resource, use the + * Format element. */ Property TYPE = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "type"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "type"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Epub.java b/tika-core/src/main/java/org/apache/tika/metadata/Epub.java index c6e3c3c33a..657e1064de 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Epub.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Epub.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -26,11 +24,11 @@ public interface Epub { String EPUB_PREFIX = "epub" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; /** - * This is set to "pre-paginated" if any itemref on the spine or the - * metadata has a "pre-paginated" value, "reflowable" otherwise. + * This is set to "pre-paginated" if any itemref on the spine or the metadata has a + * "pre-paginated" value, "reflowable" otherwise. */ Property RENDITION_LAYOUT = Property.externalClosedChoise(EPUB_PREFIX + "rendition:layout", - "pre-paginated", "reflowable"); + "pre-paginated", "reflowable"); Property VERSION = Property.externalText(EPUB_PREFIX + "version"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java b/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java index 8636969f12..2c5ac6d6c7 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -23,57 +21,52 @@ public interface ExternalProcess { /** * STD_OUT */ - Property STD_OUT = Property.externalText( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stdout"); + Property STD_OUT = Property.externalText(PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stdout"); /** * STD_ERR */ - Property STD_ERR = Property.externalText( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stderr"); + Property STD_ERR = Property.externalText(PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stderr"); /** * Whether or not stdout was truncated */ - Property STD_OUT_IS_TRUNCATED = Property.externalBoolean( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "stdout-truncated"); + Property STD_OUT_IS_TRUNCATED = Property.externalBoolean(PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stdout-truncated"); /** * Whether or not stderr was truncated */ - Property STD_ERR_IS_TRUNCATED = Property.externalBoolean( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "stderr-truncated"); + Property STD_ERR_IS_TRUNCATED = Property.externalBoolean(PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stderr-truncated"); /** - * Stdout length whether or not it was truncated. If it was truncated, - * what would its length have been; if it wasn't, what is its length. + * Stdout length whether or not it was truncated. If it was truncated, what would its length + * have been; if it wasn't, what is its length. */ - Property STD_OUT_LENGTH = Property.externalReal( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "stdout-length"); + Property STD_OUT_LENGTH = Property.externalReal(PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stdout-length"); /** - * Stderr length whether or not it was truncated. If it was truncated, - * what would its length have been; if it wasn't, what is its length. + * Stderr length whether or not it was truncated. If it was truncated, what would its length + * have been; if it wasn't, what is its length. */ - Property STD_ERR_LENGTH = Property.externalReal( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "stderr-length"); + Property STD_ERR_LENGTH = Property.externalReal(PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stderr-length"); /** * Exit value of the sub process */ - Property EXIT_VALUE = Property.externalInteger( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "exit-value"); + Property EXIT_VALUE = Property.externalInteger(PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "exit-value"); /** * Was the process timed out */ - Property IS_TIMEOUT = Property.externalBoolean( - PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "timeout"); + Property IS_TIMEOUT = Property.externalBoolean(PREFIX_EXTERNAL_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "timeout"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java b/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java index 87afab71c4..0b7f2c1e84 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Font.java b/tika-core/src/main/java/org/apache/tika/metadata/Font.java index 706e199dbc..21eb92fd1c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Font.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Font.java @@ -1,19 +1,18 @@ -package org.apache.tika.metadata; /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ +package org.apache.tika.metadata; public interface Font { @@ -23,6 +22,6 @@ public interface Font { * Basic name of a font used in a file */ Property FONT_NAME = Property.internalTextBag( - PREFIX_FONT_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "name"); + PREFIX_FONT_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "name"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java b/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java index f46d511899..e3d6c4d1ad 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java @@ -1,28 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * Geographic schema. This is a collection of - * {@link Property property definition} constants for geographic - * information, as defined in the W3C Geo Vocabularies. + * Geographic schema. This is a collection of {@link Property property definition} constants for + * geographic information, as defined in the W3C Geo Vocabularies. * - * @see W3C Basic Geo Vocabulary + * @see W3C Basic Geo Vocabulary * @since Apache Tika 0.8 */ public interface Geographic { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/HTML.java b/tika-core/src/main/java/org/apache/tika/metadata/HTML.java index 2d6eb485f6..f285643026 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/HTML.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/HTML.java @@ -1,29 +1,27 @@ -package org.apache.tika.metadata; /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ +package org.apache.tika.metadata; public interface HTML { String PREFIX_HTML_META = "html" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; /** - * If a script element contains a src value, this value - * is set in the embedded document's metadata + * If a script element contains a src value, this value is set in the embedded document's + * metadata */ - Property SCRIPT_SOURCE = Property.internalText( - PREFIX_HTML_META + "scriptSrc"); + Property SCRIPT_SOURCE = Property.internalText(PREFIX_HTML_META + "scriptSrc"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java b/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java index 937f365acb..e42eae176c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java @@ -1,26 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** * A collection of HTTP header names. * - * @see Hypertext Transfer Protocol -- - * HTTP/1.1 (RFC 2616) + * @see Hypertext Transfer Protocol -- HTTP/1.1 (RFC + * 2616) */ public interface HttpHeaders { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java b/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java index f5fa6442b6..e86a471a23 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java @@ -1,33 +1,31 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. * - * IPTC Metadata Descriptions taken from the IPTC Photo Metadata (July 2010) - * standard. These parts Copyright 2010 International Press Telecommunications - * Council. + * IPTC Metadata Descriptions taken from the IPTC Photo Metadata (July 2010) standard. These parts + * Copyright 2010 International Press Telecommunications Council. */ package org.apache.tika.metadata; /** * IPTC photo metadata schema. *

- * A collection of - * {@link Property property definition} constants for the photo metadata - * properties defined in the IPTC standard. + * A collection of {@link Property property definition} constants for the photo metadata properties + * defined in the IPTC standard. * - * @see IPTC Photo Metadata + * @see IPTC + * Photo Metadata * @since Apache Tika 1.1 */ public interface IPTC { @@ -41,15 +39,14 @@ public interface IPTC { String PREFIX_PLUS = "plus"; /** - * Name of the city the content is focussing on -- either the place shown - * in visual media or referenced by text or audio media. This element is at - * the third level of a top-down geographical hierarchy. + * Name of the city the content is focussing on -- either the place shown in visual media or + * referenced by text or audio media. This element is at the third level of a top-down + * geographical hierarchy. *

- * This is a detail of a location with blurred semantics as it does not - * clearly indicate whether it is the location in the image or the location - * the photo was taken - which can be different. Two more concise properties - * are available in IPTC Extension with Location Created and Location Shown - * in the Image. + * This is a detail of a location with blurred semantics as it does not clearly indicate whether + * it is the location in the image or the location the photo was taken - which can be different. + * Two more concise properties are available in IPTC Extension with Location Created and + * Location Shown in the Image. *

* Maps to this IIM property: 2:90 City * @@ -58,17 +55,15 @@ public interface IPTC { Property CITY = Photoshop.CITY; /** - * Full name of the country the content is focussing on -- either the - * country shown in visual media or referenced in text or audio media. This - * element is at the top/first level of a top- down geographical hierarchy. - * The full name should be expressed as a verbal name and not as a code, a - * code should go to the element "CountryCode" + * Full name of the country the content is focussing on -- either the country shown in visual + * media or referenced in text or audio media. This element is at the top/first level of a top- + * down geographical hierarchy. The full name should be expressed as a verbal name and not as a + * code, a code should go to the element "CountryCode" *

- * This is a detail of a location with blurred semantics as it does not - * clearly indicate whether it is the location in the image or the location - * the photo was taken - which can be different. Two more concise properties - * are available in IPTC Extension with Location Created and Location Shown - * in the Image. + * This is a detail of a location with blurred semantics as it does not clearly indicate whether + * it is the location in the image or the location the photo was taken - which can be different. + * Two more concise properties are available in IPTC Extension with Location Created and + * Location Shown in the Image. *

* Maps to this IIM property: 2:101 Country/Primary Location Name * @@ -77,31 +72,28 @@ public interface IPTC { Property COUNTRY = Photoshop.COUNTRY; /** - * Code of the country the content is focussing on -- either the country - * shown in visual media or referenced in text or audio media. This element - * is at the top/first level of a top-down geographical hierarchy. The code - * should be taken from ISO 3166 two or three letter code. The full name of - * a country should go to the "Country" element. + * Code of the country the content is focussing on -- either the country shown in visual media + * or referenced in text or audio media. This element is at the top/first level of a top-down + * geographical hierarchy. The code should be taken from ISO 3166 two or three letter code. The + * full name of a country should go to the "Country" element. *

- * This is a detail of a location with blurred semantics as it does not - * clearly indicate whether it is the location in the image or the location - * the photo was taken - which can be different. Two more concise properties - * are available in IPTC Extension with Location Created and Location Shown - * in the Image. + * This is a detail of a location with blurred semantics as it does not clearly indicate whether + * it is the location in the image or the location the photo was taken - which can be different. + * Two more concise properties are available in IPTC Extension with Location Created and + * Location Shown in the Image. *

* Maps to this IIM property: 2:100 Country/Primary Location Code */ - Property COUNTRY_CODE = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CountryCode"); + Property COUNTRY_CODE = Property.internalText(PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CountryCode"); /** - * A textual description, including captions, of the item's content, - * particularly used where the object is not text. + * A textual description, including captions, of the item's content, particularly used where the + * object is not text. *

- * Note: the XMP property (dc:description) which stores the value of this - * IPTC Core property is of type Lang Alt. Hence any software agent dealing - * with this property must abide to the processing rules for - * Lang Alt value type as specified by the XMP specifications. + * Note: the XMP property (dc:description) which stores the value of this IPTC Core property is + * of type Lang Alt. Hence any software agent dealing with this property must abide to the + * processing rules for Lang Alt value type as specified by the XMP specifications. *

* Maps to this IIM property: 2:120 Caption/Abstract * @@ -119,26 +111,25 @@ public interface IPTC { Property HEADLINE = Photoshop.HEADLINE; /** - * Describes the nature, intellectual, artistic or journalistic - * characteristic of a item, not specifically its content. + * Describes the nature, intellectual, artistic or journalistic characteristic of a item, not + * specifically its content. *

- * The IPTC recognizes that the corresponding IPTC Genre NewsCodes needs - * photo specific extension to be better usable with this field (as of the - * release of this standard in the year 2008). + * The IPTC recognizes that the corresponding IPTC Genre NewsCodes needs photo specific + * extension to be better usable with this field (as of the release of this standard in the year + * 2008). *

* Maps to this IIM property: 2:04 Object Attribute Reference */ - Property INTELLECTUAL_GENRE = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "IntellectualGenre"); + Property INTELLECTUAL_GENRE = Property.internalText(PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "IntellectualGenre"); /** - * Keywords to express the subject of the content. Keywords may be free - * text and don't have to be taken from a controlled vocabulary. Codes from - * the controlled vocabulary IPTC Subject NewsCodes must go to the - * "Subject Code" field. + * Keywords to express the subject of the content. Keywords may be free text and don't have to + * be taken from a controlled vocabulary. Codes from the controlled vocabulary IPTC Subject + * NewsCodes must go to the "Subject Code" field. *

- * Single values of this field should not be restricted to single words - * but must allow for phrases as well. + * Single values of this field should not be restricted to single words but must allow for + * phrases as well. *

* Maps to this IIM property: 2:25 Keywords * @@ -147,16 +138,14 @@ public interface IPTC { Property KEYWORDS = DublinCore.SUBJECT; /** - * Name of the subregion of a country -- either called province or state or - * anything else -- the content is focussing on -- either the subregion - * shown in visual media or referenced by text or audio media. This element - * is at the second level of a top-down geographical hierarchy. + * Name of the subregion of a country -- either called province or state or anything else -- the + * content is focussing on -- either the subregion shown in visual media or referenced by text + * or audio media. This element is at the second level of a top-down geographical hierarchy. *

- * This is a detail of a location with blurred semantics as it does not - * clearly indicate whether it is the location in the image or the location - * the photo was taken - which can be different. Two more concise properties - * are available in IPTC Extension with Location Created and Location Shown - * in the Image. + * This is a detail of a location with blurred semantics as it does not clearly indicate whether + * it is the location in the image or the location the photo was taken - which can be different. + * Two more concise properties are available in IPTC Extension with Location Created and + * Location Shown in the Image. *

* Maps to this IIM property: 2:95 Province/State * @@ -165,62 +154,53 @@ public interface IPTC { Property PROVINCE_OR_STATE = Photoshop.STATE; /** - * Describes the scene of a news content. Specifies one or more terms - * from the IPTC "Scene-NewsCodes". Each Scene is represented as a string of - * 6 digits in an unordered list. + * Describes the scene of a news content. Specifies one or more terms from the IPTC + * "Scene-NewsCodes". Each Scene is represented as a string of 6 digits in an unordered list. *

- * Note: Only Scene values from this IPTC taxonomy should be used here. More - * about the IPTC Scene-NewsCodes at www.newscodes.org. + * Note: Only Scene values from this IPTC taxonomy should be used here. More about the IPTC + * Scene-NewsCodes at www.newscodes.org. */ Property SCENE_CODE = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Scene"); + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Scene"); /** - * Specifies one or more Subjects from the IPTC Subject-NewsCodes taxonomy - * to categorise the content. Each Subject is represented as a string of 8 - * digits in an unordered list. + * Specifies one or more Subjects from the IPTC Subject-NewsCodes taxonomy to categorise the + * content. Each Subject is represented as a string of 8 digits in an unordered list. *

- * Note: Only Subjects from a controlled vocabulary should be used here, - * free text has to be put into the Keyword element. More about - * IPTC Subject-NewsCodes at www.newscodes.org. + * Note: Only Subjects from a controlled vocabulary should be used here, free text has to be put + * into the Keyword element. More about IPTC Subject-NewsCodes at www.newscodes.org. */ - Property SUBJECT_CODE = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "SubjectCode"); + Property SUBJECT_CODE = Property.internalTextBag(PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "SubjectCode"); /** - * Name of a sublocation the content is focussing on -- either the - * location shown in visual media or referenced by text or audio media. This - * location name could either be the name of a sublocation to a city or the - * name of a well known location or (natural) monument outside a city. In - * the sense of a sublocation to a city this element is at the fourth level - * of a top-down geographical hierarchy. + * Name of a sublocation the content is focussing on -- either the location shown in visual + * media or referenced by text or audio media. This location name could either be the name of a + * sublocation to a city or the name of a well known location or (natural) monument outside a + * city. In the sense of a sublocation to a city this element is at the fourth level of a + * top-down geographical hierarchy. *

- * This is a detail of a location with blurred semantics as it does not - * clearly indicate whether it is the location in the image or the location - * the photo was taken - which can be different. Two more concise properties - * are available in IPTC Extension with Location Created and Location Shown - * in the Image. + * This is a detail of a location with blurred semantics as it does not clearly indicate whether + * it is the location in the image or the location the photo was taken - which can be different. + * Two more concise properties are available in IPTC Extension with Location Created and + * Location Shown in the Image. *

* Maps to this IIM property: 2:92 Sublocation */ Property SUBLOCATION = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Location"); + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Location"); /** - * Designates the date and optionally the time the intellectual content was - * created rather than the date of the creation of the physical - * representation. + * Designates the date and optionally the time the intellectual content was created rather than + * the date of the creation of the physical representation. *

- * If a software system requires explicit time values and no time is given - * by the Date Created property the software system should default the time - * to 00:00:00. If the software system does not require an explicit time - * value the time part should be left empty as it is. + * If a software system requires explicit time values and no time is given by the Date Created + * property the software system should default the time to 00:00:00. If the software system does + * not require an explicit time value the time part should be left empty as it is. *

- * Note 1: Any content of the IIM dataset 2:60, Time Created, should be - * merged to this element. - * Note 2: Implementers are encouraged to provide - * the creation date and time from the EXIF data of a digital - * camera to the user for entering this date for the first time. + * Note 1: Any content of the IIM dataset 2:60, Time Created, should be merged to this element. + * Note 2: Implementers are encouraged to provide the creation date and time from the EXIF data + * of a digital camera to the user for entering this date for the first time. *

* Maps to this IIM property: 2:55 Date Created * @@ -229,8 +209,8 @@ public interface IPTC { Property DATE_CREATED = Photoshop.DATE_CREATED; /** - * Identifier or the name of the person involved in writing, editing or - * correcting the description of the content. + * Identifier or the name of the person involved in writing, editing or correcting the + * description of the content. *

* Maps to this IIM property: 2:122 Writer/Editor * @@ -239,8 +219,7 @@ public interface IPTC { Property DESCRIPTION_WRITER = Photoshop.CAPTION_WRITER; /** - * Any of a number of instructions from the provider or creator to the - * receiver of the item. + * Any of a number of instructions from the provider or creator to the receiver of the item. *

* Maps to this IIM property: 2:40 Special Instruction * @@ -249,14 +228,12 @@ public interface IPTC { Property INSTRUCTIONS = Photoshop.INSTRUCTIONS; /** - * Number or identifier for the purpose of improved workflow handling. This - * is a user created identifier related to the job for which the item is - * supplied. + * Number or identifier for the purpose of improved workflow handling. This is a user created + * identifier related to the job for which the item is supplied. *

- * Note: As this identifier references a job of the receiver's workflow it - * must first be issued by the receiver, then transmitted to the creator or - * provider of the news object and finally added by the creator - * to this field. + * Note: As this identifier references a job of the receiver's workflow it must first be issued + * by the receiver, then transmitted to the creator or provider of the news object and finally + * added by the creator to this field. *

* Maps to this IIM property: 2:103 Original Transmission Reference * @@ -265,20 +242,17 @@ public interface IPTC { Property JOB_ID = Photoshop.TRANSMISSION_REFERENCE; /** - * A shorthand reference for the item. Title provides a short human readable - * name which can be a text and/or numeric reference. It is not the same as - * Headline. + * A shorthand reference for the item. Title provides a short human readable name which can be a + * text and/or numeric reference. It is not the same as Headline. *

- * Many use the Title field to store the filename of the image, though the - * field may be used in many ways. Formal identifiers are provided by the - * Digital Image Id, or the Registry Entry property of the IPTC Extension. + * Many use the Title field to store the filename of the image, though the field may be used in + * many ways. Formal identifiers are provided by the Digital Image Id, or the Registry Entry + * property of the IPTC Extension. *

- * Note 1: This element aligns with the use of Dublin Core's "Title" - * element. - * Note 2: the XMP property (dc:title) which stores the value of - * this IPTC Core property is of type Lang Alt. Hence any software agent - * dealing with this property must abide to the processing rules for Lang - * Alt value type as specified by the XMP specifications. + * Note 1: This element aligns with the use of Dublin Core's "Title" element. Note 2: the XMP + * property (dc:title) which stores the value of this IPTC Core property is of type Lang Alt. + * Hence any software agent dealing with this property must abide to the processing rules for + * Lang Alt value type as specified by the XMP specifications. *

* Maps to this IIM property: 2:05 Object Name * @@ -287,21 +261,18 @@ public interface IPTC { Property TITLE = DublinCore.TITLE; /** - * Contains any necessary copyright notice for claiming the intellectual - * property for this item and should identify the current owner of the - * copyright for the item. Other entities like the creator of the item may - * be added in the corresponding field. Notes on usage rights should be + * Contains any necessary copyright notice for claiming the intellectual property for this item + * and should identify the current owner of the copyright for the item. Other entities like the + * creator of the item may be added in the corresponding field. Notes on usage rights should be * provided in "Rights usage terms". *

- * Copyright ownership can be expressed in a more controlled way using the - * PLUS fields "Copyright Owner", "Copyright Owner ID", - * "Copyright Owner Name" of the IPTC Extension. It is the user's - * responsibility to keep the values of the four fields in sync. + * Copyright ownership can be expressed in a more controlled way using the PLUS fields + * "Copyright Owner", "Copyright Owner ID", "Copyright Owner Name" of the IPTC Extension. It is + * the user's responsibility to keep the values of the four fields in sync. *

- * Note: the XMP property (dc:rights) which stores the value of this IPTC - * Core property is of type Lang Alt. Hence any software agent dealing with - * this property must abide to the processing rules for Lang Alt - * value type as specified by the XMP specifications. + * Note: the XMP property (dc:rights) which stores the value of this IPTC Core property is of + * type Lang Alt. Hence any software agent dealing with this property must abide to the + * processing rules for Lang Alt value type as specified by the XMP specifications. *

* Maps to this IIM property: 2:116 Copyright Notice * @@ -310,15 +281,14 @@ public interface IPTC { Property COPYRIGHT_NOTICE = DublinCore.RIGHTS; /** - * Contains the name of the person who created the content of this item, a - * photographer for photos, a graphic artist for graphics, or a writer for - * textual news, but in cases where the photographer should not be - * identified the name of a company or organisation may be appropriate. + * Contains the name of the person who created the content of this item, a photographer for + * photos, a graphic artist for graphics, or a writer for textual news, but in cases where the + * photographer should not be identified the name of a company or organisation may be + * appropriate. *

- * The creator can be expressed in a more controlled way using the - * "Image Creator" of PLUS in the IPTC Extension additionally. It is the - * user's responsibility to keep the values of the IPTC Core and the PLUS - * fields in sync. + * The creator can be expressed in a more controlled way using the "Image Creator" of PLUS in + * the IPTC Extension additionally. It is the user's responsibility to keep the values of the + * IPTC Core and the PLUS fields in sync. *

* Maps to this IIM property: 2:80 By-line * @@ -327,29 +297,24 @@ public interface IPTC { Property CREATOR = DublinCore.CREATOR; /** - * The creator's contact information provides all necessary information to - * get in contact with the creator of this item and comprises a set of - * sub-properties for proper addressing. + * The creator's contact information provides all necessary information to get in contact with + * the creator of this item and comprises a set of sub-properties for proper addressing. *

- * The IPTC Extension Licensor fields should be used instead of these - * Creator's Contact Info fields if you are using IPTC Extension fields. If - * the creator is also the licensor his or her contact information should be - * provided in the Licensor fields. + * The IPTC Extension Licensor fields should be used instead of these Creator's Contact Info + * fields if you are using IPTC Extension fields. If the creator is also the licensor his or her + * contact information should be provided in the Licensor fields. *

- * Note 1 to user interface implementers: All sub-properties of "Creator's - * contact information" should be shown as group on the form. - * Note 2: the - * CreatorContactInfo sub-properties' naming aligns with the vCard - * specification RFC 2426. + * Note 1 to user interface implementers: All sub-properties of "Creator's contact information" + * should be shown as group on the form. Note 2: the CreatorContactInfo sub-properties' naming + * aligns with the vCard specification RFC 2426. */ - Property CREATORS_CONTACT_INFO = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "CreatorContactInfo"); + Property CREATORS_CONTACT_INFO = Property.internalText(PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CreatorContactInfo"); /** - * Contains the job title of the person who created the content of this - * item. As this is sort of a qualifier the Creator element has to be filled - * in as mandatory prerequisite for using Creator's Jobtitle. + * Contains the job title of the person who created the content of this item. As this is sort of + * a qualifier the Creator element has to be filled in as mandatory prerequisite for using + * Creator's Jobtitle. *

* Maps to this IIM property: 2:85 By-line Title * @@ -358,15 +323,13 @@ public interface IPTC { Property CREATORS_JOB_TITLE = Photoshop.AUTHORS_POSITION; /** - * The credit to person(s) and/or organisation(s) required by the supplier - * of the item to be used when published. This is a free-text field. + * The credit to person(s) and/or organisation(s) required by the supplier of the item to be + * used when published. This is a free-text field. *

- * Note 1: For more formal identifications of the creator or the owner of - * the copyrights of this image other rights properties may be used. - * Note 2: - * This property was named "Credit" by the IIM metadata, then it was renamed - * to "Provider" in IPTC Core 1.0. In IPTC Core 1.1. it has been renamed to - * "Credit Line" as the field is used for this purpose by many users. + * Note 1: For more formal identifications of the creator or the owner of the copyrights of this + * image other rights properties may be used. Note 2: This property was named "Credit" by the + * IIM metadata, then it was renamed to "Provider" in IPTC Core 1.0. In IPTC Core 1.1. it has + * been renamed to "Credit Line" as the field is used for this purpose by many users. *

* Maps to this IIM property: 2:110 Credit * @@ -377,20 +340,19 @@ public interface IPTC { /** * The licensing parameters of the item expressed in free-text. *

- * The PLUS fields of the IPTC Extension can be used in parallel to express - * the licensed usage in more controlled terms. + * The PLUS fields of the IPTC Extension can be used in parallel to express the licensed usage + * in more controlled terms. */ Property RIGHTS_USAGE_TERMS = XMPRights.USAGE_TERMS; /** - * Identifies the original owner of the copyright for the intellectual - * content of the item. This could be an agency, a member of an agency or an - * individual. Source could be different from Creator and from the entities - * in the CopyrightNotice. + * Identifies the original owner of the copyright for the intellectual content of the item. This + * could be an agency, a member of an agency or an individual. Source could be different from + * Creator and from the entities in the CopyrightNotice. *

- * The original owner can never change. For that reason the content of this - * property should never be changed or deleted after the information is - * entered following the news object's initial creation. + * The original owner can never change. For that reason the content of this property should + * never be changed or deleted after the information is entered following the news object's + * initial creation. *

* Maps to this IIM property: 2:115 Source * @@ -399,110 +361,99 @@ public interface IPTC { Property SOURCE = Photoshop.SOURCE; /** - * The contact information address part. Comprises an optional company name - * and all required information to locate the building or postbox to which - * mail should be sent. To that end, the address is a multiline field. + * The contact information address part. Comprises an optional company name and all required + * information to locate the building or postbox to which mail should be sent. To that end, the + * address is a multiline field. *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426. + * Note 1: to user interface implementers: This field should be part of a "Contact information" + * group on the form. Note 2: the ContactInfo naming aligns with the vCard specification RFC + * 2426. */ - Property CONTACT_INFO_ADDRESS = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrExtadr"); + Property CONTACT_INFO_ADDRESS = Property.internalTextBag(PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrExtadr"); /** * The contact information city part. *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426. + * Note 1: to user interface implementers: This field should be part of a "Contact information" + * group on the form. Note 2: the ContactInfo naming aligns with the vCard specification RFC + * 2426. */ Property CONTACT_INFO_CITY = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCity"); + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCity"); /** * The contact information country part. *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426. + * Note 1: to user interface implementers: This field should be part of a "Contact information" + * group on the form. Note 2: the ContactInfo naming aligns with the vCard specification RFC + * 2426. */ Property CONTACT_INFO_COUNTRY = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCtry"); + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCtry"); /** * The contact information email address part. *

- * Multiple email addresses can be given. May have to be separated by a - * comma in the user interface. + * Multiple email addresses can be given. May have to be separated by a comma in the user + * interface. *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2 to user interface - * implementers: provide sufficient space to fill in multiple e-mail - * addresses. - * Note 3: the ContactInfo naming aligns with the vCard - * specification RFC 2426. + * Note 1: to user interface implementers: This field should be part of a "Contact information" + * group on the form. Note 2 to user interface implementers: provide sufficient space to fill in + * multiple e-mail addresses. Note 3: the ContactInfo naming aligns with the vCard specification + * RFC 2426. */ - Property CONTACT_INFO_EMAIL = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiEmailWork"); + Property CONTACT_INFO_EMAIL = Property.internalTextBag(PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiEmailWork"); /** * The contact information phone number part. *

- * Multiple numbers can be given. May have to be separated by a - * comma in the user interface. + * Multiple numbers can be given. May have to be separated by a comma in the user interface. *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2 to user interface - * implementers: provide sufficient space to fill in multiple international - * numbers. - * Note 3: the ContactInfo naming aligns with the vCard + * Note 1: to user interface implementers: This field should be part of a "Contact information" + * group on the form. Note 2 to user interface implementers: provide sufficient space to fill in + * multiple international numbers. Note 3: the ContactInfo naming aligns with the vCard * specification RFC 2426. */ Property CONTACT_INFO_PHONE = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiTelWork"); + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiTelWork"); /** * The contact information part denoting the local postal code. *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426. + * Note 1: to user interface implementers: This field should be part of a "Contact information" + * group on the form. Note 2: the ContactInfo naming aligns with the vCard specification RFC + * 2426. */ - Property CONTACT_INFO_POSTAL_CODE = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrPcode"); + Property CONTACT_INFO_POSTAL_CODE = Property.internalText(PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrPcode"); /** * The contact information part denoting regional information such as state or province. *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426. + * Note 1: to user interface implementers: This field should be part of a "Contact information" + * group on the form. Note 2: the ContactInfo naming aligns with the vCard specification RFC + * 2426. */ - Property CONTACT_INFO_STATE_PROVINCE = Property.internalText( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrRegion"); + Property CONTACT_INFO_STATE_PROVINCE = Property.internalText(PREFIX_IPTC_CORE + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrRegion"); /** * The contact information web address part. Multiple addresses can be given, separated by a * comma. *

- * Note 1: to user interface implementers: This field should be part of a - * "Contact information" group on the form. - * Note 2 to user interface - * implementers: provide sufficient space to fill in multiple URLs. - * Note 3: the ContactInfo naming aligns with the vCard - * specification RFC 2426. + * Note 1: to user interface implementers: This field should be part of a "Contact information" + * group on the form. Note 2 to user interface implementers: provide sufficient space to fill in + * multiple URLs. Note 3: the ContactInfo naming aligns with the vCard specification RFC 2426. */ Property CONTACT_INFO_WEB_URL = Property.internalTextBag( - PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiUrlWork"); + PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiUrlWork"); /** - * As this metadata element pertains to distribution management, it was not - * adopted. However, this data is still synchronised with the XMP property - * [photoshop:Urgency], and hence, available for future use, but outside the - * IPTC Core. + * As this metadata element pertains to distribution management, it was not adopted. However, + * this data is still synchronised with the XMP property [photoshop:Urgency], and hence, + * available for future use, but outside the IPTC Core. * * @deprecated */ @@ -510,11 +461,10 @@ public interface IPTC { Property URGENCY = Photoshop.URGENCY; /** - * As this metadata element was earmarked as deprecated already for IIM 4.1, - * it was not adopted. However, this data is still synchronised with the XMP - * property [photoshop:Category], and hence available for future use - but - * outside the IPTC Core. For migrating from Category codes to Subject Codes - * please read the Guideline for mapping Category Codes to Subject NewsCodes + * As this metadata element was earmarked as deprecated already for IIM 4.1, it was not adopted. + * However, this data is still synchronised with the XMP property [photoshop:Category], and + * hence available for future use - but outside the IPTC Core. For migrating from Category codes + * to Subject Codes please read the Guideline for mapping Category Codes to Subject NewsCodes * section below. * * @deprecated @@ -523,10 +473,10 @@ public interface IPTC { Property CATEGORY = Photoshop.CATEGORY; /** - * As this metadata element was earmarked as deprecated already for IIM 4.1, - * it was not adopted. However, this data is still synchronised with the XMP - * property [photoshop:SupplementalCategories], and hence available for - * future use - but outside the IPTC Core. + * As this metadata element was earmarked as deprecated already for IIM 4.1, it was not adopted. + * However, this data is still synchronised with the XMP property + * [photoshop:SupplementalCategories], and hence available for future use - but outside the IPTC + * Core. * * @deprecated */ @@ -534,105 +484,93 @@ public interface IPTC { Property SUPPLEMENTAL_CATEGORIES = Photoshop.SUPPLEMENTAL_CATEGORIES; /** - * Information about the ethnicity and other facets of the model(s) in a - * model-released image. + * Information about the ethnicity and other facets of the model(s) in a model-released image. *

* Use the Model Age field for the age of model(s). */ - Property ADDITIONAL_MODEL_INFO = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AddlModelInfo"); + Property ADDITIONAL_MODEL_INFO = Property.internalText(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AddlModelInfo"); /** * A set of metadata about artwork or an object in the item */ - Property ARTWORK_OR_OBJECT = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ArtworkOrObject"); + Property ARTWORK_OR_OBJECT = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ArtworkOrObject"); /** * A set of metadata about artwork or an object in the item */ - Property ORGANISATION_CODE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "OrganisationInImageCode"); + Property ORGANISATION_CODE = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "OrganisationInImageCode"); /** - * A term to describe the content of the image by a value from a Controlled - * Vocabulary. + * A term to describe the content of the image by a value from a Controlled Vocabulary. *

- * This property is part of the Photo Metadata 2008 specifications, but - * should not released to the public on the standard Adobe Custom Panels for - * IPTC metadata or other user interfaces unless agreed by the IPTC. + * This property is part of the Photo Metadata 2008 specifications, but should not released to + * the public on the standard Adobe Custom Panels for IPTC metadata or other user interfaces + * unless agreed by the IPTC. */ Property CONTROLLED_VOCABULARY_TERM = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CVterm"); + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CVterm"); /** - * A location the content of the item is about. For photos that is a - * location shown in the image. + * A location the content of the item is about. For photos that is a location shown in the + * image. *

- * If the location the image was taken in is different from this location - * the property Location Created should be used too. + * If the location the image was taken in is different from this location the property Location + * Created should be used too. */ - Property LOCATION_SHOWN = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShown"); + Property LOCATION_SHOWN = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShown"); /** - * Age of the human model(s) at the time this image was taken in a model - * released image. + * Age of the human model(s) at the time this image was taken in a model released image. *

- * The user should be aware of any legal implications of providing ages for - * young models. Ages below 18 years should not be included. + * The user should be aware of any legal implications of providing ages for young models. Ages + * below 18 years should not be included. */ Property MODEL_AGE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelAge"); + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelAge"); /** * Name of the organisation or company which is featured in the content. *

- * May be supplemented by values from a controlled vocabulary in the - * Organisation Code field. + * May be supplemented by values from a controlled vocabulary in the Organisation Code field. */ - Property ORGANISATION_NAME = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "OrganisationInImageName"); + Property ORGANISATION_NAME = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "OrganisationInImageName"); /** - * Name of a person the content of the item is about. For photos that is a - * person shown in the image. + * Name of a person the content of the item is about. For photos that is a person shown in the + * image. */ - Property PERSON = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PersonInImage"); + Property PERSON = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PersonInImage"); /** - * Globally unique identifier for the item. It is created and applied by the - * creator of the item at the time of its creation . This value shall not be - * changed after that time. + * Globally unique identifier for the item. It is created and applied by the creator of the item + * at the time of its creation . This value shall not be changed after that time. *

- * The identifier will probably be generated by the technical means of an - * imaging device or software and should be applied to the digital image - * file as early as possible in its life cycle. This identifier does not - * identify any pictured content, particularly in case of a scan of non- - * digital images, only this digital representation. + * The identifier will probably be generated by the technical means of an imaging device or + * software and should be applied to the digital image file as early as possible in its life + * cycle. This identifier does not identify any pictured content, particularly in case of a scan + * of non- digital images, only this digital representation. *

- * Any algorithm to create this identifier has to comply with the technical - * requirements to create a globally unique id. Any device creating digital - * images - e.g. still image cameras, video cameras, scanners - should - * create such an identifer right at the time of the creation of the digital - * data and add the id to the set of metadata without compromising - * performance. It is recommended that this image identifier allows - * identifying the device by which the image data and the GUID were created. - * IPTC's basic requirements for unique ids are: - * - It must be globally unique. Algorithms for this purpose exist. - * - It should identify the camera body. - * - It should identify each individual photo from this camera body. - * - It should identify the date and time of the creation of the picture. - * - It should be secured against tampering. - * This field should be implemented in a way to prove it has not been changed since its - * value has been applied. If the identifier has been created by the imaging device - * its type and brand can be found in the Exif/technical metadata. + * Any algorithm to create this identifier has to comply with the technical requirements to + * create a globally unique id. Any device creating digital images - e.g. still image cameras, + * video cameras, scanners - should create such an identifer right at the time of the creation + * of the digital data and add the id to the set of metadata without compromising performance. + * It is recommended that this image identifier allows identifying the device by which the image + * data and the GUID were created. IPTC's basic requirements for unique ids are: - It must be + * globally unique. Algorithms for this purpose exist. - It should identify the camera body. - + * It should identify each individual photo from this camera body. - It should identify the date + * and time of the creation of the picture. - It should be secured against tampering. This field + * should be implemented in a way to prove it has not been changed since its value has been + * applied. If the identifier has been created by the imaging device its type and brand can be + * found in the Exif/technical metadata. */ - Property DIGITAL_IMAGE_GUID = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DigImageGUID"); + Property DIGITAL_IMAGE_GUID = Property.internalText(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DigImageGUID"); /** * The type of the source digital file. @@ -642,621 +580,562 @@ public interface IPTC { * @deprecated */ @Deprecated - Property DIGITAL_SOURCE_FILE_TYPE = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "DigitalSourcefileType"); + Property DIGITAL_SOURCE_FILE_TYPE = Property.internalText(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DigitalSourcefileType"); /** * The type of the source of this digital image */ - Property DIGITAL_SOURCE_TYPE = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DigitalSourceType"); + Property DIGITAL_SOURCE_TYPE = Property.internalText(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DigitalSourceType"); /** * Names or describes the specific event the content relates to. *

- * Examples are: a press conference, dedication ceremony, etc. If this is a - * sub-event of a larger event both can be provided by the field: e.g. XXXIX - * Olympic Summer Games (Beijing): opening ceremony. Unplanned events could - * be named by this property too. + * Examples are: a press conference, dedication ceremony, etc. If this is a sub-event of a + * larger event both can be provided by the field: e.g. XXXIX Olympic Summer Games (Beijing): + * opening ceremony. Unplanned events could be named by this property too. */ Property EVENT = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Event"); + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Event"); /** - * Both a Registry Item Id and a Registry Organisation Id to record any - * registration of this item with a registry. + * Both a Registry Item Id and a Registry Organisation Id to record any registration of this + * item with a registry. *

- * Typically an id from a registry is negotiated and applied after the - * creation of the digital image. + * Typically an id from a registry is negotiated and applied after the creation of the digital + * image. *

- * Any user interface implementation must show both sub-properties - Item Id - * and Organisation Id - as corresponding values. Further an input to both - * fields should be made mandatory. + * Any user interface implementation must show both sub-properties - Item Id and Organisation Id + * - as corresponding values. Further an input to both fields should be made mandatory. */ Property IMAGE_REGISTRY_ENTRY = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegistryId"); + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegistryId"); /** - * Identifies the most recent supplier of the item, who is not necessarily - * its owner or creator. + * Identifies the most recent supplier of the item, who is not necessarily its owner or creator. *

- * For identifying the supplier either a well known and/or registered - * company name or a URL of the company's web site may be used. This - * property succeeds the Provider property of IPTC Core 1.0 by its semantics - * as that Provider was renamed to Credit Line. + * For identifying the supplier either a well known and/or registered company name or a URL of + * the company's web site may be used. This property succeeds the Provider property of IPTC Core + * 1.0 by its semantics as that Provider was renamed to Credit Line. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ Property IMAGE_SUPPLIER = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplier"); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplier"); /** * @deprecated use {@link IPTC#IMAGE_SUPPLIER_ID} */ @Deprecated String IMAGE_SUPPLIER_ID_WRONG_CASE = - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierId"; + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierId"; /** - * Identifies the most recent supplier of the item, who is not necessarily - * its owner or creator. + * Identifies the most recent supplier of the item, who is not necessarily its owner or creator. *

- * For identifying the supplier either a well known and/or registered - * company name or a URL of the company's web site may be used. This - * property succeeds the Provider property of IPTC Core 1.0 by its semantics - * as that Provider was renamed to Credit Line. + * For identifying the supplier either a well known and/or registered company name or a URL of + * the company's web site may be used. This property succeeds the Provider property of IPTC Core + * 1.0 by its semantics as that Provider was renamed to Credit Line. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property IMAGE_SUPPLIER_ID = Property.composite(Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierID"), - new Property[]{Property.internalText(IPTC.IMAGE_SUPPLIER_ID_WRONG_CASE)}); + Property IMAGE_SUPPLIER_ID = Property.composite( + Property.internalText( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "ImageSupplierID"), + new Property[] {Property.internalText(IPTC.IMAGE_SUPPLIER_ID_WRONG_CASE)}); /** - * Identifies the most recent supplier of the item, who is not necessarily - * its owner or creator. + * Identifies the most recent supplier of the item, who is not necessarily its owner or creator. *

- * For identifying the supplier either a well known and/or registered - * company name or a URL of the company's web site may be used. This - * property succeeds the Provider property of IPTC Core 1.0 by its semantics - * as that Provider was renamed to Credit Line. + * For identifying the supplier either a well known and/or registered company name or a URL of + * the company's web site may be used. This property succeeds the Provider property of IPTC Core + * 1.0 by its semantics as that Provider was renamed to Credit Line. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property IMAGE_SUPPLIER_NAME = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierName"); + Property IMAGE_SUPPLIER_NAME = Property.internalText(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierName"); /** * Optional identifier assigned by the Image Supplier to the image. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property IMAGE_SUPPLIER_IMAGE_ID = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierImageID"); + Property IMAGE_SUPPLIER_IMAGE_ID = Property.internalText(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierImageID"); /** - * The date and optionally time when any of the IPTC photo metadata fields - * has been last edited + * The date and optionally time when any of the IPTC photo metadata fields has been last edited *

- * The public use of this property is deprecated by IPTC Extension version - * 1.1. It may only still be used by a private user interface for a use - * scoped to a company. If used this field should be a timestamp of the - * latest change applied to any of the fields. + * The public use of this property is deprecated by IPTC Extension version 1.1. It may only + * still be used by a private user interface for a use scoped to a company. If used this field + * should be a timestamp of the latest change applied to any of the fields. *

- * The value of this property should never be set by software. XMP-aware - * software should reflect any changes to metadata by the xmp:MetadataDate - * property of the XMP Basic scheme. + * The value of this property should never be set by software. XMP-aware software should reflect + * any changes to metadata by the xmp:MetadataDate property of the XMP Basic scheme. */ - Property IPTC_LAST_EDITED = Property.internalDate( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "IptcLastEdited"); + Property IPTC_LAST_EDITED = Property.internalDate(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "IptcLastEdited"); /** * The location the content of the item was created. *

- * If the location in the image is different from the location the photo was - * taken the IPTC Extension property Location Shown in the Image should be - * used. + * If the location in the image is different from the location the photo was taken the IPTC + * Extension property Location Shown in the Image should be used. */ - Property LOCATION_CREATED = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationCreated"); + Property LOCATION_CREATED = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationCreated"); /** - * The maximum available height in pixels of the original photo from which - * this photo has been derived by downsizing. + * The maximum available height in pixels of the original photo from which this photo has been + * derived by downsizing. */ - Property MAX_AVAIL_HEIGHT = Property.internalInteger( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MaxAvailHeight"); + Property MAX_AVAIL_HEIGHT = Property.internalInteger(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MaxAvailHeight"); /** - * The maximum available width in pixels of the original photo from which - * this photo has been derived by downsizing. + * The maximum available width in pixels of the original photo from which this photo has been + * derived by downsizing. */ - Property MAX_AVAIL_WIDTH = Property.internalInteger( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MaxAvailWidth"); + Property MAX_AVAIL_WIDTH = Property.internalInteger(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MaxAvailWidth"); /** - * The version number of the PLUS standards in place at the time of the - * transaction. + * The version number of the PLUS standards in place at the time of the transaction. *

- * This property was included into the IPTC Extension schema from PLUS - * version 1.2 as all other PLUS properties. To reflect this the value of - * "PLUS Version" should be set to the string "1.2.0" + * This property was included into the IPTC Extension schema from PLUS version 1.2 as all other + * PLUS properties. To reflect this the value of "PLUS Version" should be set to the string + * "1.2.0" */ Property PLUS_VERSION = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Version"); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Version"); /** * Owner or owners of the copyright in the licensed image. *

- * Serves to identify the rights holder/s for the image. The Copyright - * Owner, Image Creator and Licensor may be the same or different entities. + * Serves to identify the rights holder/s for the image. The Copyright Owner, Image Creator and + * Licensor may be the same or different entities. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ Property COPYRIGHT_OWNER = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwner"); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwner"); /** * @deprecated use {@link IPTC#COPYRIGHT_OWNER_ID} */ @Deprecated - String COPYRIGHT_OWNER_ID_WRONG_CASE = - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerId"; + String COPYRIGHT_OWNER_ID_WRONG_CASE = PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerId"; /** * The ID of the owner or owners of the copyright in the licensed image. *

- * Serves to identify the rights holder/s for the image. The Copyright - * Owner, Image Creator and Licensor may be the same or different entities. + * Serves to identify the rights holder/s for the image. The Copyright Owner, Image Creator and + * Licensor may be the same or different entities. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property COPYRIGHT_OWNER_ID = Property.composite(Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerID"), - new Property[]{Property.internalTextBag(IPTC.COPYRIGHT_OWNER_ID_WRONG_CASE)}); + Property COPYRIGHT_OWNER_ID = Property.composite( + Property.internalTextBag( + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "CopyrightOwnerID"), + new Property[] {Property.internalTextBag(IPTC.COPYRIGHT_OWNER_ID_WRONG_CASE)}); /** * The name of the owner or owners of the copyright in the licensed image. *

- * Serves to identify the rights holder/s for the image. The Copyright - * Owner, Image Creator and Licensor may be the same or different entities. + * Serves to identify the rights holder/s for the image. The Copyright Owner, Image Creator and + * Licensor may be the same or different entities. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property COPYRIGHT_OWNER_NAME = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerName"); + Property COPYRIGHT_OWNER_NAME = Property.internalTextBag(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerName"); /** * Creator or creators of the image. *

- * The creator can be additionally expressed in free-text using the IPTC - * Core Creator field. In many countries, the Image Creator must be - * attributed in association with any use of the image. The Image Creator, - * Copyright Owner, Image Supplier and Licensor may be the same or different - * entities. + * The creator can be additionally expressed in free-text using the IPTC Core Creator field. In + * many countries, the Image Creator must be attributed in association with any use of the + * image. The Image Creator, Copyright Owner, Image Supplier and Licensor may be the same or + * different entities. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ Property IMAGE_CREATOR = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreator"); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreator"); /** * @deprecated use {@link IPTC#IMAGE_CREATOR_ID} */ @Deprecated String IMAGE_CREATOR_ID_WRONG_CASE = - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorId"; + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorId"; /** * The ID of the creator or creators of the image. *

- * The creator can be additionally expressed in free-text using the IPTC - * Core Creator field. In many countries, the Image Creator must be - * attributed in association with any use of the image. The Image Creator, - * Copyright Owner, Image Supplier and Licensor may be the same or different - * entities. + * The creator can be additionally expressed in free-text using the IPTC Core Creator field. In + * many countries, the Image Creator must be attributed in association with any use of the + * image. The Image Creator, Copyright Owner, Image Supplier and Licensor may be the same or + * different entities. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ Property IMAGE_CREATOR_ID = Property.composite(Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorID"), - new Property[]{Property.internalTextBag(IPTC.IMAGE_CREATOR_ID_WRONG_CASE)}); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorID"), + new Property[] {Property.internalTextBag(IPTC.IMAGE_CREATOR_ID_WRONG_CASE)}); /** * The name of the creator or creators of the image. *

- * The creator can be additionally expressed in free-text using the IPTC - * Core Creator field. In many countries, the Image Creator must be - * attributed in association with any use of the image. The Image Creator, - * Copyright Owner, Image Supplier and Licensor may be the same or different - * entities. + * The creator can be additionally expressed in free-text using the IPTC Core Creator field. In + * many countries, the Image Creator must be attributed in association with any use of the + * image. The Image Creator, Copyright Owner, Image Supplier and Licensor may be the same or + * different entities. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property IMAGE_CREATOR_NAME = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorName"); + Property IMAGE_CREATOR_NAME = Property.internalTextBag(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorName"); /** - * A person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. + * A person or company that should be contacted to obtain a licence for using the item or who + * has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ Property LICENSOR = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Licensor"); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Licensor"); /** * @deprecated use {@link IPTC#LICENSOR_ID} */ @Deprecated String LICENSOR_ID_WRONG_CASE = - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorId"; + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorId"; /** - * The ID of the person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. + * The ID of the person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_ID = Property.composite(Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorID"), - new Property[]{Property.internalTextBag(IPTC.LICENSOR_ID_WRONG_CASE)}); + Property LICENSOR_ID = Property.composite( + Property.internalTextBag(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorID"), + new Property[] {Property.internalTextBag(IPTC.LICENSOR_ID_WRONG_CASE)}); /** - * The name of the person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. + * The name of the person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ Property LICENSOR_NAME = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorName"); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorName"); /** - * The city of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. + * The city of a person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ Property LICENSOR_CITY = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorCity"); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorCity"); /** - * The country of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. + * The country of a person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_COUNTRY = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorCountry"); + Property LICENSOR_COUNTRY = Property.internalTextBag(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorCountry"); /** - * The email of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. + * The email of a person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ Property LICENSOR_EMAIL = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorEmail"); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorEmail"); /** * The extended address of a person or company that should be contacted to obtain a licence for * using the item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_EXTENDED_ADDRESS = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LicensorExtendedAddress"); + Property LICENSOR_EXTENDED_ADDRESS = Property.internalTextBag(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorExtendedAddress"); /** - * The postal code of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. + * The postal code of a person or company that should be contacted to obtain a licence for using + * the item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_POSTAL_CODE = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorPostalCode"); + Property LICENSOR_POSTAL_CODE = Property.internalTextBag(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorPostalCode"); /** - * The region of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. + * The region of a person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ Property LICENSOR_REGION = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorRegion"); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorRegion"); /** * The street address of a person or company that should be contacted to obtain a licence for * using the item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_STREET_ADDRESS = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorStreetAddress"); + Property LICENSOR_STREET_ADDRESS = Property.internalTextBag(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorStreetAddress"); /** * The phone number of a person or company that should be contacted to obtain a licence for * using the item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_TELEPHONE_1 = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorTelephone1"); + Property LICENSOR_TELEPHONE_1 = Property.internalTextBag(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorTelephone1"); /** * The phone number of a person or company that should be contacted to obtain a licence for * using the item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property LICENSOR_TELEPHONE_2 = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorTelephone2"); + Property LICENSOR_TELEPHONE_2 = Property.internalTextBag(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorTelephone2"); /** - * The URL of a person or company that should be contacted to obtain a licence for - * using the item or who has licensed the item. + * The URL of a person or company that should be contacted to obtain a licence for using the + * item or who has licensed the item. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ Property LICENSOR_URL = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorURL"); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorURL"); /** - * Age of the youngest model pictured in the image, at the time that the - * image was made. + * Age of the youngest model pictured in the image, at the time that the image was made. *

- * This age should not be displayed to the public on open web portals and - * the like. But it may be used by image repositories in a - * B2B enviroment. + * This age should not be displayed to the public on open web portals and the like. But it may + * be used by image repositories in a B2B enviroment. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property MINOR_MODEL_AGE_DISCLOSURE = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "MinorModelAgeDisclosure"); + Property MINOR_MODEL_AGE_DISCLOSURE = Property.internalText(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MinorModelAgeDisclosure"); /** * Optional identifier associated with each Model Release. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ Property MODEL_RELEASE_ID = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseID"); + PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseID"); /** - * Summarizes the availability and scope of model releases authorizing usage - * of the likenesses of persons appearing in the photograph. + * Summarizes the availability and scope of model releases authorizing usage of the likenesses + * of persons appearing in the photograph. *

- * It is recommended to apply the PLUS controlled value Unlimited Model - * Releases (MR- UMR) very carefully and to check the wording of the model - * release thoroughly before applying it. + * It is recommended to apply the PLUS controlled value Unlimited Model Releases (MR- UMR) very + * carefully and to check the wording of the model release thoroughly before applying it. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property MODEL_RELEASE_STATUS = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseStatus"); + Property MODEL_RELEASE_STATUS = Property.internalText(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseStatus"); /** * Optional identifier associated with each Property Release. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property PROPERTY_RELEASE_ID = Property.internalTextBag( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PropertyReleaseID"); + Property PROPERTY_RELEASE_ID = Property.internalTextBag(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PropertyReleaseID"); /** - * Summarises the availability and scope of property releases authorizing - * usage of the properties appearing in the photograph. + * Summarises the availability and scope of property releases authorizing usage of the + * properties appearing in the photograph. *

- * It is recommended to apply the value PR-UPR very carefully and to check - * the wording of the property release thoroughly before applying it. + * It is recommended to apply the value PR-UPR very carefully and to check the wording of the + * property release thoroughly before applying it. *

- * This is a PLUS version 1.2 property included in the IPTC Extension - * schema. + * This is a PLUS version 1.2 property included in the IPTC Extension schema. */ - Property PROPERTY_RELEASE_STATUS = Property.internalText( - PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PropertyReleaseStatus"); + Property PROPERTY_RELEASE_STATUS = Property.internalText(PREFIX_PLUS + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PropertyReleaseStatus"); /** - * Contains any necessary copyright notice for claiming the intellectual - * property for artwork or an object in the image and should identify the - * current owner of the copyright of this work with associated intellectual - * property rights. + * Contains any necessary copyright notice for claiming the intellectual property for artwork or + * an object in the image and should identify the current owner of the copyright of this work + * with associated intellectual property rights. */ - Property ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOCopyrightNotice"); + Property ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOCopyrightNotice"); /** * Contains the name of the artist who has created artwork or an object in the image. */ Property ARTWORK_OR_OBJECT_DETAIL_CREATOR = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOCreator"); + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOCreator"); /** - * Designates the date and optionally the time the artwork or object in the - * image was created. This relates to artwork or objects with associated - * intellectual property rights. + * Designates the date and optionally the time the artwork or object in the image was created. + * This relates to artwork or objects with associated intellectual property rights. */ - Property ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AODateCreated"); + Property ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AODateCreated"); /** - * The organisation or body holding and registering the artwork or object in - * the image for inventory purposes. + * The organisation or body holding and registering the artwork or object in the image for + * inventory purposes. */ Property ARTWORK_OR_OBJECT_DETAIL_SOURCE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOSource"); + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOSource"); /** - * The inventory number issued by the organisation or body holding and - * registering the artwork or object in the image. + * The inventory number issued by the organisation or body holding and registering the artwork + * or object in the image. */ - Property ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOSourceInvNo"); + Property ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER = Property + .internalTextBag(PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "AOSourceInvNo"); /** * A reference for the artwork or object in the image. */ Property ARTWORK_OR_OBJECT_DETAIL_TITLE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOTitle"); + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOTitle"); /** - * Name of the city of a location. This element is at the fourth level of a - * top-down geographical hierarchy. + * Name of the city of a location. This element is at the fourth level of a top-down + * geographical hierarchy. */ - Property LOCATION_SHOWN_CITY = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShownCity"); + Property LOCATION_SHOWN_CITY = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShownCity"); /** - * The ISO code of a country of a location. This element is at the second - * level of a top-down geographical hierarchy. + * The ISO code of a country of a location. This element is at the second level of a top-down + * geographical hierarchy. *

- * Note 1: an implementer would have to derive from the length of the value - * string whether this is the country code from the two or three letter - * scheme as no explicit indication can be provided. + * Note 1: an implementer would have to derive from the length of the value string whether this + * is the country code from the two or three letter scheme as no explicit indication can be + * provided. */ - Property LOCATION_SHOWN_COUNTRY_CODE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationShownCountryCode"); + Property LOCATION_SHOWN_COUNTRY_CODE = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShownCountryCode"); /** - * The name of a country of a location. This element is at the second level - * of a top-down geographical hierarchy. + * The name of a country of a location. This element is at the second level of a top-down + * geographical hierarchy. */ - Property LOCATION_SHOWN_COUNTRY_NAME = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationShownCountryName"); + Property LOCATION_SHOWN_COUNTRY_NAME = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShownCountryName"); /** - * The name of a subregion of a country - a province or state - of a - * location. This element is at the third level of a top-down geographical - * hierarchy. + * The name of a subregion of a country - a province or state - of a location. This element is + * at the third level of a top-down geographical hierarchy. */ - Property LOCATION_SHOWN_PROVINCE_OR_STATE = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationShownProvinceState"); + Property LOCATION_SHOWN_PROVINCE_OR_STATE = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShownProvinceState"); /** - * Name of a sublocation. This sublocation name could either be the name of - * a sublocation to a city or the name of a well known location or (natural) - * monument outside a city. In the sense of a sublocation to a city this - * element is at the fifth level of a top-down geographical hierarchy. + * Name of a sublocation. This sublocation name could either be the name of a sublocation to a + * city or the name of a well known location or (natural) monument outside a city. In the sense + * of a sublocation to a city this element is at the fifth level of a top-down geographical + * hierarchy. */ - Property LOCATION_SHOWN_SUBLOCATION = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationShownSublocation"); + Property LOCATION_SHOWN_SUBLOCATION = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShownSublocation"); /** - * The name of a world region of a location. This element is at the first - * (topI) level of a top- down geographical hierarchy. + * The name of a world region of a location. This element is at the first (topI) level of a top- + * down geographical hierarchy. */ - Property LOCATION_SHOWN_WORLD_REGION = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationShownWorldRegion"); + Property LOCATION_SHOWN_WORLD_REGION = Property.internalTextBag(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShownWorldRegion"); /** - * Name of the city of a location. This element is at the fourth level of a - * top-down geographical hierarchy. + * Name of the city of a location. This element is at the fourth level of a top-down + * geographical hierarchy. */ - Property LOCATION_CREATED_CITY = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedCity"); + Property LOCATION_CREATED_CITY = Property.internalText(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationCreatedCity"); /** - * The ISO code of a country of a location. This element is at the second - * level of a top-down geographical hierarchy. + * The ISO code of a country of a location. This element is at the second level of a top-down + * geographical hierarchy. *

- * Note 1: an implementer would have to derive from the length of the value - * string whether this is the country code from the two or three letter - * scheme as no explicit indication can be provided. + * Note 1: an implementer would have to derive from the length of the value string whether this + * is the country code from the two or three letter scheme as no explicit indication can be + * provided. */ - Property LOCATION_CREATED_COUNTRY_CODE = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedCountryCode"); + Property LOCATION_CREATED_COUNTRY_CODE = Property.internalText(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationCreatedCountryCode"); /** - * The name of a country of a location. This element is at the second level - * of a top-down geographical hierarchy. + * The name of a country of a location. This element is at the second level of a top-down + * geographical hierarchy. */ - Property LOCATION_CREATED_COUNTRY_NAME = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedCountryName"); + Property LOCATION_CREATED_COUNTRY_NAME = Property.internalText(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationCreatedCountryName"); /** - * The name of a subregion of a country - a province or state - of a - * location. This element is at the third level of a top-down geographical - * hierarchy. + * The name of a subregion of a country - a province or state - of a location. This element is + * at the third level of a top-down geographical hierarchy. */ - Property LOCATION_CREATED_PROVINCE_OR_STATE = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedProvinceState"); + Property LOCATION_CREATED_PROVINCE_OR_STATE = Property + .internalText(PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "LocationCreatedProvinceState"); /** - * Name of a sublocation. This sublocation name could either be the name of - * a sublocation to a city or the name of a well known location or (natural) - * monument outside a city. In the sense of a sublocation to a city this - * element is at the fifth level of a top-down geographical hierarchy. + * Name of a sublocation. This sublocation name could either be the name of a sublocation to a + * city or the name of a well known location or (natural) monument outside a city. In the sense + * of a sublocation to a city this element is at the fifth level of a top-down geographical + * hierarchy. */ - Property LOCATION_CREATED_SUBLOCATION = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedSublocation"); + Property LOCATION_CREATED_SUBLOCATION = Property.internalText(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationCreatedSublocation"); /** - * The name of a world region of a location. This element is at the first - * (topI) level of a top- down geographical hierarchy. + * The name of a world region of a location. This element is at the first (topI) level of a top- + * down geographical hierarchy. */ - Property LOCATION_CREATED_WORLD_REGION = Property.internalText( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LocationCreatedWorldRegion"); + Property LOCATION_CREATED_WORLD_REGION = Property.internalText(PREFIX_IPTC_EXT + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationCreatedWorldRegion"); /** - * A unique identifier created by a registry and applied by the creator of - * the item. This value shall not be changed after being applied. This - * identifier is linked to a corresponding Registry Organisation Identifier. + * A unique identifier created by a registry and applied by the creator of the item. This value + * shall not be changed after being applied. This identifier is linked to a corresponding + * Registry Organisation Identifier. */ Property REGISTRY_ENTRY_CREATED_ITEM_ID = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegItemId"); + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegItemId"); /** * An identifier for the registry which issued the corresponding Registry Image Id. */ Property REGISTRY_ENTRY_CREATED_ORGANISATION_ID = Property.internalTextBag( - PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegOrgId"); - - - Property[] PROPERTY_GROUP_IPTC_CORE = - new Property[]{CITY, COUNTRY, COUNTRY_CODE, DESCRIPTION, HEADLINE, INTELLECTUAL_GENRE, - KEYWORDS, PROVINCE_OR_STATE, SCENE_CODE, SUBJECT_CODE, SUBLOCATION, - DATE_CREATED, DESCRIPTION_WRITER, INSTRUCTIONS, JOB_ID, TITLE, COPYRIGHT_NOTICE, - CREATOR, CREATORS_JOB_TITLE, CREDIT_LINE, RIGHTS_USAGE_TERMS, SOURCE, - CONTACT_INFO_ADDRESS, CONTACT_INFO_CITY, CONTACT_INFO_COUNTRY, - CONTACT_INFO_EMAIL, CONTACT_INFO_PHONE, CONTACT_INFO_POSTAL_CODE, - CONTACT_INFO_STATE_PROVINCE, CONTACT_INFO_WEB_URL}; - - Property[] PROPERTY_GROUP_IPTC_EXT = - new Property[]{ADDITIONAL_MODEL_INFO, ORGANISATION_CODE, CONTROLLED_VOCABULARY_TERM, - MODEL_AGE, ORGANISATION_NAME, PERSON, DIGITAL_IMAGE_GUID, DIGITAL_SOURCE_TYPE, - EVENT, IMAGE_SUPPLIER_ID, IMAGE_SUPPLIER_NAME, IMAGE_SUPPLIER_IMAGE_ID, - IPTC_LAST_EDITED, MAX_AVAIL_HEIGHT, MAX_AVAIL_WIDTH, PLUS_VERSION, - COPYRIGHT_OWNER_ID, COPYRIGHT_OWNER_NAME, IMAGE_CREATOR_ID, IMAGE_CREATOR_NAME, - LICENSOR_ID, LICENSOR_NAME, LICENSOR_CITY, LICENSOR_COUNTRY, LICENSOR_EMAIL, + PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegOrgId"); + + + Property[] PROPERTY_GROUP_IPTC_CORE = new Property[] {CITY, COUNTRY, COUNTRY_CODE, DESCRIPTION, + HEADLINE, INTELLECTUAL_GENRE, KEYWORDS, PROVINCE_OR_STATE, SCENE_CODE, + SUBJECT_CODE, SUBLOCATION, DATE_CREATED, DESCRIPTION_WRITER, INSTRUCTIONS, + JOB_ID, TITLE, COPYRIGHT_NOTICE, CREATOR, CREATORS_JOB_TITLE, CREDIT_LINE, + RIGHTS_USAGE_TERMS, SOURCE, CONTACT_INFO_ADDRESS, CONTACT_INFO_CITY, + CONTACT_INFO_COUNTRY, CONTACT_INFO_EMAIL, CONTACT_INFO_PHONE, + CONTACT_INFO_POSTAL_CODE, CONTACT_INFO_STATE_PROVINCE, CONTACT_INFO_WEB_URL}; + + Property[] PROPERTY_GROUP_IPTC_EXT = new Property[] {ADDITIONAL_MODEL_INFO, ORGANISATION_CODE, + CONTROLLED_VOCABULARY_TERM, MODEL_AGE, ORGANISATION_NAME, PERSON, + DIGITAL_IMAGE_GUID, DIGITAL_SOURCE_TYPE, EVENT, IMAGE_SUPPLIER_ID, + IMAGE_SUPPLIER_NAME, IMAGE_SUPPLIER_IMAGE_ID, IPTC_LAST_EDITED, + MAX_AVAIL_HEIGHT, MAX_AVAIL_WIDTH, PLUS_VERSION, COPYRIGHT_OWNER_ID, + COPYRIGHT_OWNER_NAME, IMAGE_CREATOR_ID, IMAGE_CREATOR_NAME, LICENSOR_ID, + LICENSOR_NAME, LICENSOR_CITY, LICENSOR_COUNTRY, LICENSOR_EMAIL, LICENSOR_EXTENDED_ADDRESS, LICENSOR_POSTAL_CODE, LICENSOR_REGION, LICENSOR_STREET_ADDRESS, LICENSOR_TELEPHONE_1, LICENSOR_TELEPHONE_2, LICENSOR_URL, MINOR_MODEL_AGE_DISCLOSURE, MODEL_RELEASE_ID, diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java index d52b673514..8dd7ec8b03 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -26,33 +24,38 @@ public interface MAPI { String PREFIX_MAPI_META = "mapi" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; String PREFIX_MAPI_ATTACH_META = "mapi:attach" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; - String PREFIX_MAPI_PROPERTY = PREFIX_MAPI_META + "property" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + String PREFIX_MAPI_PROPERTY = + PREFIX_MAPI_META + "property" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; /** - * MAPI message class. What type of .msg/MAPI file is it? - * This is normalized via "mapi_message_classes.properties + * MAPI message class. What type of .msg/MAPI file is it? This is normalized via + * "mapi_message_classes.properties */ Property MESSAGE_CLASS = Property.internalText(PREFIX_MAPI_META + "message-class"); /** - * MAPI message class. What type of .msg/MAPI file is it? - * This is the raw value that is retrieved from the underlying chunk + * MAPI message class. What type of .msg/MAPI file is it? This is the raw value that is + * retrieved from the underlying chunk */ Property MESSAGE_CLASS_RAW = Property.internalText(PREFIX_MAPI_META + "message-class-raw"); Property SENT_BY_SERVER_TYPE = Property.internalText(PREFIX_MAPI_META + "sent-by-server-type"); - Property FROM_REPRESENTING_NAME = Property.internalText(PREFIX_MAPI_META + "from-representing-name"); + Property FROM_REPRESENTING_NAME = + Property.internalText(PREFIX_MAPI_META + "from-representing-name"); - Property FROM_REPRESENTING_EMAIL = Property.internalText(PREFIX_MAPI_META + "from-representing-email"); + Property FROM_REPRESENTING_EMAIL = + Property.internalText(PREFIX_MAPI_META + "from-representing-email"); - Property SUBMISSION_ACCEPTED_AT_TIME = Property.internalDate(PREFIX_MAPI_META + "msg-submission-accepted-at-time"); + Property SUBMISSION_ACCEPTED_AT_TIME = + Property.internalDate(PREFIX_MAPI_META + "msg-submission-accepted-at-time"); Property SUBMISSION_ID = Property.internalText(PREFIX_MAPI_META + "msg-submission-id"); Property INTERNET_MESSAGE_ID = Property.internalText(PREFIX_MAPI_META + "internet-message-id"); - Property INTERNET_REFERENCES = Property.internalTextBag(PREFIX_MAPI_META + "internet-references"); + Property INTERNET_REFERENCES = + Property.internalTextBag(PREFIX_MAPI_META + "internet-references"); Property CONVERSATION_TOPIC = Property.internalText(PREFIX_MAPI_META + "conversation-topic"); @@ -65,17 +68,20 @@ public interface MAPI { Property PRIORTY = Property.internalInteger(PREFIX_MAPI_META + "priority"); Property IS_FLAGGED = Property.internalBoolean(PREFIX_MAPI_META + "is-flagged"); - Property BODY_TYPES_PROCESSED = Property.internalTextBag(PREFIX_MAPI_META + "body-types-processed"); + Property BODY_TYPES_PROCESSED = + Property.internalTextBag(PREFIX_MAPI_META + "body-types-processed"); - Property ATTACH_LONG_PATH_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "long-path-name"); - Property ATTACH_LONG_FILE_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "long-file-name"); + Property ATTACH_LONG_PATH_NAME = + Property.internalText(PREFIX_MAPI_ATTACH_META + "long-path-name"); + Property ATTACH_LONG_FILE_NAME = + Property.internalText(PREFIX_MAPI_ATTACH_META + "long-file-name"); Property ATTACH_FILE_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "file-name"); Property ATTACH_CONTENT_ID = Property.internalText(PREFIX_MAPI_ATTACH_META + "content-id"); - Property ATTACH_CONTENT_LOCATION = Property.internalText(PREFIX_MAPI_ATTACH_META + "content-location"); + Property ATTACH_CONTENT_LOCATION = + Property.internalText(PREFIX_MAPI_ATTACH_META + "content-location"); Property ATTACH_DISPLAY_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "display-name"); Property ATTACH_EXTENSION = Property.internalText(PREFIX_MAPI_ATTACH_META + "extension"); Property ATTACH_MIME = Property.internalText(PREFIX_MAPI_ATTACH_META + "mime"); Property ATTACH_LANGUAGE = Property.internalText(PREFIX_MAPI_ATTACH_META + "language"); } - diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java b/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java index 44faa14bc7..65b1212ed0 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java @@ -1,30 +1,27 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * Metadata for describing machines, such as their - * architecture, type and endian-ness + * Metadata for describing machines, such as their architecture, type and endian-ness */ public interface MachineMetadata { String PREFIX = "machine:"; - Property ARCHITECTURE_BITS = - Property.internalClosedChoise(PREFIX + "architectureBits", "8", "16", "32", "64"); + Property ARCHITECTURE_BITS = Property.internalClosedChoise(PREFIX + "architectureBits", "8", + "16", "32", "64"); String PLATFORM_SYSV = "System V"; String PLATFORM_HPUX = "HP-UX"; @@ -39,11 +36,10 @@ public interface MachineMetadata { String PLATFORM_EMBEDDED = "Embedded"; // Stand-alone (embedded) ABI String PLATFORM_WINDOWS = "Windows"; - Property PLATFORM = - Property.internalClosedChoise(PREFIX + "platform", PLATFORM_SYSV, PLATFORM_HPUX, - PLATFORM_NETBSD, PLATFORM_LINUX, PLATFORM_SOLARIS, PLATFORM_AIX, PLATFORM_IRIX, - PLATFORM_FREEBSD, PLATFORM_TRU64, PLATFORM_ARM, PLATFORM_EMBEDDED, - PLATFORM_WINDOWS); + Property PLATFORM = Property.internalClosedChoise(PREFIX + "platform", PLATFORM_SYSV, + PLATFORM_HPUX, PLATFORM_NETBSD, PLATFORM_LINUX, PLATFORM_SOLARIS, PLATFORM_AIX, + PLATFORM_IRIX, PLATFORM_FREEBSD, PLATFORM_TRU64, PLATFORM_ARM, + PLATFORM_EMBEDDED, PLATFORM_WINDOWS); String MACHINE_x86_32 = "x86-32"; String MACHINE_x86_64 = "x86-64"; @@ -65,14 +61,13 @@ public interface MachineMetadata { String MACHINE_SH5 = "SH5"; String MACHINE_UNKNOWN = "Unknown"; - Property MACHINE_TYPE = - Property.internalClosedChoise(PREFIX + "machineType", MACHINE_x86_32, MACHINE_x86_64, - MACHINE_IA_64, MACHINE_SPARC, MACHINE_M68K, MACHINE_M88K, MACHINE_MIPS, - MACHINE_PPC, MACHINE_S370, MACHINE_S390, MACHINE_ARM, MACHINE_VAX, + Property MACHINE_TYPE = Property.internalClosedChoise(PREFIX + "machineType", MACHINE_x86_32, + MACHINE_x86_64, MACHINE_IA_64, MACHINE_SPARC, MACHINE_M68K, MACHINE_M88K, + MACHINE_MIPS, MACHINE_PPC, MACHINE_S370, MACHINE_S390, MACHINE_ARM, MACHINE_VAX, MACHINE_ALPHA, MACHINE_EFI, MACHINE_M32R, MACHINE_SH3, MACHINE_SH4, MACHINE_SH5, MACHINE_UNKNOWN); - Property ENDIAN = - Property.internalClosedChoise(PREFIX + "endian", Endian.LITTLE.name, Endian.BIG.name); + Property ENDIAN = Property.internalClosedChoise(PREFIX + "endian", Endian.LITTLE.name, + Endian.BIG.name); final class Endian { public static final Endian LITTLE = new Endian("Little", false); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Message.java b/tika-core/src/main/java/org/apache/tika/metadata/Message.java index fcb1421f3c..4ce8eb59e5 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Message.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Message.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -25,7 +23,7 @@ public interface Message { String MESSAGE_PREFIX = "Message" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; String MESSAGE_RAW_HEADER_PREFIX = - MESSAGE_PREFIX + "Raw-Header" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + MESSAGE_PREFIX + "Raw-Header" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; String MESSAGE_RECIPIENT_ADDRESS = "Message-Recipient-Address"; @@ -42,51 +40,51 @@ public interface Message { String MULTIPART_BOUNDARY = "Multipart-Boundary"; /** - * Where possible, this records the value from the name field. - * Even in MAPI messages, though, this can be an email address. + * Where possible, this records the value from the name field. Even in MAPI messages, though, + * this can be an email address. */ Property MESSAGE_FROM_NAME = Property.internalTextBag(MESSAGE_PREFIX + "From-Name"); /** - * Where possible, this records the value from the name field. - * Even in MAPI messages, though, this can be a name. + * Where possible, this records the value from the name field. Even in MAPI messages, though, + * this can be a name. *

- * Note that the value may also be an X400/x500 Exchange format: - * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other + * Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange + * Administrative Group/cn=Recipients/cn=someone.or.other */ Property MESSAGE_FROM_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "From-Email"); /** - * In Outlook messages, there are sometimes separate fields for "to-name" and - * "to-display-name" name. + * In Outlook messages, there are sometimes separate fields for "to-name" and "to-display-name" + * name. */ Property MESSAGE_TO_NAME = Property.internalTextBag(MESSAGE_PREFIX + "To-Name"); Property MESSAGE_TO_DISPLAY_NAME = Property.internalTextBag(MESSAGE_PREFIX + "To-Display-Name"); /** - * Where possible, this records the email value in the to field. - * Even in MAPI messages, though, this can be a name. + * Where possible, this records the email value in the to field. Even in MAPI messages, though, + * this can be a name. *

- * Note that the value may also be an X400/x500 Exchange format: - * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other + * Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange + * Administrative Group/cn=Recipients/cn=someone.or.other */ Property MESSAGE_TO_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "To-Email"); /** - * In Outlook messages, there are sometimes separate fields for "cc-name" and - * "cc-display-name" name. + * In Outlook messages, there are sometimes separate fields for "cc-name" and "cc-display-name" + * name. */ Property MESSAGE_CC_NAME = Property.internalTextBag(MESSAGE_PREFIX + "CC-Name"); Property MESSAGE_CC_DISPLAY_NAME = Property.internalTextBag(MESSAGE_PREFIX + "CC-Display-Name"); /** - * Where possible, this records the email value in the cc field. - * Even in MAPI messages, though, this can be a name. + * Where possible, this records the email value in the cc field. Even in MAPI messages, though, + * this can be a name. *

- * Note that the value may also be an X400/x500 Exchange format: - * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other + * Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange + * Administrative Group/cn=Recipients/cn=someone.or.other */ Property MESSAGE_CC_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "CC-Email"); @@ -97,14 +95,14 @@ public interface Message { Property MESSAGE_BCC_NAME = Property.internalTextBag(MESSAGE_PREFIX + "BCC-Name"); Property MESSAGE_BCC_DISPLAY_NAME = - Property.internalTextBag(MESSAGE_PREFIX + "BCC-Display-Name"); + Property.internalTextBag(MESSAGE_PREFIX + "BCC-Display-Name"); /** - * Where possible, this records the email value in the bcc field. - * Even in MAPI messages, though, this can be a name. + * Where possible, this records the email value in the bcc field. Even in MAPI messages, though, + * this can be a name. *

- * Note that the value may also be an X400/x500 Exchange format: - * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other + * Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange + * Administrative Group/cn=Recipients/cn=someone.or.other */ Property MESSAGE_BCC_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "BCC-Email"); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java index a5f1a2390f..9f1fde1b8e 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -33,7 +31,6 @@ import java.util.Objects; import java.util.Properties; import java.util.TimeZone; - import org.apache.tika.metadata.Property.PropertyType; import org.apache.tika.metadata.writefilter.MetadataWriteFilter; import org.apache.tika.utils.DateUtils; @@ -41,15 +38,14 @@ /** * A multi-valued metadata container. */ -public class Metadata - implements CreativeCommons, Geographic, HttpHeaders, Message, ClimateForcast, TIFF, - TikaMimeKeys, Serializable { +public class Metadata implements CreativeCommons, Geographic, HttpHeaders, Message, ClimateForcast, + TIFF, TikaMimeKeys, Serializable { private static final MetadataWriteFilter ACCEPT_ALL = new MetadataWriteFilter() { @Override public void filterExisting(Map data) { - //no-op + // no-op } @Override @@ -62,11 +58,11 @@ public void add(String field, String value, Map data) { } } - //legacy behavior -- remove the field if value is null + // legacy behavior -- remove the field if value is null @Override public void set(String field, String value, Map data) { if (value != null) { - data.put(field, new String[]{ value }); + data.put(field, new String[] {value}); } else { data.remove(field); } @@ -88,8 +84,8 @@ private String[] appendValues(String[] values, final String value) { */ private static final long serialVersionUID = 5623926545693153182L; /** - * Some parsers will have the date as a ISO-8601 string - * already, and will set that into the Metadata object. + * Some parsers will have the date as a ISO-8601 string already, and will set that into the + * Metadata object. */ private static final DateUtils DATE_UTILS = new DateUtils(); /** @@ -99,6 +95,7 @@ private String[] appendValues(String[] values, final String value) { private MetadataWriteFilter writeFilter = ACCEPT_ALL; + /** * Constructs a new, empty metadata. */ @@ -115,8 +112,8 @@ private static DateFormat createDateFormat(String format, TimeZone timezone) { } /** - * Parses the given date string. This method is synchronized to prevent - * concurrent access to the thread-unsafe date formats. + * Parses the given date string. This method is synchronized to prevent concurrent access to the + * thread-unsafe date formats. * * @param date date string * @return parsed date, or null if the date can't be parsed @@ -133,8 +130,8 @@ private static synchronized Date parseDate(String date) { * @return true is named value is multivalued, false if single value or null */ public boolean isMultiValued(final Property property) { - return metadata.get(property.getName()) != null && - metadata.get(property.getName()).length > 1; + return metadata.get(property.getName()) != null + && metadata.get(property.getName()).length > 1; } /** @@ -174,12 +171,11 @@ public String get(final String name) { /** * Sets the writeFilter that is called before {@link #set(String, String)} - * {@link #set(String, String[])}, {@link #add(String, String)}, - * {@link #add(String, String[])}. The default is {@link #ACCEPT_ALL}. + * {@link #set(String, String[])}, {@link #add(String, String)}, {@link #add(String, String[])}. + * The default is {@link #ACCEPT_ALL}. * - * This is intended for expert use only. Some parsers rely on metadata - * during the parse, and if the metadata they need is excluded, they - * will not function properly. + * This is intended for expert use only. Some parsers rely on metadata during the parse, and if + * the metadata they need is excluded, they will not function properly. * * @param writeFilter * @since 2.4.0 @@ -206,8 +202,8 @@ public String get(Property property) { * associated to the specified property, then the first one is returned. * * @param property simple integer property definition - * @return property value as a Integer, or null if the property is not set, or - * not a valid Integer + * @return property value as a Integer, or null if the property is not set, or not + * a valid Integer * @since Apache Tika 0.8 */ public Integer getInt(Property property) { @@ -234,8 +230,8 @@ public Integer getInt(Property property) { * associated to the specified property, then the first one is returned. * * @param property simple date property definition - * @return property value as a Date, or null if the property is not set, or not - * a valid Date + * @return property value as a Date, or null if the property is not set, or not a + * valid Date * @since Apache Tika 0.8 */ public Date getDate(Property property) { @@ -283,10 +279,10 @@ private String[] _getValues(final String name) { } /** - * Add a metadata name/value mapping. Add the specified value to the list of - * values associated to the specified metadata name. + * Add a metadata name/value mapping. Add the specified value to the list of values associated + * to the specified metadata name. * - * @param name the metadata name. + * @param name the metadata name. * @param value the metadata value. */ public void add(final String name, final String value) { @@ -294,10 +290,10 @@ public void add(final String name, final String value) { } /** - * Add a metadata name/value mapping. Add the specified value to the list of - * values associated to the specified metadata name. + * Add a metadata name/value mapping. Add the specified value to the list of values associated + * to the specified metadata name. * - * @param name the metadata name. + * @param name the metadata name. * @param newValues the metadata values */ protected void add(final String name, final String[] newValues) { @@ -312,11 +308,11 @@ protected void add(final String name, final String[] newValues) { } /** - * Add a metadata property/value mapping. Add the specified value to the list of - * values associated to the specified metadata property. + * Add a metadata property/value mapping. Add the specified value to the list of values + * associated to the specified metadata property. * * @param property the metadata property. - * @param value the metadata value. + * @param value the metadata value. */ public void add(final Property property, final String value) { @@ -340,7 +336,7 @@ public void add(final Property property, final String value) { add(property.getName(), value); } else { throw new PropertyTypeException( - property.getName() + " : " + property.getPropertyType()); + property.getName() + " : " + property.getPropertyType()); } } } @@ -356,17 +352,16 @@ public void setAll(Properties properties) { Enumeration names = (Enumeration) properties.propertyNames(); while (names.hasMoreElements()) { String name = names.nextElement(); - metadata.put(name, new String[]{properties.getProperty(name)}); + metadata.put(name, new String[] {properties.getProperty(name)}); } } /** - * Set metadata name/value. Associate the specified value to the specified - * metadata name. If some previous values were associated to this name, - * they are removed. If the given value is null, then the - * metadata entry is removed. + * Set metadata name/value. Associate the specified value to the specified metadata name. If + * some previous values were associated to this name, they are removed. If the given value is + * null, then the metadata entry is removed. * - * @param name the metadata name. + * @param name the metadata name. * @param value the metadata value, or null */ public void set(String name, String value) { @@ -374,8 +369,8 @@ public void set(String name, String value) { } protected void set(String name, String[] values) { - //TODO: optimize this to not copy if all - //values are to be included "as is" + // TODO: optimize this to not copy if all + // values are to be included "as is" if (values != null) { metadata.remove(name); for (String v : values) { @@ -390,7 +385,7 @@ protected void set(String name, String[] values) { * Sets the value of the identified metadata property. * * @param property property definition - * @param value property value + * @param value property value * @since Apache Tika 0.7 */ public void set(Property property, String value) { @@ -413,7 +408,7 @@ public void set(Property property, String value) { * Sets the values of the identified metadata property. * * @param property property definition - * @param values property values + * @param values property values * @since Apache Tika 1.2 */ public void set(Property property, String[] values) { @@ -436,17 +431,17 @@ public void set(Property property, String[] values) { * Sets the integer value of the identified metadata property. * * @param property simple integer property definition - * @param value property value + * @param value property value * @since Apache Tika 0.8 */ public void set(Property property, int value) { if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) { throw new PropertyTypeException(Property.PropertyType.SIMPLE, - property.getPrimaryProperty().getPropertyType()); + property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) { throw new PropertyTypeException(Property.ValueType.INTEGER, - property.getPrimaryProperty().getValueType()); + property.getPrimaryProperty().getValueType()); } set(property, Integer.toString(value)); } @@ -455,35 +450,36 @@ public void set(Property property, int value) { * Sets the integer value of the identified metadata property. * * @param property simple integer property definition - * @param value property value + * @param value property value * @since Apache Tika 0.8 */ public void set(Property property, long value) { if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) { throw new PropertyTypeException(Property.PropertyType.SIMPLE, - property.getPrimaryProperty().getPropertyType()); + property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL) { throw new PropertyTypeException(Property.ValueType.REAL, - property.getPrimaryProperty().getValueType()); + property.getPrimaryProperty().getValueType()); } set(property, Long.toString(value)); } + /** * Sets the integer value of the identified metadata property. * * @param property simple integer property definition - * @param value property value + * @param value property value * @since Apache Tika 2.1.1 */ public void set(Property property, boolean value) { if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) { throw new PropertyTypeException(Property.PropertyType.SIMPLE, - property.getPrimaryProperty().getPropertyType()); + property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.BOOLEAN) { throw new PropertyTypeException(Property.ValueType.BOOLEAN, - property.getPrimaryProperty().getValueType()); + property.getPrimaryProperty().getValueType()); } set(property, Boolean.toString(value)); } @@ -492,17 +488,17 @@ public void set(Property property, boolean value) { * Adds the integer value of the identified metadata property. * * @param property seq integer property definition - * @param value property value + * @param value property value * @since Apache Tika 1.21 */ public void add(Property property, int value) { if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) { throw new PropertyTypeException(PropertyType.SEQ, - property.getPrimaryProperty().getPropertyType()); + property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) { throw new PropertyTypeException(Property.ValueType.INTEGER, - property.getPrimaryProperty().getValueType()); + property.getPrimaryProperty().getValueType()); } add(property, Integer.toString(value)); } @@ -517,11 +513,11 @@ public void add(Property property, int value) { public int[] getIntValues(Property property) { if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) { throw new PropertyTypeException(PropertyType.SEQ, - property.getPrimaryProperty().getPropertyType()); + property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) { throw new PropertyTypeException(Property.ValueType.INTEGER, - property.getPrimaryProperty().getValueType()); + property.getPrimaryProperty().getValueType()); } String[] vals = getValues(property); int[] ret = new int[vals.length]; @@ -541,11 +537,11 @@ public int[] getIntValues(Property property) { public long[] getLongValues(Property property) { if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) { throw new PropertyTypeException(PropertyType.SEQ, - property.getPrimaryProperty().getPropertyType()); + property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL) { throw new PropertyTypeException(Property.ValueType.REAL, - property.getPrimaryProperty().getValueType()); + property.getPrimaryProperty().getValueType()); } String[] vals = getValues(property); long[] ret = new long[vals.length]; @@ -559,14 +555,14 @@ public long[] getLongValues(Property property) { * Sets the real or rational value of the identified metadata property. * * @param property simple real or simple rational property definition - * @param value property value + * @param value property value * @since Apache Tika 0.8 */ public void set(Property property, double value) { - if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL && - property.getPrimaryProperty().getValueType() != Property.ValueType.RATIONAL) { + if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL && property + .getPrimaryProperty().getValueType() != Property.ValueType.RATIONAL) { throw new PropertyTypeException(Property.ValueType.REAL, - property.getPrimaryProperty().getValueType()); + property.getPrimaryProperty().getValueType()); } set(property, Double.toString(value)); } @@ -575,17 +571,17 @@ public void set(Property property, double value) { * Sets the date value of the identified metadata property. * * @param property simple integer property definition - * @param date property value + * @param date property value * @since Apache Tika 0.8 */ public void set(Property property, Date date) { if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) { throw new PropertyTypeException(Property.PropertyType.SIMPLE, - property.getPrimaryProperty().getPropertyType()); + property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) { throw new PropertyTypeException(Property.ValueType.DATE, - property.getPrimaryProperty().getValueType()); + property.getPrimaryProperty().getValueType()); } String dateString = null; if (date != null) { @@ -598,17 +594,17 @@ public void set(Property property, Date date) { * Sets the date value of the identified metadata property. * * @param property simple integer property definition - * @param date property value + * @param date property value * @since Apache Tika 0.8 */ public void set(Property property, Calendar date) { if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) { throw new PropertyTypeException(Property.PropertyType.SIMPLE, - property.getPrimaryProperty().getPropertyType()); + property.getPrimaryProperty().getPropertyType()); } if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) { throw new PropertyTypeException(Property.ValueType.DATE, - property.getPrimaryProperty().getValueType()); + property.getPrimaryProperty().getValueType()); } String dateString = null; if (date != null) { @@ -621,13 +617,13 @@ public void set(Property property, Calendar date) { * Adds the date value of the identified metadata property. * * @param property simple calendar property definition - * @param date property value + * @param date property value * @since Apache Tika 2.5.0 */ public void add(Property property, Calendar date) { if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) { throw new PropertyTypeException(Property.ValueType.DATE, - property.getPrimaryProperty().getValueType()); + property.getPrimaryProperty().getValueType()); } String dateString = null; if (date != null) { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java index 477ffef140..1cd8456e0f 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java @@ -1,27 +1,23 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * Office Document properties collection. These properties apply to - * Office / Productivity Documents of all forms, including (but not limited - * to) MS Office and OpenDocument formats. - * This is a logical collection of properties, which may be drawn from a - * few different external definitions. + * Office Document properties collection. These properties apply to Office / Productivity Documents + * of all forms, including (but not limited to) MS Office and OpenDocument formats. This is a + * logical collection of properties, which may be drawn from a few different external definitions. * * @since Apache Tika 1.2 */ @@ -31,9 +27,9 @@ public interface Office { String PREFIX_DOC_META = "meta"; /** - * For user defined metadata entries in the document, - * what prefix should be attached to the key names. - * eg Text1 becomes custom:Info1=Text1 + * For user defined metadata entries in the document, what prefix should be attached to the key + * names. eg Text1 becomes + * custom:Info1=Text1 */ String USER_DEFINED_METADATA_NAME_PREFIX = "custom:"; @@ -41,113 +37,114 @@ public interface Office { /** * Keywords pertaining to a document. Also populates {@link DublinCore#SUBJECT}. */ - Property KEYWORDS = Property.composite(Property.internalTextBag( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "keyword"), - new Property[]{DublinCore.SUBJECT,}); + Property KEYWORDS = Property.composite( + Property.internalTextBag(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "keyword"), + new Property[] {DublinCore.SUBJECT,}); /** * Name of the initial creator/author of a document */ - Property INITIAL_AUTHOR = Property.internalText( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "initial-author"); + Property INITIAL_AUTHOR = Property.internalText(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "initial-author"); /** * Name of the last (most recent) author of a document */ - Property LAST_AUTHOR = Property.internalText( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "last-author"); + Property LAST_AUTHOR = Property.internalText(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "last-author"); /** * Name of the principal author(s) of a document */ Property AUTHOR = Property.internalTextBag( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "author"); + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "author"); /** * When was the document created? */ - Property CREATION_DATE = Property.internalDate( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creation-date"); + Property CREATION_DATE = Property.internalDate(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creation-date"); /** * When was the document last saved? */ Property SAVE_DATE = Property.internalDate( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "save-date"); + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "save-date"); /** * When was the document last printed? */ Property PRINT_DATE = Property.internalDate( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "print-date"); + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "print-date"); /** * The number of Slides are there in the (presentation) document */ - Property SLIDE_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "slide-count"); + Property SLIDE_COUNT = Property.internalInteger(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "slide-count"); /** * The number of Pages are there in the (paged) document */ Property PAGE_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "page-count"); + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "page-count"); /** * The number of individual Paragraphs in the document */ - Property PARAGRAPH_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "paragraph-count"); + Property PARAGRAPH_COUNT = Property.internalInteger(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "paragraph-count"); /** * The number of lines in the document */ Property LINE_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "line-count"); + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "line-count"); /** * The number of Words in the document */ Property WORD_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "word-count"); + PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "word-count"); /** * The number of Characters in the document */ - Property CHARACTER_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "character-count"); + Property CHARACTER_COUNT = Property.internalInteger(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "character-count"); /** * The number of Characters in the document, including spaces */ - Property CHARACTER_COUNT_WITH_SPACES = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "character-count-with-spaces"); + Property CHARACTER_COUNT_WITH_SPACES = Property + .internalInteger(PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + + "character-count-with-spaces"); /** * The number of Tables in the document */ - Property TABLE_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "table-count"); + Property TABLE_COUNT = Property.internalInteger(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "table-count"); /** * The number of Images in the document */ - Property IMAGE_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "image-count"); + Property IMAGE_COUNT = Property.internalInteger(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "image-count"); /** - * The number of Objects in the document. These are typically non-Image resources - * embedded in the document, such as other documents or non-Image media. + * The number of Objects in the document. These are typically non-Image resources embedded in + * the document, such as other documents or non-Image media. */ - Property OBJECT_COUNT = Property.internalInteger( - PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "object-count"); + Property OBJECT_COUNT = Property.internalInteger(PREFIX_DOC_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "object-count"); /** - * Embedded files may have a "progID" associated with them, such as - * Word.Document.12 or AcroExch.Document.DC + * Embedded files may have a "progID" associated with them, such as Word.Document.12 or + * AcroExch.Document.DC */ Property PROG_ID = Property.internalText("msoffice:progID"); @@ -161,11 +158,13 @@ public interface Office { Property HAS_HIDDEN_ROWS = Property.internalBoolean("msoffice:excel:has-hidden-rows"); - Property HAS_VERY_HIDDEN_SHEETS = Property.internalBoolean("msoffice:excel:has-very-hidden-sheets"); + Property HAS_VERY_HIDDEN_SHEETS = + Property.internalBoolean("msoffice:excel:has-very-hidden-sheets"); Property HIDDEN_SHEET_NAMES = Property.internalTextBag("msoffice:excel:hidden-sheet-names"); - Property VERY_HIDDEN_SHEET_NAMES = Property.internalTextBag("msoffice:excel:very-hidden-sheet-names"); + Property VERY_HIDDEN_SHEET_NAMES = + Property.internalTextBag("msoffice:excel:very-hidden-sheet-names"); Property PROTECTED_WORKSHEET = Property.internalBoolean("msoffice:excel:protected-worksheet"); @@ -181,7 +180,7 @@ public interface Office { Property HAS_ANIMATIONS = Property.internalBoolean("msoffice:ppt:has-animations"); - //w:vanish or isVanish or isFldVanish + // w:vanish or isVanish or isFldVanish Property HAS_HIDDEN_TEXT = Property.internalBoolean("msoffice:doc:has-hidden-text"); Property HAS_TRACK_CHANGES = Property.internalBoolean("msoffice:has-track-changes"); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java index 1259719e16..4c3e763d21 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java @@ -1,78 +1,76 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * Core properties as defined in the Office Open XML specification part Two that are not - * in the DublinCore namespace. - * There is also a keyword property definition in the specification which is omitted here, - * because Tika should stick to the DublinCore/IPTC definition. + * Core properties as defined in the Office Open XML specification part Two that are not in the + * DublinCore namespace. There is also a keyword property definition in the specification which is + * omitted here, because Tika should stick to the DublinCore/IPTC definition. * - * @see ISO document of Office Open XML specification - * @see ECMA document of Office Open XML specification + * @see ISO + * document of Office Open XML specification + * @see ECMA + * document of Office Open XML specification */ public interface OfficeOpenXMLCore { String NAMESPACE_URI = - "http://schemas.openxmlformats.org/package/2006/metadata/core-properties/"; + "http://schemas.openxmlformats.org/package/2006/metadata/core-properties/"; String PREFIX = "cp"; /** * A categorization of the content of this package. */ Property CATEGORY = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "category"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "category"); /** * The status of the content. */ Property CONTENT_STATUS = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contentStatus"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contentStatus"); /** * The user who performed the last modification. The identification is environment-specific. */ Property LAST_MODIFIED_BY = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastModifiedBy"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastModifiedBy"); /** * The date and time of the last printing. */ Property LAST_PRINTED = Property.externalDate( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastPrinted"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastPrinted"); /** * The revision number. */ Property REVISION = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "revision"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "revision"); /** * The version number. This value is set by the user or by the application. */ Property VERSION = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "version"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "version"); /** - * The document's subject. Also populates {@link DublinCore#SUBJECT} + * The document's subject. Also populates {@link DublinCore#SUBJECT} */ @Deprecated Property SUBJECT = Property.composite(Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"), - new Property[]{DublinCore.SUBJECT,}); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"), + new Property[] {DublinCore.SUBJECT,}); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java index 6919c216b6..bc64330168 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java @@ -1,38 +1,36 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * Extended properties as defined in the Office Open XML specification part Four. - * Those properties are omitted which have equivalent properties defined in the ODF - * namespace like "word count". - * Also not all properties from the specification are defined here, yet. Only those which have - * been in use by the parsers so far. + * Extended properties as defined in the Office Open XML specification part Four. Those properties + * are omitted which have equivalent properties defined in the ODF namespace like "word count". Also + * not all properties from the specification are defined here, yet. Only those which have been in + * use by the parsers so far. * - * @see ISO document of Office Open XML specification - * @see ECMA document of Office Open XML specification + * @see ISO + * document of Office Open XML specification + * @see ECMA + * document of Office Open XML specification */ public interface OfficeOpenXMLExtended { String NAMESPACE_URI = - "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties/"; + "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties/"; String WORD_PROCESSING_NAMESPACE_URI = - "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; + "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; String PREFIX = "extended-properties"; String WORD_PROCESSING_PREFIX = "w"; String SECURITY_NONE = "None"; @@ -43,41 +41,41 @@ public interface OfficeOpenXMLExtended { String SECURITY_UNKNOWN = "Unknown"; Property TEMPLATE = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Template"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Template"); Property MANAGER = Property.externalTextBag( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Manager"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Manager"); Property COMPANY = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Company"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Company"); Property PRESENTATION_FORMAT = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PresentationFormat"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PresentationFormat"); Property NOTES = Property.externalInteger( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Notes"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Notes"); Property TOTAL_TIME = Property.externalInteger( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "TotalTime"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "TotalTime"); Property HIDDEN_SLIDES = Property.externalInteger( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "HiddedSlides"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "HiddedSlides"); Property APPLICATION = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Application"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Application"); Property APP_VERSION = Property.externalText( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AppVersion"); - //Integer flag + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AppVersion"); + // Integer flag Property DOC_SECURITY = Property.externalInteger( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurity"); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurity"); - //Human readable string explaining doc security flag + // Human readable string explaining doc security flag Property DOC_SECURITY_STRING = Property.externalClosedChoise( - PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurityString", - SECURITY_NONE, SECURITY_PASSWORD_PROTECTED, SECURITY_READ_ONLY_RECOMMENDED, - SECURITY_READ_ONLY_ENFORCED, SECURITY_LOCKED_FOR_ANNOTATIONS, SECURITY_UNKNOWN); + PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurityString", + SECURITY_NONE, SECURITY_PASSWORD_PROTECTED, SECURITY_READ_ONLY_RECOMMENDED, + SECURITY_READ_ONLY_ENFORCED, SECURITY_LOCKED_FOR_ANNOTATIONS, SECURITY_UNKNOWN); - Property COMMENTS = Property.externalTextBag( - WORD_PROCESSING_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Comments"); + Property COMMENTS = Property.externalTextBag(WORD_PROCESSING_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Comments"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index f852189365..fd5810b30d 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -29,24 +27,23 @@ public interface PDF { /** - * Number of %%EOF as extracted by the StartXRefScanner. See - * that class for limitations. + * Number of %%EOF as extracted by the StartXRefScanner. See that class for limitations. * - * This includes the final %%EOF, which may or may not be at the literal - * end of the file. This does not include an %%EOF - * if the startxref=0, as would happen in a dummy %%EOF in a linearized PDF. + * This includes the final %%EOF, which may or may not be at the literal end of the file. This + * does not include an %%EOF if the startxref=0, as would happen in a dummy %%EOF in a + * linearized PDF. */ Property EOF_OFFSETS = Property.externalRealSeq(PDF_PREFIX + "eofOffsets"); /** - * Prefix to be used for properties that record what was stored - * in the docinfo section (as opposed to XMP) + * Prefix to be used for properties that record what was stored in the docinfo section (as + * opposed to XMP) */ String PDF_DOC_INFO_PREFIX = - PDF_PREFIX + "docinfo" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + PDF_PREFIX + "docinfo" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; String PDF_DOC_INFO_CUSTOM_PREFIX = - PDF_DOC_INFO_PREFIX + "custom" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + PDF_DOC_INFO_PREFIX + "custom" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; Property DOC_INFO_CREATED = Property.internalDate(PDF_DOC_INFO_PREFIX + "created"); @@ -92,16 +89,15 @@ public interface PDF { Property PRODUCER = Property.internalText(PDF_PREFIX + "producer"); /** - * This specifies where an action or destination would be found/triggered - * in the document: on document open, before close, etc. + * This specifies where an action or destination would be found/triggered in the document: on + * document open, before close, etc. * * This is included in the embedded document (js only for now?), not the container PDF. */ Property ACTION_TRIGGER = Property.internalText(PDF_PREFIX + "actionTrigger"); /** - * This is a list of all action or destination triggers contained - * within a given PDF. + * This is a list of all action or destination triggers contained within a given PDF. */ Property ACTION_TRIGGERS = Property.internalTextBag(PDF_PREFIX + "actionTriggers"); @@ -110,25 +106,24 @@ public interface PDF { Property CHARACTERS_PER_PAGE = Property.internalIntegerSequence(PDF_PREFIX + "charsPerPage"); Property UNMAPPED_UNICODE_CHARS_PER_PAGE = - Property.internalIntegerSequence(PDF_PREFIX + "unmappedUnicodeCharsPerPage"); + Property.internalIntegerSequence(PDF_PREFIX + "unmappedUnicodeCharsPerPage"); Property TOTAL_UNMAPPED_UNICODE_CHARS = - Property.internalInteger(PDF_PREFIX + "totalUnmappedUnicodeChars"); + Property.internalInteger(PDF_PREFIX + "totalUnmappedUnicodeChars"); Property OVERALL_PERCENTAGE_UNMAPPED_UNICODE_CHARS = - Property.internalReal(PDF_PREFIX + "overallPercentageUnmappedUnicodeChars"); + Property.internalReal(PDF_PREFIX + "overallPercentageUnmappedUnicodeChars"); /** * Contains at least one damaged font for at least one character */ - Property CONTAINS_DAMAGED_FONT = - Property.internalBoolean(PDF_PREFIX + "containsDamagedFont"); + Property CONTAINS_DAMAGED_FONT = Property.internalBoolean(PDF_PREFIX + "containsDamagedFont"); /** * Contains at least one font that is not embedded */ Property CONTAINS_NON_EMBEDDED_FONT = - Property.internalBoolean(PDF_PREFIX + "containsNonEmbeddedFont"); + Property.internalBoolean(PDF_PREFIX + "containsNonEmbeddedFont"); /** * Has XFA @@ -141,8 +136,8 @@ public interface PDF { Property HAS_XMP = Property.internalBoolean(PDF_PREFIX + "hasXMP"); /** - * If xmp is extracted by, e.g. the XMLProfiler, where did it come from? - * The document's document catalog or a specific page...or? + * If xmp is extracted by, e.g. the XMLProfiler, where did it come from? The document's document + * catalog or a specific page...or? */ Property XMP_LOCATION = Property.internalText(PDF_PREFIX + "xmpLocation"); @@ -154,25 +149,24 @@ public interface PDF { Property HAS_MARKED_CONTENT = Property.internalBoolean(PDF_PREFIX + "hasMarkedContent"); /** - * Has a collection element in the root. If true, this is likely a PDF Portfolio. + * Has a collection element in the root. If true, this is likely a PDF Portfolio. */ Property HAS_COLLECTION = Property.internalBoolean(PDF_PREFIX + "hasCollection"); - Property EMBEDDED_FILE_DESCRIPTION = Property.externalText(PDF_PREFIX + - "embeddedFileDescription"); + Property EMBEDDED_FILE_DESCRIPTION = + Property.externalText(PDF_PREFIX + "embeddedFileDescription"); /** * If the file came from an annotation and there was a type */ - Property EMBEDDED_FILE_ANNOTATION_TYPE = Property.internalText(PDF_PREFIX + - "embeddedFileAnnotationType"); + Property EMBEDDED_FILE_ANNOTATION_TYPE = + Property.internalText(PDF_PREFIX + "embeddedFileAnnotationType"); /** - * literal string from the PDEmbeddedFile#getSubtype(), should be what the PDF - * alleges is the embedded file's mime type + * literal string from the PDEmbeddedFile#getSubtype(), should be what the PDF alleges is the + * embedded file's mime type */ - Property EMBEDDED_FILE_SUBTYPE = Property.internalText(PDF_PREFIX + - "embeddedFileSubtype"); + Property EMBEDDED_FILE_SUBTYPE = Property.internalText(PDF_PREFIX + "embeddedFileSubtype"); /** * If the PDF has an annotation of type 3D */ @@ -183,42 +177,41 @@ public interface PDF { Property ANNOTATION_SUBTYPES = Property.internalTextBag(PDF_PREFIX + "annotationSubtypes"); /** - * Number of 3D annotations a PDF contains. This makes {@link PDF#HAS_3D} redundant. + * Number of 3D annotations a PDF contains. This makes {@link PDF#HAS_3D} redundant. */ Property NUM_3D_ANNOTATIONS = Property.internalInteger(PDF_PREFIX + "num3DAnnotations"); - Property ASSOCIATED_FILE_RELATIONSHIP = Property.internalText(PDF_PREFIX + - "associatedFileRelationship"); + Property ASSOCIATED_FILE_RELATIONSHIP = + Property.internalText(PDF_PREFIX + "associatedFileRelationship"); /** - * This is a zero-based number for incremental updates within a PDF -- 0 is the first - * update, 1 is the second, etc. The final version of the PDF (e.g. the last update) - * does not have an incremental update number. + * This is a zero-based number for incremental updates within a PDF -- 0 is the first update, 1 + * is the second, etc. The final version of the PDF (e.g. the last update) does not have an + * incremental update number. * - * This value is populated with the parse incremental updates feature is selected - * in the PDFParser. + * This value is populated with the parse incremental updates feature is selected in the + * PDFParser. */ - Property INCREMENTAL_UPDATE_NUMBER = - Property.composite(Property.internalInteger(PDF_PREFIX + "incrementalUpdateNumber"), - new Property[]{ TikaCoreProperties.VERSION_NUMBER }); + Property INCREMENTAL_UPDATE_NUMBER = Property.composite( + Property.internalInteger(PDF_PREFIX + "incrementalUpdateNumber"), + new Property[] {TikaCoreProperties.VERSION_NUMBER}); /** - * Incremental updates as extracted by the StartXRefScanner. See - * that class for limitations. + * Incremental updates as extracted by the StartXRefScanner. See that class for limitations. */ - Property PDF_INCREMENTAL_UPDATE_COUNT = - Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"), - new Property[]{ TikaCoreProperties.VERSION_COUNT }); + Property PDF_INCREMENTAL_UPDATE_COUNT = Property.composite( + Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"), + new Property[] {TikaCoreProperties.VERSION_COUNT}); /** - * This counts the number of pages that would have been OCR'd or were OCR'd depending - * on the OCR settings. If NO_OCR is selected, this will + * This counts the number of pages that would have been OCR'd or were OCR'd depending on the OCR + * settings. If NO_OCR is selected, this will */ Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount"); /** - * When javascript is stored in the names tree, there's a name associated with that script. - * This is that name. When javascript is stored in an action, there is no name, and this - * metadata will not be populated. + * When javascript is stored in the names tree, there's a name associated with that script. This + * is that name. When javascript is stored in an action, there is no name, and this metadata + * will not be populated. */ Property JS_NAME = Property.internalText(PDF_PREFIX + "jsName"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PST.java b/tika-core/src/main/java/org/apache/tika/metadata/PST.java index d977c2e19f..dab2241ac7 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PST.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PST.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. * */ package org.apache.tika.metadata; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java b/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java index 4ba79090e5..e2cfeda175 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java @@ -1,38 +1,35 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * XMP Paged-text schema. This is a collection of - * {@link Property property definition} constants for the paged text - * properties defined in the XMP standard. + * XMP Paged-text schema. This is a collection of {@link Property property definition} constants for + * the paged text properties defined in the XMP standard. * - * @see XMP Specification, Part 2: Standard Schemas + * @see XMP Specification, Part 2: Standard Schemas * @since Apache Tika 0.8 */ public interface PagedText { /** - * "The number of pages in the document (including any in contained - * documents)." + * "The number of pages in the document (including any in contained documents)." */ Property N_PAGES = Property.internalInteger("xmpTPg:NPages"); - //TODO MaxPageSize, Fonts, Colorants, PlateNames + // TODO MaxPageSize, Fonts, Colorants, PlateNames } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java b/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java index af4ababb08..96ead7cb60 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java @@ -1,33 +1,30 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. * - * IPTC Metadata Descriptions taken from the IPTC Photo Metadata (July 2010) - * standard. These parts Copyright 2010 International Press Telecommunications - * Council. + * IPTC Metadata Descriptions taken from the IPTC Photo Metadata (July 2010) standard. These parts + * Copyright 2010 International Press Telecommunications Council. */ package org.apache.tika.metadata; /** * XMP Photoshop metadata schema. *

- * A collection of property constants for the - * Photo Metadata properties defined in the XMP Photoshop + * A collection of property constants for the Photo Metadata properties defined in the XMP Photoshop * standard. * - * @see XMP Photoshop + * @see XMP + * Photoshop * @since Apache Tika 1.2 */ public interface Photoshop { @@ -35,57 +32,54 @@ public interface Photoshop { String NAMESPACE_URI_PHOTOSHOP = "http://ns.adobe.com/photoshop/1.0/"; String PREFIX_PHOTOSHOP = "photoshop"; - Property AUTHORS_POSITION = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AuthorsPosition"); + Property AUTHORS_POSITION = Property.internalText(PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AuthorsPosition"); // TODO Replace this with proper indexed choices support - String[] _COLOR_MODE_CHOICES_INDEXED = - {"Bitmap", "Greyscale", "Indexed Colour", "RGB Color", "CMYK Colour", "Multi-Channel", - "Duotone", "LAB Colour", "reserved", "reserved", "YCbCr Colour", "YCgCo Colour", - "YCbCrK Colour"}; + String[] _COLOR_MODE_CHOICES_INDEXED = {"Bitmap", "Greyscale", "Indexed Colour", "RGB Color", + "CMYK Colour", "Multi-Channel", "Duotone", "LAB Colour", "reserved", "reserved", + "YCbCr Colour", "YCgCo Colour", "YCbCrK Colour"}; Property COLOR_MODE = Property.internalClosedChoise( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ColorMode", - _COLOR_MODE_CHOICES_INDEXED); + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ColorMode", + _COLOR_MODE_CHOICES_INDEXED); - Property CAPTION_WRITER = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CaptionWriter"); + Property CAPTION_WRITER = Property.internalText(PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CaptionWriter"); Property CATEGORY = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Category"); + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Category"); Property CITY = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "City"); + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "City"); Property COUNTRY = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Country"); + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Country"); Property CREDIT = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Credit"); + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Credit"); - Property DATE_CREATED = Property.internalDate( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DateCreated"); + Property DATE_CREATED = Property.internalDate(PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DateCreated"); Property HEADLINE = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Headline"); + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Headline"); - Property INSTRUCTIONS = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Instructions"); + Property INSTRUCTIONS = Property.internalText(PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Instructions"); Property SOURCE = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Source"); + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Source"); Property STATE = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "State"); + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "State"); - Property SUPPLEMENTAL_CATEGORIES = Property.internalTextBag( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "SupplementalCategories"); + Property SUPPLEMENTAL_CATEGORIES = Property.internalTextBag(PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "SupplementalCategories"); - Property TRANSMISSION_REFERENCE = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "TransmissionReference"); + Property TRANSMISSION_REFERENCE = Property.internalText(PREFIX_PHOTOSHOP + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "TransmissionReference"); Property URGENCY = Property.internalText( - PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Urgency"); + PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Urgency"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Property.java b/tika-core/src/main/java/org/apache/tika/metadata/Property.java index 3d67141414..9e6cfa0710 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Property.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Property.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -26,10 +24,9 @@ import java.util.concurrent.ConcurrentHashMap; /** - * XMP property definition. Each instance of this class defines a single - * metadata property like "dc:format". In addition to the property name, - * the {@link ValueType value type} and category (internal or external) - * of the property are included in the property definition. The available + * XMP property definition. Each instance of this class defines a single metadata property like + * "dc:format". In addition to the property name, the {@link ValueType value type} and category + * (internal or external) of the property are included in the property definition. The available * choice values are also stored for open and closed choice value types. * * @since Apache Tika 0.7 @@ -49,15 +46,15 @@ public final class Property implements Comparable { private final Set choices; private Property(String name, boolean internal, PropertyType propertyType, ValueType valueType, - String[] choices, Property primaryProperty, - Property[] secondaryExtractProperties) { + String[] choices, Property primaryProperty, + Property[] secondaryExtractProperties) { this.name = name; this.internal = internal; this.propertyType = propertyType; this.valueType = valueType; if (choices != null) { this.choices = Collections - .unmodifiableSet(new HashSet<>(Arrays.asList(choices.clone()))); + .unmodifiableSet(new HashSet<>(Arrays.asList(choices.clone()))); } else { this.choices = null; } @@ -77,7 +74,7 @@ private Property(String name, boolean internal, PropertyType propertyType, Value } private Property(String name, boolean internal, PropertyType propertyType, ValueType valueType, - String[] choices) { + String[] choices) { this(name, internal, propertyType, valueType, choices, null, null); } @@ -90,7 +87,7 @@ private Property(String name, boolean internal, ValueType valueType) { } private Property(String name, boolean internal, PropertyType propertyType, - ValueType valueType) { + ValueType valueType) { this(name, internal, propertyType, valueType, null); } @@ -223,15 +220,15 @@ public static Property externalTextBag(String name) { /** * Constructs a new composite property from the given primary and array of secondary properties. *

- * Note that name of the composite property is taken from its primary property, - * and primary and secondary properties must not be composite properties themselves. + * Note that name of the composite property is taken from its primary property, and primary and + * secondary properties must not be composite properties themselves. * * @param primaryProperty * @param secondaryExtractProperties * @return the composite property */ public static Property composite(Property primaryProperty, - Property[] secondaryExtractProperties) { + Property[] secondaryExtractProperties) { if (primaryProperty == null) { throw new NullPointerException("primaryProperty must not be null"); } @@ -250,8 +247,8 @@ public static Property composite(Property primaryProperty, choices = primaryProperty.getChoices().toArray(new String[0]); } return new Property(primaryProperty.getName(), primaryProperty.isInternal(), - PropertyType.COMPOSITE, ValueType.PROPERTY, choices, primaryProperty, - secondaryExtractProperties); + PropertyType.COMPOSITE, ValueType.PROPERTY, choices, primaryProperty, + secondaryExtractProperties); } public String getName() { @@ -270,8 +267,8 @@ public boolean isExternal() { * Is the PropertyType one which accepts multiple values? */ public boolean isMultiValuePermitted() { - if (propertyType == PropertyType.BAG || propertyType == PropertyType.SEQ || - propertyType == PropertyType.ALT) { + if (propertyType == PropertyType.BAG || propertyType == PropertyType.SEQ + || propertyType == PropertyType.ALT) { return true; } else if (propertyType == PropertyType.COMPOSITE) { // Base it on the primary property's behaviour @@ -289,9 +286,9 @@ public ValueType getValueType() { } /** - * Returns the (immutable) set of choices for the values of this property. - * Only defined for {@link ValueType#OPEN_CHOICE open} and - * {@link ValueType#CLOSED_CHOICE closed choice} value types. + * Returns the (immutable) set of choices for the values of this property. Only defined for + * {@link ValueType#OPEN_CHOICE open} and {@link ValueType#CLOSED_CHOICE closed choice} value + * types. * * @return available choices, or null */ @@ -325,13 +322,13 @@ public boolean equals(Object o) { return o instanceof Property && name.equals(((Property) o).name); } - //----------------------------------------------------------< Comparable > + // ----------------------------------------------------------< Comparable > public int hashCode() { return name.hashCode(); } - //--------------------------------------------------------------< Object > + // --------------------------------------------------------------< Object > public enum PropertyType { /** @@ -357,8 +354,7 @@ public enum PropertyType { } public enum ValueType { - BOOLEAN, OPEN_CHOICE, CLOSED_CHOICE, DATE, INTEGER, LOCALE, MIME_TYPE, PROPER_NAME, - RATIONAL, REAL, TEXT, URI, URL, XPATH, PROPERTY + BOOLEAN, OPEN_CHOICE, CLOSED_CHOICE, DATE, INTEGER, LOCALE, MIME_TYPE, PROPER_NAME, RATIONAL, REAL, TEXT, URI, URL, XPATH, PROPERTY } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java b/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java index ff1f926ba4..b00a94c08a 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -21,9 +19,9 @@ /** - * XMP property definition violation exception. This is thrown when - * you try to set a {@link Property} value with an incorrect type, - * such as storing an Integer when the property is of type Date. + * XMP property definition violation exception. This is thrown when you try to set a + * {@link Property} value with an incorrect type, such as storing an Integer when the property is of + * type Date. * * @since Apache Tika 0.8 */ @@ -42,9 +40,9 @@ public PropertyTypeException(ValueType expected, ValueType found) { } public PropertyTypeException(PropertyType unsupportedPropertyType) { - super((unsupportedPropertyType != PropertyType.COMPOSITE) ? - unsupportedPropertyType + " is not supported" : - "Composite Properties must not include other Composite" + - " Properties as either Primary or Secondary"); + super((unsupportedPropertyType != PropertyType.COMPOSITE) + ? unsupportedPropertyType + " is not supported" + : "Composite Properties must not include other Composite" + + " Properties as either Primary or Secondary"); } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java b/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java index 066348804f..d8066cb7d2 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. * * Copyright 2016 Norconex Inc. */ @@ -29,24 +27,21 @@ public interface QuattroPro { /** * ID. */ - Property ID = Property.internalText( - QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Id"); + Property ID = Property.internalText(QUATTROPRO_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Id"); /** * Version. */ - Property VERSION = Property.internalInteger( - QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "Version"); + Property VERSION = Property.internalInteger(QUATTROPRO_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Version"); /** * Build. */ - Property BUILD = Property.internalInteger( - QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "Build"); + Property BUILD = Property.internalInteger(QUATTROPRO_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Build"); /** * Lowest version. */ - Property LOWEST_VERSION = Property.internalInteger( - QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "LowestVersion"); + Property LOWEST_VERSION = Property.internalInteger(QUATTROPRO_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LowestVersion"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java b/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java index 22842391f9..053b6cbf3c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java @@ -1,19 +1,18 @@ -package org.apache.tika.metadata; /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at +/* + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ +package org.apache.tika.metadata; public interface RTFMetadata { String PREFIX_RTF_META = "rtf_meta"; @@ -22,29 +21,29 @@ public interface RTFMetadata { String RTF_PICT_META_PREFIX = "rtf_pict:"; /** - * if set to true, this means that an image file is probably a "thumbnail" - * any time a pict/emf/wmf is in an object + * if set to true, this means that an image file is probably a "thumbnail" any time a + * pict/emf/wmf is in an object */ Property THUMBNAIL = Property.internalBoolean( - PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "thumbnail"); + PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "thumbnail"); /** - * if an application and version is given as part of the - * embedded object, this is the literal string + * if an application and version is given as part of the embedded object, this is the literal + * string */ - Property EMB_APP_VERSION = Property.internalText( - PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_app_version"); + Property EMB_APP_VERSION = Property.internalText(PREFIX_RTF_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_app_version"); Property EMB_CLASS = Property.internalText( - PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_class"); + PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_class"); Property EMB_TOPIC = Property.internalText( - PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_topic"); + PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_topic"); Property EMB_ITEM = Property.internalText( - PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_item"); + PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_item"); - Property CONTAINS_ENCAPSULATED_HTML = Property.internalBoolean( - PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contains_encapsulated_html"); + Property CONTAINS_ENCAPSULATED_HTML = Property.internalBoolean(PREFIX_RTF_META + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contains_encapsulated_html"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java b/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java index 31732c97a9..7ad6422d0b 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. * */ diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java b/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java index fe5fd0ec39..a244bf5df1 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java @@ -1,28 +1,26 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * XMP Exif TIFF schema. This is a collection of - * {@link Property property definition} constants for the Exif TIFF - * properties defined in the XMP standard. + * XMP Exif TIFF schema. This is a collection of {@link Property property definition} constants for + * the Exif TIFF properties defined in the XMP standard. * - * @see XMP Specification, Part 2: Standard Schemas + * @see XMP Specification, Part 2: Standard Schemas * @since Apache Tika 0.8 */ public interface TIFF { @@ -58,9 +56,8 @@ public interface TIFF { Property EXPOSURE_TIME = Property.internalRational("exif:ExposureTime"); /** - * "F-Number." - * The f-number is the focal length divided by the "effective" aperture - * diameter. It is a dimensionless number that is a measure of lens speed. + * "F-Number." The f-number is the focal length divided by the "effective" aperture diameter. It + * is a dimensionless number that is a measure of lens speed. */ Property F_NUMBER = Property.internalRational("exif:FNumber"); @@ -90,19 +87,13 @@ public interface TIFF { Property SOFTWARE = Property.internalText("tiff:Software"); /** - * "The Orientation of the image." - * 1 = 0th row at top, 0th column at left - * 2 = 0th row at top, 0th column at right - * 3 = 0th row at bottom, 0th column at right - * 4 = 0th row at bottom, 0th column at left - * 5 = 0th row at left, 0th column at top - * 6 = 0th row at right, 0th column at top - * 7 = 0th row at right, 0th column at bottom - * 8 = 0th row at left, 0th column at bottom + * "The Orientation of the image." 1 = 0th row at top, 0th column at left 2 = 0th row at top, + * 0th column at right 3 = 0th row at bottom, 0th column at right 4 = 0th row at bottom, 0th + * column at left 5 = 0th row at left, 0th column at top 6 = 0th row at right, 0th column at top + * 7 = 0th row at right, 0th column at bottom 8 = 0th row at left, 0th column at bottom */ - Property ORIENTATION = - Property.internalClosedChoise("tiff:Orientation", "1", "2", "3", "4", "5", "6", "7", - "8"); + Property ORIENTATION = Property.internalClosedChoise("tiff:Orientation", "1", "2", "3", "4", + "5", "6", "7", "8"); /** * "Horizontal resolution in pixels per unit." @@ -115,8 +106,7 @@ public interface TIFF { Property RESOLUTION_VERTICAL = Property.internalRational("tiff:YResolution"); /** - * "Units used for Horizontal and Vertical Resolutions." - * One of "Inch" or "cm" + * "Units used for Horizontal and Vertical Resolutions." One of "Inch" or "cm" */ Property RESOLUTION_UNIT = Property.internalClosedChoise("tiff:ResolutionUnit", "Inch", "cm"); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index 7e36624c5c..de170f1c8b 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -1,36 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * Contains a core set of basic Tika metadata properties, which all parsers - * will attempt to supply (where the file format permits). These are all - * defined in terms of other standard namespaces. + * Contains a core set of basic Tika metadata properties, which all parsers will attempt to supply + * (where the file format permits). These are all defined in terms of other standard namespaces. *

- * Users of Tika who wish to have consistent metadata across file formats - * can make use of these Properties, knowing that where present they will - * have consistent semantic meaning between different file formats. (No - * matter if one file format calls it Title, another Long-Title and another - * Long-Name, if they all mean the same thing as defined by - * {@link DublinCore#TITLE} then they will all be present as such) + * Users of Tika who wish to have consistent metadata across file formats can make use of these + * Properties, knowing that where present they will have consistent semantic meaning between + * different file formats. (No matter if one file format calls it Title, another Long-Title and + * another Long-Name, if they all mean the same thing as defined by {@link DublinCore#TITLE} then + * they will all be present as such) *

- * For now, most of these properties are composite ones including the deprecated - * non-prefixed String properties from the Metadata class. In Tika 2.0, most - * of these will revert back to simple assignments. + * For now, most of these properties are composite ones including the deprecated non-prefixed String + * properties from the Metadata class. In Tika 2.0, most of these will revert back to simple + * assignments. * * @since Apache Tika 1.2 */ @@ -43,62 +39,58 @@ public interface TikaCoreProperties { String NAMESPACE_PREFIX_DELIMITER = ":"; /** - * Use this to prefix metadata properties that store information - * about the parsing process. Users should be able to distinguish - * between metadata that was contained within the document and - * metadata about the parsing process. + * Use this to prefix metadata properties that store information about the parsing process. + * Users should be able to distinguish between metadata that was contained within the document + * and metadata about the parsing process. */ String TIKA_META_PREFIX = "X-TIKA" + NAMESPACE_PREFIX_DELIMITER; Property EMBEDDED_DEPTH = Property.internalInteger(TIKA_META_PREFIX + "embedded_depth"); /** - * This tracks the embedded file paths based on the name of embedded files - * where available. + * This tracks the embedded file paths based on the name of embedded files where available. *

- * This field should be treated with great care and should NOT - * be used for creating a directory structure to write out attachments - * because: there may be path collisions or illegal characters or other mayhem. + * This field should be treated with great care and should NOT be used for creating a directory + * structure to write out attachments because: there may be path collisions or illegal + * characters or other mayhem. *

* For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}. */ Property EMBEDDED_RESOURCE_PATH = - Property.internalText(TIKA_META_PREFIX + "embedded_resource_path"); + Property.internalText(TIKA_META_PREFIX + "embedded_resource_path"); /** - * This is calculated in {@link org.apache.tika.sax.RecursiveParserWrapperHandler}. - * It differs from {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} in that - * it is calculated at the end of the full parse of a file. {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} - * is calculated during the parse, and, for some parsers, an embedded file's name isn't known until - * after its child files have been parsed. + * This is calculated in {@link org.apache.tika.sax.RecursiveParserWrapperHandler}. It differs + * from {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} in that it is calculated at the end of + * the full parse of a file. {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} is calculated + * during the parse, and, for some parsers, an embedded file's name isn't known until after its + * child files have been parsed. *

- * Note that the unknown file count may differ between {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} - * because there should be fewer unknown files when this is calculated. More simply, - * there is no connection between "embedded-1" in this field and "embedded-1" in - * {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH}. + * Note that the unknown file count may differ between + * {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} because there should be fewer unknown files + * when this is calculated. More simply, there is no connection between "embedded-1" in this + * field and "embedded-1" in {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH}. *

- * This field should be treated with great care and should NOT - * be used for creating a directory structure to write out attachments - * because: there may be path collisions or illegal characters or other mayhem. + * This field should be treated with great care and should NOT be used for creating a directory + * structure to write out attachments because: there may be path collisions or illegal + * characters or other mayhem. *

* * For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}. */ Property FINAL_EMBEDDED_RESOURCE_PATH = - Property.internalText(TIKA_META_PREFIX + "final_embedded_resource_path"); + Property.internalText(TIKA_META_PREFIX + "final_embedded_resource_path"); /** * This tracks the embedded file paths based on the embedded file's * {@link TikaCoreProperties#EMBEDDED_ID}. */ - Property EMBEDDED_ID_PATH = - Property.internalText(TIKA_META_PREFIX + "embedded_id_path"); + Property EMBEDDED_ID_PATH = Property.internalText(TIKA_META_PREFIX + "embedded_id_path"); /** * This is a 1-index counter for embedded files, used by the RecursiveParserWrapper */ - Property EMBEDDED_ID = - Property.internalInteger(TIKA_META_PREFIX + "embedded_id"); + Property EMBEDDED_ID = Property.internalInteger(TIKA_META_PREFIX + "embedded_id"); Property PARSE_TIME_MILLIS = Property.internalText(TIKA_META_PREFIX + "parse_time_millis"); /** @@ -116,103 +108,100 @@ public interface TikaCoreProperties { */ String TIKA_META_WARN_PREFIX = TIKA_META_PREFIX + "WARN" + NAMESPACE_PREFIX_DELIMITER; - //exception in main file + // exception in main file Property CONTAINER_EXCEPTION = - Property.internalText(TIKA_META_EXCEPTION_PREFIX + "container_exception"); + Property.internalText(TIKA_META_EXCEPTION_PREFIX + "container_exception"); - //exception in an embedded file + // exception in an embedded file Property EMBEDDED_EXCEPTION = - Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_exception"); + Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_exception"); - //exception handling the raw bytes of an embedded file by an EmbeddedDocumentByteStore - Property EMBEDDED_BYTES_EXCEPTION = - Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_bytes_exception"); + // exception handling the raw bytes of an embedded file by an EmbeddedDocumentByteStore + Property EMBEDDED_BYTES_EXCEPTION = Property + .internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_bytes_exception"); - //warning while parsing in an embedded file + // warning while parsing in an embedded file Property EMBEDDED_WARNING = - Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_warning"); + Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_warning"); Property WRITE_LIMIT_REACHED = - Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "write_limit_reached"); + Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "write_limit_reached"); /** - * Use this to store exceptions caught during a parse that are - * non-fatal, e.g. if a parser is in lenient mode and more - * content can be extracted if we ignore an exception thrown by - * a dependency. + * Use this to store exceptions caught during a parse that are non-fatal, e.g. if a parser is in + * lenient mode and more content can be extracted if we ignore an exception thrown by a + * dependency. */ Property TIKA_META_EXCEPTION_WARNING = - Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "warn"); + Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "warn"); /** - * This means that metadata keys or metadata values were truncated. - * If there is an "include" filter, this should not be set if - * a field is not in the "include" set. + * This means that metadata keys or metadata values were truncated. If there is an "include" + * filter, this should not be set if a field is not in the "include" set. */ Property TRUNCATED_METADATA = - Property.internalBoolean(TIKA_META_WARN_PREFIX + "truncated_metadata"); + Property.internalBoolean(TIKA_META_WARN_PREFIX + "truncated_metadata"); /** - * Use this to store exceptions caught while trying to read the - * stream of an embedded resource. Do not use this if there is - * a parse exception on the embedded resource. + * Use this to store exceptions caught while trying to read the stream of an embedded resource. + * Do not use this if there is a parse exception on the embedded resource. */ - Property TIKA_META_EXCEPTION_EMBEDDED_STREAM = - Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_stream_exception"); + Property TIKA_META_EXCEPTION_EMBEDDED_STREAM = Property + .internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_stream_exception"); Property TIKA_PARSED_BY = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By"); /** - * Use this to store a record of all parsers that touched a given file - * in the container file's metadata. + * Use this to store a record of all parsers that touched a given file in the container file's + * metadata. */ - Property TIKA_PARSED_BY_FULL_SET = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set"); + Property TIKA_PARSED_BY_FULL_SET = + Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set"); - Property TIKA_DETECTED_LANGUAGE = Property.externalTextBag(TIKA_META_PREFIX + - "detected_language"); + Property TIKA_DETECTED_LANGUAGE = + Property.externalTextBag(TIKA_META_PREFIX + "detected_language"); - Property TIKA_DETECTED_LANGUAGE_CONFIDENCE = Property.externalTextBag(TIKA_META_PREFIX + - "detected_language_confidence"); + Property TIKA_DETECTED_LANGUAGE_CONFIDENCE = + Property.externalTextBag(TIKA_META_PREFIX + "detected_language_confidence"); - Property TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW = Property.externalRealSeq(TIKA_META_PREFIX + - "detected_language_confidence_raw"); + Property TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW = + Property.externalRealSeq(TIKA_META_PREFIX + "detected_language_confidence_raw"); Property RESOURCE_NAME_KEY = Property.internalText(TIKA_META_PREFIX + "resourceName"); - Property EMBEDDED_RELATIONSHIP_ID = Property.internalText(TIKA_META_PREFIX + "embeddedRelationshipId"); + Property EMBEDDED_RELATIONSHIP_ID = + Property.internalText(TIKA_META_PREFIX + "embeddedRelationshipId"); String EMBEDDED_RESOURCE_TYPE_KEY = "embeddedResourceType"; /** - * Some file formats can store information about their original - * file name/location or about their attachment's original file name/location - * within the file. + * Some file formats can store information about their original file name/location or about + * their attachment's original file name/location within the file. */ Property ORIGINAL_RESOURCE_NAME = - Property.internalTextBag(TIKA_META_PREFIX + "origResourceName"); + Property.internalTextBag(TIKA_META_PREFIX + "origResourceName"); /** - * This should be used to store the path (relative or full) - * of the source file, including the file name, - * e.g. doc/path/to/my_pdf.pdf + * This should be used to store the path (relative or full) of the source file, including the + * file name, e.g. doc/path/to/my_pdf.pdf *

* This can also be used for a primary key within a database. */ Property SOURCE_PATH = Property.internalText(TIKA_META_PREFIX + "sourcePath"); /** - * This is currently used to identify Content-Type that may be - * included within a document, such as in html documents - * (e.g. ) - * , or the value might come from outside the document. This information - * may be faulty and should be treated only as a hint. + * This is currently used to identify Content-Type that may be included within a document, such + * as in html documents (e.g. + * ) , or the value might + * come from outside the document. This information may be faulty and should be treated only as + * a hint. */ Property CONTENT_TYPE_HINT = Property.internalText(HttpHeaders.CONTENT_TYPE + "-Hint"); /** * This is used by users to override detection with the override detector. */ Property CONTENT_TYPE_USER_OVERRIDE = - Property.internalText(HttpHeaders.CONTENT_TYPE + "-Override"); + Property.internalText(HttpHeaders.CONTENT_TYPE + "-Override"); /** - * This is used by parsers to override detection of embedded resources - * with the override detector. + * This is used by parsers to override detection of embedded resources with the override + * detector. */ Property CONTENT_TYPE_PARSER_OVERRIDE = - Property.internalText(HttpHeaders.CONTENT_TYPE + "-Parser-Override"); + Property.internalText(HttpHeaders.CONTENT_TYPE + "-Parser-Override"); /** * @see DublinCore#FORMAT */ @@ -276,9 +265,8 @@ public interface TikaCoreProperties { */ Property DESCRIPTION = DublinCore.DESCRIPTION; /** - * {@link DublinCore#SUBJECT}; should include both subject and keywords - * if a document format has both. See also {@link Office#KEYWORDS} - * and {@link OfficeOpenXMLCore#SUBJECT}. + * {@link DublinCore#SUBJECT}; should include both subject and keywords if a document format has + * both. See also {@link Office#KEYWORDS} and {@link OfficeOpenXMLCore#SUBJECT}. */ Property SUBJECT = DublinCore.SUBJECT; /** @@ -321,9 +309,9 @@ public interface TikaCoreProperties { Property RATING = XMP.RATING; /** - * This is the number of images (as in a multi-frame gif) returned by - * Java's {@link javax.imageio.ImageReader#getNumImages(boolean)}. See - * the javadocs for known limitations. + * This is the number of images (as in a multi-frame gif) returned by Java's + * {@link javax.imageio.ImageReader#getNumImages(boolean)}. See the javadocs for known + * limitations. */ Property NUM_IMAGES = Property.internalInteger("imagereader:NumImages"); @@ -336,9 +324,11 @@ public interface TikaCoreProperties { * Embedded resource type property */ Property EMBEDDED_RESOURCE_TYPE = Property.internalClosedChoise(EMBEDDED_RESOURCE_TYPE_KEY, - EmbeddedResourceType.ATTACHMENT.toString(), EmbeddedResourceType.INLINE.toString(), - EmbeddedResourceType.METADATA.toString(), EmbeddedResourceType.MACRO.toString(), - EmbeddedResourceType.THUMBNAIL.toString(), EmbeddedResourceType.RENDERING.toString()); + EmbeddedResourceType.ATTACHMENT.toString(), + EmbeddedResourceType.INLINE.toString(), + EmbeddedResourceType.METADATA.toString(), EmbeddedResourceType.MACRO.toString(), + EmbeddedResourceType.THUMBNAIL.toString(), + EmbeddedResourceType.RENDERING.toString()); Property HAS_SIGNATURE = Property.internalBoolean("hasSignature"); Property SIGNATURE_NAME = Property.internalTextBag("signature:name"); @@ -348,69 +338,68 @@ public interface TikaCoreProperties { Property SIGNATURE_FILTER = Property.internalTextBag("signature:filter"); Property SIGNATURE_CONTACT_INFO = Property.internalTextBag("signature:contact-info"); - //is the file encrypted + // is the file encrypted Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted"); /** * When an EncodingDetector detects an encoding, the encoding should be stored in this field. * This is different from {@link Metadata#CONTENT_ENCODING} because that is what a parser - * chooses to use for processing a file. If an EncodingDetector returns "null", a parser - * may choose to use a default encoding. We want to differentiate between a parser using a - * default encoding and the output of an EncodingDetector. + * chooses to use for processing a file. If an EncodingDetector returns "null", a parser may + * choose to use a default encoding. We want to differentiate between a parser using a default + * encoding and the output of an EncodingDetector. */ Property DETECTED_ENCODING = Property.externalText(TIKA_META_PREFIX + "detectedEncoding"); /** - * This should be the simple class name for the EncodingDetectors whose detected encoding - * was used in the parse. + * This should be the simple class name for the EncodingDetectors whose detected encoding was + * used in the parse. */ Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + "encodingDetector"); /** - * General metadata key for the count of non-final versions available within a file. This - * was added initially to support generalizing incremental updates in PDF. + * General metadata key for the count of non-final versions available within a file. This was + * added initially to support generalizing incremental updates in PDF. */ Property VERSION_COUNT = Property.externalInteger(TIKA_META_PREFIX + "versionCount"); /** - * General metadata key for the version number of a given file that contains - * earlier versions within it. This number is 0-indexed for the earliest version. - * The latest version does not have this metadata value. This was added initially - * to support generalizing incremental updates in PDF. + * General metadata key for the version number of a given file that contains earlier versions + * within it. This number is 0-indexed for the earliest version. The latest version does not + * have this metadata value. This was added initially to support generalizing incremental + * updates in PDF. */ Property VERSION_NUMBER = Property.externalInteger(TIKA_META_PREFIX + "versionNumber"); Property PIPES_RESULT = Property.externalText(TIKA_META_PREFIX + "pipes_result"); + /** - * A file might contain different types of embedded documents. - * The most common is the ATTACHMENT. + * A file might contain different types of embedded documents. The most common is the + * ATTACHMENT. *

- * An INLINE embedded resource should be used for embedded image - * files that are used to render the page image (as in PDXObjImages in PDF files). + * An INLINE embedded resource should be used for embedded image files that are used to render + * the page image (as in PDXObjImages in PDF files). *

- * A MACRO is code that is embedded in the document and is intended - * to be executable within the application that opens the document. This - * includes traditional macros within Microsoft Office files and - * javascript within PDFActions. This would not include, e.g., an - * .exe file embedded in a .zip file. + * A MACRO is code that is embedded in the document and is intended to be executable within the + * application that opens the document. This includes traditional macros within Microsoft Office + * files and javascript within PDFActions. This would not include, e.g., an .exe file embedded + * in a .zip file. *

- * A VERSION is an earlier version of the file as in incremental updates. - * The initial use case for this is incremental updates in PDFs, but - * it could be applied to other file formats as well where earlier versions - * are recoverable. See also {@link PDF#INCREMENTAL_UPDATE_NUMBER} + * A VERSION is an earlier version of the file as in incremental updates. The initial use case + * for this is incremental updates in PDFs, but it could be applied to other file formats as + * well where earlier versions are recoverable. See also {@link PDF#INCREMENTAL_UPDATE_NUMBER} *

* Not all parsers have yet implemented this. */ enum EmbeddedResourceType { - INLINE, //image that is intended to be displayed in a rendering of the file - ATTACHMENT,//standard attachment as in email - MACRO, //any code that is intended to be run by the application - METADATA, //e.g. xmp, xfa - FONT,//embedded font files - THUMBNAIL, //TODO: set this in parsers that handle thumbnails - RENDERING, //if a file has been rendered - VERSION, //an earlier version of a file - ALTERNATE_FORMAT_CHUNK //OOXML inline alternate format chunk + INLINE, // image that is intended to be displayed in a rendering of the file + ATTACHMENT, // standard attachment as in email + MACRO, // any code that is intended to be run by the application + METADATA, // e.g. xmp, xfa + FONT, // embedded font files + THUMBNAIL, // TODO: set this in parsers that handle thumbnails + RENDERING, // if a file has been rendered + VERSION, // an earlier version of a file + ALTERNATE_FORMAT_CHUNK // OOXML inline alternate format chunk } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java index 7ae685e05e..53a519f999 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java index e4bf1454e2..0f42ae9549 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * Metadata properties for paged text, metadata appropriate - * for an individual page (useful for embedded document handlers - * called on individual pages). + * Metadata properties for paged text, metadata appropriate for an individual page (useful for + * embedded document handlers called on individual pages). * * Use {@link PagedText} where possible */ diff --git a/tika-core/src/main/java/org/apache/tika/metadata/WARC.java b/tika-core/src/main/java/org/apache/tika/metadata/WARC.java index 359236bdd0..d0e427b775 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/WARC.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/WARC.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -27,5 +25,5 @@ public interface WARC { Property WARC_RECORD_ID = Property.externalText(PREFIX + "WARC-Record-ID"); - //TODO: lots + // TODO: lots } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java b/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java index 4fd37f07cd..36054fc22b 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java @@ -1,19 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * Copyright 2016 Norconex Inc. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. Copyright 2016 Norconex Inc. */ package org.apache.tika.metadata; @@ -28,43 +25,36 @@ public interface WordPerfect { /** * File size as defined in document header. */ - Property FILE_SIZE = Property.internalText( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "FileSize"); + Property FILE_SIZE = Property.internalText(WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "FileSize"); /** * File identifier. */ - Property FILE_ID = Property.internalText( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "FileId"); + Property FILE_ID = Property.internalText(WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "FileId"); /** * Product type. */ - Property PRODUCT_TYPE = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "ProductType"); + Property PRODUCT_TYPE = Property.internalInteger(WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ProductType"); /** * File type. */ - Property FILE_TYPE = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "FileType"); + Property FILE_TYPE = Property.internalInteger(WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "FileType"); /** * Major version. */ - Property MAJOR_VERSION = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "MajorVersion"); + Property MAJOR_VERSION = Property.internalInteger(WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MajorVersion"); /** * Minor version. */ - Property MINOR_VERSION = Property.internalInteger( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "MinorVersion"); + Property MINOR_VERSION = Property.internalInteger(WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MinorVersion"); /** * Is encrypted?. */ - Property ENCRYPTED = Property.internalBoolean( - WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + - "Encrypted"); + Property ENCRYPTED = Property.internalBoolean(WORDPERFECT_METADATA_NAME_PREFIX + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Encrypted"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java index 12842c5f80..69f1a8d49b 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -41,10 +39,10 @@ public interface XMP { Property ADVISORY = Property.externalTextBag(PREFIX_ + "Advisory"); /** - * The date and time the resource was created. For a digital file, this need not - * match a file-system creation time. For a freshly created resource, it should - * be close to that time, modulo the time taken to write the file. Later file - * transfer, copying, and so on, can make the file-system time arbitrarily different. + * The date and time the resource was created. For a digital file, this need not match a + * file-system creation time. For a freshly created resource, it should be close to that time, + * modulo the time taken to write the file. Later file transfer, copying, and so on, can make + * the file-system time arbitrarily different. */ Property CREATE_DATE = Property.externalDate(PREFIX_ + "CreateDate"); @@ -54,10 +52,9 @@ public interface XMP { Property CREATOR_TOOL = Property.externalText(PREFIX_ + "CreatorTool"); /** - * An unordered array of text strings that unambiguously identify the resource - * within a given context. An array item may be qualified with xmpidq:Scheme - * (see 8.7, “xmpidq namespace”) to denote the formal identification system to - * which that identifier conforms. + * An unordered array of text strings that unambiguously identify the resource within a given + * context. An array item may be qualified with xmpidq:Scheme (see 8.7, “xmpidq namespace”) to + * denote the formal identification system to which that identifier conforms. */ Property IDENTIFIER = Property.externalTextBag(PREFIX_ + "Identifier"); @@ -67,8 +64,8 @@ public interface XMP { Property LABEL = Property.externalText(PREFIX_ + "Label"); /** - * The date and time that any metadata for this resource was last changed. It - * should be the same as or more recent than xmp:ModifyDate + * The date and time that any metadata for this resource was last changed. It should be the same + * as or more recent than xmp:ModifyDate */ Property METADATA_DATE = Property.externalDate(PREFIX_ + "MetadataDate"); @@ -83,15 +80,15 @@ public interface XMP { Property NICKNAME = Property.externalText(PREFIX_ + "NickName"); /** - * A user-assigned rating for this file. The value shall be -1 or in the range - * [0..5], where -1 indicates “rejected” and 0 indicates “unrated”. If xmp:Rating - * is not present, a value of 0 should be assumed. + * A user-assigned rating for this file. The value shall be -1 or in the range [0..5], where -1 + * indicates “rejected” and 0 indicates “unrated”. If xmp:Rating is not present, a value of 0 + * should be assumed. */ Property RATING = Property.externalInteger(PREFIX_ + "Rating"); /** - * This doesn't belong to the XMP Basic schema. However, because it is part of - * JempBox's XMPBasicSchema, we include this here. + * This doesn't belong to the XMP Basic schema. However, because it is part of JempBox's + * XMPBasicSchema, we include this here. */ Property TITLE = Property.externalText(PREFIX_ + "Title"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java index 26f60407ff..c7047da0ff 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java @@ -1,26 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; /** - * Metadata keys for the XMP DublinCore schema. This differs from {@link DublinCore} in - * that this data must derive strictly from XMP. Tika applies logic to normalize - * metadata keys and values into {@link DublinCore}. This process can make it difficult to determine - * if the underlying metadata derived from a literal XMP component or from another source within the file. + * Metadata keys for the XMP DublinCore schema. This differs from {@link DublinCore} in that this + * data must derive strictly from XMP. Tika applies logic to normalize metadata keys and values into + * {@link DublinCore}. This process can make it difficult to determine if the underlying metadata + * derived from a literal XMP component or from another source within the file. *

* See TIKA-4444. */ @@ -30,162 +28,146 @@ public interface XMPDC { String PREFIX_DC_TERMS = "xmp" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "dcterms"; /** - * Typically, Format may include the media-type or dimensions of the - * resource. Format may be used to determine the software, hardware or - * other equipment needed to display or operate the resource. Examples - * of dimensions include size and duration. Recommended best practice is - * to select a value from a controlled vocabulary (for example, the list - * of Internet Media Types [MIME] defining computer media formats). + * Typically, Format may include the media-type or dimensions of the resource. Format may be + * used to determine the software, hardware or other equipment needed to display or operate the + * resource. Examples of dimensions include size and duration. Recommended best practice is to + * select a value from a controlled vocabulary (for example, the list of Internet Media Types + * [MIME] defining computer media formats). */ Property FORMAT = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "format"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "format"); /** - * Recommended best practice is to identify the resource by means of - * a string or number conforming to a formal identification system. - * Example formal identification systems include the Uniform Resource - * Identifier (URI) (including the Uniform Resource Locator (URL)), - * the Digital Object Identifier (DOI) and the International Standard - * Book Number (ISBN). + * Recommended best practice is to identify the resource by means of a string or number + * conforming to a formal identification system. Example formal identification systems include + * the Uniform Resource Identifier (URI) (including the Uniform Resource Locator (URL)), the + * Digital Object Identifier (DOI) and the International Standard Book Number (ISBN). */ Property IDENTIFIER = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "identifier"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "identifier"); /** * Date on which the resource was changed. */ Property MODIFIED = Property.internalDate( - PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "modified"); + PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "modified"); /** - * An entity responsible for making contributions to the content of the - * resource. Examples of a Contributor include a person, an organisation, - * or a service. Typically, the name of a Contributor should be used to - * indicate the entity. + * An entity responsible for making contributions to the content of the resource. Examples of a + * Contributor include a person, an organisation, or a service. Typically, the name of a + * Contributor should be used to indicate the entity. */ Property CONTRIBUTOR = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contributor"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contributor"); /** - * The extent or scope of the content of the resource. Coverage will - * typically include spatial location (a place name or geographic - * coordinates), temporal period (a period label, date, or date range) - * or jurisdiction (such as a named administrative entity). Recommended - * best practice is to select a value from a controlled vocabulary (for - * example, the Thesaurus of Geographic Names [TGN]) and that, where - * appropriate, named places or time periods be used in preference to - * numeric identifiers such as sets of coordinates or date ranges. + * The extent or scope of the content of the resource. Coverage will typically include spatial + * location (a place name or geographic coordinates), temporal period (a period label, date, or + * date range) or jurisdiction (such as a named administrative entity). Recommended best + * practice is to select a value from a controlled vocabulary (for example, the Thesaurus of + * Geographic Names [TGN]) and that, where appropriate, named places or time periods be used in + * preference to numeric identifiers such as sets of coordinates or date ranges. */ Property COVERAGE = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "coverage"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "coverage"); /** - * An entity primarily responsible for making the content of the resource. - * Examples of a Creator include a person, an organisation, or a service. - * Typically, the name of a Creator should be used to indicate the entity. + * An entity primarily responsible for making the content of the resource. Examples of a Creator + * include a person, an organisation, or a service. Typically, the name of a Creator should be + * used to indicate the entity. */ Property CREATOR = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creator"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creator"); /** * Date of creation of the resource. */ Property CREATED = Property.internalDate( - PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "created"); + PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "created"); /** - * A date associated with an event in the life cycle of the resource. - * Typically, Date will be associated with the creation or availability of - * the resource. Recommended best practice for encoding the date value is - * defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD - * format. + * A date associated with an event in the life cycle of the resource. Typically, Date will be + * associated with the creation or availability of the resource. Recommended best practice for + * encoding the date value is defined in a profile of ISO 8601 [W3CDTF] and follows the + * YYYY-MM-DD format. */ Property DATE = Property.internalDate( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "date"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "date"); /** - * An account of the content of the resource. Description may include - * but is not limited to: an abstract, table of contents, reference to - * a graphical representation of content or a free-text account of - * the content. + * An account of the content of the resource. Description may include but is not limited to: an + * abstract, table of contents, reference to a graphical representation of content or a + * free-text account of the content. */ Property DESCRIPTION = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "description"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "description"); /** - * A language of the intellectual content of the resource. Recommended - * best practice is to use RFC 3066 [RFC3066], which, in conjunction - * with ISO 639 [ISO639], defines two- and three-letter primary language - * tags with optional subtags. Examples include "en" or "eng" for English, - * "akk" for Akkadian, and "en-GB" for English used in the United Kingdom. + * A language of the intellectual content of the resource. Recommended best practice is to use + * RFC 3066 [RFC3066], which, in conjunction with ISO 639 [ISO639], defines two- and + * three-letter primary language tags with optional subtags. Examples include "en" or "eng" for + * English, "akk" for Akkadian, and "en-GB" for English used in the United Kingdom. */ Property LANGUAGE = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "language"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "language"); /** - * An entity responsible for making the resource available. Examples of - * a Publisher include a person, an organisation, or a service. Typically, - * the name of a Publisher should be used to indicate the entity. + * An entity responsible for making the resource available. Examples of a Publisher include a + * person, an organisation, or a service. Typically, the name of a Publisher should be used to + * indicate the entity. */ Property PUBLISHER = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "publisher"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "publisher"); /** - * A reference to a related resource. Recommended best practice is to - * reference the resource by means of a string or number conforming to - * a formal identification system. + * A reference to a related resource. Recommended best practice is to reference the resource by + * means of a string or number conforming to a formal identification system. */ Property RELATION = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "relation"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "relation"); /** - * Information about rights held in and over the resource. Typically, - * a Rights element will contain a rights management statement for - * the resource, or reference a service providing such information. - * Rights information often encompasses Intellectual Property Rights - * (IPR), Copyright, and various Property Rights. If the Rights element - * is absent, no assumptions can be made about the status of these and - * other rights with respect to the resource. + * Information about rights held in and over the resource. Typically, a Rights element will + * contain a rights management statement for the resource, or reference a service providing such + * information. Rights information often encompasses Intellectual Property Rights (IPR), + * Copyright, and various Property Rights. If the Rights element is absent, no assumptions can + * be made about the status of these and other rights with respect to the resource. */ Property RIGHTS = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "rights"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "rights"); /** - * A reference to a resource from which the present resource is derived. - * The present resource may be derived from the Source resource in whole - * or in part. Recommended best practice is to reference the resource by - * means of a string or number conforming to a formal identification + * A reference to a resource from which the present resource is derived. The present resource + * may be derived from the Source resource in whole or in part. Recommended best practice is to + * reference the resource by means of a string or number conforming to a formal identification * system. */ Property SOURCE = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "source"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "source"); /** - * The topic of the content of the resource. Typically, a Subject will - * be expressed as keywords, key phrases or classification codes that - * describe a topic of the resource. Recommended best practice is to - * select a value from a controlled vocabulary or formal classification - * scheme. + * The topic of the content of the resource. Typically, a Subject will be expressed as keywords, + * key phrases or classification codes that describe a topic of the resource. Recommended best + * practice is to select a value from a controlled vocabulary or formal classification scheme. */ Property SUBJECT = Property.internalTextBag( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"); /** - * A name given to the resource. Typically, a Title will be a name by - * which the resource is formally known. + * A name given to the resource. Typically, a Title will be a name by which the resource is + * formally known. */ Property TITLE = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "title"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "title"); /** - * The nature or genre of the content of the resource. Type includes terms - * describing general categories, functions, genres, or aggregation levels - * for content. Recommended best practice is to select a value from a - * controlled vocabulary (for example, the DCMI Type Vocabulary - * [DCMITYPE]). To describe the physical or digital manifestation of - * the resource, use the Format element. + * The nature or genre of the content of the resource. Type includes terms describing general + * categories, functions, genres, or aggregation levels for content. Recommended best practice + * is to select a value from a controlled vocabulary (for example, the DCMI Type Vocabulary + * [DCMITYPE]). To describe the physical or digital manifestation of the resource, use the + * Format element. */ Property TYPE = Property.internalText( - PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "type"); + PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "type"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java index d7faa4483f..b44fb07ac1 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java @@ -1,37 +1,34 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; import java.util.Date; /** - * XMP Dynamic Media schema. This is a collection of - * {@link Property property definition} constants for the dynamic media - * properties defined in the XMP standard. + * XMP Dynamic Media schema. This is a collection of {@link Property property definition} constants + * for the dynamic media properties defined in the XMP standard. * - * @see XMP Specification, Part 2: Standard Schemas + * @see XMP Specification, Part 2: Standard Schemas * @since Apache Tika 0.7 */ public interface XMPDM { /** - * "The absolute path to the file's peak audio file. If empty, no peak - * file exists." + * "The absolute path to the file's peak audio file. If empty, no peak file exists." */ Property ABS_PEAK_AUDIO_FILE_PATH = Property.internalURI("xmpDM:absPeakAudioFilePath"); @@ -41,17 +38,16 @@ public interface XMPDM { Property ALBUM = Property.externalText("xmpDM:album"); /** - * "An alternative tape name, set via the project window or timecode - * dialog in Premiere. If an alternative name has been set and has not - * been reverted, that name is displayed." + * "An alternative tape name, set via the project window or timecode dialog in Premiere. If an + * alternative name has been set and has not been reverted, that name is displayed." */ Property ALT_TAPE_NAME = Property.externalText("xmpDM:altTapeName"); -// /** -// * "A timecode set by the user. When specified, it is used instead -// * of the startTimecode." -// */ -// Property ALT_TIMECODE = "xmpDM:altTimecode"; + // /** + // * "A timecode set by the user. When specified, it is used instead + // * of the startTimecode." + // */ + // Property ALT_TIMECODE = "xmpDM:altTimecode"; /** * "The name of the artist or artists." @@ -69,23 +65,21 @@ public interface XMPDM { Property AUDIO_MOD_DATE = Property.internalDate("xmpDM:audioModDate"); /** - * "The audio sample rate. Can be any value, but commonly 32000, 41100, - * or 48000." + * "The audio sample rate. Can be any value, but commonly 32000, 41100, or 48000." */ Property AUDIO_SAMPLE_RATE = Property.internalInteger("xmpDM:audioSampleRate"); /** * "The audio sample type." */ - Property AUDIO_SAMPLE_TYPE = - Property.internalClosedChoise("xmpDM:audioSampleType", "8Int", "16Int", "32Int", - "32Float"); + Property AUDIO_SAMPLE_TYPE = Property.internalClosedChoise("xmpDM:audioSampleType", "8Int", + "16Int", "32Int", "32Float"); /** * "The audio channel type." */ - Property AUDIO_CHANNEL_TYPE = - Property.internalClosedChoise("xmpDM:audioChannelType", "Mono", "Stereo", "5.1", "7.1"); + Property AUDIO_CHANNEL_TYPE = Property.internalClosedChoise("xmpDM:audioChannelType", "Mono", + "Stereo", "5.1", "7.1"); /** * "The audio compression used. For example, MP3." */ @@ -95,10 +89,10 @@ public interface XMPDM { */ Property COMPILATION = Property.externalInteger("xmpDM:compilation"); -// /** -// * "Additional parameters for Beat Splice stretch mode." -// */ -// Property BEAT_SPLICE_PARAMS = "xmpDM:beatSpliceParams"; + // /** + // * "Additional parameters for Beat Splice stretch mode." + // */ + // Property BEAT_SPLICE_PARAMS = "xmpDM:beatSpliceParams"; /** * "The composer's name." */ @@ -108,17 +102,16 @@ public interface XMPDM { */ Property COPYRIGHT = Property.externalText("xmpDM:copyright"); -// /** -// * "An unordered list of all media used to create this media." -// */ -// Property CONTRIBUTED_MEDIA = "xmpDM:contributedMedia"; + // /** + // * "An unordered list of all media used to create this media." + // */ + // Property CONTRIBUTED_MEDIA = "xmpDM:contributedMedia"; /** * "The disc number for part of an album set." */ Property DISC_NUMBER = Property.externalInteger("xmpDM:discNumber"); /** - * "The duration of the media file." - * Value is in Seconds, unless xmpDM:scale is also set. + * "The duration of the media file." Value is in Seconds, unless xmpDM:scale is also set. */ Property DURATION = Property.externalReal("xmpDM:duration"); /** @@ -126,8 +119,7 @@ public interface XMPDM { */ Property ENGINEER = Property.externalText("xmpDM:engineer"); /** - * "The file data rate in megabytes per second. For example: - * '36/10' = 3.6 MB/sec" + * "The file data rate in megabytes per second. For example: '36/10' = 3.6 MB/sec" */ Property FILE_DATA_RATE = Property.internalRational("xmpDM:fileDataRate"); /** @@ -141,14 +133,13 @@ public interface XMPDM { /** * "The audio's musical key." */ - Property KEY = - Property.internalClosedChoise("xmpDM:key", "C", "C#", "D", "D#", "E", "F", "F#", "G", - "G#", "A", "A#", "B"); + Property KEY = Property.internalClosedChoise("xmpDM:key", "C", "C#", "D", "D#", "E", "F", "F#", + "G", "G#", "A", "A#", "B"); -// /** -// * "The duration of lead time for queuing music." -// */ -// Property INTRO_TIME = "xmpDM:introTime"; + // /** + // * "The duration of lead time for queuing music." + // */ + // Property INTRO_TIME = "xmpDM:introTime"; /** * "User's log comments." */ @@ -166,52 +157,51 @@ public interface XMPDM { */ Property METADATA_MOD_DATE = Property.internalDate("xmpDM:metadataModDate"); -// /** -// * An ordered list of markers. See also {@link #TRACKS xmpDM:Tracks}. -// */ -// Property MARKERS = "xmpDM:markers"; + // /** + // * An ordered list of markers. See also {@link #TRACKS xmpDM:Tracks}. + // */ + // Property MARKERS = "xmpDM:markers"; /** * "The sampling phase of film to be converted to video (pull-down)." */ - Property PULL_DOWN = - Property.internalClosedChoise("xmpDM:pullDown", "WSSWW", "SSWWW", "SWWWS", "WWWSS", - "WWSSW", "WSSWW_24p", "SSWWW_24p", "SWWWS_24p", "WWWSS_24p", "WWSSW_24p"); + Property PULL_DOWN = Property.internalClosedChoise("xmpDM:pullDown", "WSSWW", "SSWWW", "SWWWS", + "WWWSS", "WWSSW", "WSSWW_24p", "SSWWW_24p", "SWWWS_24p", "WWWSS_24p", + "WWSSW_24p"); -// /** -// * "The time at which to fade out." -// */ -// Property OUT_CUE = "xmpDM:outCue"; + // /** + // * "The time at which to fade out." + // */ + // Property OUT_CUE = "xmpDM:outCue"; -// /** -// * "A reference to the project that created this file." -// */ -// Property PROJECT_REF = "xmpDM:projectRef"; + // /** + // * "A reference to the project that created this file." + // */ + // Property PROJECT_REF = "xmpDM:projectRef"; /** - * "The relative path to the file's peak audio file. If empty, no peak - * file exists." + * "The relative path to the file's peak audio file. If empty, no peak file exists." */ Property RELATIVE_PEAK_AUDIO_FILE_PATH = - Property.internalURI("xmpDM:relativePeakAudioFilePath"); + Property.internalURI("xmpDM:relativePeakAudioFilePath"); /** * "The date the title was released." */ Property RELEASE_DATE = Property.externalDate("xmpDM:releaseDate"); -// /** -// * "The start time of the media inside the audio project." -// */ -// Property RELATIVE_TIMESTAMP = "xmpDM:relativeTimestamp"; + // /** + // * "The start time of the media inside the audio project." + // */ + // Property RELATIVE_TIMESTAMP = "xmpDM:relativeTimestamp"; /** - * "The musical scale used in the music. 'Neither' is most often used - * for instruments with no associated scale, such as drums." + * "The musical scale used in the music. 'Neither' is most often used for instruments with no + * associated scale, such as drums." */ - Property SCALE_TYPE = - Property.internalClosedChoise("xmpDM:scaleType", "Major", "Minor", "Both", "Neither"); + Property SCALE_TYPE = Property.internalClosedChoise("xmpDM:scaleType", "Major", "Minor", "Both", + "Neither"); -// /** -// * "Additional parameters for Resample stretch mode." -// */ -// Property RESAMPLE_PARAMS = "xmpDM:resampleParams"; + // /** + // * "Additional parameters for Resample stretch mode." + // */ + // Property RESAMPLE_PARAMS = "xmpDM:resampleParams"; /** * "The name of the scene." */ @@ -221,9 +211,8 @@ public interface XMPDM { */ Property SHOT_DATE = Property.externalDate("xmpDM:shotDate"); /** - * "The name of the location where the video was shot. For example: - * 'Oktoberfest, Munich, Germany'. For more accurate positioning, - * use the EXIF GPS values." + * "The name of the location where the video was shot. For example: 'Oktoberfest, Munich, + * Germany'. For more accurate positioning, use the EXIF GPS values." */ Property SHOT_LOCATION = Property.externalText("xmpDM:shotLocation"); /** @@ -231,26 +220,23 @@ public interface XMPDM { */ Property SHOT_NAME = Property.externalText("xmpDM:shotName"); /** - * "A description of the speaker angles from center front in degrees. - * For example: 'Left = -30, Right = 30, Center = 0, LFE = 45, - * Left Surround = -110, Right Surround = 110'" + * "A description of the speaker angles from center front in degrees. For example: 'Left = -30, + * Right = 30, Center = 0, LFE = 45, Left Surround = -110, Right Surround = 110'" */ Property SPEAKER_PLACEMENT = Property.externalText("xmpDM:speakerPlacement"); /** * "The audio stretch mode." */ - Property STRETCH_MODE = - Property.internalClosedChoise("xmpDM:stretchMode", "Fixed length", "Time-Scale", - "Resample", "Beat Splice", "Hybrid"); + Property STRETCH_MODE = Property.internalClosedChoise("xmpDM:stretchMode", "Fixed length", + "Time-Scale", "Resample", "Beat Splice", "Hybrid"); -// /** -// * "The timecode of the first frame of video in the file, as obtained -// * from the device control." -// */ -// Property START_TIMECODE = "xmpDM:startTimecode"; + // /** + // * "The timecode of the first frame of video in the file, as obtained + // * from the device control." + // */ + // Property START_TIMECODE = "xmpDM:startTimecode"; /** - * "The name of the tape from which the clip was captured, as set during - * the capture process." + * "The name of the tape from which the clip was captured, as set during the capture process." */ Property TAPE_NAME = Property.externalText("xmpDM:tapeName"); /** @@ -260,47 +246,45 @@ public interface XMPDM { /** * "The time signature of the music." */ - Property TIME_SIGNATURE = - Property.internalClosedChoise("xmpDM:timeSignature", "2/4", "3/4", "4/4", "5/4", "7/4", - "6/8", "9/8", "12/8", "other"); + Property TIME_SIGNATURE = Property.internalClosedChoise("xmpDM:timeSignature", "2/4", "3/4", + "4/4", "5/4", "7/4", "6/8", "9/8", "12/8", "other"); -// /** -// * "Additional parameters for Time-Scale stretch mode." -// */ -// Property TIME_SCALE_PARAMS = "xmpDM:timeScaleParams"; + // /** + // * "Additional parameters for Time-Scale stretch mode." + // */ + // Property TIME_SCALE_PARAMS = "xmpDM:timeScaleParams"; /** - * "A numeric value indicating the order of the audio file within its - * original recording." + * "A numeric value indicating the order of the audio file within its original recording." */ Property TRACK_NUMBER = Property.externalInteger("xmpDM:trackNumber"); /** * "The alpha mode." */ - Property VIDEO_ALPHA_MODE = - Property.externalClosedChoise("xmpDM:videoAlphaMode", "straight", "pre-multiplied"); + Property VIDEO_ALPHA_MODE = Property.externalClosedChoise("xmpDM:videoAlphaMode", "straight", + "pre-multiplied"); -// /** -// * "An unordered list of tracks. A track is a named set of markers, -// * which can specify a frame rate for all markers in the set. -// * See also {@link #MARKERS xmpDM:markers}." -// */ -// Property TRACKS = "xmpDM:Tracks"; + // /** + // * "An unordered list of tracks. A track is a named set of markers, + // * which can specify a frame rate for all markers in the set. + // * See also {@link #MARKERS xmpDM:markers}." + // */ + // Property TRACKS = "xmpDM:Tracks"; /** * "When true, unity is clear, when false, it is opaque." */ Property VIDEO_ALPHA_UNITY_IS_TRANSPARENT = - Property.internalBoolean("xmpDM:videoAlphaUnityIsTransparent"); + Property.internalBoolean("xmpDM:videoAlphaUnityIsTransparent"); -// /** -// * "A color in CMYK or RGB to be used as the pre-multiple color when -// * alpha mode is pre-multiplied." -// */ -// Property VIDEO_ALPHA_PREMULTIPLE_COLOR = "xmpDM:videoAlphaPremultipleColor"; + // /** + // * "A color in CMYK or RGB to be used as the pre-multiple color when + // * alpha mode is pre-multiplied." + // */ + // Property VIDEO_ALPHA_PREMULTIPLE_COLOR = "xmpDM:videoAlphaPremultipleColor"; /** * "The color space." */ - Property VIDEO_COLOR_SPACE = - Property.internalClosedChoise("xmpDM:videoColorSpace", "sRGB", "CCIR-601", "CCIR-709"); + Property VIDEO_COLOR_SPACE = Property.internalClosedChoise("xmpDM:videoColorSpace", "sRGB", + "CCIR-601", "CCIR-709"); /** * "Video compression used. For example, jpeg." */ @@ -308,29 +292,28 @@ public interface XMPDM { /** * "The field order for video." */ - Property VIDEO_FIELD_ORDER = - Property.internalClosedChoise("xmpDM:videoFieldOrder", "Upper", "Lower", "Progressive"); + Property VIDEO_FIELD_ORDER = Property.internalClosedChoise("xmpDM:videoFieldOrder", "Upper", + "Lower", "Progressive"); /** * "The video frame rate." */ Property VIDEO_FRAME_RATE = - Property.internalOpenChoise("xmpDM:videoFrameRate", "24", "NTSC", "PAL"); + Property.internalOpenChoise("xmpDM:videoFrameRate", "24", "NTSC", "PAL"); /** * "The date and time when the video was last modified." */ Property VIDEO_MOD_DATE = Property.internalDate("xmpDM:videoModDate"); -// /** -// * "The frame size. For example: w:720, h: 480, unit:pixels" -// */ -// Property VIDEO_FRAME_SIZE = "xmpDM:videoFrameSize"; + // /** + // * "The frame size. For example: w:720, h: 480, unit:pixels" + // */ + // Property VIDEO_FRAME_SIZE = "xmpDM:videoFrameSize"; /** - * "The size in bits of each color component of a pixel. Standard - * Windows 32-bit pixels have 8 bits per component." + * "The size in bits of each color component of a pixel. Standard Windows 32-bit pixels have 8 + * bits per component." */ - Property VIDEO_PIXEL_DEPTH = - Property.internalClosedChoise("xmpDM:videoPixelDepth", "8Int", "16Int", "32Int", - "32Float"); + Property VIDEO_PIXEL_DEPTH = Property.internalClosedChoise("xmpDM:videoPixelDepth", "8Int", + "16Int", "32Int", "32Float"); /** * "The aspect ratio, expressed as wd/ht. For example: '648/720' = 0.9" */ @@ -377,7 +360,7 @@ public static void convertAndSet(Metadata metadata, Object value) { } if (value instanceof Date) { // Won't happen in this case, just an example of already - // converted to a type metadata.set(property) handles + // converted to a type metadata.set(property) handles metadata.set(property, (Date) value); } if (value instanceof String) { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java index 015b0657d8..4bd670e8f6 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -28,8 +26,8 @@ public interface XMPIdq { String PREFIX_ = PREFIX + ":"; /** - * A qualifier providing the name of the formal identification - * scheme used for an item in the xmp:Identifier array. + * A qualifier providing the name of the formal identification scheme used for an item in the + * xmp:Identifier array. */ Property SCHEME = Property.externalText(PREFIX_ + "Scheme"); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java index 2a81fa254f..e8d7845337 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -28,13 +26,12 @@ public interface XMPMM { String PREFIX_ = PREFIX + ":"; /** - * A reference to the resource from which this one is derived. - * This should be a minimal reference, in which missing - * components can be assumed to be unchanged. + * A reference to the resource from which this one is derived. This should be a minimal + * reference, in which missing components can be assumed to be unchanged. * * TODO This property is of type RessourceRef which is a struct */ -// Property DERIVED_FROM = Property.externalText(PREFIX_ + "DerivedFrom"); + // Property DERIVED_FROM = Property.externalText(PREFIX_ + "DerivedFrom"); /** * The common identifier for all versions and renditions of a resource. @@ -42,33 +39,28 @@ public interface XMPMM { Property DOCUMENTID = Property.externalText(PREFIX_ + "DocumentID"); /** - * An identifier for a specific incarnation of a resource, updated - * each time a file is saved. + * An identifier for a specific incarnation of a resource, updated each time a file is saved. */ Property INSTANCEID = Property.externalText(PREFIX_ + "InstanceID"); /** - * The common identifier for the original resource from which - * the current resource is derived. For example, if you save a - * resource to a different format, then save that one to another - * format, each save operation should generate a new - * xmpMM:DocumentID that uniquely identifies the resource in - * that format, but should retain the ID of the source file here. + * The common identifier for the original resource from which the current resource is derived. + * For example, if you save a resource to a different format, then save that one to another + * format, each save operation should generate a new xmpMM:DocumentID that uniquely identifies + * the resource in that format, but should retain the ID of the source file here. */ Property ORIGINAL_DOCUMENTID = Property.externalText(PREFIX_ + "OriginalDocumentID"); /** - * The rendition class name for this resource. This property - * should be absent or set to default for a resource that is not - * a derived rendition + * The rendition class name for this resource. This property should be absent or set to default + * for a resource that is not a derived rendition */ - Property RENDITION_CLASS = - Property.externalOpenChoise(PREFIX_ + "RenditionClass", "default", "draft", "low-res", - "proof", "screen", "thumbnail"); + Property RENDITION_CLASS = Property.externalOpenChoise(PREFIX_ + "RenditionClass", "default", + "draft", "low-res", "proof", "screen", "thumbnail"); /** - * Can be used to provide additional rendition parameters that - * are too complex or verbose to encode in xmpMM:RenditionClass + * Can be used to provide additional rendition parameters that are too complex or verbose to + * encode in xmpMM:RenditionClass */ Property RENDITION_PARAMS = Property.externalText(PREFIX_ + "RenditionParams"); @@ -87,20 +79,17 @@ public interface XMPMM { Property HISTORY_WHEN = Property.externalTextBag(PREFIX_ + "History:When"); /** - * Software agent that created the action in the XMPMM's - * history section + * Software agent that created the action in the XMPMM's history section */ Property HISTORY_SOFTWARE_AGENT = Property.externalTextBag(PREFIX_ + "History:SoftwareAgent"); /** - * Document id for the document that this document - * was derived from + * Document id for the document that this document was derived from */ Property DERIVED_FROM_DOCUMENTID = Property.externalText(PREFIX_ + "DerivedFrom:DocumentID"); /** - * Instance id for the document instance that this - * document was derived from + * Instance id for the document instance that this document was derived from */ Property DERIVED_FROM_INSTANCEID = Property.externalText(PREFIX_ + "DerivedFrom:InstanceID"); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java index a4d1bb13a7..3f96e5ee4c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -23,7 +21,7 @@ public interface XMPPDF { String PREFIX = "xmp" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "pdf" - + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; /** * Unordered text strings of about. diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java index 6254dbf266..7deb6c9201 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java @@ -1,33 +1,30 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. * - * IPTC Metadata Descriptions taken from the IPTC Photo Metadata (July 2010) - * standard. These parts Copyright 2010 International Press Telecommunications - * Council. + * IPTC Metadata Descriptions taken from the IPTC Photo Metadata (July 2010) standard. These parts + * Copyright 2010 International Press Telecommunications Council. */ package org.apache.tika.metadata; /** * XMP Rights management schema. *

- * A collection of property constants for the - * rights management properties defined in the XMP + * A collection of property constants for the rights management properties defined in the XMP * standard. * - * @see XMP Photoshop + * @see XMP + * Photoshop * @since Apache Tika 1.2 */ public interface XMPRights { @@ -46,9 +43,8 @@ public interface XMPRights { Property CERTIFICATE = Property.internalText(PREFIX_ + "Certificate"); /** - * When true, indicates that this is a rights-managed resource. When - * false, indicates that this is a public-domain resource. Omit if the - * state is unknown. + * When true, indicates that this is a rights-managed resource. When false, indicates that this + * is a public-domain resource. Omit if the state is unknown. */ Property MARKED = Property.internalBoolean(PREFIX_ + "Marked"); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java index 7023c4cb8b..cf217771fe 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; @@ -20,7 +18,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -32,26 +29,23 @@ /** - * This filter runs a regex against the first value in the "sourceField". - * If the pattern matches, it extracts the first group of the first match and - * set's the "targetField"'s value to that first group. + * This filter runs a regex against the first value in the "sourceField". If the pattern matches, it + * extracts the first group of the first match and set's the "targetField"'s value to that first + * group. *

- * If there is a match, this will overwrite whatever value is in the - * "targetField". + * If there is a match, this will overwrite whatever value is in the "targetField". *

* If there is not a match, this filter will be a no-op. *

- * If there are multiple matches, this filter will capture only the first. - * Open a ticket if you need different behavior. + * If there are multiple matches, this filter will capture only the first. Open a ticket if you need + * different behavior. *

- * If the source field has multiple values, this will run the regex - * against only the first value. + * If the source field has multiple values, this will run the regex against only the first value. *

* If the source field does not exist, this filter will be a no-op. *

- * If the target field is the same value as the source field, this filter - * will overwrite the value in that field. Again, if there are multiple - * values in that field, those will all be overwritten. + * If the target field is the same value as the source field, this filter will overwrite the value + * in that field. Again, if there are multiple values in that field, those will all be overwritten. */ public class CaptureGroupMetadataFilter extends MetadataFilter implements Initializable { @@ -111,7 +105,7 @@ public void initialize(Map params) throws TikaConfigException { @Override public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { + throws TikaConfigException { if (StringUtils.isBlank(sourceField)) { throw new TikaConfigException("Must specify a 'sourceField'"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java index 6157b4e5c4..b5e0c64a68 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByAttachmentTypeMetadataFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; @@ -20,7 +18,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; @@ -28,9 +25,8 @@ import org.apache.tika.metadata.TikaCoreProperties; /** - * This class clears the entire metadata object if the - * attachment type matches one of the types. The idea is that you might not want - * to store/transmit metadata for images or specific file types. + * This class clears the entire metadata object if the attachment type matches one of the types. The + * idea is that you might not want to store/transmit metadata for images or specific file types. */ public class ClearByAttachmentTypeMetadataFilter extends MetadataFilter { private final Set types; @@ -49,7 +45,7 @@ public void filter(Metadata metadata) throws TikaException { if (type == null) { return; } - if (! types.contains(type)) { + if (!types.contains(type)) { return; } for (String n : metadata.names()) { @@ -71,14 +67,16 @@ public void setTypes(List types) throws TikaConfigException { } catch (IllegalArgumentException e) { StringBuilder sb = new StringBuilder(); int i = 0; - for (TikaCoreProperties.EmbeddedResourceType type : TikaCoreProperties.EmbeddedResourceType.values()) { + for (TikaCoreProperties.EmbeddedResourceType type : TikaCoreProperties.EmbeddedResourceType + .values()) { if (i++ > 0) { sb.append(", "); } sb.append(type.name()); } - throw new TikaConfigException("I'm sorry. I regret I don't recognise " + t + - ". I do recognize the following (case-sensitive):" + sb.toString()); + throw new TikaConfigException("I'm sorry. I regret I don't recognise " + t + + ". I do recognize the following (case-sensitive):" + + sb.toString()); } } this.types.addAll(types); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java index adcffc5adb..b3a80f87de 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; @@ -20,16 +18,14 @@ import java.util.HashSet; import java.util.List; import java.util.Set; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; /** - * This class clears the entire metadata object if the - * mime matches the mime filter. The idea is that you might not want - * to store/transmit metadata for images or specific file types. + * This class clears the entire metadata object if the mime matches the mime filter. The idea is + * that you might not want to store/transmit metadata for images or specific file types. */ public class ClearByMimeMetadataFilter extends MetadataFilter { private final Set mimes; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java index 8f5907c443..864fd7612c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java @@ -1,35 +1,33 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; import java.util.ArrayList; import java.util.List; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; public class CompositeMetadataFilter extends MetadataFilter { - //no longer final to allow for no arg initialization during serialization + // no longer final to allow for no arg initialization during serialization private List filters; public CompositeMetadataFilter() { filters = new ArrayList<>(); } + public CompositeMetadataFilter(List filters) { this.filters = filters; } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java index ed8280bcc9..e6e6f8943d 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; @@ -22,32 +20,30 @@ import java.util.Date; import java.util.Locale; import java.util.TimeZone; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * Some dates in some file formats do not have a timezone. - * Tika correctly stores these without a timezone, e.g. 'yyyy-MM-dd'T'HH:mm:ss' - * This can be a problem if end points expect a 'Z' timezone. - * This filter makes the assumption that dates without timezones are UTC - * and always modifies the date to: "yyyy-MM-dd'T'HH:mm:ss'Z'" + * Some dates in some file formats do not have a timezone. Tika correctly stores these without a + * timezone, e.g. 'yyyy-MM-dd'T'HH:mm:ss' This can be a problem if end points expect a 'Z' timezone. + * This filter makes the assumption that dates without timezones are UTC and always modifies the + * date to: "yyyy-MM-dd'T'HH:mm:ss'Z'" * * Users can specify an alternate defaultTimeZone with - * {@link DateNormalizingMetadataFilter#setDefaultTimeZone(String)} to apply - * if the file format does not specify a timezone. + * {@link DateNormalizingMetadataFilter#setDefaultTimeZone(String)} to apply if the file format does + * not specify a timezone. * */ public class DateNormalizingMetadataFilter extends MetadataFilter { private static TimeZone UTC = TimeZone.getTimeZone("UTC"); - private static final Logger LOGGER = LoggerFactory.getLogger(DateNormalizingMetadataFilter.class); + private static final Logger LOGGER = + LoggerFactory.getLogger(DateNormalizingMetadataFilter.class); private TimeZone defaultTimeZone = UTC; @@ -75,8 +71,8 @@ public void filter(Metadata metadata) throws TikaException { d = dateFormatter.parse(dateString); metadata.set(property, utcFormatter.format(d)); } catch (ParseException e) { - LOGGER.warn("Couldn't convert date to default time zone: >" - + dateString + "<"); + LOGGER.warn("Couldn't convert date to default time zone: >" + dateString + + "<"); } } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java index 64a7d0ad61..82d2fd84d6 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; import java.util.List; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.utils.ServiceLoaderUtils; @@ -36,7 +33,8 @@ public DefaultMetadataFilter() { } private static List getDefaultFilters(ServiceLoader loader) { - List metadataFilters = loader.loadStaticServiceProviders(MetadataFilter.class); + List metadataFilters = + loader.loadStaticServiceProviders(MetadataFilter.class); ServiceLoaderUtils.sortLoadedClasses(metadataFilters); return metadataFilters; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java index e000899101..0f174bff01 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; @@ -20,7 +18,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java index fe346e74df..6976cb9c0e 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; import java.util.LinkedHashMap; import java.util.Map; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -57,10 +54,9 @@ public void filter(Metadata metadata) throws TikaException { } /** - * If this is true (default), this means that only the fields that - * have a "from" value in the mapper will be passed through. Otherwise, - * this will pass through all keys/values and mutate the keys - * that exist in the mappings. + * If this is true (default), this means that only the fields that have a "from" + * value in the mapper will be passed through. Otherwise, this will pass through all keys/values + * and mutate the keys that exist in the mappings. * * @param excludeUnmapped */ @@ -82,6 +78,7 @@ public Map getMappins() { @Override public String toString() { - return "FieldNameMappingFilter{" + "mappings=" + mappings + ", excludeUnmapped=" + excludeUnmapped + '}'; + return "FieldNameMappingFilter{" + "mappings=" + mappings + ", excludeUnmapped=" + + excludeUnmapped + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java index ac48454cf5..58a8c25c09 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; @@ -23,9 +21,9 @@ import org.apache.tika.utils.StringUtils; /** - * If {@link Metadata} contains a {@link TikaCoreProperties#LATITUDE} and - * a {@link TikaCoreProperties#LONGITUDE}, this filter concatenates those with a - * comma in the order LATITUDE,LONGITUDE. + * If {@link Metadata} contains a {@link TikaCoreProperties#LATITUDE} and a + * {@link TikaCoreProperties#LONGITUDE}, this filter concatenates those with a comma in the order + * LATITUDE,LONGITUDE. * * If you need any other mappings, please open a ticket on our JIRA. */ @@ -34,8 +32,8 @@ public class GeoPointMetadataFilter extends MetadataFilter { String geoPointFieldName = "location"; /** - * Set the field for the concatenated LATITUDE,LONGITUDE string. - * The default if &dquot;location&dquot; + * Set the field for the concatenated LATITUDE,LONGITUDE string. The default if + * &dquot;location&dquot; * * @param geoPointFieldName field name to use for the geopoint field */ diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java index 3fe2a90fad..4f4441e8bd 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; @@ -20,7 +18,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java index 21eb3eced1..8a22f0974e 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java @@ -1,31 +1,27 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; import java.io.IOException; import java.io.Serializable; - -import org.w3c.dom.Element; - import org.apache.tika.config.ConfigBase; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.w3c.dom.Element; /** * Filters the metadata in place after the parse @@ -36,16 +32,17 @@ public abstract class MetadataFilter extends ConfigBase implements Serializable /** * Loads the metadata filter from the config file if it exists, otherwise returns NoOpFilter + * * @param root * @return * @throws TikaConfigException * @throws IOException */ - public static MetadataFilter load(Element root, boolean allowMissing) throws TikaConfigException, - IOException { + public static MetadataFilter load(Element root, boolean allowMissing) + throws TikaConfigException, IOException { try { return buildComposite("metadataFilters", CompositeMetadataFilter.class, - "metadataFilter", MetadataFilter.class, root); + "metadataFilter", MetadataFilter.class, root); } catch (TikaConfigException e) { if (allowMissing && e.getMessage().contains("could not find metadataFilters")) { return new NoOpFilter(); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java index d95472a1b9..349a0f8b0d 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; @@ -20,8 +18,7 @@ import org.apache.tika.metadata.Metadata; /** - * This filter performs no operations on the metadata - * and leaves it untouched. + * This filter performs no operations on the metadata and leaves it untouched. */ public class NoOpFilter extends MetadataFilter { @@ -29,6 +26,6 @@ public class NoOpFilter extends MetadataFilter { @Override public void filter(Metadata metadata) throws TikaException { - //no op + // no op } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java index cede25bd52..3e347fb61c 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java @@ -1,35 +1,33 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.listfilter; import java.util.ArrayList; import java.util.List; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; public class CompositeMetadataListFilter extends MetadataListFilter { - //no longer final to allow for no arg initialization during serialization + // no longer final to allow for no arg initialization during serialization private List filters; public CompositeMetadataListFilter() { filters = new ArrayList<>(); } + public CompositeMetadataListFilter(List filters) { this.filters = filters; } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java index 0735a98a1a..840a0718fe 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java @@ -1,45 +1,43 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.listfilter; import java.io.IOException; import java.io.Serializable; import java.util.List; - -import org.w3c.dom.Element; - import org.apache.tika.config.ConfigBase; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.w3c.dom.Element; public abstract class MetadataListFilter extends ConfigBase implements Serializable { /** - * Loads the metadata list filter from the config file if it exists, otherwise returns NoOpFilter + * Loads the metadata list filter from the config file if it exists, otherwise returns + * NoOpFilter + * * @param root * @return * @throws TikaConfigException * @throws IOException */ - public static MetadataListFilter load(Element root, boolean allowMissing) throws TikaConfigException, - IOException { + public static MetadataListFilter load(Element root, boolean allowMissing) + throws TikaConfigException, IOException { try { return buildComposite("metadataListFilters", CompositeMetadataListFilter.class, - "metadataListFilter", MetadataListFilter.class, root); + "metadataListFilter", MetadataListFilter.class, root); } catch (TikaConfigException e) { if (allowMissing && e.getMessage().contains("could not find metadataListFilters")) { return new NoOpListFilter(); @@ -47,5 +45,6 @@ public static MetadataListFilter load(Element root, boolean allowMissing) throws throw e; } } + public abstract List filter(List metadataList) throws TikaException; } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java index 68654e4f2c..a20d62861b 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.listfilter; import java.util.List; - import org.apache.tika.metadata.Metadata; public class NoOpListFilter extends MetadataListFilter { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/package-info.java b/tika-core/src/main/java/org/apache/tika/metadata/package-info.java index 02fcae3ec8..c509c743e9 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java index e03367c773..35790aeb01 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.writefilter; @@ -24,13 +22,12 @@ public interface MetadataWriteFilter extends Serializable { void filterExisting(Map data); /** - * Based on the field and value, this filter modifies the field - * and/or the value to something that should be added to the Metadata object. + * Based on the field and value, this filter modifies the field and/or the value to something + * that should be added to the Metadata object. * * If the value is null, no value is set or added. * - * Status updates (e.g. write limit reached) can be added directly to the - * underlying metadata. + * Status updates (e.g. write limit reached) can be added directly to the underlying metadata. * * @param field * @param value @@ -40,9 +37,8 @@ public interface MetadataWriteFilter extends Serializable { void add(String field, String value, Map data); /** - * Based on the field and the value, this filter modifies - * the field and/or the value to something that should be set in the - * Metadata object. + * Based on the field and the value, this filter modifies the field and/or the value to + * something that should be set in the Metadata object. * * @param field * @param value diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilterFactory.java index 49d87dc468..4605de4134 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilterFactory.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilterFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.writefilter; diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java index 8e11b9805a..28dbf490a3 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.writefilter; @@ -26,44 +24,38 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; - import org.apache.tika.metadata.AccessPermissions; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.StringUtils; /** - * This is to be used to limit the amount of metadata that a - * parser can add based on the {@link #maxTotalEstimatedSize}, - * {@link #maxFieldSize}, {@link #maxValuesPerField}, and - * {@link #maxKeySize}. This can also be used to limit which - * fields are stored in the metadata object at write-time - * with {@link #includeFields}. + * This is to be used to limit the amount of metadata that a parser can add based on the + * {@link #maxTotalEstimatedSize}, {@link #maxFieldSize}, {@link #maxValuesPerField}, and + * {@link #maxKeySize}. This can also be used to limit which fields are stored in the metadata + * object at write-time with {@link #includeFields}. * - * All sizes are measured in UTF-16 bytes. The size is estimated - * as a rough order of magnitude of what is - * required to store the string in memory in Java. We recognize - * that Java uses more bytes to store length, offset etc. for strings. But - * the extra overhead varies by Java version and implementation, - * and we just need a basic estimate. We also recognize actual - * memory usage is affected by interning strings, etc. - * Please forgive us ... or consider writing your own write filter. :) + * All sizes are measured in UTF-16 bytes. The size is estimated as a rough order of magnitude of + * what is required to store the string in memory in Java. We recognize that Java uses more bytes to + * store length, offset etc. for strings. But the extra overhead varies by Java version and + * implementation, and we just need a basic estimate. We also recognize actual memory usage is + * affected by interning strings, etc. Please forgive us ... or consider writing your own write + * filter. :) * * - * NOTE: Fields in {@link #ALWAYS_SET_FIELDS} are - * always set no matter the current state of {@link #maxTotalEstimatedSize}. - * Except for {@link TikaCoreProperties#TIKA_CONTENT}, they are truncated at - * {@link #maxFieldSize}, and their sizes contribute to the {@link #maxTotalEstimatedSize}. + * NOTE: Fields in {@link #ALWAYS_SET_FIELDS} are always set no matter the current state of + * {@link #maxTotalEstimatedSize}. Except for {@link TikaCoreProperties#TIKA_CONTENT}, they are + * truncated at {@link #maxFieldSize}, and their sizes contribute to the + * {@link #maxTotalEstimatedSize}. * - * NOTE: Fields in {@link #ALWAYS_ADD_FIELDS} are - * always added no matter the current state of {@link #maxTotalEstimatedSize}. - * Except for {@link TikaCoreProperties#TIKA_CONTENT}, each addition is truncated at - * {@link #maxFieldSize}, and their sizes contribute to the {@link #maxTotalEstimatedSize}. + * NOTE: Fields in {@link #ALWAYS_ADD_FIELDS} are always added no matter the current state of + * {@link #maxTotalEstimatedSize}. Except for {@link TikaCoreProperties#TIKA_CONTENT}, each addition + * is truncated at {@link #maxFieldSize}, and their sizes contribute to the + * {@link #maxTotalEstimatedSize}. * - * This class {@link #minimumMaxFieldSizeInAlwaysFields} to protect the - * {@link #ALWAYS_ADD_FIELDS} and {@link #ALWAYS_SET_FIELDS}. If we didn't - * have this and a user sets the {@link #maxFieldSize} to, say, 10 bytes, - * the internal parser behavior would be broken because parsers rely on + * This class {@link #minimumMaxFieldSizeInAlwaysFields} to protect the {@link #ALWAYS_ADD_FIELDS} + * and {@link #ALWAYS_SET_FIELDS}. If we didn't have this and a user sets the {@link #maxFieldSize} + * to, say, 10 bytes, the internal parser behavior would be broken because parsers rely on * {@link Metadata#CONTENT_TYPE} to determine which parser to call. * * NOTE: as with {@link Metadata}, this object is not thread safe. @@ -87,7 +79,7 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable { ALWAYS_SET_FIELDS.add(Metadata.CONTENT_DISPOSITION); ALWAYS_SET_FIELDS.add(TikaCoreProperties.CONTAINER_EXCEPTION.getName()); ALWAYS_SET_FIELDS.add(TikaCoreProperties.EMBEDDED_EXCEPTION.getName()); - //Metadata.CONTENT_LOCATION? used by the html parser + // Metadata.CONTENT_LOCATION? used by the html parser } static { @@ -95,13 +87,13 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable { } private static final String METADATA_TRUNCATED_KEY = - TikaCoreProperties.TRUNCATED_METADATA.getName(); + TikaCoreProperties.TRUNCATED_METADATA.getName(); private static final String TIKA_CONTENT_KEY = TikaCoreProperties.TIKA_CONTENT.getName(); - private static final String[] TRUE = new String[]{"true"}; + private static final String[] TRUE = new String[] {"true"}; - //allow at least these many bytes in the "always" fields. - //As of 2022-03, the longest mime is 146. Doubling that gives - //us some leeway. If a mime is truncated, bad things will happen. + // allow at least these many bytes in the "always" fields. + // As of 2022-03, the longest mime is 146. Doubling that gives + // us some leeway. If a mime is truncated, bad things will happen. private final int minimumMaxFieldSizeInAlwaysFields = 300; @@ -117,24 +109,23 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable { private Map fieldSizes = new HashMap<>(); - //tracks the estimated size in utf16 bytes. Can be > maxEstimated size + // tracks the estimated size in utf16 bytes. Can be > maxEstimated size int estimatedSize = 0; /** - * @param maxKeySize maximum key size in UTF-16 bytes-- keys will be truncated to this - * length; if less than 0, keys will not be truncated + * @param maxKeySize maximum key size in UTF-16 bytes-- keys will be truncated to this length; + * if less than 0, keys will not be truncated * @param maxEstimatedSize - * @param includeFields if null or empty, all fields are included; otherwise, which fields - * to add to the metadata object. - * @param excludeFields these fields will not be included (unless they're in {@link StandardWriteFilter#ALWAYS_SET_FIELDS}) - * @param includeEmpty if true, this will set or add an empty value to the - * metadata object. + * @param includeFields if null or empty, all fields are included; otherwise, which fields to + * add to the metadata object. + * @param excludeFields these fields will not be included (unless they're in + * {@link StandardWriteFilter#ALWAYS_SET_FIELDS}) + * @param includeEmpty if true, this will set or add an empty value to the metadata + * object. */ protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimatedSize, - int maxValuesPerField, - Set includeFields, - Set excludeFields, - boolean includeEmpty) { + int maxValuesPerField, Set includeFields, Set excludeFields, + boolean includeEmpty) { this.maxKeySize = maxKeySize; this.maxFieldSize = maxFieldSize; @@ -147,16 +138,16 @@ protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimated @Override public void filterExisting(Map data) { - //this is somewhat costly, but it ensures that - //metadata that was placed in the metadata object before this - //filter was applied is removed. - //It should only be called once, and probably not on that - //many fields. + // this is somewhat costly, but it ensures that + // metadata that was placed in the metadata object before this + // filter was applied is removed. + // It should only be called once, and probably not on that + // many fields. Map tmp = new HashMap<>(); for (Map.Entry e : data.entrySet()) { String name = e.getKey(); String[] vals = e.getValue(); - if (! includeField(name)) { + if (!includeField(name)) { continue; } for (int i = 0; i < vals.length; i++) { @@ -173,12 +164,12 @@ public void filterExisting(Map data) { @Override public void set(String field, String value, Map data) { - //legacy behavior is that setting(null) removes the key + // legacy behavior is that setting(null) removes the key if (value == null) { data.remove(field); return; } - if (! include(field, value)) { + if (!include(field, value)) { return; } if (ALWAYS_SET_FIELDS.contains(field) || ALWAYS_ADD_FIELDS.contains(field)) { @@ -192,12 +183,12 @@ public void set(String field, String value, Map data) { private void setAlwaysInclude(String field, String value, Map data) { if (TIKA_CONTENT_KEY.equals(field)) { - data.put(field, new String[]{ value }); + data.put(field, new String[] {value}); return; } int sizeToAdd = estimateSize(value); - //if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax - //we do not want to truncate a mime! + // if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax + // we do not want to truncate a mime! int alwaysMaxFieldLength = Math.max(minimumMaxFieldSizeInAlwaysFields, maxFieldSize); String toSet = value; if (sizeToAdd > alwaysMaxFieldLength) { @@ -208,29 +199,29 @@ private void setAlwaysInclude(String field, String value, Map totalAdded += sizeToAdd; if (data.containsKey(field)) { String[] vals = data.get(field); - //this should only ever be single valued!!! + // this should only ever be single valued!!! if (vals.length > 0) { totalAdded -= estimateSize(vals[0]); } } estimatedSize += totalAdded; - data.put(field, new String[]{toSet}); + data.put(field, new String[] {toSet}); } private void addAlwaysInclude(String field, String value, Map data) { if (TIKA_CONTENT_KEY.equals(field)) { - data.put(field, new String[]{ value }); + data.put(field, new String[] {value}); return; } - if (! data.containsKey(field)) { + if (!data.containsKey(field)) { setAlwaysInclude(field, value, data); return; } - //TODO: should we limit the number of field values? + // TODO: should we limit the number of field values? int toAddSize = estimateSize(value); - //if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax - //we do not want to truncate a mime! + // if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax + // we do not want to truncate a mime! int alwaysMaxFieldLength = Math.max(minimumMaxFieldSizeInAlwaysFields, maxFieldSize); String toAddValue = value; if (toAddSize > alwaysMaxFieldLength) { @@ -245,19 +236,19 @@ private void addAlwaysInclude(String field, String value, Map } - //calculate the max field length allowed if we are - //setting a value + // calculate the max field length allowed if we are + // setting a value private int maxAllowedToSet(StringSizePair filterKey) { Integer existingSizeInt = fieldSizes.get(filterKey.string); int existingSize = existingSizeInt == null ? 0 : existingSizeInt; - //this is how much is allowed by the overall total limit + // this is how much is allowed by the overall total limit int allowedByMaxTotal = maxTotalEstimatedSize - estimatedSize; - //if we're overwriting a value, that value's data size is now available + // if we're overwriting a value, that value's data size is now available allowedByMaxTotal += existingSize; - //if we're adding a key, we need to subtract that value + // if we're adding a key, we need to subtract that value allowedByMaxTotal -= existingSizeInt == null ? filterKey.size : 0; return Math.min(maxFieldSize, allowedByMaxTotal); @@ -266,7 +257,7 @@ private int maxAllowedToSet(StringSizePair filterKey) { @Override public void add(String field, String value, Map data) { - if (! include(field, value)) { + if (!include(field, value)) { return; } if (ALWAYS_SET_FIELDS.contains(field)) { @@ -277,7 +268,7 @@ public void add(String field, String value, Map data) { return; } StringSizePair filterKey = filterKey(field, value, data); - if (! data.containsKey(filterKey.string)) { + if (!data.containsKey(filterKey.string)) { setFilterKey(filterKey, value, data); return; } @@ -308,16 +299,16 @@ public void add(String field, String value, Map data) { int addedOverall = valueLength; if (fieldSizeInteger == null) { - //if there was no value before, we're adding - //a key. If there was a value before, do not - //add the key length. + // if there was no value before, we're adding + // a key. If there was a value before, do not + // add the key length. addedOverall += filterKey.size; } estimatedSize += addedOverall; fieldSizes.put(filterKey.string, valueLength + fieldSize); - data.put(filterKey.string, appendValue(data.get(filterKey.string), toAdd )); + data.put(filterKey.string, appendValue(data.get(filterKey.string), toAdd)); } private String[] appendValue(String[] values, final String value) { @@ -330,28 +321,27 @@ private String[] appendValue(String[] values, final String value) { return newValues; } - //calculate the max field length allowed if we are - //adding a value + // calculate the max field length allowed if we are + // adding a value private int maxAllowedToAdd(StringSizePair filterKey) { Integer existingSizeInt = fieldSizes.get(filterKey.string); int existingSize = existingSizeInt == null ? 0 : existingSizeInt; - //how much can we add to this field + // how much can we add to this field int allowedByMaxField = maxFieldSize - existingSize; - //this is how much is allowed by the overall total limit + // this is how much is allowed by the overall total limit int allowedByMaxTotal = maxTotalEstimatedSize - estimatedSize - 1; - //if we're adding a new key, we need to subtract that value + // if we're adding a new key, we need to subtract that value allowedByMaxTotal -= existingSizeInt == null ? filterKey.size : 0; return Math.min(allowedByMaxField, allowedByMaxTotal); } - private void setFilterKey(StringSizePair filterKey, String value, - Map data) { - //if you can't even add the key, give up now - if (! data.containsKey(filterKey.string) && - (filterKey.size + estimatedSize > maxTotalEstimatedSize)) { + private void setFilterKey(StringSizePair filterKey, String value, Map data) { + // if you can't even add the key, give up now + if (!data.containsKey(filterKey.string) + && (filterKey.size + estimatedSize > maxTotalEstimatedSize)) { setTruncated(data); return; } @@ -375,9 +365,9 @@ private void setFilterKey(StringSizePair filterKey, String value, int addedOverall = 0; if (fieldSizeInteger == null) { - //if there was no value before, we're adding - //a key. If there was a value before, do not - //add the key length. + // if there was no value before, we're adding + // a key. If there was a value before, do not + // add the key length. addedOverall += filterKey.size; } addedOverall += valueLength - fieldSize; @@ -385,7 +375,7 @@ private void setFilterKey(StringSizePair filterKey, String value, fieldSizes.put(filterKey.string, valueLength); - data.put(filterKey.string, new String[]{ toSet }); + data.put(filterKey.string, new String[] {toSet}); } @@ -400,15 +390,13 @@ private StringSizePair filterKey(String field, String value, Map data) { setTruncated(data); - //correctly handle multibyte characters + // correctly handle multibyte characters byte[] bytes = value.getBytes(StandardCharsets.UTF_16BE); ByteBuffer bb = ByteBuffer.wrap(bytes, 0, length); CharBuffer cb = CharBuffer.allocate(length); @@ -426,6 +414,7 @@ private boolean include(String field, String value) { /** * Tests for null or empty. Does not check for length + * * @param value * @return */ @@ -461,7 +450,7 @@ private static int estimateSize(String s) { private static class StringSizePair { final String string; - final int size;//utf-16 bytes -- estimated + final int size;// utf-16 bytes -- estimated final boolean truncated; public StringSizePair(String string, int size, boolean truncated) { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java index df6d8b42d1..0cc3489510 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.writefilter; @@ -22,8 +20,8 @@ import java.util.concurrent.ConcurrentHashMap; /** - * Factory class for {@link StandardWriteFilter}. See that class - * for how the estimated sizes are calculated on Strings. + * Factory class for {@link StandardWriteFilter}. See that class for how the estimated sizes are + * calculated on Strings. */ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory { @@ -55,9 +53,8 @@ public MetadataWriteFilter newInstance() { throw new IllegalArgumentException("max estimated size must be > 0"); } - return new StandardWriteFilter(maxKeySize, maxFieldSize, - maxTotalEstimatedBytes, maxValuesPerField, includeFields, - excludeFields, includeEmpty); + return new StandardWriteFilter(maxKeySize, maxFieldSize, maxTotalEstimatedBytes, + maxValuesPerField, includeFields, excludeFields, includeEmpty); } public void setIncludeFields(List includeFields) { @@ -118,9 +115,10 @@ public boolean isIncludeEmpty() { @Override public String toString() { - return "StandardWriteFilterFactory{" + "includeFields=" + includeFields + ", maxKeySize=" + - maxKeySize + ", maxFieldSize=" + maxFieldSize + ", maxTotalEstimatedBytes=" + - maxTotalEstimatedBytes + ", maxValuesPerField=" + maxValuesPerField + - ", includeEmpty=" + includeEmpty + '}'; + return "StandardWriteFilterFactory{" + "includeFields=" + includeFields + ", maxKeySize=" + + maxKeySize + ", maxFieldSize=" + maxFieldSize + + ", maxTotalEstimatedBytes=" + maxTotalEstimatedBytes + + ", maxValuesPerField=" + maxValuesPerField + ", includeEmpty=" + + includeEmpty + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/mime/AndClause.java b/tika-core/src/main/java/org/apache/tika/mime/AndClause.java index c7a2184820..1cab8c1b14 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/AndClause.java +++ b/tika-core/src/main/java/org/apache/tika/mime/AndClause.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; diff --git a/tika-core/src/main/java/org/apache/tika/mime/Clause.java b/tika-core/src/main/java/org/apache/tika/mime/Clause.java index fc3bcc1e39..c74bbd6e95 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/Clause.java +++ b/tika-core/src/main/java/org/apache/tika/mime/Clause.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -29,8 +27,8 @@ interface Clause extends Serializable { boolean eval(byte[] data); /** - * Returns the size of this clause. The size of a clause is the number of - * chars it is composed of. + * Returns the size of this clause. The size of a clause is the number of chars it is composed + * of. */ int size(); diff --git a/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java b/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java index 1ba53fe9bb..b4772ad3ca 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java +++ b/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -21,8 +19,8 @@ */ public class HexCoDec { - private static final char[] HEX_CHARS = - {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; + private static final char[] HEX_CHARS = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', + 'b', 'c', 'd', 'e', 'f'}; /** * Decode a hex string @@ -47,9 +45,9 @@ public static byte[] decode(char[] hexChars) { /** * Decode an array of hex chars. * - * @param hexChars an array of hex characters. + * @param hexChars an array of hex characters. * @param startIndex the index of the first character to decode - * @param length the number of characters to decode. + * @param length the number of characters to decode. * @return the decode hex chars as bytes. */ public static byte[] decode(char[] hexChars, int startIndex, int length) { @@ -59,8 +57,8 @@ public static byte[] decode(char[] hexChars, int startIndex, int length) { byte[] result = new byte[length / 2]; for (int j = 0; j < result.length; j++) { - result[j] = (byte) (hexCharToNibble(hexChars[startIndex++]) * 16 + - hexCharToNibble(hexChars[startIndex++])); + result[j] = (byte) (hexCharToNibble(hexChars[startIndex++]) * 16 + + hexCharToNibble(hexChars[startIndex++])); } return result; } @@ -78,9 +76,9 @@ public static char[] encode(byte[] bites) { /** * Hex encode an array of bytes * - * @param bites the array of bytes to encode. + * @param bites the array of bytes to encode. * @param startIndex the index of the first character to encode. - * @param length the number of characters to encode. + * @param length the number of characters to encode. * @return the array of hex characters. */ public static char[] encode(byte[] bites, int startIndex, int length) { diff --git a/tika-core/src/main/java/org/apache/tika/mime/Magic.java b/tika-core/src/main/java/org/apache/tika/mime/Magic.java index 9c3b3adf62..68729fd191 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/Magic.java +++ b/tika-core/src/main/java/org/apache/tika/mime/Magic.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; /** - * Defines a magic for a MimeType. A magic is made of one or several - * MagicClause. + * Defines a magic for a MimeType. A magic is made of one or several MagicClause. */ class Magic implements Clause, Comparable { diff --git a/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java b/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java index 4917ea5428..b8fcdac6a2 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java @@ -1,25 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; import java.io.IOException; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; - import org.apache.tika.detect.MagicDetector; import org.apache.tika.metadata.Metadata; @@ -57,8 +53,9 @@ private synchronized MagicDetector getDetector() { public boolean eval(byte[] data) { try { - return getDetector().detect(UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(), new Metadata()) != - MediaType.OCTET_STREAM; + return getDetector().detect( + UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(), + new Metadata()) != MediaType.OCTET_STREAM; } catch (IOException e) { // Should never happen with a ByteArrayInputStream return false; diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java index 23a8ea58d0..6df12b3bd8 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -42,25 +40,24 @@ public final class MediaType implements Comparable, Serializable { private static final Pattern SPECIAL = Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=]"); private static final Pattern SPECIAL_OR_WHITESPACE = - Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]"); + Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]"); /** * See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters. */ private static final String VALID_CHARS = "([^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]+)"; - private static final Pattern TYPE_PATTERN = - Pattern.compile("(?s)\\s*" + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS + "\\s*($|;.*)"); + private static final Pattern TYPE_PATTERN = Pattern + .compile("(?s)\\s*" + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS + "\\s*($|;.*)"); // TIKA-350: handle charset as first element in content-type - private static final Pattern CHARSET_FIRST_PATTERN = Pattern.compile( - "(?is)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*" + VALID_CHARS + "\\s*/\\s*" + - VALID_CHARS + "\\s*"); + private static final Pattern CHARSET_FIRST_PATTERN = + Pattern.compile("(?is)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*" + VALID_CHARS + + "\\s*/\\s*" + VALID_CHARS + "\\s*"); /** - * Set of basic types with normalized "type/subtype" names. - * Used to optimize type lookup and to avoid having too many - * {@link MediaType} instances in memory. + * Set of basic types with normalized "type/subtype" names. Used to optimize type lookup and to + * avoid having too many {@link MediaType} instances in memory. */ private static final Map SIMPLE_TYPES = new HashMap<>(); @@ -80,14 +77,12 @@ public final class MediaType implements Comparable, Serializable { */ private final String string; /** - * Location of the "/" character separating the type and the subtype - * tokens in {@link #string}. + * Location of the "/" character separating the type and the subtype tokens in {@link #string}. */ private final int slash; /** - * Location of the first ";" character separating the type part of - * {@link #string} from possible parameters. Length of {@link #string} - * in case there are no parameters. + * Location of the first ";" character separating the type part of {@link #string} from possible + * parameters. Length of {@link #string} in case there are no parameters. */ private final int semicolon; /** @@ -157,8 +152,8 @@ public MediaType(MediaType type, Map parameters) { /** * Creates a media type by adding a parameter to a base type. * - * @param type base type - * @param name parameter name + * @param type base type + * @param name parameter name * @param value parameter value * @since Apache Tika 1.2 */ @@ -169,7 +164,7 @@ public MediaType(MediaType type, String name, String value) { /** * Creates a media type by adding the "charset" parameter to a base type. * - * @param type base type + * @param type base type * @param charset charset value * @since Apache Tika 1.2 */ @@ -198,8 +193,7 @@ public static MediaType video(String type) { } /** - * Convenience method that returns an unmodifiable set that contains - * all the given media types. + * Convenience method that returns an unmodifiable set that contains all the given media types. * * @param types media types * @return unmodifiable set of the given types @@ -216,8 +210,8 @@ public static Set set(MediaType... types) { } /** - * Convenience method that parses the given media type strings and - * returns an unmodifiable set that contains all the parsed types. + * Convenience method that parses the given media type strings and returns an unmodifiable set + * that contains all the parsed types. * * @param types media type strings * @return unmodifiable set of the parsed types @@ -235,10 +229,9 @@ public static Set set(String... types) { } /** - * Parses the given string to a media type. The string is expected - * to be of the form "type/subtype(; parameter=...)*" as defined in - * RFC 2045, though we also handle "charset=xxx; type/subtype" for - * broken web servers. + * Parses the given string to a media type. The string is expected to be of the form + * "type/subtype(; parameter=...)*" as defined in RFC 2045, though we also handle "charset=xxx; + * type/subtype" for broken web servers. * * @param string media type string to be parsed * @return parsed media type, or null if parsing fails @@ -255,9 +248,8 @@ public static MediaType parse(String string) { int slash = string.indexOf('/'); if (slash == -1) { return null; - } else if (SIMPLE_TYPES.size() < 10000 && - isSimpleName(string.substring(0, slash)) && - isSimpleName(string.substring(slash + 1))) { + } else if (SIMPLE_TYPES.size() < 10000 && isSimpleName(string.substring(0, slash)) + && isSimpleName(string.substring(slash + 1))) { type = new MediaType(string, slash); SIMPLE_TYPES.put(string, type); } @@ -271,12 +263,12 @@ public static MediaType parse(String string) { matcher = TYPE_PATTERN.matcher(string); if (matcher.matches()) { return new MediaType(matcher.group(1), matcher.group(2), - parseParameters(matcher.group(3))); + parseParameters(matcher.group(3))); } matcher = CHARSET_FIRST_PATTERN.matcher(string); if (matcher.matches()) { return new MediaType(matcher.group(2), matcher.group(3), - parseParameters(matcher.group(1))); + parseParameters(matcher.group(1))); } return null; @@ -285,8 +277,8 @@ public static MediaType parse(String string) { private static boolean isSimpleName(String name) { for (int i = 0; i < name.length(); i++) { char c = name.charAt(i); - if (c != '-' && c != '+' && c != '.' && c != '_' && !('0' <= c && c <= '9') && - !('a' <= c && c <= 'z')) { + if (c != '-' && c != '+' && c != '.' && c != '_' && !('0' <= c && c <= '9') + && !('a' <= c && c <= 'z')) { return false; } } @@ -300,7 +292,7 @@ private static Map parseParameters(String string) { // Extracts k1=v1, k2=v2 from mime/type; k1=v1; k2=v2 // Note - this logic isn't fully RFC2045 compliant yet, as it - // doesn't fully handle quoted keys or values (eg containing ; or =) + // doesn't fully handle quoted keys or values (eg containing ; or =) Map parameters = new HashMap<>(); while (string.length() > 0) { String key = string; @@ -329,8 +321,7 @@ private static Map parseParameters(String string) { } /** - * Fuzzy unquoting mechanism that works also with somewhat malformed - * quotes. + * Fuzzy unquoting mechanism that works also with somewhat malformed quotes. * * @param s string to unquote * @return unquoted string @@ -359,8 +350,7 @@ private static Map union(Map a, Maptrue if this type has one or more parameters, - * false otherwise + * @return true if this type has one or more parameters, false + * otherwise * @since Apache Tika 0.8 */ public boolean hasParameters() { @@ -399,8 +387,8 @@ public boolean hasParameters() { } /** - * Returns an immutable sorted map of the parameters of this media type. - * The parameter names are guaranteed to be trimmed and in lower case. + * Returns an immutable sorted map of the parameters of this media type. The parameter names are + * guaranteed to be trimmed and in lower case. * * @return sorted map of parameters */ diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java index ac5b3add87..1d474872e1 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -33,14 +31,14 @@ public class MediaTypeRegistry implements Serializable { */ private static final long serialVersionUID = 4710974869988895410L; /** - * Registry of known media types, including type aliases. A canonical - * media type is handled as an identity mapping, while an alias is stored - * as a mapping from the alias to the corresponding canonical type. + * Registry of known media types, including type aliases. A canonical media type is handled as + * an identity mapping, while an alias is stored as a mapping from the alias to the + * corresponding canonical type. */ private final Map registry = new ConcurrentHashMap<>(); /** - * Known type inheritance relationships. The mapping is from a media type - * to the closest supertype. + * Known type inheritance relationships. The mapping is from a media type to the closest + * supertype. */ private final Map inheritance = new HashMap<>(); @@ -55,8 +53,8 @@ public static MediaTypeRegistry getDefaultRegistry() { } /** - * Returns the set of all known canonical media types. Type aliases are - * not included in the returned set. + * Returns the set of all known canonical media types. Type aliases are not included in the + * returned set. * * @return canonical media types * @since Apache Tika 0.8 @@ -126,13 +124,12 @@ public MediaType normalize(MediaType type) { } /** - * Checks whether the given media type a is a specialization of a more - * generic type b. Both types should be already normalised. + * Checks whether the given media type a is a specialization of a more generic type b. Both + * types should be already normalised. * * @param a media type, normalised * @param b suspected supertype, normalised - * @return true if b is a supertype of a, - * false otherwise + * @return true if b is a supertype of a, false otherwise * @since Apache Tika 0.8 */ public boolean isSpecializationOf(MediaType a, MediaType b) { @@ -140,13 +137,13 @@ public boolean isSpecializationOf(MediaType a, MediaType b) { } /** - * Checks whether the given media type equals the given base type or - * is a specialization of it. Both types should be already normalised. + * Checks whether the given media type equals the given base type or is a specialization of it. + * Both types should be already normalised. * * @param a media type, normalised * @param b base type, normalised - * @return true if b equals a or is a specialization of it, - * false otherwise + * @return true if b equals a or is a specialization of it, false + * otherwise * @since Apache Tika 1.2 */ public boolean isInstanceOf(MediaType a, MediaType b) { @@ -154,14 +151,14 @@ public boolean isInstanceOf(MediaType a, MediaType b) { } /** - * Parses and normalises the given media type string and checks whether - * the result equals the given base type or is a specialization of it. - * The given base type should already be normalised. + * Parses and normalises the given media type string and checks whether the result equals the + * given base type or is a specialization of it. The given base type should already be + * normalised. * * @param a media type * @param b base type, normalised - * @return true if b equals a or is a specialization of it, - * false otherwise + * @return true if b equals a or is a specialization of it, false + * otherwise * @since Apache Tika 1.2 */ public boolean isInstanceOf(String a, MediaType b) { @@ -169,14 +166,12 @@ public boolean isInstanceOf(String a, MediaType b) { } /** - * Returns the supertype of the given type. If the media type database - * has an explicit inheritance rule for the type, then that is used. - * Next, if the given type has any parameters, then the respective base - * type (parameter-less) is returned. Otherwise built-in heuristics like - * text/... -> text/plain and .../...+xml -> application/xml are used. - * Finally application/octet-stream is returned for all types for which no other - * supertype is known, and the return value for application/octet-stream - * is null. + * Returns the supertype of the given type. If the media type database has an explicit + * inheritance rule for the type, then that is used. Next, if the given type has any parameters, + * then the respective base type (parameter-less) is returned. Otherwise built-in heuristics + * like text/... -> text/plain and .../...+xml -> application/xml are used. Finally + * application/octet-stream is returned for all types for which no other supertype is known, and + * the return value for application/octet-stream is null. * * @param type media type * @return supertype, or null for application/octet-stream diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java index 8dc3ddba54..c33562e1f5 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -69,22 +67,19 @@ public final class MimeType implements Comparable, Serializable { */ private List rootXML = null; /** - * All known file extensions of this type, in order of preference - * (best first). + * All known file extensions of this type, in order of preference (best first). */ private List extensions = null; /** - * Whether this mime-type is used for server-side scripts, - * and thus cannot reliably be used for filename-based type detection + * Whether this mime-type is used for server-side scripts, and thus cannot reliably be used for + * filename-based type detection */ private boolean isInterpreted = false; /** - * Creates a media type with the give name and containing media type - * registry. The name is expected to be valid and normalized to lower - * case. This constructor should only be called by - * {@link MimeTypes#forName(String)} to keep the media type registry - * up to date. + * Creates a media type with the give name and containing media type registry. The name is + * expected to be valid and normalized to lower case. This constructor should only be called by + * {@link MimeTypes#forName(String)} to keep the media type registry up to date. * * @param type normalized media type name */ @@ -96,9 +91,9 @@ public final class MimeType implements Comparable, Serializable { } /** - * Checks that the given string is a valid Internet media type name - * based on rules from RFC 2054 section 5.3. For validation purposes the - * rules can be simplified to the following: + * Checks that the given string is a valid Internet media type name based on rules from RFC 2054 + * section 5.3. For validation purposes the rules can be simplified to the following: + * *

      * name := token "/" token
      * token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
@@ -107,8 +102,8 @@ public final class MimeType implements Comparable, Serializable {
      * 
* * @param name name string - * @return true if the string is a valid media type name, - * false otherwise + * @return true if the string is a valid media type name, false + * otherwise */ public static boolean isValid(String name) { if (name == null) { @@ -118,9 +113,9 @@ public static boolean isValid(String name) { boolean slash = false; for (int i = 0; i < name.length(); i++) { char ch = name.charAt(i); - if (ch <= ' ' || ch >= 127 || ch == '(' || ch == ')' || ch == '<' || ch == '>' || - ch == '@' || ch == ',' || ch == ';' || ch == ':' || ch == '\\' || ch == '"' || - ch == '[' || ch == ']' || ch == '?' || ch == '=') { + if (ch <= ' ' || ch >= 127 || ch == '(' || ch == ')' || ch == '<' || ch == '>' + || ch == '@' || ch == ',' || ch == ';' || ch == ':' || ch == '\\' + || ch == '"' || ch == '[' || ch == ']' || ch == '?' || ch == '=') { return false; } else if (ch == '/') { if (slash || i == 0 || i + 1 == name.length()) { @@ -197,7 +192,8 @@ void setAcronym(String v) { * Get the UTI for this mime type. * * @return The Uniform Type Identifier - * @see http://en.wikipedia.org/wiki/Uniform_Type_Identifier + * @see http://en.wikipedia.org/wiki/Uniform_Type_Identifier */ public String getUniformTypeIdentifier() { return uti; @@ -323,7 +319,7 @@ public int compareTo(MimeType mime) { return type.compareTo(mime.type); } - //----------------------------------------------------------< Comparable > + // ----------------------------------------------------------< Comparable > public boolean equals(Object o) { if (o instanceof MimeType) { @@ -334,7 +330,7 @@ public boolean equals(Object o) { return false; } - //--------------------------------------------------------------< Object > + // --------------------------------------------------------------< Object > public int hashCode() { return type.hashCode(); @@ -350,9 +346,9 @@ public String toString() { } /** - * Returns the preferred file extension of this type, or an empty string - * if no extensions are known. Use the {@link #getExtensions()} method to - * get the full list of known extensions of this type. + * Returns the preferred file extension of this type, or an empty string if no extensions are + * known. Use the {@link #getExtensions()} method to get the full list of known extensions of + * this type. * * @return preferred file extension or empty string * @since Apache Tika 0.9 @@ -396,8 +392,7 @@ void addExtension(String extension) { } /** - * Defines a RootXML description. RootXML is made of a localName and/or a - * namespaceURI. + * Defines a RootXML description. RootXML is made of a localName and/or a namespaceURI. */ static class RootXML implements Serializable { @@ -415,7 +410,7 @@ static class RootXML implements Serializable { RootXML(MimeType type, String namespaceURI, String localName) { if (isEmpty(namespaceURI) && isEmpty(localName)) { throw new IllegalArgumentException( - "Both namespaceURI and localName cannot be empty"); + "Both namespaceURI and localName cannot be empty"); } this.type = type; this.namespaceURI = namespaceURI; @@ -423,7 +418,7 @@ static class RootXML implements Serializable { } boolean matches(String namespaceURI, String localName) { - //Compare namespaces + // Compare namespaces if (!isEmpty(this.namespaceURI)) { if (!this.namespaceURI.equals(namespaceURI)) { return false; @@ -436,7 +431,7 @@ boolean matches(String namespaceURI, String localName) { } } - //Compare root element's local name + // Compare root element's local name if (!isEmpty(this.localName)) { return this.localName.equals(localName); } else { diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java index 31bc8a1400..051c7a3800 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -33,11 +31,10 @@ public MimeTypeException(String message) { } /** - * Constructs a MimeTypeException with the specified detail message - * and root cause. + * Constructs a MimeTypeException with the specified detail message and root cause. * * @param message the detail message. - * @param cause root cause + * @param cause root cause */ public MimeTypeException(String message, Throwable cause) { super(message, cause); diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java index e146e8e0f6..b7c943b6d5 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -31,9 +29,7 @@ import java.util.Locale; import java.util.Map; import javax.xml.namespace.QName; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; - import org.apache.tika.Tika; import org.apache.tika.detect.Detector; import org.apache.tika.detect.TextDetector; @@ -42,18 +38,16 @@ import org.apache.tika.metadata.TikaCoreProperties; /** - * This class is a MimeType repository. It gathers a set of MimeTypes and - * enables to retrieves a content-type from its name, from a file name, or from - * a magic character sequence. + * This class is a MimeType repository. It gathers a set of MimeTypes and enables to retrieves a + * content-type from its name, from a file name, or from a magic character sequence. *

- * The MIME type detection methods that take an {@link InputStream} as - * an argument will never reads more than {@link #getMinLength()} bytes - * from the stream. Also the given stream is never - * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked}, - * or {@link InputStream#reset() reset} by the methods. Thus a client can - * use the {@link InputStream#markSupported() mark feature} of the stream - * (if available) to restore the stream back to the state it was before type - * detection if it wants to process the stream based on the detected type. + * The MIME type detection methods that take an {@link InputStream} as an argument will never reads + * more than {@link #getMinLength()} bytes from the stream. Also the given stream is never + * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked}, or + * {@link InputStream#reset() reset} by the methods. Thus a client can use the + * {@link InputStream#markSupported() mark feature} of the stream (if available) to restore the + * stream back to the state it was before type detection if it wants to process the stream based on + * the detected type. */ public final class MimeTypes implements Detector, Serializable { @@ -74,7 +68,7 @@ public final class MimeTypes implements Detector, Serializable { */ private static final long serialVersionUID = -1350863170146349036L; private static final Map CLASSLOADER_SPECIFIC_DEFAULT_TYPES = - new HashMap<>(); + new HashMap<>(); private static MimeTypes DEFAULT_TYPES = null; /** * Root type, application/octet-stream. @@ -128,8 +122,8 @@ public MimeTypes() { } /** - * Get the default MimeTypes. This includes all the build in - * media types, and any custom override ones present. + * Get the default MimeTypes. This includes all the build in media types, and any custom + * override ones present. * * @return MimeTypes default type registry */ @@ -138,8 +132,8 @@ public static synchronized MimeTypes getDefaultMimeTypes() { } /** - * Get the default MimeTypes. This includes all the built-in - * media types, and any custom override ones present. + * Get the default MimeTypes. This includes all the built-in media types, and any custom + * override ones present. * * @param classLoader to use, if not the default * @return MimeTypes default type registry @@ -152,8 +146,8 @@ public static synchronized MimeTypes getDefaultMimeTypes(ClassLoader classLoader if (types == null) { try { - types = MimeTypesFactory - .create("tika-mimetypes.xml", "custom-mimetypes.xml", classLoader); + types = MimeTypesFactory.create("tika-mimetypes.xml", "custom-mimetypes.xml", + classLoader); } catch (MimeTypeException e) { throw new RuntimeException("Unable to parse the default media type registry", e); } catch (IOException e) { @@ -170,8 +164,8 @@ public static synchronized MimeTypes getDefaultMimeTypes(ClassLoader classLoader } /** - * Find the Mime Content Type of a document from its name. - * Returns application/octet-stream if no better match is found. + * Find the Mime Content Type of a document from its name. Returns application/octet-stream if + * no better match is found. * * @param name of the document to analyze. * @return the Mime Content Type of the specified document name @@ -192,13 +186,13 @@ public MimeType getMimeType(String name) { } /** - * Find the Mime Content Type of a document stored in the given file. - * Returns application/octet-stream if no better match is found. + * Find the Mime Content Type of a document stored in the given file. Returns + * application/octet-stream if no better match is found. * * @param file file to analyze * @return the Mime Content Type of the specified document * @throws MimeTypeException if the type can't be detected - * @throws IOException if the file can't be read + * @throws IOException if the file can't be read * @deprecated Use {@link Tika#detect(File)} instead */ @Deprecated @@ -207,16 +201,14 @@ public MimeType getMimeType(File file) throws MimeTypeException, IOException { } /** - * Returns the MIME type that best matches the given first few bytes - * of a document stream. Returns application/octet-stream if no better - * match is found. + * Returns the MIME type that best matches the given first few bytes of a document stream. + * Returns application/octet-stream if no better match is found. *

- * If multiple matches are found, the best (highest priority) matching - * type is returned. If multiple matches are found with the same priority, - * then all of these are returned. + * If multiple matches are found, the best (highest priority) matching type is returned. If + * multiple matches are found with the same priority, then all of these are returned. *

- * The given byte array is expected to be at least {@link #getMinLength()} - * long, or shorter only if the document stream itself is shorter. + * The given byte array is expected to be at least {@link #getMinLength()} long, or shorter only + * if the document stream itself is shorter. * * @param data first few bytes of a document stream * @return matching MIME type @@ -248,15 +240,15 @@ List getMimeType(byte[] data) { // When detecting generic XML (or possibly XHTML), // extract the root element and match it against known types - if ("application/xml".equals(matched.getName()) || - "text/html".equals(matched.getName())) { + if ("application/xml".equals(matched.getName()) + || "text/html".equals(matched.getName())) { XmlRootExtractor extractor = new XmlRootExtractor(); QName rootElement = extractor.extractRootElement(data); if (rootElement != null) { for (MimeType type : xmls) { if (type.matchesXML(rootElement.getNamespaceURI(), - rootElement.getLocalPart())) { + rootElement.getLocalPart())) { result.set(i, type); break; } @@ -264,7 +256,7 @@ List getMimeType(byte[] data) { } else if ("application/xml".equals(matched.getName())) { // Our XML magic is higher than our HTML magic // So, if we got here, we might have a HTML file that's - // invalid XML. So, try our HTML magics explicitly (TIKA-2419) + // invalid XML. So, try our HTML magics explicitly (TIKA-2419) boolean isHTML = false; for (Magic magic : magics) { if (!magic.getType().equals(htmlMimeType)) { @@ -277,7 +269,7 @@ List getMimeType(byte[] data) { } // Otherwise, downgrade from application/xml to text/plain - // since the document seems not to be well-formed. + // since the document seems not to be well-formed. if (isHTML) { result.set(i, htmlMimeType); } else { @@ -293,7 +285,7 @@ List getMimeType(byte[] data) { try { TextDetector detector = new TextDetector(getMinLength()); UnsynchronizedByteArrayInputStream stream = - UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(); + UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(); MimeType type = forName(detector.detect(stream, new Metadata()).toString()); return Collections.singletonList(type); } catch (Exception e) { @@ -302,13 +294,11 @@ List getMimeType(byte[] data) { } /** - * Reads the first {@link #getMinLength()} bytes from the given stream. - * If the stream is shorter, then the entire content of the stream is - * returned. + * Reads the first {@link #getMinLength()} bytes from the given stream. If the stream is + * shorter, then the entire content of the stream is returned. *

- * The given stream is never {@link InputStream#close() closed}, - * {@link InputStream#mark(int) marked}, or - * {@link InputStream#reset() reset} by this method. + * The given stream is never {@link InputStream#close() closed}, {@link InputStream#mark(int) + * marked}, or {@link InputStream#reset() reset} by this method. * * @param stream stream to be read * @return first {@link #getMinLength()} (or fewer) bytes of the stream @@ -337,9 +327,8 @@ byte[] readMagicHeader(InputStream stream) throws IOException { } /** - * Returns the registered media type with the given name (or alias). - * The named media type is automatically registered (and returned) if - * it doesn't already exist. + * Returns the registered media type with the given name (or alias). The named media type is + * automatically registered (and returned) if it doesn't already exist. * * @param name media type name (case-insensitive) * @return the registered media type with the given name or alias @@ -356,7 +345,7 @@ public MimeType forName(String name) throws MimeTypeException { if (mime == null) { synchronized (this) { // Double check it didn't already get added while - // we were waiting for the lock + // we were waiting for the lock mime = types.get(normalisedType); if (mime == null) { mime = new MimeType(type); @@ -371,18 +360,19 @@ public MimeType forName(String name) throws MimeTypeException { /** * Returns the registered, normalised media type with the given name (or alias). * - *

Unlike {@link #forName(String)}, this function will not create a - * new MimeType and register it. Instead, null will be returned if - * there is no definition available for the given name. + *

+ * Unlike {@link #forName(String)}, this function will not create a new MimeType and + * register it. Instead, null will be returned if there is no definition available + * for the given name. * - *

Also, unlike {@link #forName(String)}, this function may return a - * mime type that has fewer parameters than were included in the supplied name. - * If the registered mime type has parameters (e.g. - * application/dita+xml;format=map), then those will be maintained. - * However, if the supplied name has paramenters that the registered mime - * type does not (e.g. application/xml; charset=UTF-8 as a name, - * compared to just application/xml for the type in the registry), - * then those parameters will not be included in the returned type. + *

+ * Also, unlike {@link #forName(String)}, this function may return a mime type that has fewer + * parameters than were included in the supplied name. If the registered mime type has + * parameters (e.g. application/dita+xml;format=map), then those will be + * maintained. However, if the supplied name has paramenters that the registered mime + * type does not (e.g. application/xml; charset=UTF-8 as a name, compared to just + * application/xml for the type in the registry), then those parameters will not be + * included in the returned type. * * @param name media type name (case-insensitive) * @return the registered media type with the given name or alias, or null if not found @@ -410,10 +400,10 @@ public synchronized void setSuperType(MimeType type, MediaType parent) { } /** - * Adds an alias for the given media type. This method should only - * be called from {@link MimeType#addAlias(String)}. + * Adds an alias for the given media type. This method should only be called from + * {@link MimeType#addAlias(String)}. * - * @param type media type + * @param type media type * @param alias media type alias (normalized to lower case) */ synchronized void addAlias(MimeType type, MediaType alias) { @@ -421,10 +411,10 @@ synchronized void addAlias(MimeType type, MediaType alias) { } /** - * Adds a file name pattern for the given media type. Assumes that the - * pattern being added is not a JDK standard regular expression. + * Adds a file name pattern for the given media type. Assumes that the pattern being added is + * not a JDK standard regular expression. * - * @param type media type + * @param type media type * @param pattern file name pattern * @throws MimeTypeException if the pattern conflicts with existing ones */ @@ -433,20 +423,18 @@ public void addPattern(MimeType type, String pattern) throws MimeTypeException { } /** - * Adds a file name pattern for the given media type. The caller can specify - * whether the pattern being added is or is not a JDK standard - * regular expression via the isRegex parameter. If the value - * is set to true, then a JDK standard regex is assumed, otherwise the - * freedesktop glob type is assumed. + * Adds a file name pattern for the given media type. The caller can specify whether the pattern + * being added is or is not a JDK standard regular expression via the + * isRegex parameter. If the value is set to true, then a JDK standard regex is + * assumed, otherwise the freedesktop glob type is assumed. * - * @param type media type + * @param type media type * @param pattern file name pattern - * @param isRegex set to true if JDK std regexs are desired, otherwise set to - * false. + * @param isRegex set to true if JDK std regexs are desired, otherwise set to false. * @throws MimeTypeException if the pattern conflicts with existing ones. */ public void addPattern(MimeType type, String pattern, boolean isRegex) - throws MimeTypeException { + throws MimeTypeException { patterns.add(pattern, isRegex, type); } @@ -455,8 +443,8 @@ public MediaTypeRegistry getMediaTypeRegistry() { } /** - * Return the minimum length of data to provide to analyzing methods based - * on the document's content in order to check all the known MimeTypes. + * Return the minimum length of data to provide to analyzing methods based on the document's + * content in order to check all the known MimeTypes. * * @return the minimum length of data to provide. * @see #getMimeType(byte[]) @@ -488,8 +476,7 @@ void add(MimeType type) { } /** - * Called after all configured types have been loaded. - * Initializes the magics and xmls sets. + * Called after all configured types have been loaded. Initializes the magics and xmls sets. */ void init() { for (MimeType type : types.values()) { @@ -503,14 +490,13 @@ void init() { } /** - * Automatically detects the MIME type of a document based on magic - * markers in the stream prefix and any given metadata hints. + * Automatically detects the MIME type of a document based on magic markers in the stream prefix + * and any given metadata hints. *

- * The given stream is expected to support marks, so that this method - * can reset the stream to the position it was in before this method - * was called. + * The given stream is expected to support marks, so that this method can reset the stream to + * the position it was in before this method was called. * - * @param input document stream, or null + * @param input document stream, or null * @param metadata metadata hints * @return MIME type of the document * @throws IOException if the document stream could not be read @@ -536,7 +522,7 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException String name = null; boolean isHttp = false; - // Deal with a URI or a path name in as the resource name + // Deal with a URI or a path name in as the resource name try { URI uri = new URI(resourceName); String scheme = uri.getScheme(); @@ -559,7 +545,7 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException // the mime type if (!(isHttp && hint.isInterpreted())) { // If we have some types based on mime magic, try to specialise - // and/or select the type based on that + // and/or select the type based on that // Otherwise, use the type identified from the name possibleTypes = applyHint(possibleTypes, hint); } @@ -586,18 +572,17 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } /** - * Use the MimeType hint to try to clarify or specialise the current - * possible types list. - * If the hint is a specialised form, use that instead - * If there are multiple possible types, use the hint to select one + * Use the MimeType hint to try to clarify or specialise the current possible types list. If the + * hint is a specialised form, use that instead If there are multiple possible types, use the + * hint to select one */ private List applyHint(List possibleTypes, MimeType hint) { if (possibleTypes == null || possibleTypes.isEmpty()) { return Collections.singletonList(hint); } else { for (final MimeType type : possibleTypes) { - if (hint.equals(type) || - registry.isSpecializationOf(hint.getType(), type.getType())) { + if (hint.equals(type) + || registry.isSpecializationOf(hint.getType(), type.getType())) { // Use just this type return Collections.singletonList(hint); } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java index afec1f1993..2546ac9cb4 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -23,7 +21,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; @@ -37,8 +34,8 @@ public class MimeTypesFactory { /** - * System property to set a path to an additional external custom mimetypes - * XML file to be loaded. + * System property to set a path to an additional external custom mimetypes XML file to be + * loaded. */ public static final String CUSTOM_MIMES_SYS_PROP = "tika.custom-mimetypes"; @@ -64,14 +61,14 @@ public static MimeTypes create(Document document) throws MimeTypeException { } /** - * Creates and returns a MimeTypes instance from the specified input stream. - * Does not close the input stream(s). + * Creates and returns a MimeTypes instance from the specified input stream. Does not close the + * input stream(s). * - * @throws IOException if the stream can not be read + * @throws IOException if the stream can not be read * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(InputStream... inputStreams) - throws IOException, MimeTypeException { + throws IOException, MimeTypeException { MimeTypes mimeTypes = new MimeTypes(); MimeTypesReader reader = new MimeTypesReader(mimeTypes); for (InputStream inputStream : inputStreams) { @@ -85,16 +82,15 @@ public static MimeTypes create(InputStream... inputStreams) * @see #create(InputStream...) */ public static MimeTypes create(InputStream stream) throws IOException, MimeTypeException { - return create(new InputStream[]{stream}); + return create(new InputStream[] {stream}); } /** - * Creates and returns a MimeTypes instance from the resource - * at the location specified by the URL. Opens and closes the - * InputStream from the URL. - * If multiple URLs are supplied, then they are loaded in turn. + * Creates and returns a MimeTypes instance from the resource at the location specified by the + * URL. Opens and closes the InputStream from the URL. If multiple URLs are supplied, then they + * are loaded in turn. * - * @throws IOException if the URL can not be accessed + * @throws IOException if the URL can not be accessed * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(URL... urls) throws IOException, MimeTypeException { @@ -116,14 +112,14 @@ public static MimeTypes create(URL... urls) throws IOException, MimeTypeExceptio * @see #create(URL...) */ public static MimeTypes create(URL url) throws IOException, MimeTypeException { - return create(new URL[]{url}); + return create(new URL[] {url}); } /** - * Creates and returns a MimeTypes instance from the specified file path, - * as interpreted by the class loader in getResource(). + * Creates and returns a MimeTypes instance from the specified file path, as interpreted by the + * class loader in getResource(). * - * @throws IOException if the file can not be accessed + * @throws IOException if the file can not be accessed * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(String filePath) throws IOException, MimeTypeException { @@ -131,60 +127,52 @@ public static MimeTypes create(String filePath) throws IOException, MimeTypeExce } /** - * Creates and returns a MimeTypes instance. The core mimetypes - * will be loaded from the specified file path, and any custom - * override mimetypes found will loaded afterwards. - * The file paths will be interpreted by the default class loader in - * getResource(). + * Creates and returns a MimeTypes instance. The core mimetypes will be loaded from the + * specified file path, and any custom override mimetypes found will loaded afterwards. The file + * paths will be interpreted by the default class loader in getResource(). * - * @param coreFilePath The main MimeTypes file to load + * @param coreFilePath The main MimeTypes file to load * @param extensionFilePath The name of extension MimeType files to load afterwards - * @throws IOException if the file can not be accessed + * @throws IOException if the file can not be accessed * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(String coreFilePath, String extensionFilePath) - throws IOException, MimeTypeException { + throws IOException, MimeTypeException { return create(coreFilePath, extensionFilePath, null); } /** - * Creates and returns a MimeTypes instance. The core mimetypes - * will be loaded from the specified file path, and any custom - * override mimetypes found will loaded afterwards. - * The file paths will be interpreted by the specified class - * loader in getResource(). - * It will also load custom mimetypes from the system property - * {@link #CUSTOM_MIMES_SYS_PROP}, if specified. + * Creates and returns a MimeTypes instance. The core mimetypes will be loaded from the + * specified file path, and any custom override mimetypes found will loaded afterwards. The file + * paths will be interpreted by the specified class loader in getResource(). It will also load + * custom mimetypes from the system property {@link #CUSTOM_MIMES_SYS_PROP}, if specified. * - * @param coreFilePath The main MimeTypes file to load + * @param coreFilePath The main MimeTypes file to load * @param extensionFilePath The name of extension MimeType files to load afterwards - * @throws IOException if the file can not be accessed + * @throws IOException if the file can not be accessed * @throws MimeTypeException if the type configuration is invalid */ public static MimeTypes create(String coreFilePath, String extensionFilePath, - ClassLoader classLoader) throws IOException, MimeTypeException { + ClassLoader classLoader) throws IOException, MimeTypeException { // If no specific classloader was requested, use our own class's one if (classLoader == null) { classLoader = MimeTypesReader.class.getClassLoader(); } // This allows us to replicate class.getResource() when using - // the classloader directly + // the classloader directly String classPrefix = MimeTypesReader.class.getPackage().getName().replace('.', '/') + "/"; // Get the core URL, and all the extensions URLs URL coreURL = classLoader.getResource(classPrefix + coreFilePath); - List extensionURLs = - Collections.list(classLoader.getResources(extensionFilePath)); + List extensionURLs = Collections.list(classLoader.getResources(extensionFilePath)); // Swap that into an Array, and process List urls = new ArrayList<>(); urls.add(coreURL); urls.addAll(extensionURLs); if (LOG.isDebugEnabled()) { - urls.stream().forEach( u -> - LOG.debug("Loaded custom mimes file: {}", u) - ); + urls.stream().forEach(u -> LOG.debug("Loaded custom mimes file: {}", u)); } String customMimesPath = System.getProperty(CUSTOM_MIMES_SYS_PROP); @@ -192,12 +180,13 @@ public static MimeTypes create(String coreFilePath, String extensionFilePath, File externalFile = new File(customMimesPath); if (!externalFile.exists()) { throw new IOException( - "Specified custom mimetypes file not found: " + customMimesPath); + "Specified custom mimetypes file not found: " + customMimesPath); } URL externalURL = externalFile.toURI().toURL(); urls.add(externalURL); if (LOG.isDebugEnabled()) { - LOG.debug("Loaded external custom mimetypes file: {}", externalFile.getAbsolutePath()); + LOG.debug("Loaded external custom mimetypes file: {}", + externalFile.getAbsolutePath()); } } diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java index 76bc5c7525..e04f710fe8 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -34,8 +32,9 @@ import javax.xml.transform.TransformerException; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.sax.SAXResult; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.utils.XMLReaderUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; @@ -44,9 +43,6 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.exception.TikaException; -import org.apache.tika.utils.XMLReaderUtils; - /** * A reader for XML files compliant with the freedesktop MIME-info DTD. * @@ -104,11 +100,10 @@ * ]> *

*

- * In addition to the standard fields, this will also read two Tika specific fields: - * - link - * - uti + * In addition to the standard fields, this will also read two Tika specific fields: - link - uti * - * @see https://freedesktop.org/wiki/Specifications/shared-mime-info-spec/ + * @see https://freedesktop.org/wiki/Specifications/shared-mime-info-spec/ */ public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMetKeys { private static final ReentrantReadWriteLock READ_WRITE_LOCK = new ReentrantReadWriteLock(); @@ -143,9 +138,8 @@ protected MimeTypesReader(MimeTypes types) { } /** - * Acquire a SAXParser from the pool; create one if it - * doesn't exist. Make sure to {@link #releaseParser(SAXParser)} in - * a finally block every time you call this. + * Acquire a SAXParser from the pool; create one if it doesn't exist. Make sure to + * {@link #releaseParser(SAXParser)} in a finally block every time you call this. * * @return a SAXParser * @throws TikaException @@ -177,11 +171,11 @@ private static void releaseParser(SAXParser parser) { try { parser.reset(); } catch (UnsupportedOperationException e) { - //ignore + // ignore } try { READ_WRITE_LOCK.readLock().lock(); - //if there are extra parsers (e.g. after a reset of the pool to a smaller size), + // if there are extra parsers (e.g. after a reset of the pool to a smaller size), // this parser will not be added and will then be gc'd SAX_PARSERS.offer(parser); } finally { @@ -196,9 +190,9 @@ private static void releaseParser(SAXParser parser) { */ public static void setPoolSize(int poolSize) throws TikaException { try { - //stop the world with a write lock - //parsers that are currently in use will be offered, but not - //accepted and will be gc'd + // stop the world with a write lock + // parsers that are currently in use will be offered, but not + // accepted and will be gc'd READ_WRITE_LOCK.writeLock().lock(); SAX_PARSERS = new ArrayBlockingQueue<>(poolSize); for (int i = 0; i < poolSize; i++) { @@ -216,8 +210,8 @@ private static SAXParser newSAXParser() throws TikaException { try { factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); } catch (ParserConfigurationException | SAXException e) { - LOG.warn("can't set secure processing feature on: " + factory.getClass() + - ". User assumes responsibility for consequences."); + LOG.warn("can't set secure processing feature on: " + factory.getClass() + + ". User assumes responsibility for consequences."); } try { return factory.newSAXParser(); @@ -259,7 +253,7 @@ public InputSource resolveEntity(String publicId, String systemId) { @Override public void startElement(String uri, String localName, String qName, Attributes attributes) - throws SAXException { + throws SAXException { if (type == null) { if (MIME_TYPE_TAG.equals(qName)) { String name = attributes.getValue(MIME_TYPE_TYPE_ATTR); @@ -278,8 +272,8 @@ public void startElement(String uri, String localName, String qName, Attributes } else if (SUB_CLASS_OF_TAG.equals(qName)) { String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR); types.setSuperType(type, MediaType.parse(parent)); - } else if (ACRONYM_TAG.equals(qName) || COMMENT_TAG.equals(qName) || - TIKA_LINK_TAG.equals(qName) || TIKA_UTI_TAG.equals(qName)) { + } else if (ACRONYM_TAG.equals(qName) || COMMENT_TAG.equals(qName) + || TIKA_LINK_TAG.equals(qName) || TIKA_UTI_TAG.equals(qName)) { characters = new StringBuilder(); } else if (GLOB_TAG.equals(qName)) { String pattern = attributes.getValue(PATTERN_ATTR); @@ -298,7 +292,7 @@ public void startElement(String uri, String localName, String qName, Attributes } else if (MATCH_TAG.equals(qName)) { if (attributes.getValue(MATCH_MINSHOULDMATCH_ATTR) != null) { current = new ClauseRecord(new MinShouldMatchVal( - Integer.parseInt(attributes.getValue(MATCH_MINSHOULDMATCH_ATTR)))); + Integer.parseInt(attributes.getValue(MATCH_MINSHOULDMATCH_ATTR)))); } else { String kind = attributes.getValue(MATCH_TYPE_ATTR); String offset = attributes.getValue(MATCH_OFFSET_ATTR); @@ -307,8 +301,8 @@ public void startElement(String uri, String localName, String qName, Attributes if (kind == null) { kind = "string"; } - current = - new ClauseRecord(new MagicMatch(type.getType(), kind, offset, value, mask)); + current = new ClauseRecord( + new MagicMatch(type.getType(), kind, offset, value, mask)); } } else if (MAGIC_TAG.equals(qName)) { String value = attributes.getValue(MAGIC_PRIORITY_ATTR); @@ -361,19 +355,18 @@ public void characters(char[] ch, int start, int length) { } protected void handleMimeError(String input, MimeTypeException ex, String qName, - Attributes attributes) throws SAXException { + Attributes attributes) throws SAXException { throw new SAXException(ex); } protected void handleGlobError(MimeType type, String pattern, MimeTypeException ex, - String qName, Attributes attributes) throws SAXException { + String qName, Attributes attributes) throws SAXException { throw new SAXException(ex); } /** - * Shim class used during building of actual classes. - * This temporarily holds the value of the minShouldMatchClause - * so that the actual MinShouldMatchClause can have a cleaner/immutable + * Shim class used during building of actual classes. This temporarily holds the value of the + * minShouldMatchClause so that the actual MinShouldMatchClause can have a cleaner/immutable * initialization. */ private static class MinShouldMatchVal implements Clause { @@ -391,7 +384,7 @@ int getVal() { @Override public boolean eval(byte[] data) { throw new IllegalStateException( - "This should never be used " + "on this placeholder class"); + "This should never be used " + "on this placeholder class"); } @Override @@ -415,8 +408,8 @@ public ClauseRecord(Clause clause) { public void stop() { if (clause instanceof MinShouldMatchVal) { - clause = - new MinShouldMatchClause(((MinShouldMatchVal) clause).getVal(), subclauses); + clause = new MinShouldMatchClause(((MinShouldMatchVal) clause).getVal(), + subclauses); } else if (subclauses != null) { Clause subclause; if (subclauses.size() == 1) { diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java index df35134571..7d110dfb8d 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; diff --git a/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java b/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java index 0a18f4e703..aab1e7d966 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java +++ b/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -26,8 +24,8 @@ class MinShouldMatchClause implements Clause { /** * Minimum number of clauses that need to match. *

- * Throws IllegalArgumentException if min <= 0, - * if clauses is null or has size == 0, or if min > clauses.size() + * Throws IllegalArgumentException if min <= 0, if clauses is null or has size == 0, or if min > + * clauses.size() * * @param min * @param clauses @@ -40,7 +38,7 @@ class MinShouldMatchClause implements Clause { if (min > clauses.size()) { throw new IllegalArgumentException( - "min (" + min + ") cannot be > clauses.size (" + clauses.size() + ")"); + "min (" + min + ") cannot be > clauses.size (" + clauses.size() + ")"); } else if (min <= 0) { throw new IllegalArgumentException("min cannot be <= 0: " + min); } diff --git a/tika-core/src/main/java/org/apache/tika/mime/OrClause.java b/tika-core/src/main/java/org/apache/tika/mime/OrClause.java index 6a2f212f5a..5d2f0a2806 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/OrClause.java +++ b/tika-core/src/main/java/org/apache/tika/mime/OrClause.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; diff --git a/tika-core/src/main/java/org/apache/tika/mime/Patterns.java b/tika-core/src/main/java/org/apache/tika/mime/Patterns.java index 48c0329f06..4ab844d2b1 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/Patterns.java +++ b/tika-core/src/main/java/org/apache/tika/mime/Patterns.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -47,8 +45,7 @@ class Patterns implements Serializable { /** * Index of generic glob patterns, sorted by length. */ - private final SortedMap globs = - new TreeMap<>(new LengthComparator()); + private final SortedMap globs = new TreeMap<>(new LengthComparator()); private int minExtensionLength = Integer.MAX_VALUE; private int maxExtensionLength = 0; @@ -71,11 +68,11 @@ public void add(String pattern, boolean isJavaRegex, MimeType type) throws MimeT addGlob(pattern, type); } else { - if (pattern.indexOf('*') == -1 && pattern.indexOf('?') == -1 && - pattern.indexOf('[') == -1) { + if (pattern.indexOf('*') == -1 && pattern.indexOf('?') == -1 + && pattern.indexOf('[') == -1) { addName(pattern, type); - } else if (pattern.startsWith("*") && pattern.indexOf('*', 1) == -1 && - pattern.indexOf('?') == -1 && pattern.indexOf('[') == -1) { + } else if (pattern.startsWith("*") && pattern.indexOf('*', 1) == -1 + && pattern.indexOf('?') == -1 && pattern.indexOf('[') == -1) { String extension = pattern.substring(1); addExtension(extension, type); type.addExtension(extension); @@ -89,8 +86,8 @@ private void addName(String name, MimeType type) throws MimeTypeException { MimeType previous = names.get(name); if (previous == null || registry.isSpecializationOf(previous.getType(), type.getType())) { names.put(name, type); - } else if (previous == type || - registry.isSpecializationOf(type.getType(), previous.getType())) { + } else if (previous == type + || registry.isSpecializationOf(type.getType(), previous.getType())) { // do nothing } else { throw new MimeTypeException("Conflicting name pattern: " + name); @@ -104,8 +101,8 @@ private void addExtension(String extension, MimeType type) throws MimeTypeExcept int length = extension.length(); minExtensionLength = Math.min(minExtensionLength, length); maxExtensionLength = Math.max(maxExtensionLength, length); - } else if (previous == type || - registry.isSpecializationOf(type.getType(), previous.getType())) { + } else if (previous == type + || registry.isSpecializationOf(type.getType(), previous.getType())) { // do nothing } else { throw new MimeTypeException("Conflicting extension pattern: " + extension); @@ -116,8 +113,8 @@ private void addGlob(String glob, MimeType type) throws MimeTypeException { MimeType previous = globs.get(glob); if (previous == null || registry.isSpecializationOf(previous.getType(), type.getType())) { globs.put(glob, type); - } else if (previous == type || - registry.isSpecializationOf(type.getType(), previous.getType())) { + } else if (previous == type + || registry.isSpecializationOf(type.getType(), previous.getType())) { // do nothing } else { throw new MimeTypeException("Conflicting glob pattern: " + glob); @@ -127,15 +124,13 @@ private void addGlob(String glob, MimeType type) throws MimeTypeException { /** * Find the MimeType corresponding to a resource name. *

- * It applies the recommendations detailed in FreeDesktop Shared MIME-info - * Database for guessing MimeType from a resource name: It first tries a - * case-sensitive match, then try again with the resource name converted to - * lower-case if that fails. If several patterns match then the longest - * pattern is used. In particular, files with multiple extensions (such as - * Data.tar.gz) match the longest sequence of extensions (eg '*.tar.gz' in - * preference to '*.gz'). Literal patterns (eg, 'Makefile') are matched - * before all others. Patterns beginning with `*.' and containing no other - * special characters (`*?[') are matched before other wildcarded patterns + * It applies the recommendations detailed in FreeDesktop Shared MIME-info Database for guessing + * MimeType from a resource name: It first tries a case-sensitive match, then try again with the + * resource name converted to lower-case if that fails. If several patterns match then the + * longest pattern is used. In particular, files with multiple extensions (such as Data.tar.gz) + * match the longest sequence of extensions (eg '*.tar.gz' in preference to '*.gz'). Literal + * patterns (eg, 'Makefile') are matched before all others. Patterns beginning with `*.' and + * containing no other special characters (`*?[') are matched before other wildcarded patterns * (since this covers the majority of the patterns). */ public MimeType matches(String name) { diff --git a/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java b/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java index 5e33b85795..cec7e3dcaf 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java +++ b/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -22,14 +20,12 @@ import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; - import org.apache.tika.detect.Detector; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; /** - * Selector for combining different mime detection results - * based on probability + * Selector for combining different mime detection results based on probability */ public class ProbabilisticMimeDetectionSelector implements Detector { private static final long serialVersionUID = 224589862960269260L; @@ -42,10 +38,9 @@ public class ProbabilisticMimeDetectionSelector implements Detector { private final MimeTypes mimeTypes; private final MediaType rootMediaType; /* - * this change rate is used when there are multiple types predicted by - * magic-bytes. the first predicted type has the highest probability, and - * the probability for the next type predicted by magic-bytes will decay - * with this change rate. The idea is to have the first one to take + * this change rate is used when there are multiple types predicted by magic-bytes. the first + * predicted type has the highest probability, and the probability for the next type predicted + * by magic-bytes will decay with this change rate. The idea is to have the first one to take * precedence among the multiple possible types predicted by MAGIC-bytes. */ private final float changeRate; @@ -53,8 +48,8 @@ public class ProbabilisticMimeDetectionSelector implements Detector { private float magic_trust, extension_trust, meta_trust; private float magic_neg, extension_neg, meta_neg; /* - * any posterior probability lower than the threshold, will be considered as - * an oct-stream type, the default value is 0.5 + * any posterior probability lower than the threshold, will be considered as an oct-stream type, + * the default value is 0.5 */ private float threshold; @@ -80,16 +75,16 @@ public ProbabilisticMimeDetectionSelector(final MimeTypes mimeTypes, final Build this.initializeDefaultProbabilityParameters(); this.changeRate = 0.1f; if (builder != null) { - priorMagicFileType = builder.priorMagicFileType == 0f ? priorMagicFileType : - builder.priorMagicFileType; - priorExtensionFileType = builder.priorExtensionFileType == 0f ? priorExtensionFileType : - builder.priorExtensionFileType; - priorMetaFileType = - builder.priorMetaFileType == 0f ? priorMetaFileType : builder.priorMetaFileType; + priorMagicFileType = builder.priorMagicFileType == 0f ? priorMagicFileType + : builder.priorMagicFileType; + priorExtensionFileType = builder.priorExtensionFileType == 0f ? priorExtensionFileType + : builder.priorExtensionFileType; + priorMetaFileType = builder.priorMetaFileType == 0f ? priorMetaFileType + : builder.priorMetaFileType; magic_trust = builder.magic_trust == 0f ? magic_trust : builder.extension_neg; - extension_trust = - builder.extension_trust == 0f ? extension_trust : builder.extension_trust; + extension_trust = builder.extension_trust == 0f ? extension_trust + : builder.extension_trust; meta_trust = builder.meta_trust == 0f ? meta_trust : builder.meta_trust; magic_neg = builder.magic_neg == 0f ? magic_neg : builder.magic_neg; @@ -130,7 +125,7 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException input.mark(mimeTypes.getMinLength()); try { byte[] prefix = mimeTypes.readMagicHeader(input); - //defensive copy + // defensive copy possibleTypes.addAll(mimeTypes.getMimeType(prefix)); } finally { input.reset(); @@ -187,8 +182,7 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } private MediaType applyProbilities(final List possibleTypes, - final MimeType extMimeType, - final MimeType metadataMimeType) { + final MimeType extMimeType, final MimeType metadataMimeType) { /* initialize some probability variables */ MediaType extensionMediaType_ = extMimeType == null ? null : extMimeType.getType(); @@ -206,8 +200,7 @@ private MediaType applyProbilities(final List possibleTypes, /* pre-process some probability variables */ if (extensionMediaType_ == null || extensionMediaType_.compareTo(rootMediaType) == 0) { /* - * this is a root type, that means the extension method fails to - * identify any type. + * this is a root type, that means the extension method fails to identify any type. */ ext_trust = 1; ext_neg = 1; @@ -231,8 +224,8 @@ private MediaType applyProbilities(final List possibleTypes, } else { // check if each identified type belongs to the same class; if (extensionMediaType_ != null) { - if (extensionMediaType_.equals(magictype) || - registry.isSpecializationOf(extensionMediaType_, magictype)) { + if (extensionMediaType_.equals(magictype) || registry + .isSpecializationOf(extensionMediaType_, magictype)) { // Use just this type possibleTypes.set(i, extMimeType); } else if (registry.isSpecializationOf(magictype, extensionMediaType_)) { @@ -240,8 +233,8 @@ private MediaType applyProbilities(final List possibleTypes, } } if (metaMediaType_ != null) { - if (metaMediaType_.equals(magictype) || - registry.isSpecializationOf(metaMediaType_, magictype)) { + if (metaMediaType_.equals(magictype) + || registry.isSpecializationOf(metaMediaType_, magictype)) { // Use just this type possibleTypes.set(i, metadataMimeType); } else if (registry.isSpecializationOf(magictype, metaMediaType_)) { @@ -261,8 +254,7 @@ private MediaType applyProbilities(final List possibleTypes, if (i > 0) { /* - * decay as our trust goes down with next type predicted by - * magic + * decay as our trust goes down with next type predicted by magic */ mag_trust = mag_trust * (1 - changeRate); /* @@ -371,8 +363,8 @@ private MediaType applyProbilities(final List possibleTypes, results[2] = 0.1f; } /* - * compute the posterior probability for each predicted file - * type and store them into the "results" array. + * compute the posterior probability for each predicted file type and store them + * into the "results" array. */ float pPrime = priorMagicFileType; float deno = 1 - priorMagicFileType; @@ -444,19 +436,16 @@ public MediaTypeRegistry getMediaTypeRegistry() { */ public static class Builder { /* - * the following are the prior probabilities for the file type - * identified by each method. + * the following are the prior probabilities for the file type identified by each method. */ private float priorMagicFileType, priorExtensionFileType, priorMetaFileType; /* - * the following are the conditional probability for each method with - * positive conditions + * the following are the conditional probability for each method with positive conditions */ private float magic_trust, extension_trust, meta_trust; /* - * the following *_neg are the conditional probabilities with negative - * conditions + * the following *_neg are the conditional probabilities with negative conditions */ private float magic_neg, extension_neg, meta_neg; diff --git a/tika-core/src/main/java/org/apache/tika/mime/package-info.java b/tika-core/src/main/java/org/apache/tika/mime/package-info.java index 104dc3acf9..de0a4cab16 100644 --- a/tika-core/src/main/java/org/apache/tika/mime/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/mime/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/main/java/org/apache/tika/package-info.java b/tika-core/src/main/java/org/apache/tika/package-info.java index cf4352ddcb..8a371af2ca 100644 --- a/tika-core/src/main/java/org/apache/tika/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java index 2e9f3936fe..3186ea4abd 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -22,8 +20,8 @@ /** - * Abstract base class for parsers that use the AutoDetectReader and need - * to use the {@link EncodingDetector} configured by {@link TikaConfig} + * Abstract base class for parsers that use the AutoDetectReader and need to use the + * {@link EncodingDetector} configured by {@link TikaConfig} */ public abstract class AbstractEncodingDetectorParser implements Parser { @@ -39,8 +37,8 @@ public AbstractEncodingDetectorParser(EncodingDetector encodingDetector) { } /** - * Look for an EncodingDetetor in the ParseContext. If it hasn't been - * passed in, use the original EncodingDetector from initialization. + * Look for an EncodingDetetor in the ParseContext. If it hasn't been passed in, use the + * original EncodingDetector from initialization. * * @param parseContext * @return diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java index c5c3315f92..efdc02ccc5 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -20,12 +18,10 @@ import java.util.concurrent.ConcurrentHashMap; /** - * Abstract base class for parsers that call external processes. This - * adds one more layer of 'hope' that processes won't be orphaned if - * the jvm has to be restarted. This does not guarantee that the - * processes won't be orphaned in case of, e.g. kill -9, but this - * increases the chances that under normal circumstances or if the jvm - * itself exits, that external processes won't be orphaned. + * Abstract base class for parsers that call external processes. This adds one more layer of 'hope' + * that processes won't be orphaned if the jvm has to be restarted. This does not guarantee that the + * processes won't be orphaned in case of, e.g. kill -9, but this increases the chances that under + * normal circumstances or if the jvm itself exits, that external processes won't be orphaned. * * @since Apache Tika 1.27 */ @@ -54,4 +50,3 @@ protected Process release(String id) { return PROCESS_MAP.remove(id); } } - diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java index f6017d6c0f..b5d106a68e 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java @@ -1,33 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; import java.io.IOException; import java.io.InputStream; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Abstract base class for new parsers. This method implements the old - * deprecated parse method so subclasses won't have to. + * Abstract base class for new parsers. This method implements the old deprecated parse method so + * subclasses won't have to. * * @deprecated for removal in 4.x * @since Apache Tika 0.10 @@ -41,21 +37,18 @@ public abstract class AbstractParser implements Parser { private static final long serialVersionUID = 7186985395903074255L; /** - * Calls the - * {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)} - * method with an empty {@link ParseContext}. This method exists as a - * leftover from Tika 0.x when the three-argument parse() method still - * existed in the {@link Parser} interface. No new code should call this - * method anymore, it's only here for backwards compatibility. + * Calls the {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)} method + * with an empty {@link ParseContext}. This method exists as a leftover from Tika 0.x when the + * three-argument parse() method still existed in the {@link Parser} interface. No new code + * should call this method anymore, it's only here for backwards compatibility. * - * @deprecated use the {@link Parser#parse(InputStream, ContentHandler, - * Metadata, ParseContext)} method instead + * @deprecated use the {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)} + * method instead */ @Deprecated public void parse(InputStream stream, ContentHandler handler, Metadata metadata) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { parse(stream, handler, metadata, new ParseContext()); } } - diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index 86eae692a0..49f0c4999d 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@ -1,27 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; import java.io.IOException; import java.io.InputStream; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; @@ -38,6 +32,8 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.sax.SecureContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class AutoDetectParser extends CompositeParser { @@ -45,23 +41,20 @@ public class AutoDetectParser extends CompositeParser { * Serial version UID */ private static final long serialVersionUID = 6110455808615143122L; - //private final TikaConfig config; + // private final TikaConfig config; /** - * The type detector used by this parser to auto-detect the type - * of a document. + * The type detector used by this parser to auto-detect the type of a document. */ private Detector detector; // always set in the constructor /** - * Configuration used when initializing a SecureContentHandler - * and the TikaInputStream. + * Configuration used when initializing a SecureContentHandler and the TikaInputStream. */ private AutoDetectParserConfig autoDetectParserConfig; /** - * Creates an auto-detecting parser instance using the default Tika - * configuration. + * Creates an auto-detecting parser instance using the default Tika configuration. */ public AutoDetectParser() { this(TikaConfig.getDefaultConfig()); @@ -73,10 +66,10 @@ public AutoDetectParser(Detector detector) { } /** - * Creates an auto-detecting parser instance using the specified set of parser. - * This allows one to create a Tika configuration where only a subset of the - * available parsers have their 3rd party jars included, as otherwise the - * use of the default TikaConfig will throw various "ClassNotFound" exceptions. + * Creates an auto-detecting parser instance using the specified set of parser. This allows one + * to create a Tika configuration where only a subset of the available parsers have their 3rd + * party jars included, as otherwise the use of the default TikaConfig will throw various + * "ClassNotFound" exceptions. * * @param parsers */ @@ -102,7 +95,7 @@ private static Parser buildFallbackParser(TikaConfig config) { Parser fallback = null; Parser p = config.getParser(); if (p instanceof DefaultParser) { - fallback = ((DefaultParser)p).getFallback(); + fallback = ((DefaultParser) p).getFallback(); } else { fallback = new EmptyParser(); } @@ -111,8 +104,9 @@ private static Parser buildFallbackParser(TikaConfig config) { return fallback; } else { return new DigestingParser(fallback, - config.getAutoDetectParserConfig().getDigesterFactory().build(), - config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument()); + config.getAutoDetectParserConfig().getDigesterFactory().build(), + config.getAutoDetectParserConfig().getDigesterFactory() + .isSkipContainerDocument()); } } @@ -122,13 +116,13 @@ private static Parser getParser(TikaConfig config) { return config.getParser(); } return new DigestingParser(config.getParser(), - config.getAutoDetectParserConfig().getDigesterFactory().build(), - config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument()); + config.getAutoDetectParserConfig().getDigesterFactory().build(), + config.getAutoDetectParserConfig().getDigesterFactory() + .isSkipContainerDocument()); } /** - * Returns the type detector used by this parser to auto-detect the type - * of a document. + * Returns the type detector used by this parser to auto-detect the type of a document. * * @return type detector * @since Apache Tika 0.4 @@ -138,8 +132,7 @@ public Detector getDetector() { } /** - * Sets the type detector used by this parser to auto-detect the type - * of a document. + * Sets the type detector used by this parser to auto-detect the type of a document. * * @param detector type detector * @since Apache Tika 0.4 @@ -149,8 +142,8 @@ public void setDetector(Detector detector) { } /** - * Sets the configuration that will be used to create SecureContentHandlers - * that will be used for parsing. + * Sets the configuration that will be used to create SecureContentHandlers that will be used + * for parsing. * * @param autoDetectParserConfig type SecureContentHandlerConfig * @since Apache Tika 2.1.1 @@ -164,26 +157,26 @@ public AutoDetectParserConfig getAutoDetectParserConfig() { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { if (autoDetectParserConfig.getMetadataWriteFilterFactory() != null) { metadata.setMetadataWriteFilter( - autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance()); + autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance()); } TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); - //figure out if we should spool to disk + // figure out if we should spool to disk maybeSpool(tis, autoDetectParserConfig, metadata); // Automatically detect the MIME type of the document MediaType type = detector.detect(tis, metadata); - //update CONTENT_TYPE as long as it wasn't set by parser override - if (metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) == null || - !metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) - .equals(type.toString())) { + // update CONTENT_TYPE as long as it wasn't set by parser override + if (metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) == null + || !metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) + .equals(type.toString())) { metadata.set(Metadata.CONTENT_TYPE, type.toString()); } - //check for zero-byte inputstream + // check for zero-byte inputstream if (tis.getOpenContainer() == null) { if (autoDetectParserConfig.getThrowOnZeroBytes()) { tis.mark(1); @@ -195,8 +188,9 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } handler = decorateHandler(handler, metadata, context, autoDetectParserConfig); // TIKA-216: Zip bomb prevention - SecureContentHandler sch = handler != null ? - createSecureContentHandler(handler, tis, autoDetectParserConfig) : null; + SecureContentHandler sch = handler != null + ? createSecureContentHandler(handler, tis, autoDetectParserConfig) + : null; initializeEmbeddedDocumentExtractor(metadata, context); try { @@ -213,32 +207,31 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } private ContentHandler decorateHandler(ContentHandler handler, Metadata metadata, - ParseContext context, - AutoDetectParserConfig autoDetectParserConfig) { + ParseContext context, AutoDetectParserConfig autoDetectParserConfig) { if (context.get(RecursiveParserWrapper.RecursivelySecureContentHandler.class) != null) { - //using the recursiveparserwrapper. we should decorate this handler - return autoDetectParserConfig.getContentHandlerDecoratorFactory() - .decorate(handler, metadata, context); + // using the recursiveparserwrapper. we should decorate this handler + return autoDetectParserConfig.getContentHandlerDecoratorFactory().decorate(handler, + metadata, context); } ParseRecord parseRecord = context.get(ParseRecord.class); if (parseRecord == null || parseRecord.getDepth() == 0) { - return autoDetectParserConfig.getContentHandlerDecoratorFactory() - .decorate(handler, metadata, context); + return autoDetectParserConfig.getContentHandlerDecoratorFactory().decorate(handler, + metadata, context); } - //else do not decorate + // else do not decorate return handler; } private void maybeSpool(TikaInputStream tis, AutoDetectParserConfig autoDetectParserConfig, - Metadata metadata) throws IOException { + Metadata metadata) throws IOException { if (tis.hasFile()) { return; } if (autoDetectParserConfig.getSpoolToDisk() == null) { return; } - //whether or not a content-length has been sent in, - //if spoolToDisk == 0, spool it + // whether or not a content-length has been sent in, + // if spoolToDisk == 0, spool it if (autoDetectParserConfig.getSpoolToDisk() == 0) { tis.getPath(); metadata.set(HttpHeaders.CONTENT_LENGTH, Long.toString(tis.getLength())); @@ -253,7 +246,7 @@ private void maybeSpool(TikaInputStream tis, AutoDetectParserConfig autoDetectPa metadata.set(HttpHeaders.CONTENT_LENGTH, Long.toString(tis.getLength())); } } catch (NumberFormatException e) { - //swallow...maybe log? + // swallow...maybe log? } } } @@ -262,14 +255,14 @@ private void initializeEmbeddedDocumentExtractor(Metadata metadata, ParseContext if (context.get(EmbeddedDocumentExtractor.class) != null) { return; } - //pass self to handle embedded documents if - //the caller hasn't specified one. + // pass self to handle embedded documents if + // the caller hasn't specified one. Parser p = context.get(Parser.class); if (p == null) { context.set(Parser.class, this); } EmbeddedDocumentExtractorFactory edxf = - autoDetectParserConfig.getEmbeddedDocumentExtractorFactory(); + autoDetectParserConfig.getEmbeddedDocumentExtractorFactory(); if (edxf == null) { edxf = new ParsingEmbeddedDocumentExtractorFactory(); } @@ -278,15 +271,14 @@ private void initializeEmbeddedDocumentExtractor(Metadata metadata, ParseContext } public void parse(InputStream stream, ContentHandler handler, Metadata metadata) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { ParseContext context = new ParseContext(); context.set(Parser.class, this); parse(stream, handler, metadata, context); } private SecureContentHandler createSecureContentHandler(ContentHandler handler, - TikaInputStream tis, - AutoDetectParserConfig config) { + TikaInputStream tis, AutoDetectParserConfig config) { SecureContentHandler sch = new SecureContentHandler(handler, tis); if (config == null) { return sch; diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java index afe65b07ed..7ff0fd2b9d 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java @@ -1,66 +1,61 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; import java.io.IOException; import java.io.Serializable; - -import org.w3c.dom.Element; -import org.xml.sax.ContentHandler; - import org.apache.tika.config.ConfigBase; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory; import org.apache.tika.sax.ContentHandlerDecoratorFactory; +import org.w3c.dom.Element; +import org.xml.sax.ContentHandler; /** - * This config object can be used to tune how conservative we want to be - * when parsing data that is extremely compressible and resembles a ZIP - * bomb. Null values will be ignored and will not affect the default values - * in SecureContentHandler. + * This config object can be used to tune how conservative we want to be when parsing data that is + * extremely compressible and resembles a ZIP bomb. Null values will be ignored and will not affect + * the default values in SecureContentHandler. *

- * See ModifyingContentWithHandlersAndMetadataFilters - * for documentation and examples for configuring this with a tika-config.xml file. + * See ModifyingContentWithHandlersAndMetadataFilters + * for documentation and examples for configuring this with a tika-config.xml file. */ public class AutoDetectParserConfig extends ConfigBase implements Serializable { private static ContentHandlerDecoratorFactory NOOP_CONTENT_HANDLER_DECORATOR_FACTORY = - new ContentHandlerDecoratorFactory() { - @Override - public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata, - ParseContext parseContext) { - return contentHandler; - } - }; + new ContentHandlerDecoratorFactory() { + @Override + public ContentHandler decorate(ContentHandler contentHandler, + Metadata metadata, ParseContext parseContext) { + return contentHandler; + } + }; public static AutoDetectParserConfig DEFAULT = new AutoDetectParserConfig(); public static AutoDetectParserConfig load(Element element) - throws TikaConfigException, IOException { + throws TikaConfigException, IOException { return AutoDetectParserConfig.buildSingle("autoDetectParserConfig", - AutoDetectParserConfig.class, element, AutoDetectParserConfig.DEFAULT); + AutoDetectParserConfig.class, element, AutoDetectParserConfig.DEFAULT); } /** - * If this is not null and greater than -1, the AutoDetectParser - * will spool the stream to disk if the length of the stream is known - * ahead of time. + * If this is not null and greater than -1, the AutoDetectParser will spool the stream to disk + * if the length of the stream is known ahead of time. */ private Long spoolToDisk = null; @@ -89,7 +84,7 @@ public static AutoDetectParserConfig load(Element element) private EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory = null; private ContentHandlerDecoratorFactory contentHandlerDecoratorFactory = - NOOP_CONTENT_HANDLER_DECORATOR_FACTORY; + NOOP_CONTENT_HANDLER_DECORATOR_FACTORY; private DigestingParser.DigesterFactory digesterFactory = null; @@ -99,14 +94,14 @@ public static AutoDetectParserConfig load(Element element) * Creates a SecureContentHandlerConfig using the passed in parameters. * * @param spoolToDisk - * @param outputThreshold SecureContentHandler - character output threshold. - * @param maximumCompressionRatio SecureContentHandler - max compression ratio allowed. - * @param maximumDepth SecureContentHandler - maximum XML element nesting level. + * @param outputThreshold SecureContentHandler - character output threshold. + * @param maximumCompressionRatio SecureContentHandler - max compression ratio allowed. + * @param maximumDepth SecureContentHandler - maximum XML element nesting level. * @param maximumPackageEntryDepth SecureContentHandler - maximum package entry nesting level. */ public AutoDetectParserConfig(Long spoolToDisk, Long outputThreshold, - Long maximumCompressionRatio, Integer maximumDepth, - Integer maximumPackageEntryDepth) { + Long maximumCompressionRatio, Integer maximumDepth, + Integer maximumPackageEntryDepth) { this.spoolToDisk = spoolToDisk; this.outputThreshold = outputThreshold; this.maximumCompressionRatio = maximumCompressionRatio; @@ -163,12 +158,12 @@ public MetadataWriteFilterFactory getMetadataWriteFilterFactory() { } public void setMetadataWriteFilterFactory( - MetadataWriteFilterFactory metadataWriteFilterFactory) { + MetadataWriteFilterFactory metadataWriteFilterFactory) { this.metadataWriteFilterFactory = metadataWriteFilterFactory; } public void setEmbeddedDocumentExtractorFactory( - EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory) { + EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory) { this.embeddedDocumentExtractorFactory = embeddedDocumentExtractorFactory; } @@ -177,7 +172,7 @@ public EmbeddedDocumentExtractorFactory getEmbeddedDocumentExtractorFactory() { } public void setContentHandlerDecoratorFactory( - ContentHandlerDecoratorFactory contentHandlerDecoratorFactory) { + ContentHandlerDecoratorFactory contentHandlerDecoratorFactory) { this.contentHandlerDecoratorFactory = contentHandlerDecoratorFactory; } @@ -203,14 +198,13 @@ public boolean getThrowOnZeroBytes() { @Override public String toString() { - return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ", outputThreshold=" + - outputThreshold + ", maximumCompressionRatio=" + maximumCompressionRatio + - ", maximumDepth=" + maximumDepth + ", maximumPackageEntryDepth=" + - maximumPackageEntryDepth + ", metadataWriteFilterFactory=" + - metadataWriteFilterFactory + ", embeddedDocumentExtractorFactory=" + - embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" + - contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory + - ", throwOnZeroBytes=" + throwOnZeroBytes + '}'; + return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ", outputThreshold=" + + outputThreshold + ", maximumCompressionRatio=" + maximumCompressionRatio + + ", maximumDepth=" + maximumDepth + ", maximumPackageEntryDepth=" + + maximumPackageEntryDepth + ", metadataWriteFilterFactory=" + + metadataWriteFilterFactory + ", embeddedDocumentExtractorFactory=" + + embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" + + contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory + + ", throwOnZeroBytes=" + throwOnZeroBytes + '}'; } } - diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java index 2365c8943a..c5ce3caf66 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -22,11 +20,9 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.Map; - -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; +import org.xml.sax.SAXException; /** * Factory for an AutoDetectParser @@ -34,8 +30,7 @@ public class AutoDetectParserFactory extends ParserFactory { /** - * Path to a tika-config file. This must be a literal - * file or findable on the classpath. + * Path to a tika-config file. This must be a literal file or findable on the classpath. */ public static final String TIKA_CONFIG_PATH = "tika_config_path"; diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java index 3b50b4da77..ec79d67bf3 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -26,10 +24,6 @@ import java.util.List; import java.util.Map; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.io.TemporaryResources; @@ -41,12 +35,13 @@ import org.apache.tika.sax.TaggedContentHandler; import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.ParserUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Composite parser that delegates parsing tasks to a component parser - * based on the declared content type of the incoming document. A fallback - * parser is defined for cases where a parser for the given content type is - * not available. + * Composite parser that delegates parsing tasks to a component parser based on the declared content + * type of the incoming document. A fallback parser is defined for cases where a parser for the + * given content type is not available. */ public class CompositeParser implements Parser { @@ -71,7 +66,7 @@ public class CompositeParser implements Parser { private Parser fallback = new EmptyParser(); public CompositeParser(MediaTypeRegistry registry, List parsers, - Collection> excludeParsers) { + Collection> excludeParsers) { if (excludeParsers == null || excludeParsers.isEmpty()) { this.parsers = parsers; } else { @@ -108,12 +103,12 @@ public Map getParsers(ParseContext context) { } private boolean isExcluded(Collection> excludeParsers, - Class p) { + Class p) { return excludeParsers.contains(p) || assignableFrom(excludeParsers, p); } private boolean assignableFrom(Collection> excludeParsers, - Class p) { + Class p) { for (Class e : excludeParsers) { if (e.isAssignableFrom(p)) { return true; @@ -123,9 +118,9 @@ private boolean assignableFrom(Collection> excludeParser } /** - * Utility method that goes through all the component parsers and finds - * all media types for which more than one parser declares support. This - * is useful in tracking down conflicting parser definitions. + * Utility method that goes through all the component parsers and finds all media types for + * which more than one parser declares support. This is useful in tracking down conflicting + * parser definitions. * * @param context parsing context * @return media types that are supported by at least two component parsers @@ -175,9 +170,8 @@ public void setMediaTypeRegistry(MediaTypeRegistry registry) { } /** - * Returns all parsers registered with the Composite Parser, - * including ones which may not currently be active. - * This won't include the Fallback Parser, if defined + * Returns all parsers registered with the Composite Parser, including ones which may not + * currently be active. This won't include the Fallback Parser, if defined */ public List getAllComponentParsers() { return Collections.unmodifiableList(parsers); @@ -200,8 +194,8 @@ public Map getParsers() { public void setParsers(Map parsers) { this.parsers = new ArrayList<>(parsers.size()); for (Map.Entry entry : parsers.entrySet()) { - this.parsers.add(ParserDecorator - .withTypes(entry.getValue(), Collections.singleton(entry.getKey()))); + this.parsers.add(ParserDecorator.withTypes(entry.getValue(), + Collections.singleton(entry.getKey()))); } } @@ -224,14 +218,12 @@ public void setFallback(Parser fallback) { } /** - * Returns the parser that best matches the given metadata. By default - * looks for a parser that matches the content type metadata property, - * and uses the fallback parser if a better match is not found. The - * type hierarchy information included in the configured media type - * registry is used when looking for a matching parser instance. + * Returns the parser that best matches the given metadata. By default looks for a parser that + * matches the content type metadata property, and uses the fallback parser if a better match is + * not found. The type hierarchy information included in the configured media type registry is + * used when looking for a matching parser instance. *

- * Subclasses can override this method to provide more accurate - * parser resolution. + * Subclasses can override this method to provide more accurate parser resolution. * * @param metadata document metadata * @return matching parser @@ -242,7 +234,7 @@ protected Parser getParser(Metadata metadata) { protected Parser getParser(Metadata metadata, ParseContext context) { Map map = getParsers(context); - //check for parser override first + // check for parser override first String contentTypeString = metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE); if (contentTypeString == null) { contentTypeString = metadata.get(Metadata.CONTENT_TYPE); @@ -272,13 +264,12 @@ public Set getSupportedTypes(ParseContext context) { /** * Delegates the call to the matching component parser. *

- * Potential {@link RuntimeException}s, {@link IOException}s and - * {@link SAXException}s unrelated to the given input stream and content - * handler are automatically wrapped into {@link TikaException}s to better - * honor the {@link Parser} contract. + * Potential {@link RuntimeException}s, {@link IOException}s and {@link SAXException}s unrelated + * to the given input stream and content handler are automatically wrapped into + * {@link TikaException}s to better honor the {@link Parser} contract. */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { Parser parser = getParser(metadata, context); TemporaryResources tmp = new TemporaryResources(); ParseRecord parserRecord = context.get(ParseRecord.class); @@ -289,7 +280,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, try { TikaInputStream taggedStream = TikaInputStream.get(stream, tmp, metadata); TaggedContentHandler taggedHandler = - handler != null ? new TaggedContentHandler(handler) : null; + handler != null ? new TaggedContentHandler(handler) : null; String parserClassname = ParserUtils.getParserClassname(parser); parserRecord.addParserClass(parserClassname); ParserUtils.recordParserDetails(parserClassname, metadata); @@ -297,7 +288,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, try { parser.parse(taggedStream, taggedHandler, metadata, context); } catch (SecurityException e) { - //rethrow security exceptions + // rethrow security exceptions throw e; } catch (IOException e) { taggedStream.throwIfCauseOf(e); @@ -324,7 +315,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, private void recordEmbeddedMetadata(Metadata metadata, ParseContext context) { ParseRecord record = context.get(ParseRecord.class); if (record == null) { - //this should never happen + // this should never happen return; } for (Exception e : record.getExceptions()) { diff --git a/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java b/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java index 1ffd851db3..e29ee38f1d 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -26,19 +24,17 @@ import java.util.Set; import javax.crypto.Cipher; import javax.crypto.CipherInputStream; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Decrypts the incoming document stream and delegates further parsing to - * another parser instance. The decryption key and other settings as well - * as the delegate parser are taken from the parsing context. + * Decrypts the incoming document stream and delegates further parsing to another parser instance. + * The decryption key and other settings as well as the delegate parser are taken from the parsing + * context. * * @since Apache Tika 0.10 */ @@ -70,7 +66,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { try { Cipher cipher; if (provider != null) { diff --git a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java index 3205ea81d1..2824e65b9f 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -21,7 +19,6 @@ import java.util.Collections; import java.util.List; import java.util.Map; - import org.apache.tika.config.ServiceLoader; import org.apache.tika.detect.DefaultEncodingDetector; import org.apache.tika.detect.EncodingDetector; @@ -32,8 +29,7 @@ import org.apache.tika.utils.ServiceLoaderUtils; /** - * A composite parser based on all the {@link Parser} implementations - * available through the + * A composite parser based on all the {@link Parser} implementations available through the * {@link javax.imageio.spi.ServiceRegistry service provider mechanism}. * * @since Apache Tika 0.8 @@ -47,28 +43,27 @@ public class DefaultParser extends CompositeParser { private transient final ServiceLoader loader; public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader, - Collection> excludeParsers, - EncodingDetector encodingDetector, Renderer renderer) { + Collection> excludeParsers, + EncodingDetector encodingDetector, Renderer renderer) { super(registry, getDefaultParsers(loader, encodingDetector, renderer, excludeParsers)); this.loader = loader; } public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader, - Collection> excludeParsers) { - super(registry, - getDefaultParsers(loader, new DefaultEncodingDetector(loader), + Collection> excludeParsers) { + super(registry, getDefaultParsers(loader, new DefaultEncodingDetector(loader), new CompositeRenderer(loader), excludeParsers)); this.loader = loader; } public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader, - EncodingDetector encodingDetector, Renderer renderer) { + EncodingDetector encodingDetector, Renderer renderer) { this(registry, loader, Collections.EMPTY_SET, encodingDetector, renderer); } public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader) { this(registry, loader, Collections.EMPTY_SET, new DefaultEncodingDetector(loader), - new CompositeRenderer(loader)); + new CompositeRenderer(loader)); } public DefaultParser(MediaTypeRegistry registry, ClassLoader loader) { @@ -88,21 +83,17 @@ public DefaultParser() { } /** - * Finds all statically loadable parsers and sort the list by name, - * rather than discovery order. CompositeParser takes the last - * parser for any given media type, so put the Tika parsers first + * Finds all statically loadable parsers and sort the list by name, rather than discovery order. + * CompositeParser takes the last parser for any given media type, so put the Tika parsers first * so that non-Tika (user supplied) parsers can take precedence. * * @param loader service loader * @return ordered list of statically loadable parsers */ private static List getDefaultParsers(ServiceLoader loader, - EncodingDetector encodingDetector, - Renderer renderer, - Collection> - excludeParsers) { - List parsers = - loader.loadStaticServiceProviders(Parser.class, excludeParsers); + EncodingDetector encodingDetector, Renderer renderer, + Collection> excludeParsers) { + List parsers = loader.loadStaticServiceProviders(Parser.class, excludeParsers); if (encodingDetector != null) { for (Parser p : parsers) { @@ -115,14 +106,14 @@ private static List getDefaultParsers(ServiceLoader loader, } } ServiceLoaderUtils.sortLoadedClasses(parsers); - //reverse the order of parsers so that custom ones come last - //this will prevent them from being overwritten in getParsers(ParseContext ..) + // reverse the order of parsers so that custom ones come last + // this will prevent them from being overwritten in getParsers(ParseContext ..) Collections.reverse(parsers); return parsers; } - //recursively go through the parsers and set the encoding detector - //as configured in the config file + // recursively go through the parsers and set the encoding detector + // as configured in the config file private static void setEncodingDetector(Parser p, EncodingDetector encodingDetector) { if (p instanceof AbstractEncodingDetectorParser) { ((AbstractEncodingDetectorParser) p).setEncodingDetector(encodingDetector); diff --git a/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java index f2e007cfe9..a3872f5bfb 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java @@ -1,48 +1,42 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; import java.io.IOException; import java.io.InputStream; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Base class for parser implementations that want to delegate parts of the - * task of parsing an input document to another parser. The delegate parser - * is looked up from the parsing context using the {@link Parser} class as - * the key. + * Base class for parser implementations that want to delegate parts of the task of parsing an input + * document to another parser. The delegate parser is looked up from the parsing context using the + * {@link Parser} class as the key. * * @since Apache Tika 0.4, major changes in Tika 0.5 */ public class DelegatingParser implements Parser { /** - * Returns the parser instance to which parsing tasks should be delegated. - * The default implementation looks up the delegate parser from the given - * parse context, and uses an {@link EmptyParser} instance as a fallback. - * Subclasses can override this method to implement alternative delegation - * strategies. + * Returns the parser instance to which parsing tasks should be delegated. The default + * implementation looks up the delegate parser from the given parse context, and uses an + * {@link EmptyParser} instance as a fallback. Subclasses can override this method to implement + * alternative delegation strategies. * * @param context parse context * @return delegate parser @@ -57,17 +51,15 @@ public Set getSupportedTypes(ParseContext context) { } /** - * Looks up the delegate parser from the parsing context and - * delegates the parse operation to it. If a delegate parser is not - * found, then an empty XHTML document is returned. + * Looks up the delegate parser from the parsing context and delegates the parse operation to + * it. If a delegate parser is not found, then an empty XHTML document is returned. *

- * Subclasses should override this method to parse the top level - * structure of the given document stream. Parsed sub-streams can - * be passed to this base class method to be parsed by the configured - * delegate parser. + * Subclasses should override this method to parse the top level structure of the given document + * stream. Parsed sub-streams can be passed to this base class method to be parsed by the + * configured delegate parser. */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws SAXException, IOException, TikaException { + ParseContext context) throws SAXException, IOException, TikaException { getDelegateParser(context).parse(stream, handler, metadata, context); } diff --git a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java index d0bcaa1f95..cd2174af1e 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -23,10 +21,6 @@ import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator; import org.apache.tika.extractor.EmbeddedStreamTranslator; @@ -34,12 +28,16 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class DigestingParser extends ParserDecorator { - private final EmbeddedStreamTranslator embeddedStreamTranslator = new DefaultEmbeddedStreamTranslator(); + private final EmbeddedStreamTranslator embeddedStreamTranslator = + new DefaultEmbeddedStreamTranslator(); private final Digester digester; private final boolean skipContainerDocument; + /** * Creates a decorator for the given parser. * @@ -53,10 +51,10 @@ public DigestingParser(Parser parser, Digester digester, boolean skipContainerDo @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { - if (! shouldDigest(metadata)) { + if (!shouldDigest(metadata)) { super.parse(stream, handler, metadata, context); return; } @@ -85,7 +83,7 @@ private boolean shouldDigest(Metadata metadata) { if (digester == null) { return false; } - if (! skipContainerDocument) { + if (!skipContainerDocument) { return true; } Integer parseDepth = metadata.getInt(TikaCoreProperties.EMBEDDED_DEPTH); @@ -96,34 +94,33 @@ private boolean shouldDigest(Metadata metadata) { } /** - * This is used in {@link AutoDetectParserConfig} to (optionally) - * wrap the parser in a digesting parser. + * This is used in {@link AutoDetectParserConfig} to (optionally) wrap the parser in a digesting + * parser. */ public interface DigesterFactory { Digester build(); + void setSkipContainerDocument(boolean skipContainerDocument); + boolean isSkipContainerDocument(); } - /** - * Interface for digester. See - * org.apache.parser.utils.CommonsDigester in tika-parsers for an + /** + * Interface for digester. See org.apache.parser.utils.CommonsDigester in tika-parsers for an * implementation. */ public interface Digester { /** - * Digests an InputStream and sets the appropriate value(s) in the metadata. - * The Digester is also responsible for marking and resetting the stream. + * Digests an InputStream and sets the appropriate value(s) in the metadata. The Digester is + * also responsible for marking and resetting the stream. *

- * The given stream is guaranteed to support the - * {@link InputStream#markSupported() mark feature} and the detector - * is expected to {@link InputStream#mark(int) mark} the stream before - * reading any bytes from it, and to {@link InputStream#reset() reset} - * the stream before returning. The stream must not be closed by the - * detector. + * The given stream is guaranteed to support the {@link InputStream#markSupported() mark + * feature} and the detector is expected to {@link InputStream#mark(int) mark} the stream + * before reading any bytes from it, and to {@link InputStream#reset() reset} the stream + * before returning. The stream must not be closed by the detector. * - * @param is InputStream to digest - * @param m Metadata to set the values for + * @param is InputStream to digest + * @param m Metadata to set the values for * @param parseContext ParseContext * @throws IOException */ diff --git a/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java b/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java index 546d0c2a71..7cf974a2d4 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java @@ -1,36 +1,31 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; import java.io.InputStream; import java.util.Collections; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Dummy parser that always produces an empty XHTML document without even - * attempting to parse the given document stream. Useful as a sentinel parser - * for unknown document types. + * Dummy parser that always produces an empty XHTML document without even attempting to parse the + * given document stream. Useful as a sentinel parser for unknown document types. */ public class EmptyParser implements Parser { /** @@ -47,7 +42,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws SAXException { + ParseContext context) throws SAXException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument(); diff --git a/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java b/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java index b8071cb52f..bc0e8a49dc 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java @@ -1,35 +1,30 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; import java.io.InputStream; import java.util.Collections; import java.util.Set; - -import org.xml.sax.ContentHandler; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; /** - * Dummy parser that always throws a {@link TikaException} without even - * attempting to parse the given document stream. Useful as a sentinel parser - * for unknown document types. + * Dummy parser that always throws a {@link TikaException} without even attempting to parse the + * given document stream. Useful as a sentinel parser for unknown document types. */ public class ErrorParser implements Parser { /** @@ -43,7 +38,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws TikaException { + ParseContext context) throws TikaException { throw new TikaException("Parse error"); } } diff --git a/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java b/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java index 822512d7e4..775bd504a7 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -26,14 +24,8 @@ import java.net.URLConnection; import java.util.Collections; import java.util.Set; - import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -42,6 +34,10 @@ import org.apache.tika.sax.TaggedContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.utils.XMLReaderUtils; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; public class NetworkParser implements Parser { @@ -64,7 +60,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); @@ -75,7 +71,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } private void parse(TikaInputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { if ("telnet".equals(uri.getScheme())) { try (Socket socket = new Socket(uri.getHost(), uri.getPort())) { new ParsingTask(stream, new FilterOutputStream(socket.getOutputStream()) { @@ -91,8 +87,8 @@ public void close() throws IOException { connection.setDoOutput(true); connection.connect(); try (InputStream input = connection.getInputStream()) { - new ParsingTask(stream, connection.getOutputStream()) - .parse(CloseShieldInputStream.wrap(input), handler, metadata, context); + new ParsingTask(stream, connection.getOutputStream()).parse( + CloseShieldInputStream.wrap(input), handler, metadata, context); } } @@ -112,16 +108,14 @@ public ParsingTask(TikaInputStream input, OutputStream output) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { Thread thread = new Thread(this, "Tika network parser"); thread.start(); - TaggedContentHandler tagged = - new TaggedContentHandler(handler); + TaggedContentHandler tagged = new TaggedContentHandler(handler); try { - XMLReaderUtils - .parseSAX(stream, new TeeContentHandler(tagged, new MetaHandler(metadata)), - context); + XMLReaderUtils.parseSAX(stream, + new TeeContentHandler(tagged, new MetaHandler(metadata)), context); } catch (SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("Invalid network parser output", e); @@ -141,7 +135,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } } - //---------------------------------------------------------- + // ---------------------------------------------------------- public void run() { try { @@ -167,7 +161,7 @@ public MetaHandler(Metadata metadata) { @Override public void startElement(String uri, String localName, String qName, Attributes attributes) - throws SAXException { + throws SAXException { if ("http://www.w3.org/1999/xhtml".equals(uri) && "meta".equals(localName)) { String name = attributes.getValue("", "name"); String content = attributes.getValue("", "content"); diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index 25256a77f1..ce7355a228 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -41,10 +39,9 @@ public class ParseContext implements Serializable { private final Map context = new HashMap<>(); /** - * Adds the given value to the context as an implementation of the given - * interface. + * Adds the given value to the context as an implementation of the given interface. * - * @param key the interface implemented by the given value + * @param key the interface implemented by the given value * @param value the value to be added, or null to remove */ public void set(Class key, T value) { @@ -59,8 +56,7 @@ public void set(Class key, T value) { * Returns the object in this context that implements the given interface. * * @param key the interface implemented by the requested object - * @return the object that implements the given interface, - * or null if not found + * @return the object that implements the given interface, or null if not found */ @SuppressWarnings("unchecked") public T get(Class key) { @@ -68,13 +64,13 @@ public T get(Class key) { } /** - * Returns the object in this context that implements the given interface, - * or the given default value if such an object is not found. + * Returns the object in this context that implements the given interface, or the given default + * value if such an object is not found. * - * @param key the interface implemented by the requested object + * @param key the interface implemented by the requested object * @param defaultValue value to return if the requested object is not found - * @return the object that implements the given interface, - * or the given default value if not found + * @return the object that implements the given interface, or the given default value if not + * found */ public T get(Class key, T defaultValue) { T value = get(key); @@ -89,10 +85,9 @@ public boolean isEmpty() { return context.size() == 0; } - //this should really only be used for serialization + // this should really only be used for serialization public Set keySet() { - return Collections - .unmodifiableSet(context.keySet()); + return Collections.unmodifiableSet(context.keySet()); } @Override diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java index ca0edc567c..875ce475bd 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -20,18 +18,16 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Set; - import org.apache.tika.metadata.Metadata; /** - * Use this class to store exceptions, warnings and other information - * during the parse. This information is added to the parent's metadata - * after the parse by the {@link CompositeParser}. + * Use this class to store exceptions, warnings and other information during the parse. This + * information is added to the parent's metadata after the parse by the {@link CompositeParser}. */ public class ParseRecord { - //hard limits so that specially crafted files - //don't cause an OOM + // hard limits so that specially crafted files + // don't cause an OOM private static int MAX_PARSERS = 100; private static final int MAX_EXCEPTIONS = 100; diff --git a/tika-core/src/main/java/org/apache/tika/parser/Parser.java b/tika-core/src/main/java/org/apache/tika/parser/Parser.java index 44882883a4..9a8860791d 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/Parser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/Parser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -20,13 +18,11 @@ import java.io.InputStream; import java.io.Serializable; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Tika parser interface. @@ -34,8 +30,8 @@ public interface Parser extends Serializable { /** - * Returns the set of media types supported by this parser when used - * with the given parse context. + * Returns the set of media types supported by this parser when used with the given parse + * context. * * @param context parse context * @return immutable set of media types @@ -44,26 +40,25 @@ public interface Parser extends Serializable { Set getSupportedTypes(ParseContext context); /** - * Parses a document stream into a sequence of XHTML SAX events. - * Fills in related document metadata in the given metadata object. + * Parses a document stream into a sequence of XHTML SAX events. Fills in related document + * metadata in the given metadata object. *

- * The given document stream is consumed but not closed by this method. - * The responsibility to close the stream remains on the caller. + * The given document stream is consumed but not closed by this method. The responsibility to + * close the stream remains on the caller. *

- * Information about the parsing context can be passed in the context - * parameter. See the parser implementations for the kinds of context - * information they expect. + * Information about the parsing context can be passed in the context parameter. See the parser + * implementations for the kinds of context information they expect. * - * @param stream the document stream (input) - * @param handler handler for the XHTML SAX events (output) + * @param stream the document stream (input) + * @param handler handler for the XHTML SAX events (output) * @param metadata document metadata (input and output) - * @param context parse context - * @throws IOException if the document stream could not be read - * @throws SAXException if the SAX events could not be processed + * @param context parse context + * @throws IOException if the document stream could not be read + * @throws SAXException if the SAX events could not be processed * @throws TikaException if the document could not be parsed * @since Apache Tika 0.5 */ void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException; + throws IOException, SAXException, TikaException; } diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java b/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java index 32d6661fb3..3c8a3f1230 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -21,24 +19,23 @@ import java.util.Collection; import java.util.HashSet; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy; import org.apache.tika.parser.multiple.FallbackParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Decorator base class for the {@link Parser} interface. - *

This class simply delegates all parsing calls to an underlying decorated - * parser instance. Subclasses can provide extra decoration by overriding the - * parse method. - *

To decorate several different parsers at the same time, wrap them in - * a {@link CompositeParser} instance first. + *

+ * This class simply delegates all parsing calls to an underlying decorated parser instance. + * Subclasses can provide extra decoration by overriding the parse method. + *

+ * To decorate several different parsers at the same time, wrap them in a {@link CompositeParser} + * instance first. */ public class ParserDecorator implements Parser { @@ -61,11 +58,11 @@ public ParserDecorator(Parser parser) { } /** - * Decorates the given parser so that it always claims to support - * parsing of the given media types. + * Decorates the given parser so that it always claims to support parsing of the given media + * types. * * @param parser the parser to be decorated - * @param types supported media types + * @param types supported media types * @return the decorated parser */ public static final Parser withTypes(Parser parser, final Set types) { @@ -85,10 +82,10 @@ public String getDecorationName() { } /** - * Decorates the given parser so that it never claims to support - * parsing of the given media types, but will work for all others. + * Decorates the given parser so that it never claims to support parsing of the given media + * types, but will work for all others. * - * @param parser the parser to be decorated + * @param parser the parser to be decorated * @param excludeTypes excluded/ignored media types * @return the decorated parser */ @@ -99,8 +96,7 @@ public static final Parser withoutTypes(Parser parser, final Set excl @Override public Set getSupportedTypes(ParseContext context) { // Get our own, writable copy of the types the parser supports - Set parserTypes = - new HashSet<>(super.getSupportedTypes(context)); + Set parserTypes = new HashSet<>(super.getSupportedTypes(context)); // Remove anything on our excludes list parserTypes.removeAll(excludeTypes); // Return whatever is left @@ -115,14 +111,14 @@ public String getDecorationName() { } /** - * Decorates the given parsers into a virtual parser, where they'll - * be tried in preference order until one works without error. + * Decorates the given parsers into a virtual parser, where they'll be tried in preference order + * until one works without error. * * @deprecated This has been replaced by {@link FallbackParser} */ @Deprecated public static final Parser withFallbacks(final Collection parsers, - final Set types) { + final Set types) { // Delegate to the new FallbackParser for now, until people upgrade // Keep old behaviour on metadata, which was to preseve all MediaTypeRegistry registry = MediaTypeRegistry.getDefaultRegistry(); @@ -135,21 +131,21 @@ public static final Parser withFallbacks(final Collection pars } /** - * Delegates the method call to the decorated parser. Subclasses should - * override this method (and use super.getSupportedTypes() - * to invoke the decorated parser) to implement extra decoration. + * Delegates the method call to the decorated parser. Subclasses should override this method + * (and use super.getSupportedTypes() to invoke the decorated parser) to implement + * extra decoration. */ public Set getSupportedTypes(ParseContext context) { return parser.getSupportedTypes(context); } /** - * Delegates the method call to the decorated parser. Subclasses should - * override this method (and use super.parse() to invoke - * the decorated parser) to implement extra decoration. + * Delegates the method call to the decorated parser. Subclasses should override this method + * (and use super.parse() to invoke the decorated parser) to implement extra + * decoration. */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { parser.parse(stream, handler, metadata, context); } diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java b/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java index af541b0582..f06dab711d 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -20,10 +18,8 @@ import java.io.IOException; import java.util.Map; - -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; +import org.xml.sax.SAXException; public abstract class ParserFactory { diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java b/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java index 308fa7ebc0..ebc307dc4e 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java @@ -1,38 +1,33 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; import java.io.IOException; import java.io.InputStream; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.utils.RegexUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Parser decorator that post-processes the results from a decorated parser. - * The post-processing takes care of filling in the "fulltext", "summary", - * and "outlinks" metadata entries based on the full text content returned by - * the decorated parser. + * Parser decorator that post-processes the results from a decorated parser. The post-processing + * takes care of filling in the "fulltext", "summary", and "outlinks" metadata entries based on the + * full text content returned by the decorated parser. */ public class ParserPostProcessor extends ParserDecorator { @@ -46,11 +41,10 @@ public ParserPostProcessor(Parser parser) { } /** - * Forwards the call to the delegated parser and post-processes the - * results as described above. + * Forwards the call to the delegated parser and post-processes the results as described above. */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { ContentHandler body = new BodyContentHandler(); ContentHandler tee = new TeeContentHandler(handler, body); super.parse(stream, tee, metadata, context); diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java b/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java index fe98e746de..4133051ee4 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -29,21 +27,18 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.concurrent.Executor; - -import org.xml.sax.ContentHandler; - import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; /** - * Reader for the text content from a given binary stream. This class - * uses a background parsing task with a {@link Parser} - * ({@link AutoDetectParser} by default) to parse the text content from - * a given input stream. The {@link BodyContentHandler} class and a pipe - * is used to convert the push-based SAX event stream to the pull-based - * character stream defined by the {@link Reader} interface. + * Reader for the text content from a given binary stream. This class uses a background parsing task + * with a {@link Parser} ({@link AutoDetectParser} by default) to parse the text content from a + * given input stream. The {@link BodyContentHandler} class and a pipe is used to convert the + * push-based SAX event stream to the pull-based character stream defined by the {@link Reader} + * interface. * * @since Apache Tika 0.2 */ @@ -96,11 +91,10 @@ public ParsingReader(InputStream stream) throws IOException { } /** - * Creates a reader for the text content of the given binary stream - * with the given name. + * Creates a reader for the text content of the given binary stream with the given name. * * @param stream binary stream - * @param name document name + * @param name document name * @throws IOException if the document can not be parsed */ public ParsingReader(InputStream stream, String name) throws IOException { @@ -113,7 +107,7 @@ public ParsingReader(InputStream stream, String name) throws IOException { * * @param path path * @throws FileNotFoundException if the given file does not exist - * @throws IOException if the document can not be parsed + * @throws IOException if the document can not be parsed */ public ParsingReader(Path path) throws IOException { this(Files.newInputStream(path), path.getFileName().toString()); @@ -124,7 +118,7 @@ public ParsingReader(Path path) throws IOException { * * @param file file * @throws FileNotFoundException if the given file does not exist - * @throws IOException if the document can not be parsed + * @throws IOException if the document can not be parsed * @see #ParsingReader(Path) */ public ParsingReader(File file) throws FileNotFoundException, IOException { @@ -132,21 +126,21 @@ public ParsingReader(File file) throws FileNotFoundException, IOException { } /** - * Creates a reader for the text content of the given binary stream - * with the given document metadata. The given parser is used for - * parsing. A new background thread is started for the parsing task. + * Creates a reader for the text content of the given binary stream with the given document + * metadata. The given parser is used for parsing. A new background thread is started for the + * parsing task. *

- * The created reader will be responsible for closing the given stream. - * The stream and any associated resources will be closed at or before - * the time when the {@link #close()} method is called on this reader. + * The created reader will be responsible for closing the given stream. The stream and any + * associated resources will be closed at or before the time when the {@link #close()} method is + * called on this reader. * - * @param parser parser instance - * @param stream binary stream + * @param parser parser instance + * @param stream binary stream * @param metadata document metadata * @throws IOException if the document can not be parsed */ public ParsingReader(Parser parser, InputStream stream, final Metadata metadata, - ParseContext context) throws IOException { + ParseContext context) throws IOException { this(parser, stream, metadata, context, command -> { String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (name != null) { @@ -161,27 +155,26 @@ public ParsingReader(Parser parser, InputStream stream, final Metadata metadata, } /** - * Creates a reader for the text content of the given binary stream - * with the given document metadata. The given parser is used for the - * parsing task that is run with the given executor. The given executor - * must run the parsing task asynchronously in a separate thread, - * since the current thread must return to the caller that can then - * consume the parsed text through the {@link Reader} interface. + * Creates a reader for the text content of the given binary stream with the given document + * metadata. The given parser is used for the parsing task that is run with the given executor. + * The given executor must run the parsing task asynchronously in a separate thread, + * since the current thread must return to the caller that can then consume the parsed text + * through the {@link Reader} interface. *

- * The created reader will be responsible for closing the given stream. - * The stream and any associated resources will be closed at or before - * the time when the {@link #close()} method is called on this reader. + * The created reader will be responsible for closing the given stream. The stream and any + * associated resources will be closed at or before the time when the {@link #close()} method is + * called on this reader. * - * @param parser parser instance - * @param stream binary stream + * @param parser parser instance + * @param stream binary stream * @param metadata document metadata - * @param context parsing context + * @param context parsing context * @param executor executor for the parsing task * @throws IOException if the document can not be parsed * @since Apache Tika 0.4 */ public ParsingReader(Parser parser, InputStream stream, Metadata metadata, ParseContext context, - Executor executor) throws IOException { + Executor executor) throws IOException { this.parser = parser; PipedReader pipedReader = new PipedReader(); this.reader = new BufferedReader(pipedReader); @@ -203,8 +196,7 @@ public ParsingReader(Parser parser, InputStream stream, Metadata metadata, Parse } /** - * Utility method that returns a {@link Metadata} instance - * for a document with the given name. + * Utility method that returns a {@link Metadata} instance for a document with the given name. * * @param name resource name (or null) * @return metadata instance @@ -218,14 +210,14 @@ private static Metadata getMetadata(String name) { } /** - * Reads parsed text from the pipe connected to the parsing thread. - * Fails if the parsing thread has thrown an exception. + * Reads parsed text from the pipe connected to the parsing thread. Fails if the parsing thread + * has thrown an exception. * * @param cbuf character buffer - * @param off start offset within the buffer - * @param len maximum number of characters to read - * @throws IOException if the parsing thread has failed or - * if for some reason the pipe does not work properly + * @param off start offset within the buffer + * @param len maximum number of characters to read + * @throws IOException if the parsing thread has failed or if for some reason the pipe does not + * work properly */ @Override public int read(char[] cbuf, int off, int len) throws IOException { @@ -240,9 +232,9 @@ public int read(char[] cbuf, int off, int len) throws IOException { } /** - * Closes the read end of the pipe. If the parsing thread is still - * running, next write to the pipe will fail and cause the thread - * to stop. Thus there is no need to explicitly terminate the thread. + * Closes the read end of the pipe. If the parsing thread is still running, next write to the + * pipe will fail and cause the thread to stop. Thus there is no need to explicitly terminate + * the thread. * * @throws IOException if the pipe can not be closed */ @@ -257,10 +249,9 @@ public void close() throws IOException { private class ParsingTask implements Runnable { /** - * Parses the given binary stream and writes the text content - * to the write end of the pipe. Potential exceptions (including - * the one caused if the read end is closed unexpectedly) are - * stored before the input stream is closed and processing is stopped. + * Parses the given binary stream and writes the text content to the write end of the pipe. + * Potential exceptions (including the one caused if the read end is closed unexpectedly) + * are stored before the input stream is closed and processing is stopped. */ public void run() { try { diff --git a/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java b/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java index b14baddcae..fc3332918c 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java +++ b/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java @@ -1,41 +1,35 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; import org.apache.tika.metadata.Metadata; /** - * Interface for providing a password to a Parser for handling Encrypted - * and Password Protected Documents. - * An implementation of this should be set on the {@link ParseContext} - * supplied to {@link Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, - * Metadata, ParseContext)} - * to provide a way to get the document password. - * An implementation of this interface defines some specific selection - * or lookup criteria, to be applied against the document metadata passed - * to the {@link #getPassword(Metadata)} method. + * Interface for providing a password to a Parser for handling Encrypted and Password Protected + * Documents. An implementation of this should be set on the {@link ParseContext} supplied to + * {@link Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, Metadata, ParseContext)} to + * provide a way to get the document password. An implementation of this interface defines some + * specific selection or lookup criteria, to be applied against the document metadata passed to the + * {@link #getPassword(Metadata)} method. * * @since Apache Tika 1.1 */ public interface PasswordProvider { /** - * Looks up the password for a document with the given metadata, - * and returns it for the Parser. If no password is available - * for the document, will return null. + * Looks up the password for a document with the given metadata, and returns it for the Parser. + * If no password is available for the document, will return null. * * @param metadata document metadata * @return The document decryption password, or null if not known diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index 07eee752ba..c51c12fd38 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -20,12 +18,7 @@ import java.io.InputStream; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; - import org.apache.commons.io.input.CloseShieldInputStream; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; @@ -45,32 +38,32 @@ import org.apache.tika.sax.WriteLimiter; import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.ParserUtils; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * This is a helper class that wraps a parser in a recursive handler. - * It takes care of setting the embedded parser in the ParseContext - * and handling the embedded path calculations. + * This is a helper class that wraps a parser in a recursive handler. It takes care of setting the + * embedded parser in the ParseContext and handling the embedded path calculations. *

- * After parsing a document, call getMetadata() to retrieve a list of - * Metadata objects, one for each embedded resource. The first item - * in the list will contain the Metadata for the outer container file. + * After parsing a document, call getMetadata() to retrieve a list of Metadata objects, one for each + * embedded resource. The first item in the list will contain the Metadata for the outer container + * file. *

- * Content can also be extracted and stored in the {@link TikaCoreProperties#TIKA_CONTENT} field - * of a Metadata object. Select the type of content to be stored - * at initialization. + * Content can also be extracted and stored in the {@link TikaCoreProperties#TIKA_CONTENT} field of + * a Metadata object. Select the type of content to be stored at initialization. *

- * If a WriteLimitReachedException is encountered, the wrapper will stop - * processing the current resource, and it will not process - * any of the child resources for the given resource. However, it will try to - * parse as much as it can. If a WLRE is reached in the parent document, - * no child resources will be parsed. + * If a WriteLimitReachedException is encountered, the wrapper will stop processing the current + * resource, and it will not process any of the child resources for the given resource. However, it + * will try to parse as much as it can. If a WLRE is reached in the parent document, no child + * resources will be parsed. *

- * The implementation is based on Jukka's RecursiveMetadataParser - * and Nick's additions. See: - * RecursiveMetadataParser. + * The implementation is based on Jukka's RecursiveMetadataParser and Nick's additions. See: + * RecursiveMetadataParser. *

- * Note that this wrapper holds all data in memory and is not appropriate - * for files with content too large to be held in memory. + * Note that this wrapper holds all data in memory and is not appropriate for files with content too + * large to be held in memory. *

* The unit tests for this class are in the tika-parsers module. *

@@ -86,9 +79,10 @@ public class RecursiveParserWrapper extends ParserDecorator { private final boolean catchEmbeddedExceptions; private final boolean inlineContent = false; + /** - * Initialize the wrapper with {@link #catchEmbeddedExceptions} set - * to true as default. + * Initialize the wrapper with {@link #catchEmbeddedExceptions} set to true as + * default. * * @param wrappedParser parser to use for the container documents and the embedded documents */ @@ -97,12 +91,11 @@ public RecursiveParserWrapper(Parser wrappedParser) { } /** - * @param wrappedParser parser to wrap - * @param catchEmbeddedExceptions whether or not to catch+record embedded exceptions. - * If set to false, embedded exceptions will be - * thrown and the rest of the file will not be parsed. The - * following will not be ignored: - * {@link CorruptedFileException}, {@link RuntimeException} + * @param wrappedParser parser to wrap + * @param catchEmbeddedExceptions whether or not to catch+record embedded exceptions. If set to + * false, embedded exceptions will be thrown and the rest of the file will + * not be parsed. The following will not be ignored: {@link CorruptedFileException}, + * {@link RuntimeException} */ public RecursiveParserWrapper(Parser wrappedParser, boolean catchEmbeddedExceptions) { super(wrappedParser); @@ -119,7 +112,7 @@ public Set getSupportedTypes(ParseContext context) { /** * @param stream * @param recursiveParserWrapperHandler -- handler must implement - * {@link RecursiveParserWrapperHandler} + * {@link RecursiveParserWrapperHandler} * @param metadata * @param context * @throws IOException @@ -129,22 +122,22 @@ public Set getSupportedTypes(ParseContext context) { */ @Override public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandler, - Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { - //this tracks the state of the parent parser, per call to #parse + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + // this tracks the state of the parent parser, per call to #parse ParserState parserState; if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) { parserState = new ParserState( - (AbstractRecursiveParserWrapperHandler) recursiveParserWrapperHandler); + (AbstractRecursiveParserWrapperHandler) recursiveParserWrapperHandler); } else { throw new IllegalStateException( - "ContentHandler must implement RecursiveParserWrapperHandler"); + "ContentHandler must implement RecursiveParserWrapperHandler"); } EmbeddedParserDecorator decorator = - new EmbeddedParserDecorator(getWrappedParser(), "/", "/", parserState); + new EmbeddedParserDecorator(getWrappedParser(), "/", "/", parserState); context.set(Parser.class, decorator); ContentHandler localHandler = - parserState.recursiveParserWrapperHandler.getNewContentHandler(); + parserState.recursiveParserWrapperHandler.getNewContentHandler(); long started = System.currentTimeMillis(); parserState.recursiveParserWrapperHandler.startDocument(); TemporaryResources tmp = new TemporaryResources(); @@ -153,17 +146,19 @@ public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandl if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) { ContentHandlerFactory factory = - ((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler).getContentHandlerFactory(); + ((AbstractRecursiveParserWrapperHandler) recursiveParserWrapperHandler) + .getContentHandlerFactory(); if (factory instanceof WriteLimiter) { - writeLimit = ((WriteLimiter)factory).getWriteLimit(); - throwOnWriteLimitReached = ((WriteLimiter)factory).isThrowOnWriteLimitReached(); + writeLimit = ((WriteLimiter) factory).getWriteLimit(); + throwOnWriteLimitReached = ((WriteLimiter) factory).isThrowOnWriteLimitReached(); } } try { TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); RecursivelySecureContentHandler secureContentHandler = - new RecursivelySecureContentHandler(localHandler, tis, new SecureHandlerCounter(writeLimit), - throwOnWriteLimitReached, context); + new RecursivelySecureContentHandler(localHandler, tis, + new SecureHandlerCounter(writeLimit), + throwOnWriteLimitReached, context); context.set(RecursivelySecureContentHandler.class, secureContentHandler); getWrappedParser().parse(tis, secureContentHandler, metadata, context); } catch (Throwable e) { @@ -198,8 +193,8 @@ public static String getResourceName(Metadata metadata, AtomicInteger counter) { } else { objectName = "embedded-" + counter.incrementAndGet(); } - //make sure that there isn't any path info in the objectName - //some parsers can return paths, not just file names + // make sure that there isn't any path info in the objectName + // some parsers can return paths, not just file names objectName = FilenameUtils.getName(objectName); return objectName; } @@ -214,8 +209,8 @@ private class EmbeddedParserDecorator extends StatefulParser { private String embeddedIdPath = null; - private EmbeddedParserDecorator(Parser parser, String location, - String embeddedIdPath, ParserState parseState) { + private EmbeddedParserDecorator(Parser parser, String location, String embeddedIdPath, + ParserState parseState) { super(parser); this.location = location; if (!this.location.endsWith("/")) { @@ -227,9 +222,9 @@ private EmbeddedParserDecorator(Parser parser, String location, @Override public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { - //Test to see if we should avoid parsing + // Test to see if we should avoid parsing if (parserState.recursiveParserWrapperHandler.hasHitMaximumEmbeddedResources()) { return; } @@ -239,25 +234,24 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, metadata.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, objectLocation); - String idPath = - this.embeddedIdPath.equals("/") ? - this.embeddedIdPath + ++parserState.embeddedCount : - this.embeddedIdPath + "/" + ++parserState.embeddedCount; + String idPath = this.embeddedIdPath.equals("/") + ? this.embeddedIdPath + ++parserState.embeddedCount + : this.embeddedIdPath + "/" + ++parserState.embeddedCount; metadata.add(TikaCoreProperties.EMBEDDED_ID_PATH, idPath); metadata.set(TikaCoreProperties.EMBEDDED_ID, parserState.embeddedCount); - //get a fresh handler + // get a fresh handler ContentHandler localHandler = - parserState.recursiveParserWrapperHandler.getNewContentHandler(); + parserState.recursiveParserWrapperHandler.getNewContentHandler(); parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, metadata); Parser preContextParser = context.get(Parser.class); - context.set(Parser.class, - new EmbeddedParserDecorator(getWrappedParser(), objectLocation, - idPath, parserState)); + context.set(Parser.class, new EmbeddedParserDecorator(getWrappedParser(), + objectLocation, idPath, parserState)); long started = System.currentTimeMillis(); - //store the handler that was used before this parse - //so that you can return it back to its state at the end of this parse - RecursivelySecureContentHandler preParseHandler = context.get(RecursivelySecureContentHandler.class); + // store the handler that was used before this parse + // so that you can return it back to its state at the end of this parse + RecursivelySecureContentHandler preParseHandler = + context.get(RecursivelySecureContentHandler.class); ParentContentHandler preParseParentHandler = context.get(ParentContentHandler.class); context.set(ParentContentHandler.class, new ParentContentHandler(preParseHandler)); @@ -267,9 +261,9 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, tmp = new TemporaryResources(); tis = TikaInputStream.get(CloseShieldInputStream.wrap(stream), tmp, metadata); } - ContentHandler secureContentHandler = - new RecursivelySecureContentHandler(localHandler, tis, preParseHandler.handlerCounter, - preParseHandler.throwOnWriteLimitReached, context); + ContentHandler secureContentHandler = new RecursivelySecureContentHandler(localHandler, + tis, preParseHandler.handlerCounter, + preParseHandler.throwOnWriteLimitReached, context); try { tis.setCloseShield(); @@ -291,9 +285,9 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, if (e instanceof EncryptedDocumentException) { metadata.set(TikaCoreProperties.IS_ENCRYPTED, true); } - if (context.get(ZeroByteFileException.IgnoreZeroByteFileException.class) != null && - e instanceof ZeroByteFileException) { - //do nothing + if (context.get(ZeroByteFileException.IgnoreZeroByteFileException.class) != null + && e instanceof ZeroByteFileException) { + // do nothing } else if (catchEmbeddedExceptions) { ParserUtils.recordParserFailure(this, e, metadata); } else { @@ -306,8 +300,8 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, context.set(ParentContentHandler.class, preParseParentHandler); long elapsedMillis = System.currentTimeMillis() - started; metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); - parserState.recursiveParserWrapperHandler - .endEmbeddedDocument(localHandler, metadata); + parserState.recursiveParserWrapperHandler.endEmbeddedDocument(localHandler, + metadata); if (tmp != null) { tis.close(); } @@ -316,13 +310,14 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, } /** - * This tracks the state of the parse of a single document. - * In future versions, this will allow the RecursiveParserWrapper to be thread safe. + * This tracks the state of the parse of a single document. In future versions, this will allow + * the RecursiveParserWrapper to be thread safe. */ private static class ParserState { private final AbstractRecursiveParserWrapperHandler recursiveParserWrapperHandler; private AtomicInteger unknownCount = new AtomicInteger(0); - private int embeddedCount = 0;//this is effectively 1-indexed + private int embeddedCount = 0;// this is effectively 1-indexed + private ParserState(AbstractRecursiveParserWrapperHandler handler) { this.recursiveParserWrapperHandler = handler; } @@ -331,20 +326,23 @@ private ParserState(AbstractRecursiveParserWrapperHandler handler) { static class SecureHandlerCounter { private final int totalWriteLimit; private boolean writeLimitReached = false; - //total chars written to all handlers + // total chars written to all handlers private int totalChars = 0; private SecureHandlerCounter(int totalWriteLimit) { this.totalWriteLimit = totalWriteLimit; } + /** * Given the requested length, how many characters are actually available + * * @param length * @return */ int getAvailable(int length) { return Math.min(totalWriteLimit - totalChars, length); } + void addChars(int numChars) { totalChars += numChars; } @@ -364,8 +362,8 @@ static class RecursivelySecureContentHandler extends SecureContentHandler { private final int id = COUNTER.getAndIncrement(); public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream stream, - SecureHandlerCounter handlerCounter, - boolean throwOnWriteLimitReached, ParseContext parseContext) { + SecureHandlerCounter handlerCounter, boolean throwOnWriteLimitReached, + ParseContext parseContext) { super(handler, stream); this.handler = handler; this.handlerCounter = handlerCounter; @@ -376,10 +374,9 @@ public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream s /** * Bypass the SecureContentHandler... *

- * This handler only looks at zip bomb via zip expansion. - * Users should be protected within entries against nested - * nested xml entities. We don't want to carry - * those stats _across_ entries. + * This handler only looks at zip bomb via zip expansion. Users should be protected within + * entries against nested nested xml entities. We don't want to carry those stats _across_ + * entries. * * @param uri * @param localName @@ -389,7 +386,7 @@ public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream s */ @Override public void startElement(String uri, String localName, String name, Attributes atts) - throws SAXException { + throws SAXException { this.handler.startElement(uri, localName, name, atts); } diff --git a/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java b/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java index 412673b70e..ad5bed1d5c 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -28,10 +26,6 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -40,11 +34,13 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class RegexCaptureParser implements Parser, Initializable { private static final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.TEXT_PLAIN); + Collections.singleton(MediaType.TEXT_PLAIN); private Map captureMap = new HashMap<>(); private Map matchMap = new HashMap<>(); @@ -56,7 +52,7 @@ public void initialize(Map params) throws TikaConfigException { @Override public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { + throws TikaConfigException { } @@ -69,9 +65,9 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { - try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream, - StandardCharsets.UTF_8))) { + ParseContext context) throws IOException, SAXException, TikaException { + try (BufferedReader reader = + new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))) { String line = reader.readLine(); Map localCaptureMap = new HashMap(); for (Map.Entry e : captureMap.entrySet()) { diff --git a/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java b/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java index 0daae6be12..02f8b42233 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java @@ -1,19 +1,18 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */package org.apache.tika.parser; + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.tika.parser; import org.apache.tika.renderer.Renderer; diff --git a/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java b/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java index 0fb657b4dd..bdc3af59d2 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java @@ -1,30 +1,25 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; /** - * The RecursiveParserWrapper wraps the parser sent - * into the parsecontext and then uses that parser + * The RecursiveParserWrapper wraps the parser sent into the parsecontext and then uses that parser * to store state (among many other things). *

- * There are some use cases where regular parsers - * want to parse content inline (e.g. OCR), and their - * output should not be treated as coming from an embedded - * object. + * There are some use cases where regular parsers want to parse content inline (e.g. OCR), and their + * output should not be treated as coming from an embedded object. **/ public class StatefulParser extends ParserDecorator { diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java index ee4dfe233d..4aa8502831 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java +++ b/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.digest; import java.io.IOException; import java.io.InputStream; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java index c3e4fde2cb..8f8a9864ff 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java +++ b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.digest; @@ -24,7 +22,6 @@ import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.security.Provider; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.TemporaryResources; @@ -47,19 +44,16 @@ public InputStreamDigester(int markLimit, String algorithm, DigestingParser.Enco } /** - * @param markLimit limit in bytes to allow for mark/reset. If the inputstream is longer - * than this limit, the stream will be reset and then spooled to a - * temporary file. - * Throws IllegalArgumentException if < 0. - * @param algorithm name of the digest algorithm to retrieve from the Provider - * @param algorithmKeyName name of the algorithm to store - * as part of the key in the metadata - * when {@link #digest(InputStream, Metadata, ParseContext)} is called - * @param encoder encoder to convert the byte array returned from the digester to a - * string + * @param markLimit limit in bytes to allow for mark/reset. If the inputstream is longer than + * this limit, the stream will be reset and then spooled to a temporary file. Throws + * IllegalArgumentException if < 0. + * @param algorithm name of the digest algorithm to retrieve from the Provider + * @param algorithmKeyName name of the algorithm to store as part of the key in the metadata + * when {@link #digest(InputStream, Metadata, ParseContext)} is called + * @param encoder encoder to convert the byte array returned from the digester to a string */ public InputStreamDigester(int markLimit, String algorithm, String algorithmKeyName, - DigestingParser.Encoder encoder) { + DigestingParser.Encoder encoder) { this.algorithm = algorithm; this.algorithmKeyName = algorithmKeyName; this.encoder = encoder; @@ -73,8 +67,8 @@ public InputStreamDigester(int markLimit, String algorithm, String algorithmKeyN /** * Copied from commons-codec */ - private static MessageDigest updateDigest(MessageDigest digest, InputStream data, Metadata metadata) - throws IOException { + private static MessageDigest updateDigest(MessageDigest digest, InputStream data, + Metadata metadata) throws IOException { byte[] buffer = new byte[1024]; long total = 0; for (int read = data.read(buffer, 0, 1024); read > -1; read = data.read(buffer, 0, 1024)) { @@ -87,7 +81,7 @@ private static MessageDigest updateDigest(MessageDigest digest, InputStream data private static void setContentLength(long length, Metadata metadata) { if (StringUtils.isBlank(metadata.get(Metadata.CONTENT_LENGTH))) { - //only add it if it hasn't been populated already + // only add it if it hasn't been populated already metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length)); } } @@ -106,36 +100,35 @@ private MessageDigest newMessageDigest() { } /** - * When subclassing this, becare to ensure that your provider is - * thread-safe (not likely) or return a new provider with each call. + * When subclassing this, becare to ensure that your provider is thread-safe (not likely) or + * return a new provider with each call. * - * @return provider to use to get the MessageDigest from the algorithm name. - * Default is to return null. + * @return provider to use to get the MessageDigest from the algorithm name. Default is to + * return null. */ protected Provider getProvider() { return null; } /** - * @param is InputStream to digest. Best to use a TikaInputStream because - * of potential need to spool to disk. InputStream must - * support mark/reset. - * @param metadata metadata in which to store the digest information + * @param is InputStream to digest. Best to use a TikaInputStream because of potential need to + * spool to disk. InputStream must support mark/reset. + * @param metadata metadata in which to store the digest information * @param parseContext ParseContext -- not actually used yet, but there for future expansion * @throws IOException on IO problem or IllegalArgumentException if algorithm couldn't be found */ @Override public void digest(InputStream is, Metadata metadata, ParseContext parseContext) - throws IOException { + throws IOException { TikaInputStream tis = TikaInputStream.cast(is); if (tis != null && tis.hasFile()) { long sz = -1; if (tis.hasFile()) { sz = tis.getLength(); } - //if the inputstream has a file, - //and its size is greater than its mark limit, - //just digest the underlying file. + // if the inputstream has a file, + // and its size is greater than its mark limit, + // just digest the underlying file. if (sz > markLimit) { digestFile(tis.getFile(), sz, metadata); return; @@ -143,9 +136,9 @@ public void digest(InputStream is, Metadata metadata, ParseContext parseContext) } - //try the usual mark/reset stuff. - //however, if you actually hit the bound, - //then stop and spool to file via TikaInputStream + // try the usual mark/reset stuff. + // however, if you actually hit the bound, + // then stop and spool to file via TikaInputStream BoundedInputStream bis = new BoundedInputStream(markLimit, is); boolean finishedStream = false; bis.mark(markLimit + 1); @@ -154,8 +147,8 @@ public void digest(InputStream is, Metadata metadata, ParseContext parseContext) if (finishedStream) { return; } - //if the stream wasn't finished -- if the stream was longer than the mark limit -- - //spool to File and digest that. + // if the stream wasn't finished -- if the stream was longer than the mark limit -- + // spool to File and digest that. if (tis != null) { digestFile(tis.getFile(), -1, metadata); } else { @@ -174,12 +167,12 @@ public void digest(InputStream is, Metadata metadata, ParseContext parseContext) } private String getMetadataKey() { - return TikaCoreProperties.TIKA_META_PREFIX + "digest" + - TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + algorithmKeyName; + return TikaCoreProperties.TIKA_META_PREFIX + "digest" + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + algorithmKeyName; } private void digestFile(File f, long sz, Metadata m) throws IOException { - //only add it if it hasn't been populated already + // only add it if it hasn't been populated already if (StringUtils.isBlank(m.get(Metadata.CONTENT_LENGTH))) { if (sz < 0) { sz = f.length(); @@ -192,7 +185,7 @@ private void digestFile(File f, long sz, Metadata m) throws IOException { } /** - * @param is input stream to read from + * @param is input stream to read from * @param metadata metadata for reporting the digest * @return whether or not this finished the input stream * @throws IOException diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java index 53cb7b7eac..f7c6a7393b 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java @@ -1,34 +1,30 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.external; import java.io.IOException; import java.util.List; - import org.apache.tika.exception.TikaException; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; /** - * A Composite Parser that wraps up all the available External Parsers, - * and provides an easy way to access them. - * Parser that uses an external program (like catdoc or pdf2txt) to extract - * text content and metadata from a given document. + * A Composite Parser that wraps up all the available External Parsers, and provides an easy way to + * access them. Parser that uses an external program (like catdoc or pdf2txt) to extract text + * content and metadata from a given document. */ public class CompositeExternalParser extends CompositeParser { private static final long serialVersionUID = 6962436916649024024L; diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java index 429258e461..38ac4bbc23 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.external; @@ -35,14 +33,8 @@ import java.util.concurrent.TimeoutException; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.NullOutputStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -51,29 +43,31 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Parser that uses an external program (like catdoc or pdf2txt) to extract - * text content and metadata from a given document. + * Parser that uses an external program (like catdoc or pdf2txt) to extract text content and + * metadata from a given document. */ public class ExternalParser implements Parser { private static final Logger LOG = LoggerFactory.getLogger(ExternalParser.class); /** - * The token, which if present in the Command string, will - * be replaced with the input filename. + * The token, which if present in the Command string, will be replaced with the input filename. * Alternately, the input data can be streamed over STDIN. */ public static final String INPUT_FILE_TOKEN = "${INPUT}"; /** - * The token, which if present in the Command string, will - * be replaced with the output filename. + * The token, which if present in the Command string, will be replaced with the output filename. * Alternately, the output data can be collected on STDOUT. */ public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}"; private static final long serialVersionUID = -1079128990650687037L; - //make this parameterizable + // make this parameterizable private final long timeoutMs = 60000; /** * Media types supported by the external program. @@ -81,8 +75,7 @@ public class ExternalParser implements Parser { private Set supportedTypes = Collections.emptySet(); /** - * Regular Expressions to run over STDOUT to - * extract Metadata. + * Regular Expressions to run over STDOUT to extract Metadata. */ private Map metadataPatterns = null; /** @@ -90,18 +83,17 @@ public class ExternalParser implements Parser { * * @see Runtime#exec(String[]) */ - private String[] command = new String[]{"cat"}; + private String[] command = new String[] {"cat"}; /** * A consumer for ignored Lines */ private LineConsumer ignoredLineConsumer = LineConsumer.NULL; /** - * Starts a thread that reads and discards the contents of the - * standard stream of the given process. Potential exceptions - * are ignored, and the stream is closed once fully processed. - * Note: calling this starts a new thread and blocks the current(caller) - * thread until the new thread dies + * Starts a thread that reads and discards the contents of the standard stream of the given + * process. Potential exceptions are ignored, and the stream is closed once fully processed. + * Note: calling this starts a new thread and blocks the current(caller) thread until the new + * thread dies * * @param stream stream to be ignored */ @@ -110,13 +102,12 @@ private static void ignoreStream(final InputStream stream) { } /** - * Starts a thread that reads and discards the contents of the - * standard stream of the given process. Potential exceptions - * are ignored, and the stream is closed once fully processed. + * Starts a thread that reads and discards the contents of the standard stream of the given + * process. Potential exceptions are ignored, and the stream is closed once fully processed. * - * @param stream stream to sent to black hole (a k a null) - * @param waitForDeath when {@code true} the caller thread will be - * blocked till the death of new thread. + * @param stream stream to sent to black hole (a k a null) + * @param waitForDeath when {@code true} the caller thread will be blocked till the death of new + * thread. * @return The thread that is created and started */ private static Thread ignoreStream(final InputStream stream, boolean waitForDeath) { @@ -124,7 +115,7 @@ private static Thread ignoreStream(final InputStream stream, boolean waitForDeat try { IOUtils.copy(stream, NullOutputStream.INSTANCE); } catch (IOException e) { - //swallow + // swallow } finally { IOUtils.closeQuietly(stream); } @@ -140,20 +131,19 @@ private static Thread ignoreStream(final InputStream stream, boolean waitForDeat } /** - * Checks to see if the command can be run. Typically used with - * something like "myapp --version" to check to see if "myapp" - * is installed and on the path. + * Checks to see if the command can be run. Typically used with something like "myapp --version" + * to check to see if "myapp" is installed and on the path. * - * @param checkCmd The check command to run + * @param checkCmd The check command to run * @param errorValue What is considered an error value? */ public static boolean check(String checkCmd, int... errorValue) { - return check(new String[]{checkCmd}, errorValue); + return check(new String[] {checkCmd}, errorValue); } public static boolean check(String[] checkCmd, int... errorValue) { if (errorValue.length == 0) { - errorValue = new int[]{127}; + errorValue = new int[] {127}; } Process process = null; @@ -163,7 +153,7 @@ public static boolean check(String[] checkCmd, int... errorValue) { Thread stdOutSuckerThread = ignoreStream(process.getInputStream(), false); stdErrSuckerThread.join(); stdOutSuckerThread.join(); - //make the timeout parameterizable + // make the timeout parameterizable boolean finished = process.waitFor(60000, TimeUnit.MILLISECONDS); if (!finished) { throw new TimeoutException(); @@ -184,14 +174,14 @@ public static boolean check(String[] checkCmd, int... errorValue) { // External process execution is banned by the security manager throw se; } catch (Error err) { - if (err.getMessage() != null && (err.getMessage().contains("posix_spawn") || - err.getMessage().contains("UNIXProcess"))) { + if (err.getMessage() != null && (err.getMessage().contains("posix_spawn") + || err.getMessage().contains("UNIXProcess"))) { LOG.debug("(TIKA-1526): exception trying to run: " + checkCmd[0], err); - //"Error forking command due to JVM locale bug - //(see TIKA-1526 and SOLR-6387)" + // "Error forking command due to JVM locale bug + // (see TIKA-1526 and SOLR-6387)" return false; } - //throw if a different kind of error + // throw if a different kind of error throw err; } finally { if (process != null) { @@ -217,9 +207,8 @@ public String[] getCommand() { } /** - * Sets the command to be run. This can include either of - * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN} - * if the command needs filenames. + * Sets the command to be run. This can include either of {@link #INPUT_FILE_TOKEN} or + * {@link #OUTPUT_FILE_TOKEN} if the command needs filenames. * * @see Runtime#exec(String[]) */ @@ -250,23 +239,20 @@ public Map getMetadataExtractionPatterns() { } /** - * Sets the map of regular expression patterns and Metadata - * keys. Any matching patterns will have the matching - * metadata entries set. - * Set this to null to disable Metadata extraction. + * Sets the map of regular expression patterns and Metadata keys. Any matching patterns will + * have the matching metadata entries set. Set this to null to disable Metadata extraction. */ public void setMetadataExtractionPatterns(Map patterns) { this.metadataPatterns = patterns; } /** - * Executes the configured external command and passes the given document - * stream as a simple XHTML document to the given SAX content handler. - * Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)} - * has been called to set patterns. + * Executes the configured external command and passes the given document stream as a simple + * XHTML document to the given SAX content handler. Metadata is only extracted if + * {@link #setMetadataExtractionPatterns(Map)} has been called to set patterns. */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); TemporaryResources tmp = new TemporaryResources(); @@ -278,7 +264,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, - TemporaryResources tmp) throws IOException, SAXException, TikaException { + TemporaryResources tmp) throws IOException, SAXException, TikaException { boolean inputToStdIn = true; boolean outputFromStdOut = true; boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty()); @@ -360,17 +346,17 @@ private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata m } /** - * Starts a thread that extracts the contents of the standard output - * stream of the given process to the given XHTML content handler. - * The standard output stream is closed once fully processed. + * Starts a thread that extracts the contents of the standard output stream of the given process + * to the given XHTML content handler. The standard output stream is closed once fully + * processed. * * @param stream - * @param xhtml XHTML content handler + * @param xhtml XHTML content handler * @throws SAXException if the XHTML SAX events could not be handled - * @throws IOException if an input error occurred + * @throws IOException if an input error occurred */ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) - throws SAXException, IOException { + throws SAXException, IOException { try (Reader reader = new InputStreamReader(stream, UTF_8)) { xhtml.startDocument(); xhtml.startElement("p"); @@ -384,14 +370,13 @@ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) } /** - * Starts a thread that sends the contents of the given input stream - * to the standard input stream of the given process. Potential - * exceptions are ignored, and the standard input stream is closed - * once fully processed. Note that the given input stream is not - * closed by this method. + * Starts a thread that sends the contents of the given input stream to the standard input + * stream of the given process. Potential exceptions are ignored, and the standard input stream + * is closed once fully processed. Note that the given input stream is not closed by + * this method. * * @param process process - * @param stream input stream + * @param stream input stream */ private void sendInput(final Process process, final InputStream stream) { Thread t = new Thread(() -> { @@ -399,7 +384,7 @@ private void sendInput(final Process process, final InputStream stream) { try { IOUtils.copy(stream, stdin); } catch (IOException e) { - //swallow + // swallow } }); t.start(); @@ -421,8 +406,7 @@ private void extractMetadata(final InputStream stream, final Metadata metadata) Matcher m = entry.getKey().matcher(line); if (m.find()) { consumed = true; - if (entry.getValue() != null && - !entry.getValue().equals("")) { + if (entry.getValue() != null && !entry.getValue().equals("")) { metadata.add(entry.getValue(), m.group(1)); } else { metadata.add(m.group(1), m.group(2)); diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java index 754bcf4454..62ad6826f0 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java +++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.external; @@ -27,7 +25,10 @@ import java.util.StringTokenizer; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; - +import org.apache.tika.exception.TikaException; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.utils.XMLReaderUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; @@ -35,16 +36,9 @@ import org.xml.sax.InputSource; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.mime.MediaType; -import org.apache.tika.mime.MimeTypeException; -import org.apache.tika.utils.XMLReaderUtils; - /** - * Builds up ExternalParser instances based on XML file(s) - * which define what to run, for what, and how to process - * any output metadata. - * Typically used to configure up a series of external programs + * Builds up ExternalParser instances based on XML file(s) which define what to run, for what, and + * how to process any output metadata. Typically used to configure up a series of external programs * (like catdoc or pdf2txt) to extract text content from documents. * *

@@ -86,16 +80,15 @@ public static List read(Element element) throws TikaException, I
             }
         } else {
             throw new MimeTypeException(
-                    "Not a <" + EXTERNAL_PARSERS_TAG + "/> configuration document: " +
-                            (element != null ? element.getTagName() : "n/a"));
+                            "Not a <" + EXTERNAL_PARSERS_TAG + "/> configuration document: "
+                                            + (element != null ? element.getTagName() : "n/a"));
         }
 
         return parsers;
     }
 
     /**
-     * Builds and Returns an ExternalParser, or null if a check
-     * command was given that didn't match.
+     * Builds and Returns an ExternalParser, or null if a check command was given that didn't match.
      */
     private static ExternalParser readParser(Element parserDef) throws TikaException {
         ExternalParser parser = new ExternalParser();
@@ -122,7 +115,8 @@ private static ExternalParser readParser(Element parserDef) throws TikaException
                         parser.setMetadataExtractionPatterns(readMetadataPatterns(child));
                         break;
                     default:
-                        throw new IllegalArgumentException("reaction not defined for " + child.getTagName());
+                        throw new IllegalArgumentException(
+                                        "reaction not defined for " + child.getTagName());
                 }
             }
         }
@@ -186,7 +180,7 @@ private static boolean readCheckTagAndCheck(Element checkDef) {
                             String s = st.nextToken();
                             errorVals.add(Integer.parseInt(s));
                         } catch (NumberFormatException e) {
-                            //swallow
+                            // swallow
                         }
                     }
                 }
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
index 86369c6cd7..1325463b45 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.parser.external;
 
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
index 561cbe7d00..a06f91483d 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.parser.external;
 
@@ -24,7 +22,6 @@
 import java.util.Enumeration;
 import java.util.List;
 import java.util.Map;
-
 import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
@@ -33,8 +30,7 @@
 import org.apache.tika.parser.Parser;
 
 /**
- * Creates instances of ExternalParser based on XML
- * configuration files.
+ * Creates instances of ExternalParser based on XML configuration files.
  *
  * @see ExternalParsersConfigReader
  */
@@ -45,15 +41,14 @@ public static List create() throws IOException, TikaException {
     }
 
     public static List create(ServiceLoader loader)
-            throws IOException, TikaException {
+                    throws IOException, TikaException {
         return create("tika-external-parsers.xml", loader);
     }
 
     public static List create(String filename, ServiceLoader loader)
-            throws IOException, TikaException {
-        String filepath =
-                ExternalParsersFactory.class.getPackage().getName().replace('.', '/') + "/" +
-                        filename;
+                    throws IOException, TikaException {
+        String filepath = ExternalParsersFactory.class.getPackage().getName().replace('.', '/')
+                        + "/" + filename;
         Enumeration files = loader.findServiceResources(filepath);
         ArrayList list = Collections.list(files);
         URL[] urls = list.toArray(new URL[0]);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java b/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
index 4ee27b9d65..243fc0d7e5 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 
 /**
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
index 5dbea57f04..14489cf849 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.parser.external2;
 
@@ -29,12 +27,6 @@
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
 import org.apache.tika.config.Field;
 import org.apache.tika.config.Initializable;
 import org.apache.tika.config.InitializableProblemHandler;
@@ -54,14 +46,17 @@
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.FileProcessResult;
 import org.apache.tika.utils.ProcessUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 /**
- * This is a next generation external parser that uses some of the more
- * recent additions to Tika. This is an experimental alternative to the
- * {@link org.apache.tika.parser.external.ExternalParser}.
- * Specifically, it relies more on configuration than the SPI model.
- * Further, users can specify a parser to handle the output
- * of the external process.
+ * This is a next generation external parser that uses some of the more recent additions to Tika.
+ * This is an experimental alternative to the
+ * {@link org.apache.tika.parser.external.ExternalParser}. Specifically, it relies more on
+ * configuration than the SPI model. Further, users can specify a parser to handle the output of the
+ * external process.
  */
 public class ExternalParser implements Parser, Initializable {
 
@@ -99,8 +94,8 @@ public Set getSupportedTypes(ParseContext context) {
 
     @Override
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
-                      ParseContext context) throws IOException, SAXException, TikaException {
-        //this may remain null, depending on whether the external parser writes to a file
+                    ParseContext context) throws IOException, SAXException, TikaException {
+        // this may remain null, depending on whether the external parser writes to a file
         Path outFile = null;
         try (TemporaryResources tmp = new TemporaryResources()) {
             TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
@@ -112,12 +107,12 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
             for (String c : commandLine) {
                 if (inputMatcher.reset(c).find()) {
                     String updated = c.replace(INPUT_FILE_TOKEN,
-                            ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString()));
+                                    ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString()));
                     thisCommandLine.add(updated);
                 } else if (outputMatcher.reset(c).find()) {
                     outFile = Files.createTempFile("tika-external2-", "");
-                    String updated = c.replace(OUTPUT_FILE_TOKEN,
-                            ProcessUtils.escapeCommandLine(outFile.toAbsolutePath().toString()));
+                    String updated = c.replace(OUTPUT_FILE_TOKEN, ProcessUtils
+                                    .escapeCommandLine(outFile.toAbsolutePath().toString()));
                     thisCommandLine.add(updated);
                     outputFileInCommandline = true;
                 } else {
@@ -128,20 +123,18 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
             long localTimeoutMillis = TikaTaskTimeout.getTimeoutMillis(context, timeoutMs);
             if (outputFileInCommandline) {
                 result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine),
-                        localTimeoutMillis, maxStdOut, maxStdErr);
+                                localTimeoutMillis, maxStdOut, maxStdErr);
             } else {
                 outFile = Files.createTempFile("tika-external2-", "");
                 result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine),
-                        localTimeoutMillis, outFile, maxStdErr);
+                                localTimeoutMillis, outFile, maxStdErr);
             }
             metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout());
             metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue());
             metadata.set(ExternalProcess.STD_OUT_LENGTH, result.getStdoutLength());
-            metadata.set(ExternalProcess.STD_OUT_IS_TRUNCATED,
-                    result.isStdoutTruncated());
+            metadata.set(ExternalProcess.STD_OUT_IS_TRUNCATED, result.isStdoutTruncated());
             metadata.set(ExternalProcess.STD_ERR_LENGTH, result.getStderrLength());
-            metadata.set(ExternalProcess.STD_ERR_IS_TRUNCATED,
-                    result.isStderrTruncated());
+            metadata.set(ExternalProcess.STD_ERR_IS_TRUNCATED, result.isStderrTruncated());
 
             if (returnStdout) {
                 metadata.set(ExternalProcess.STD_OUT, result.getStdout());
@@ -160,23 +153,22 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
         }
     }
 
-    private void handleOutput(FileProcessResult result, Path outFile,
-                              XHTMLContentHandler xhtml, Metadata metadata,
-                              ParseContext parseContext) throws SAXException, TikaException,
-            IOException {
+    private void handleOutput(FileProcessResult result, Path outFile, XHTMLContentHandler xhtml,
+                    Metadata metadata, ParseContext parseContext)
+                    throws SAXException, TikaException, IOException {
         if (outputParser == EmptyParser.INSTANCE) {
             if (outFile != null) {
                 try (BufferedReader reader = Files.newBufferedReader(outFile)) {
                     String line = reader.readLine();
                     while (line != null) {
-                        //do we want to wrap this in 

elements? + // do we want to wrap this in

elements? xhtml.characters(line); xhtml.newline(); line = reader.readLine(); } } } else { - //read this in line by line and wrap

elements? + // read this in line by line and wrap

elements? xhtml.characters(result.getStdout()); } } else { @@ -185,8 +177,8 @@ private void handleOutput(FileProcessResult result, Path outFile, outputParser.parse(is, new BodyContentHandler(xhtml), metadata, parseContext); } } else { - try (InputStream is = TikaInputStream.get( - result.getStdout().getBytes(StandardCharsets.UTF_8))) { + try (InputStream is = TikaInputStream + .get(result.getStdout().getBytes(StandardCharsets.UTF_8))) { outputParser.parse(is, new BodyContentHandler(xhtml), metadata, parseContext); } } @@ -195,8 +187,8 @@ private void handleOutput(FileProcessResult result, Path outFile, } /** - * This is set during initialization from a tika-config. - * Any calls after initialization will result in a {@link IllegalStateException}. + * This is set during initialization from a tika-config. Any calls after initialization will + * result in a {@link IllegalStateException}. * * @param supportedTypes */ @@ -226,10 +218,9 @@ public void setMaxStdOut(int maxStdOut) { } /** - * Use this to specify the full commandLine. The commandline must - * include at least {@link ExternalParser#INPUT_FILE_TOKEN}. - * If the external process writes to an output file, specify - * {@link ExternalParser#OUTPUT_FILE_TOKEN}. + * Use this to specify the full commandLine. The commandline must include at least + * {@link ExternalParser#INPUT_FILE_TOKEN}. If the external process writes to an output file, + * specify {@link ExternalParser#OUTPUT_FILE_TOKEN}. * * @param commandLine */ @@ -240,10 +231,9 @@ public void setCommandLine(List commandLine) { /** - * If set to true, this will return the stdout in the metadata - * via {@link org.apache.tika.metadata.ExternalProcess#STD_OUT}. - * Default is false because this should normally - * be handled by the outputParser + * If set to true, this will return the stdout in the metadata via + * {@link org.apache.tika.metadata.ExternalProcess#STD_OUT}. Default is false + * because this should normally be handled by the outputParser * * @param returnStdout */ @@ -253,9 +243,9 @@ public void setReturnStdout(boolean returnStdout) { } /** - * If set to true, this will return the stderr in the metadata - * via {@link org.apache.tika.metadata.ExternalProcess#STD_ERR}. - * Default is true + * If set to true, this will return the stderr in the metadata via + * {@link org.apache.tika.metadata.ExternalProcess#STD_ERR}. Default is true + * * @param returnStderr */ @Field @@ -264,10 +254,10 @@ public void setReturnStderr(boolean returnStderr) { } /** - * This parser is called on the output of the process. - * If the process writes to an output file, specified by - * {@link ExternalParser#OUTPUT_FILE_TOKEN}, this parser will parse that file, + * This parser is called on the output of the process. If the process writes to an output file, + * specified by {@link ExternalParser#OUTPUT_FILE_TOKEN}, this parser will parse that file, * otherwise it will parse the UTF-8 encoded bytes from the process' STD_OUT. + * * @param parser */ @Field @@ -281,12 +271,12 @@ public Parser getOutputParser() { @Override public void initialize(Map params) throws TikaConfigException { - //no-op + // no-op } @Override public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { + throws TikaConfigException { if (supportedTypes.size() == 0) { throw new TikaConfigException("supportedTypes size must be > 0"); } @@ -295,8 +285,8 @@ public void checkInitialization(InitializableProblemHandler problemHandler) } if (outputParser == EmptyParser.INSTANCE) { - LOG.debug("no parser selected for the output; contents will be " + - "written to the content handler"); + LOG.debug("no parser selected for the output; contents will be " + + "written to the content handler"); } } diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java index 9f2ea8a3b2..75e1fac470 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.multiple; @@ -30,10 +28,6 @@ import java.util.List; import java.util.Map; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Param; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; @@ -46,15 +40,14 @@ import org.apache.tika.parser.ParserDecorator; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.utils.ParserUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Abstract base class for parser wrappers which may / will - * process a given stream multiple times, merging the results - * of the various parsers used. - * End users should normally use {@link FallbackParser} or - * {@link SupplementingParser} along with a Strategy. - * Note that unless you give a {@link ContentHandlerFactory}, - * you'll get content from every parser tried mushed together! + * Abstract base class for parser wrappers which may / will process a given stream multiple times, + * merging the results of the various parsers used. End users should normally use + * {@link FallbackParser} or {@link SupplementingParser} along with a Strategy. Note that unless you + * give a {@link ContentHandlerFactory}, you'll get content from every parser tried mushed together! * * @since Apache Tika 1.18 */ @@ -73,9 +66,8 @@ public abstract class AbstractMultipleParser implements Parser { */ private final Collection parsers; /** - * Computed list of Mime Types to offer, which is all - * those in common between the parsers. - * For explicit mimetypes only, use a {@link ParserDecorator} + * Computed list of Mime Types to offer, which is all those in common between the parsers. For + * explicit mimetypes only, use a {@link ParserDecorator} */ private final Set offeredTypes; /** @@ -85,17 +77,17 @@ public abstract class AbstractMultipleParser implements Parser { @SuppressWarnings("rawtypes") public AbstractMultipleParser(MediaTypeRegistry registry, Collection parsers, - Map params) { + Map params) { this(registry, getMetadataPolicy(params), parsers); } public AbstractMultipleParser(MediaTypeRegistry registry, MetadataPolicy policy, - Parser... parsers) { + Parser... parsers) { this(registry, policy, Arrays.asList(parsers)); } public AbstractMultipleParser(MediaTypeRegistry registry, MetadataPolicy policy, - Collection parsers) { + Collection parsers) { this.policy = policy; this.parsers = parsers; this.registry = registry; @@ -114,18 +106,18 @@ protected static MetadataPolicy getMetadataPolicy(Map params) { return (MetadataPolicy) params.get(METADATA_POLICY_CONFIG_KEY).getValue(); } throw new IllegalArgumentException( - "Required parameter '" + METADATA_POLICY_CONFIG_KEY + "' not supplied"); + "Required parameter '" + METADATA_POLICY_CONFIG_KEY + "' not supplied"); } protected static Metadata mergeMetadata(Metadata newMetadata, Metadata lastMetadata, - MetadataPolicy policy) { + MetadataPolicy policy) { if (policy == MetadataPolicy.DISCARD_ALL) { return newMetadata; } for (String n : lastMetadata.names()) { // If this is one of the metadata keys we're setting ourselves - // for tracking/errors, then always keep the latest one! + // for tracking/errors, then always keep the latest one! if (n.equals(TikaCoreProperties.TIKA_PARSED_BY.getName())) { continue; } @@ -212,55 +204,48 @@ public List getAllParsers() { } /** - * Used to allow implementations to prepare or change things - * before parsing occurs + * Used to allow implementations to prepare or change things before parsing occurs */ - protected void parserPrepare(Parser parser, Metadata metadata, ParseContext context) { - } + protected void parserPrepare(Parser parser, Metadata metadata, ParseContext context) {} /** - * Used to notify implementations that a Parser has Finished - * or Failed, and to allow them to decide to continue or - * abort further parsing + * Used to notify implementations that a Parser has Finished or Failed, and to allow them to + * decide to continue or abort further parsing */ protected abstract boolean parserCompleted(Parser parser, Metadata metadata, - ContentHandler handler, ParseContext context, - Exception exception); + ContentHandler handler, ParseContext context, Exception exception); /** - * Processes the given Stream through one or more parsers, - * resetting things between parsers as requested by policy. - * The actual processing is delegated to one or more {@link Parser}s. + * Processes the given Stream through one or more parsers, resetting things between parsers as + * requested by policy. The actual processing is delegated to one or more {@link Parser}s. *

- * Note that you'll get text from every parser this way, to have - * control of which content is from which parser you need to - * call the method with a {@link ContentHandlerFactory} instead. + * Note that you'll get text from every parser this way, to have control of which content is + * from which parser you need to call the method with a {@link ContentHandlerFactory} instead. */ @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { parse(stream, handler, null, metadata, context); } /** - * Processes the given Stream through one or more parsers, - * resetting things between parsers as requested by policy. - * The actual processing is delegated to one or more {@link Parser}s. - * You will get one ContentHandler fetched for each Parser used. - * TODO Do we need to return all the ContentHandler instances we created? + * Processes the given Stream through one or more parsers, resetting things between parsers as + * requested by policy. The actual processing is delegated to one or more {@link Parser}s. You + * will get one ContentHandler fetched for each Parser used. TODO Do we need to return all the + * ContentHandler instances we created? * - * @deprecated The {@link ContentHandlerFactory} override is still experimental - * and the method signature is subject to change before Tika 2.0 + * @deprecated The {@link ContentHandlerFactory} override is still experimental and the method + * signature is subject to change before Tika 2.0 */ @Deprecated public void parse(InputStream stream, ContentHandlerFactory handlers, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { parse(stream, null, handlers, metadata, context); } private void parse(InputStream stream, ContentHandler handler, - ContentHandlerFactory handlerFactory, Metadata originalMetadata, - ParseContext context) throws IOException, SAXException, TikaException { + ContentHandlerFactory handlerFactory, Metadata originalMetadata, + ParseContext context) throws IOException, SAXException, TikaException { // Track the metadata between parsers, so we can apply our policy Metadata lastMetadata = cloneMetadata(originalMetadata); Metadata metadata = lastMetadata; @@ -269,13 +254,14 @@ private void parse(InputStream stream, ContentHandler handler, TemporaryResources tmp = new TemporaryResources(); try { // Ensure we'll be able to re-read safely, buffering to disk if so, - // to permit Parsers 2+ to be able to read the same data - InputStream taggedStream = ParserUtils.ensureStreamReReadable(stream, tmp, originalMetadata); + // to permit Parsers 2+ to be able to read the same data + InputStream taggedStream = + ParserUtils.ensureStreamReReadable(stream, tmp, originalMetadata); for (Parser p : parsers) { // Get a new handler for this parser, if we can // If not, the user will get text from every parser - // mushed together onto the one solitary handler... + // mushed together onto the one solitary handler... if (handlerFactory != null) { handler = handlerFactory.getNewContentHandler(); } @@ -342,30 +328,25 @@ private void parse(InputStream stream, ContentHandler handler, } /** - * The various strategies for handling metadata emitted by - * multiple parsers. - * Note that not all will be supported by all subclasses. + * The various strategies for handling metadata emitted by multiple parsers. Note that not all + * will be supported by all subclasses. */ public enum MetadataPolicy { /** - * Before moving onto another parser, throw away - * all previously seen metadata + * Before moving onto another parser, throw away all previously seen metadata */ DISCARD_ALL, /** - * The first parser to output a given key wins, - * merge in non-clashing other keys + * The first parser to output a given key wins, merge in non-clashing other keys */ FIRST_WINS, /** - * The last parser to output a given key wins, - * overriding previous parser values for a + * The last parser to output a given key wins, overriding previous parser values for a * clashing key. */ LAST_WINS, /** - * Where multiple parsers output a given key, - * store all their different (unique) values + * Where multiple parsers output a given key, store all their different (unique) values */ KEEP_ALL } diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java index e538e596a9..a42047b864 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.multiple; @@ -20,21 +18,18 @@ import java.util.Collection; import java.util.List; import java.util.Map; - -import org.xml.sax.ContentHandler; - import org.apache.tika.config.Param; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.xml.sax.ContentHandler; /** * Tries multiple parsers in turn, until one succeeds. *

- * Can optionally keep Metadata from failed parsers when - * trying the next one, depending on the {@link AbstractMultipleParser.MetadataPolicy} - * chosen. + * Can optionally keep Metadata from failed parsers when trying the next one, depending on the + * {@link AbstractMultipleParser.MetadataPolicy} chosen. * * @since Apache Tika 1.18 */ @@ -43,7 +38,7 @@ public class FallbackParser extends AbstractMultipleParser { * The different Metadata Policies we support (all) */ public static final List allowedPolicies = - Arrays.asList(MetadataPolicy.values()); + Arrays.asList(MetadataPolicy.values()); /** * Serial version UID. */ @@ -51,12 +46,12 @@ public class FallbackParser extends AbstractMultipleParser { @SuppressWarnings("rawtypes") public FallbackParser(MediaTypeRegistry registry, Collection parsers, - Map params) { + Map params) { super(registry, parsers, params); } public FallbackParser(MediaTypeRegistry registry, MetadataPolicy policy, - Collection parsers) { + Collection parsers) { super(registry, policy, parsers); } @@ -66,11 +61,10 @@ public FallbackParser(MediaTypeRegistry registry, MetadataPolicy policy, Parser. @Override protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandler handler, - ParseContext context, Exception exception) { + ParseContext context, Exception exception) { // If there was no exception, abort further parsers return exception != null; // Have the next parser tried } } - diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java index 8cf83c019f..9e519025f6 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.multiple; @@ -20,24 +18,20 @@ import java.util.Collection; import java.util.List; import java.util.Map; - -import org.xml.sax.ContentHandler; - import org.apache.tika.config.Param; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.xml.sax.ContentHandler; /** - * Runs the input stream through all available parsers, - * merging the metadata from them based on the + * Runs the input stream through all available parsers, merging the metadata from them based on the * {@link AbstractMultipleParser.MetadataPolicy} chosen. *

- * Warning - currently only one Parser should output - * any Content to the {@link ContentHandler}, the rest - * should only output {@link Metadata}. A solution to - * multiple-content is still being worked on... + * Warning - currently only one Parser should output any Content to the {@link ContentHandler}, the + * rest should only output {@link Metadata}. A solution to multiple-content is still being worked + * on... * * @since Apache Tika 1.18 */ @@ -45,9 +39,8 @@ public class SupplementingParser extends AbstractMultipleParser { /** * The different Metadata Policies we support (not discard) */ - public static final List allowedPolicies = - Arrays.asList(MetadataPolicy.FIRST_WINS, MetadataPolicy.LAST_WINS, - MetadataPolicy.KEEP_ALL); + public static final List allowedPolicies = Arrays.asList( + MetadataPolicy.FIRST_WINS, MetadataPolicy.LAST_WINS, MetadataPolicy.KEEP_ALL); /** * Serial version UID. */ @@ -55,29 +48,29 @@ public class SupplementingParser extends AbstractMultipleParser { @SuppressWarnings("rawtypes") public SupplementingParser(MediaTypeRegistry registry, Collection parsers, - Map params) { + Map params) { super(registry, parsers, params); } public SupplementingParser(MediaTypeRegistry registry, MetadataPolicy policy, - Parser... parsers) { + Parser... parsers) { this(registry, policy, Arrays.asList(parsers)); } public SupplementingParser(MediaTypeRegistry registry, MetadataPolicy policy, - Collection parsers) { + Collection parsers) { super(registry, policy, parsers); // Ensure it's a supported policy if (!allowedPolicies.contains(policy)) { throw new IllegalArgumentException( - "Unsupported policy for SupplementingParser: " + policy); + "Unsupported policy for SupplementingParser: " + policy); } } @Override protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandler handler, - ParseContext context, Exception exception) { + ParseContext context, Exception exception) { // If there was no exception, just carry on to the next if (exception == null) { return true; @@ -87,4 +80,3 @@ protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandl return true; } } - diff --git a/tika-core/src/main/java/org/apache/tika/parser/package-info.java b/tika-core/src/main/java/org/apache/tika/parser/package-info.java index 10df69e1e6..7ff5e03830 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/parser/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java index a98d39c974..b2dbbaa34a 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java @@ -1,19 +1,18 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */package org.apache.tika.renderer; + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ +package org.apache.tika.renderer; import java.io.IOException; import java.io.InputStream; @@ -23,7 +22,6 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; - import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; @@ -54,6 +52,7 @@ public CompositeRenderer(List renderers) { } rendererMap = Collections.unmodifiableMap(tmp); } + @Override public Set getSupportedTypes(ParseContext context) { return rendererMap.keySet(); @@ -61,7 +60,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext, - RenderRequest... requests) throws IOException, TikaException { + RenderRequest... requests) throws IOException, TikaException { String mediaTypeString = metadata.get(TikaCoreProperties.TYPE); if (mediaTypeString == null) { @@ -81,6 +80,7 @@ public RenderResults render(InputStream is, Metadata metadata, ParseContext pars public Renderer getLeafRenderer(MediaType mt) { return rendererMap.get(mt); } + @Override public void initialize(Map params) throws TikaConfigException { @@ -88,13 +88,12 @@ public void initialize(Map params) throws TikaConfigException { @Override public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { + throws TikaConfigException { } private static List getDefaultRenderers(ServiceLoader loader) { - List staticRenderers = - loader.loadStaticServiceProviders(Renderer.class); + List staticRenderers = loader.loadStaticServiceProviders(Renderer.class); ServiceLoaderUtils.sortLoadedClasses(staticRenderers); return staticRenderers; diff --git a/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java index d80ff7c5c9..5516a9f935 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.renderer; @@ -20,7 +18,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - import org.apache.tika.io.TemporaryResources; import org.apache.tika.metadata.TikaPagedText; @@ -31,6 +28,7 @@ public class PageBasedRenderResults extends RenderResults { public PageBasedRenderResults(TemporaryResources tmp) { super(tmp); } + public void add(RenderResult result) { Integer page = result.getMetadata().getInt(TikaPagedText.PAGE_NUMBER); if (page != null) { diff --git a/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java b/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java index 2534d7032f..66b2db99f7 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java @@ -1,25 +1,23 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.renderer; import java.util.Objects; /** - * The range of pages to render. These are 1-based, and "to" is inclusive. + * The range of pages to render. These are 1-based, and "to" is inclusive. */ public class PageRangeRequest implements RenderRequest { diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java index 3277d866af..7b6f1e2e36 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java @@ -1,27 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.renderer; /** - * Empty interface for requests to a renderer. Different - * file formats and different use cases will have different types of requests. - * For page based, it could be a page range (render the full pages from 2 to 5); - * or it could be a single page with an x-y bounding box. For video files, - * it could be a temporal offset or a temporal offset with an x-y bounding box. + * Empty interface for requests to a renderer. Different file formats and different use cases will + * have different types of requests. For page based, it could be a page range (render the full pages + * from 2 to 5); or it could be a single page with an x-y bounding box. For video files, it could be + * a temporal offset or a temporal offset with an x-y bounding box. */ public interface RenderRequest { } diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java index 25588c45bb..b87e8fd00d 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.renderer; @@ -20,7 +18,6 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; - import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -28,18 +25,17 @@ public class RenderResult implements Closeable { public enum STATUS { - SUCCESS, - EXCEPTION, - TIMEOUT + SUCCESS, EXCEPTION, TIMEOUT } + private final STATUS status; private final int id; private final Object result; - //TODO: we're relying on metadata to bring in a bunch of info. - //Might be cleaner to add specific parameters for page number, embedded path, etc.? + // TODO: we're relying on metadata to bring in a bunch of info. + // Might be cleaner to add specific parameters for page number, embedded path, etc.? private final Metadata metadata; TemporaryResources tmp = new TemporaryResources(); @@ -53,7 +49,7 @@ public RenderResult(STATUS status, int id, Object result, Metadata metadata) { tmp.addResource(new Closeable() { @Override public void close() throws IOException { - Files.delete((Path)result); + Files.delete((Path) result); } }); } else if (result instanceof Closeable) { @@ -63,7 +59,7 @@ public void close() throws IOException { public TikaInputStream getInputStream() throws IOException { if (result instanceof Path) { - return TikaInputStream.get((Path)result, metadata); + return TikaInputStream.get((Path) result, metadata); } else { TikaInputStream tis = TikaInputStream.get(new byte[0]); tis.setOpenContainer(result); diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java index 108c062605..a311f7975a 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.renderer; @@ -20,7 +18,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; - import org.apache.tika.io.TemporaryResources; public class RenderResults implements Closeable { @@ -28,9 +25,11 @@ public class RenderResults implements Closeable { private List results = new ArrayList<>(); private final TemporaryResources tmp; + public RenderResults(TemporaryResources tmp) { this.tmp = tmp; } + public void add(RenderResult result) { tmp.addResource(result); results.add(result); diff --git a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java index bc4261f521..8e64f173ab 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.renderer; @@ -20,14 +18,13 @@ import java.io.InputStream; import java.io.Serializable; import java.util.Set; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; /** - * Interface for a renderer. This should be flexible enough to run on the initial design: PDF pages + * Interface for a renderer. This should be flexible enough to run on the initial design: PDF pages * but also on portions of PDF pages as well as on other document types. * */ @@ -36,8 +33,8 @@ public interface Renderer extends Serializable { /** - * Returns the set of media types supported by this renderer when used - * with the given parse context. + * Returns the set of media types supported by this renderer when used with the given parse + * context. * * @param context parse context * @return immutable set of media types @@ -46,17 +43,14 @@ public interface Renderer extends Serializable { Set getSupportedTypes(ParseContext context); RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext, - RenderRequest ... requests) throws IOException, - TikaException; + RenderRequest... requests) throws IOException, TikaException; /* - At some point, we might need/want to add something like this, where for a given - page the requestor or the parser determines that they only want to render e.g. a - box within a page. - - RenderResults render(InputStream is, int page, Coordinates coordinates, Metadata metadata, - ParseContext parseContext) throws IOException, - TikaException; - + * At some point, we might need/want to add something like this, where for a given page the + * requestor or the parser determines that they only want to render e.g. a box within a page. + * + * RenderResults render(InputStream is, int page, Coordinates coordinates, Metadata metadata, + * ParseContext parseContext) throws IOException, TikaException; + * */ } diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java index ed82500659..568c91e21b 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.renderer; /** - * This should be to track state for each file (embedded or otherwise). - * This should be reset in the parseContext at the beginning of a parse - * and then replaced at the end of the parse. + * This should be to track state for each file (embedded or otherwise). This should be reset in the + * parseContext at the beginning of a parse and then replaced at the end of the parse. */ public class RenderingState { diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java index 2e3143261a..8934168245 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java @@ -1,28 +1,25 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.renderer; /** - * Use this in the ParseContext to keep track of unique ids for rendered - * images in embedded docs. This should be used for the full parse of - * a main document and its embedded document. + * Use this in the ParseContext to keep track of unique ids for rendered images in embedded docs. + * This should be used for the full parse of a main document and its embedded document. * - * This is different from RenderingState, which is used to track - * rendering per file/per embedded doc. + * This is different from RenderingState, which is used to track rendering per file/per embedded + * doc. */ public class RenderingTracker { diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java index 850ceb4147..6a5745b64b 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java @@ -1,44 +1,41 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import java.io.OutputStream; import java.io.Serializable; import java.nio.charset.Charset; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** * This is a special handler to be used only with the - * {@link org.apache.tika.parser.RecursiveParserWrapper}. - * It allows for finer-grained processing of embedded documents than in the legacy handlers. - * Subclasses can choose how to process individual embedded documents. + * {@link org.apache.tika.parser.RecursiveParserWrapper}. It allows for finer-grained processing of + * embedded documents than in the legacy handlers. Subclasses can choose how to process individual + * embedded documents. */ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandler - implements Serializable { + implements Serializable { - public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = Property.internalBoolean( - TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached"); + public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = + Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + + "embedded_resource_limit_reached"); private static final int MAX_DEPTH = 100; private final ContentHandlerFactory contentHandlerFactory; private final int maxEmbeddedResources; @@ -50,7 +47,7 @@ public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandle } public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, - int maxEmbeddedResources) { + int maxEmbeddedResources) { this.contentHandlerFactory = contentHandlerFactory; this.maxEmbeddedResources = maxEmbeddedResources; } @@ -64,15 +61,15 @@ public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { } /** - * This is called before parsing each embedded document. Override this - * for custom behavior. Make sure to call this in your custom classes - * because this tracks the number of embedded documents. + * This is called before parsing each embedded document. Override this for custom behavior. Make + * sure to call this in your custom classes because this tracks the number of embedded + * documents. * * @param contentHandler local handler to be used on this embedded document - * @param metadata embedded document's metadata + * @param metadata embedded document's metadata */ public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) - throws SAXException { + throws SAXException { embeddedResources++; embeddedDepth++; if (embeddedDepth >= MAX_DEPTH) { @@ -82,37 +79,36 @@ public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metada } /** - * This is called after parsing each embedded document. Override this - * for custom behavior. This is currently a no-op aside from tracking embedded depth. + * This is called after parsing each embedded document. Override this for custom behavior. This + * is currently a no-op aside from tracking embedded depth. *

* When overriding, make sure to call {@link #decrementEmbeddedDepth()} * * @param contentHandler content handler that was used on this embedded document - * @param metadata metadata for this embedded document + * @param metadata metadata for this embedded document * @throws SAXException */ public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) - throws SAXException { + throws SAXException { decrementEmbeddedDepth(); } /** - * This is called by {@link #endEmbeddedDocument(ContentHandler, Metadata)}. Users - * overriding {@link #endEmbeddedDocument(ContentHandler, Metadata)} need to call this - * unless they are triggering it via super.endEmbeddedDocument(contentHandler, metadata); + * This is called by {@link #endEmbeddedDocument(ContentHandler, Metadata)}. Users overriding + * {@link #endEmbeddedDocument(ContentHandler, Metadata)} need to call this unless they are + * triggering it via super.endEmbeddedDocument(contentHandler, metadata); */ protected void decrementEmbeddedDepth() { embeddedDepth--; } /** - * This is called after the full parse has completed. Override this - * for custom behavior. Make sure to call this as super.endDocument(...) - * in subclasses because this adds whether or not the embedded resource - * maximum has been hit to the metadata. + * This is called after the full parse has completed. Override this for custom behavior. Make + * sure to call this as super.endDocument(...) in subclasses because this adds + * whether or not the embedded resource maximum has been hit to the metadata. * * @param contentHandler content handler that was used on the main document - * @param metadata metadata that was gathered for the main document + * @param metadata metadata that was gathered for the main document * @throws SAXException */ public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java index 361b7817c7..0a00929b27 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -22,12 +20,10 @@ import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.Locale; - +import org.apache.tika.parser.ParseContext; import org.xml.sax.ContentHandler; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.parser.ParseContext; - /** * Basic factory for creating common types of ContentHandlers */ @@ -42,9 +38,10 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteL /** * Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true - * @param type basic type of handler - * @param writeLimit max number of characters to store; if < 0, - * the handler will store all characters + * + * @param type basic type of handler + * @param writeLimit max number of characters to store; if < 0, the handler will store all + * characters */ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) { this(type, writeLimit, true, null); @@ -55,32 +52,31 @@ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) { * @param type basic type of handler * @param writeLimit maximum number of characters to store * @param throwOnWriteLimitReached whether or not to throw a - * {@link org.apache.tika.exception.WriteLimitReachedException} - * when the write limit has been reached - * @param parseContext to store the writelimitreached warning if - * throwOnWriteLimitReached is set to false + * {@link org.apache.tika.exception.WriteLimitReachedException} when the write limit has + * been reached + * @param parseContext to store the writelimitreached warning if throwOnWriteLimitReached is set + * to false */ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit, - boolean throwOnWriteLimitReached, ParseContext parseContext) { + boolean throwOnWriteLimitReached, ParseContext parseContext) { this.type = type; this.writeLimit = writeLimit; this.throwOnWriteLimitReached = throwOnWriteLimitReached; this.parseContext = parseContext; if (throwOnWriteLimitReached == false && parseContext == null) { - throw new IllegalArgumentException("parse context must not be null if " + - "throwOnWriteLimitReached is false"); + throw new IllegalArgumentException("parse context must not be null if " + + "throwOnWriteLimitReached is false"); } } /** - * Tries to parse string into handler type. Returns default if string is null or - * parse fails. + * Tries to parse string into handler type. Returns default if string is null or parse fails. *

* Options: xml, html, text, body, ignore (no content) * * @param handlerTypeName string to parse - * @param defaultType type to return if parse fails + * @param defaultType type to return if parse fails * @return handler type */ public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE defaultType) { @@ -111,9 +107,8 @@ public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE public ContentHandler getNewContentHandler() { if (type == HANDLER_TYPE.BODY) { - return new BodyContentHandler( - new WriteOutContentHandler(new ToTextContentHandler(), writeLimit, - throwOnWriteLimitReached, parseContext)); + return new BodyContentHandler(new WriteOutContentHandler(new ToTextContentHandler(), + writeLimit, throwOnWriteLimitReached, parseContext)); } else if (type == HANDLER_TYPE.IGNORE) { return new DefaultHandler(); } @@ -122,7 +117,7 @@ public ContentHandler getNewContentHandler() { return formatHandler; } return new WriteOutContentHandler(formatHandler, writeLimit, throwOnWriteLimitReached, - parseContext); + parseContext); } private ContentHandler getFormatHandler() { @@ -149,20 +144,20 @@ public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { switch (type) { case BODY: return new WriteOutContentHandler( - new BodyContentHandler(new OutputStreamWriter(os, charset)), - writeLimit); + new BodyContentHandler(new OutputStreamWriter(os, charset)), + writeLimit); case TEXT: return new WriteOutContentHandler( - new ToTextContentHandler(os, charset.name()), writeLimit); + new ToTextContentHandler(os, charset.name()), writeLimit); case HTML: return new WriteOutContentHandler( - new ToHTMLContentHandler(os, charset.name()), writeLimit); + new ToHTMLContentHandler(os, charset.name()), writeLimit); case XML: return new WriteOutContentHandler( - new ToXMLContentHandler(os, charset.name()), writeLimit); + new ToXMLContentHandler(os, charset.name()), writeLimit); default: return new WriteOutContentHandler( - new ToTextContentHandler(os, charset.name()), writeLimit); + new ToTextContentHandler(os, charset.name()), writeLimit); } } else { switch (type) { @@ -195,7 +190,7 @@ public HANDLER_TYPE getType() { * Common handler types for content. */ public enum HANDLER_TYPE { - BODY, IGNORE, //don't store content + BODY, IGNORE, // don't store content TEXT, HTML, XML } diff --git a/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java index dfdecb83c3..3162846309 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java @@ -1,34 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import java.io.Writer; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.sax.xpath.Matcher; import org.apache.tika.sax.xpath.MatchingContentHandler; import org.apache.tika.sax.xpath.XPathParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Content handler decorator that only passes everything inside - * the XHTML <body/> tag to the underlying handler. Note that - * the <body/> tag itself is not passed on. + * Content handler decorator that only passes everything inside the XHTML <body/> tag to the + * underlying handler. Note that the <body/> tag itself is not passed on. */ public class BodyContentHandler extends ContentHandlerDecorator { @@ -41,11 +36,11 @@ public class BodyContentHandler extends ContentHandlerDecorator { * The XPath matcher used to select the XHTML body contents. */ private static final Matcher MATCHER = - PARSER.parse("/xhtml:html/xhtml:body/descendant::node()"); + PARSER.parse("/xhtml:html/xhtml:body/descendant::node()"); /** - * Creates a content handler that passes all XHTML body events to the - * given underlying content handler. + * Creates a content handler that passes all XHTML body events to the given underlying content + * handler. * * @param handler content handler */ @@ -54,8 +49,7 @@ public BodyContentHandler(ContentHandler handler) { } /** - * Creates a content handler that writes XHTML body character events to - * the given writer. + * Creates a content handler that writes XHTML body character events to the given writer. * * @param writer writer */ @@ -64,15 +58,14 @@ public BodyContentHandler(Writer writer) { } /** - * Creates a content handler that writes XHTML body character events to - * an internal string buffer. The contents of the buffer can be retrieved - * using the {@link #toString()} method. + * Creates a content handler that writes XHTML body character events to an internal string + * buffer. The contents of the buffer can be retrieved using the {@link #toString()} method. *

- * The internal string buffer is bounded at the given number of characters. - * If this write limit is reached, then a {@link SAXException} is thrown. + * The internal string buffer is bounded at the given number of characters. If this write limit + * is reached, then a {@link SAXException} is thrown. * - * @param writeLimit maximum number of characters to include in the string, - * or -1 to disable the write limit + * @param writeLimit maximum number of characters to include in the string, or -1 to disable the + * write limit * @since Apache Tika 0.7 */ public BodyContentHandler(int writeLimit) { @@ -80,12 +73,11 @@ public BodyContentHandler(int writeLimit) { } /** - * Creates a content handler that writes XHTML body character events to - * an internal string buffer. The contents of the buffer can be retrieved - * using the {@link #toString()} method. + * Creates a content handler that writes XHTML body character events to an internal string + * buffer. The contents of the buffer can be retrieved using the {@link #toString()} method. *

- * The internal string buffer is bounded at 100k characters. If this write - * limit is reached, then a {@link SAXException} is thrown. + * The internal string buffer is bounded at 100k characters. If this write limit is reached, + * then a {@link SAXException} is thrown. */ public BodyContentHandler() { this(new WriteOutContentHandler()); diff --git a/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java b/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java index 6e6ddcde99..1864d05ec7 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java +++ b/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -26,132 +24,151 @@ * Class to help de-obfuscate phone numbers in text. */ public class CleanPhoneText { - public static final String[][][] cleanSubstitutions = - new String[][][]{{{"&#\\d{1,3};", ""}}, // first simply remove numeric entities - {{"th0usand", "thousand"}, // handle common misspellings - {"th1rteen", "thirteen"}, {"f0urteen", "fourteen"}, - {"e1ghteen", "eighteen"}, {"n1neteen", "nineteen"}, - {"f1fteen", "fifteen"}, {"s1xteen", "sixteen"}, {"th1rty", "thirty"}, - {"e1ghty", "eighty"}, {"n1nety", "ninety"}, {"fourty", "forty"}, - {"f0urty", "forty"}, {"e1ght", "eight"}, {"f0rty", "forty"}, - {"f1fty", "fifty"}, {"s1xty", "sixty"}, {"zer0", "zero"}, - {"f0ur", "four"}, {"f1ve", "five"}, {"n1ne", "nine"}, {"0ne", "one"}, - {"tw0", "two"}, {"s1x", "six"}}, + public static final String[][][] cleanSubstitutions = new String[][][] {{{"&#\\d{1,3};", ""}}, // first + // simply + // remove + // numeric + // entities + {{"th0usand", "thousand"}, // handle common misspellings + {"th1rteen", "thirteen"}, {"f0urteen", "fourteen"}, + {"e1ghteen", "eighteen"}, {"n1neteen", "nineteen"}, + {"f1fteen", "fifteen"}, {"s1xteen", "sixteen"}, + {"th1rty", "thirty"}, {"e1ghty", "eighty"}, + {"n1nety", "ninety"}, {"fourty", "forty"}, {"f0urty", "forty"}, + {"e1ght", "eight"}, {"f0rty", "forty"}, {"f1fty", "fifty"}, + {"s1xty", "sixty"}, {"zer0", "zero"}, {"f0ur", "four"}, + {"f1ve", "five"}, {"n1ne", "nine"}, {"0ne", "one"}, + {"tw0", "two"}, {"s1x", "six"}}, // mixed compound numeral words // consider 7teen, etc. {{"twenty[\\W_]{0,3}1", "twenty-one"}, {"twenty[\\W_]{0,3}2", "twenty-two"}, - {"twenty[\\W_]{0,3}3", "twenty-three"}, - {"twenty[\\W_]{0,3}4", "twenty-four"}, - {"twenty[\\W_]{0,3}5", "twenty-five"}, - {"twenty[\\W_]{0,3}6", "twenty-six"}, - {"twenty[\\W_]{0,3}7", "twenty-seven"}, - {"twenty[\\W_]{0,3}8", "twenty-eight"}, - {"twenty[\\W_]{0,3}9", "twenty-nine"}, - {"thirty[\\W_]{0,3}1", "thirty-one"}, - {"thirty[\\W_]{0,3}2", "thirty-two"}, - {"thirty[\\W_]{0,3}3", "thirty-three"}, - {"thirty[\\W_]{0,3}4", "thirty-four"}, - {"thirty[\\W_]{0,3}5", "thirty-five"}, - {"thirty[\\W_]{0,3}6", "thirty-six"}, - {"thirty[\\W_]{0,3}7", "thirty-seven"}, - {"thirty[\\W_]{0,3}8", "thirty-eight"}, - {"thirty[\\W_]{0,3}9", "thirty-nine"}, - {"forty[\\W_]{0,3}1", "forty-one"}, {"forty[\\W_]{0,3}2", "forty-two"}, - {"forty[\\W_]{0,3}3", "forty-three"}, - {"forty[\\W_]{0,3}4", "forty-four"}, - {"forty[\\W_]{0,3}5", "forty-five"}, {"forty[\\W_]{0,3}6", "forty-six"}, - {"forty[\\W_]{0,3}7", "forty-seven"}, - {"forty[\\W_]{0,3}8", "forty-eight"}, - {"forty[\\W_]{0,3}9", "forty-nine"}, {"fifty[\\W_]{0,3}1", "fifty-one"}, - {"fifty[\\W_]{0,3}2", "fifty-two"}, - {"fifty[\\W_]{0,3}3", "fifty-three"}, - {"fifty[\\W_]{0,3}4", "fifty-four"}, - {"fifty[\\W_]{0,3}5", "fifty-five"}, {"fifty[\\W_]{0,3}6", "fifty-six"}, - {"fifty[\\W_]{0,3}7", "fifty-seven"}, - {"fifty[\\W_]{0,3}8", "fifty-eight"}, - {"fifty[\\W_]{0,3}9", "fifty-nine"}, {"sixty[\\W_]{0,3}1", "sixty-one"}, - {"sixty[\\W_]{0,3}2", "sixty-two"}, - {"sixty[\\W_]{0,3}3", "sixty-three"}, - {"sixty[\\W_]{0,3}4", "sixty-four"}, - {"sixty[\\W_]{0,3}5", "sixty-five"}, {"sixty[\\W_]{0,3}6", "sixty-six"}, - {"sixty[\\W_]{0,3}7", "sixty-seven"}, - {"sixty[\\W_]{0,3}8", "sixty-eight"}, - {"sixty[\\W_]{0,3}9", "sixty-nine"}, - {"seventy[\\W_]{0,3}1", "seventy-one"}, - {"seventy[\\W_]{0,3}2", "seventy-two"}, - {"seventy[\\W_]{0,3}3", "seventy-three"}, - {"seventy[\\W_]{0,3}4", "seventy-four"}, - {"seventy[\\W_]{0,3}5", "seventy-five"}, - {"seventy[\\W_]{0,3}6", "seventy-six"}, - {"seventy[\\W_]{0,3}7", "seventy-seven"}, - {"seventy[\\W_]{0,3}8", "seventy-eight"}, - {"seventy[\\W_]{0,3}9", "seventy-nine"}, - {"eighty[\\W_]{0,3}1", "eighty-one"}, - {"eighty[\\W_]{0,3}2", "eighty-two"}, - {"eighty[\\W_]{0,3}3", "eighty-three"}, - {"eighty[\\W_]{0,3}4", "eighty-four"}, - {"eighty[\\W_]{0,3}5", "eighty-five"}, - {"eighty[\\W_]{0,3}6", "eighty-six"}, - {"eighty[\\W_]{0,3}7", "eighty-seven"}, - {"eighty[\\W_]{0,3}8", "eighty-eight"}, - {"eighty[\\W_]{0,3}9", "eighty-nine"}, - {"ninety[\\W_]{0,3}1", "ninety-one"}, - {"ninety[\\W_]{0,3}2", "ninety-two"}, - {"ninety[\\W_]{0,3}3", "ninety-three"}, - {"ninety[\\W_]{0,3}4", "ninety-four"}, - {"ninety[\\W_]{0,3}5", "ninety-five"}, - {"ninety[\\W_]{0,3}6", "ninety-six"}, - {"ninety[\\W_]{0,3}7", "ninety-seven"}, - {"ninety[\\W_]{0,3}8", "ninety-eight"}, - {"ninety[\\W_]{0,3}9", "ninety-nine"}}, + {"twenty[\\W_]{0,3}3", "twenty-three"}, + {"twenty[\\W_]{0,3}4", "twenty-four"}, + {"twenty[\\W_]{0,3}5", "twenty-five"}, + {"twenty[\\W_]{0,3}6", "twenty-six"}, + {"twenty[\\W_]{0,3}7", "twenty-seven"}, + {"twenty[\\W_]{0,3}8", "twenty-eight"}, + {"twenty[\\W_]{0,3}9", "twenty-nine"}, + {"thirty[\\W_]{0,3}1", "thirty-one"}, + {"thirty[\\W_]{0,3}2", "thirty-two"}, + {"thirty[\\W_]{0,3}3", "thirty-three"}, + {"thirty[\\W_]{0,3}4", "thirty-four"}, + {"thirty[\\W_]{0,3}5", "thirty-five"}, + {"thirty[\\W_]{0,3}6", "thirty-six"}, + {"thirty[\\W_]{0,3}7", "thirty-seven"}, + {"thirty[\\W_]{0,3}8", "thirty-eight"}, + {"thirty[\\W_]{0,3}9", "thirty-nine"}, + {"forty[\\W_]{0,3}1", "forty-one"}, + {"forty[\\W_]{0,3}2", "forty-two"}, + {"forty[\\W_]{0,3}3", "forty-three"}, + {"forty[\\W_]{0,3}4", "forty-four"}, + {"forty[\\W_]{0,3}5", "forty-five"}, + {"forty[\\W_]{0,3}6", "forty-six"}, + {"forty[\\W_]{0,3}7", "forty-seven"}, + {"forty[\\W_]{0,3}8", "forty-eight"}, + {"forty[\\W_]{0,3}9", "forty-nine"}, + {"fifty[\\W_]{0,3}1", "fifty-one"}, + {"fifty[\\W_]{0,3}2", "fifty-two"}, + {"fifty[\\W_]{0,3}3", "fifty-three"}, + {"fifty[\\W_]{0,3}4", "fifty-four"}, + {"fifty[\\W_]{0,3}5", "fifty-five"}, + {"fifty[\\W_]{0,3}6", "fifty-six"}, + {"fifty[\\W_]{0,3}7", "fifty-seven"}, + {"fifty[\\W_]{0,3}8", "fifty-eight"}, + {"fifty[\\W_]{0,3}9", "fifty-nine"}, + {"sixty[\\W_]{0,3}1", "sixty-one"}, + {"sixty[\\W_]{0,3}2", "sixty-two"}, + {"sixty[\\W_]{0,3}3", "sixty-three"}, + {"sixty[\\W_]{0,3}4", "sixty-four"}, + {"sixty[\\W_]{0,3}5", "sixty-five"}, + {"sixty[\\W_]{0,3}6", "sixty-six"}, + {"sixty[\\W_]{0,3}7", "sixty-seven"}, + {"sixty[\\W_]{0,3}8", "sixty-eight"}, + {"sixty[\\W_]{0,3}9", "sixty-nine"}, + {"seventy[\\W_]{0,3}1", "seventy-one"}, + {"seventy[\\W_]{0,3}2", "seventy-two"}, + {"seventy[\\W_]{0,3}3", "seventy-three"}, + {"seventy[\\W_]{0,3}4", "seventy-four"}, + {"seventy[\\W_]{0,3}5", "seventy-five"}, + {"seventy[\\W_]{0,3}6", "seventy-six"}, + {"seventy[\\W_]{0,3}7", "seventy-seven"}, + {"seventy[\\W_]{0,3}8", "seventy-eight"}, + {"seventy[\\W_]{0,3}9", "seventy-nine"}, + {"eighty[\\W_]{0,3}1", "eighty-one"}, + {"eighty[\\W_]{0,3}2", "eighty-two"}, + {"eighty[\\W_]{0,3}3", "eighty-three"}, + {"eighty[\\W_]{0,3}4", "eighty-four"}, + {"eighty[\\W_]{0,3}5", "eighty-five"}, + {"eighty[\\W_]{0,3}6", "eighty-six"}, + {"eighty[\\W_]{0,3}7", "eighty-seven"}, + {"eighty[\\W_]{0,3}8", "eighty-eight"}, + {"eighty[\\W_]{0,3}9", "eighty-nine"}, + {"ninety[\\W_]{0,3}1", "ninety-one"}, + {"ninety[\\W_]{0,3}2", "ninety-two"}, + {"ninety[\\W_]{0,3}3", "ninety-three"}, + {"ninety[\\W_]{0,3}4", "ninety-four"}, + {"ninety[\\W_]{0,3}5", "ninety-five"}, + {"ninety[\\W_]{0,3}6", "ninety-six"}, + {"ninety[\\W_]{0,3}7", "ninety-seven"}, + {"ninety[\\W_]{0,3}8", "ninety-eight"}, + {"ninety[\\W_]{0,3}9", "ninety-nine"}}, // now resolve compound numeral words {{"twenty-one", "21"}, {"twenty-two", "22"}, {"twenty-three", "23"}, - {"twenty-four", "24"}, {"twenty-five", "25"}, {"twenty-six", "26"}, - {"twenty-seven", "27"}, {"twenty-eight", "28"}, {"twenty-nine", "29"}, - {"thirty-one", "31"}, {"thirty-two", "32"}, {"thirty-three", "33"}, - {"thirty-four", "34"}, {"thirty-five", "35"}, {"thirty-six", "36"}, - {"thirty-seven", "37"}, {"thirty-eight", "38"}, {"thirty-nine", "39"}, - {"forty-one", "41"}, {"forty-two", "42"}, {"forty-three", "43"}, - {"forty-four", "44"}, {"forty-five", "45"}, {"forty-six", "46"}, - {"forty-seven", "47"}, {"forty-eight", "48"}, {"forty-nine", "49"}, - {"fifty-one", "51"}, {"fifty-two", "52"}, {"fifty-three", "53"}, - {"fifty-four", "54"}, {"fifty-five", "55"}, {"fifty-six", "56"}, - {"fifty-seven", "57"}, {"fifty-eight", "58"}, {"fifty-nine", "59"}, - {"sixty-one", "61"}, {"sixty-two", "62"}, {"sixty-three", "63"}, - {"sixty-four", "64"}, {"sixty-five", "65"}, {"sixty-six", "66"}, - {"sixty-seven", "67"}, {"sixty-eight", "68"}, {"sixty-nine", "69"}, - {"seventy-one", "71"}, {"seventy-two", "72"}, {"seventy-three", "73"}, - {"seventy-four", "74"}, {"seventy-five", "75"}, {"seventy-six", "76"}, - {"seventy-seven", "77"}, {"seventy-eight", "78"}, - {"seventy-nine", "79"}, {"eighty-one", "81"}, {"eighty-two", "82"}, - {"eighty-three", "83"}, {"eighty-four", "84"}, {"eighty-five", "85"}, - {"eighty-six", "86"}, {"eighty-seven", "87"}, {"eighty-eight", "88"}, - {"eighty-nine", "89"}, {"ninety-one", "91"}, {"ninety-two", "92"}, - {"ninety-three", "93"}, {"ninety-four", "94"}, {"ninety-five", "95"}, - {"ninety-six", "96"}, {"ninety-seven", "97"}, {"ninety-eight", "98"}, - {"ninety-nine", "99"}}, + {"twenty-four", "24"}, {"twenty-five", "25"}, + {"twenty-six", "26"}, {"twenty-seven", "27"}, + {"twenty-eight", "28"}, {"twenty-nine", "29"}, + {"thirty-one", "31"}, {"thirty-two", "32"}, + {"thirty-three", "33"}, {"thirty-four", "34"}, + {"thirty-five", "35"}, {"thirty-six", "36"}, + {"thirty-seven", "37"}, {"thirty-eight", "38"}, + {"thirty-nine", "39"}, {"forty-one", "41"}, {"forty-two", "42"}, + {"forty-three", "43"}, {"forty-four", "44"}, + {"forty-five", "45"}, {"forty-six", "46"}, + {"forty-seven", "47"}, {"forty-eight", "48"}, + {"forty-nine", "49"}, {"fifty-one", "51"}, {"fifty-two", "52"}, + {"fifty-three", "53"}, {"fifty-four", "54"}, + {"fifty-five", "55"}, {"fifty-six", "56"}, + {"fifty-seven", "57"}, {"fifty-eight", "58"}, + {"fifty-nine", "59"}, {"sixty-one", "61"}, {"sixty-two", "62"}, + {"sixty-three", "63"}, {"sixty-four", "64"}, + {"sixty-five", "65"}, {"sixty-six", "66"}, + {"sixty-seven", "67"}, {"sixty-eight", "68"}, + {"sixty-nine", "69"}, {"seventy-one", "71"}, + {"seventy-two", "72"}, {"seventy-three", "73"}, + {"seventy-four", "74"}, {"seventy-five", "75"}, + {"seventy-six", "76"}, {"seventy-seven", "77"}, + {"seventy-eight", "78"}, {"seventy-nine", "79"}, + {"eighty-one", "81"}, {"eighty-two", "82"}, + {"eighty-three", "83"}, {"eighty-four", "84"}, + {"eighty-five", "85"}, {"eighty-six", "86"}, + {"eighty-seven", "87"}, {"eighty-eight", "88"}, + {"eighty-nine", "89"}, {"ninety-one", "91"}, + {"ninety-two", "92"}, {"ninety-three", "93"}, + {"ninety-four", "94"}, {"ninety-five", "95"}, + {"ninety-six", "96"}, {"ninety-seven", "97"}, + {"ninety-eight", "98"}, {"ninety-nine", "99"}}, // larger units function as suffixes now // assume never have three hundred four, three hundred and four {{"hundred", "00"}, {"thousand", "000"}}, // single numeral words now // some would have been ambiguous {{"seventeen", "17"}, {"thirteen", "13"}, {"fourteen", "14"}, - {"eighteen", "18"}, {"nineteen", "19"}, {"fifteen", "15"}, - {"sixteen", "16"}, {"seventy", "70"}, {"eleven", "11"}, - {"twelve", "12"}, {"twenty", "20"}, {"thirty", "30"}, {"eighty", "80"}, - {"ninety", "90"}, {"three", "3"}, {"seven", "7"}, {"eight", "8"}, - {"forty", "40"}, {"fifty", "50"}, {"sixty", "60"}, {"zero", "0"}, - {"four", "4"}, {"five", "5"}, {"nine", "9"}, {"one", "1"}, {"two", "2"}, - {"six", "6"}, {"ten", "10"}}, + {"eighteen", "18"}, {"nineteen", "19"}, {"fifteen", "15"}, + {"sixteen", "16"}, {"seventy", "70"}, {"eleven", "11"}, + {"twelve", "12"}, {"twenty", "20"}, {"thirty", "30"}, + {"eighty", "80"}, {"ninety", "90"}, {"three", "3"}, + {"seven", "7"}, {"eight", "8"}, {"forty", "40"}, + {"fifty", "50"}, {"sixty", "60"}, {"zero", "0"}, {"four", "4"}, + {"five", "5"}, {"nine", "9"}, {"one", "1"}, {"two", "2"}, + {"six", "6"}, {"ten", "10"}}, // now do letter for digit substitutions {{"oh", "0"}, {"o", "0"}, {"i", "1"}, {"l", "1"}}}; // Regex to identify a phone number static final String cleanPhoneRegex = "([2-9]\\d{2}[2-9]\\d{6})"; // Regex which attempts to ignore punctuation and other distractions. - static final String phoneRegex = - "([{(<]{0,3}[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}" + - "[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}\\d" + - "[\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,3}\\d)"; + static final String phoneRegex = "([{(<]{0,3}[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}" + + "[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}\\d" + + "[\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,3}\\d)"; public static ArrayList extractPhoneNumbers(String text) { text = clean(text); diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java index b7ce5c77a1..a59ee97357 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -25,10 +23,9 @@ import org.xml.sax.helpers.DefaultHandler; /** - * Decorator base class for the {@link ContentHandler} interface. This class - * simply delegates all SAX events calls to an underlying decorated handler - * instance. Subclasses can provide extra decoration by overriding one or more - * of the SAX event methods. + * Decorator base class for the {@link ContentHandler} interface. This class simply delegates all + * SAX events calls to an underlying decorated handler instance. Subclasses can provide extra + * decoration by overriding one or more of the SAX event methods. */ public class ContentHandlerDecorator extends DefaultHandler { @@ -48,18 +45,18 @@ public ContentHandlerDecorator(ContentHandler handler) { } /** - * Creates a decorator that by default forwards incoming SAX events to - * a dummy content handler that simply ignores all the events. Subclasses - * should use the {@link #setContentHandler(ContentHandler)} method to - * switch to a more usable underlying content handler. + * Creates a decorator that by default forwards incoming SAX events to a dummy content handler + * that simply ignores all the events. Subclasses should use the + * {@link #setContentHandler(ContentHandler)} method to switch to a more usable underlying + * content handler. */ protected ContentHandlerDecorator() { this(new DefaultHandler()); } /** - * Sets the underlying content handler. All future SAX events will be - * directed to this handler instead of the one that was previously used. + * Sets the underlying content handler. All future SAX events will be directed to this handler + * instead of the one that was previously used. * * @param handler content handler */ @@ -120,7 +117,7 @@ public void endDocument() throws SAXException { @Override public void startElement(String uri, String localName, String name, Attributes atts) - throws SAXException { + throws SAXException { try { handler.startElement(uri, localName, name, atts); } catch (SAXException e) { @@ -170,48 +167,46 @@ public String toString() { } /** - * Handle any exceptions thrown by methods in this class. This method - * provides a single place to implement custom exception handling. The - * default behaviour is simply to re-throw the given exception, but - * subclasses can also provide alternative ways of handling the situation. + * Handle any exceptions thrown by methods in this class. This method provides a single place to + * implement custom exception handling. The default behaviour is simply to re-throw the given + * exception, but subclasses can also provide alternative ways of handling the situation. * - * If the wrapped handler is itself a ContentHandlerDecorator, the call - * is delegated to the wrapped handler's {@link ContentHandlerDecorator#handleException(SAXException)} + * If the wrapped handler is itself a ContentHandlerDecorator, the call is delegated to the + * wrapped handler's {@link ContentHandlerDecorator#handleException(SAXException)} * * @param exception the exception that was thrown * @throws SAXException the exception (if any) thrown to the client */ protected void handleException(SAXException exception) throws SAXException { if (handler instanceof ContentHandlerDecorator) { - ((ContentHandlerDecorator)handler).handleException(exception); + ((ContentHandlerDecorator) handler).handleException(exception); } else { throw exception; } } @Override - public void warning (SAXParseException exception) throws SAXException { + public void warning(SAXParseException exception) throws SAXException { if (handler instanceof ErrorHandler) { - ((ErrorHandler)handler).warning(exception); + ((ErrorHandler) handler).warning(exception); } else { super.warning(exception); } } @Override - public void error (SAXParseException exception) throws SAXException { + public void error(SAXParseException exception) throws SAXException { if (handler instanceof ErrorHandler) { - ((ErrorHandler)handler).error(exception); + ((ErrorHandler) handler).error(exception); } else { super.error(exception); } } @Override - public void fatalError (SAXParseException exception) - throws SAXException { + public void fatalError(SAXParseException exception) throws SAXException { if (handler instanceof ErrorHandler) { - ((ErrorHandler)handler).fatalError(exception); + ((ErrorHandler) handler).fatalError(exception); } else { super.fatalError(exception); } diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java index 967e186ebd..597553f38f 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java @@ -1,30 +1,26 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import java.io.Serializable; - -import org.xml.sax.ContentHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +import org.xml.sax.ContentHandler; public interface ContentHandlerDecoratorFactory extends Serializable { ContentHandler decorate(ContentHandler contentHandler, Metadata metadata, - ParseContext parseContext); + ParseContext parseContext); } diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java index dc2f3384fc..494f5adfcb 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -20,7 +18,6 @@ import java.io.OutputStream; import java.io.Serializable; import java.nio.charset.Charset; - import org.xml.sax.ContentHandler; /** diff --git a/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java index f8f088d2af..0eaf701497 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java @@ -1,35 +1,31 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import java.util.Stack; - +import org.apache.tika.metadata.Metadata; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.metadata.Metadata; - public class DIFContentHandler extends DefaultHandler { - private static final char[] NEWLINE = new char[]{'\n'}; - private static final char[] TABSPACE = new char[]{'\t'}; + private static final char[] NEWLINE = new char[] {'\n'}; + private static final char[] TABSPACE = new char[] {'\t'}; private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); private final Stack treeStack; @@ -65,10 +61,10 @@ public void characters(char[] ch, int start, int length) throws SAXException { this.delegate.characters(title.toCharArray(), 0, title.length()); this.delegate.endElement("", "h3", "h3"); } - if (this.treeStack.peek().equals("Southernmost_Latitude") || - this.treeStack.peek().equals("Northernmost_Latitude") || - this.treeStack.peek().equals("Westernmost_Longitude") || - this.treeStack.peek().equals("Easternmost_Longitude")) { + if (this.treeStack.peek().equals("Southernmost_Latitude") + || this.treeStack.peek().equals("Northernmost_Latitude") + || this.treeStack.peek().equals("Westernmost_Longitude") + || this.treeStack.peek().equals("Easternmost_Longitude")) { this.delegate.characters(NEWLINE, 0, NEWLINE.length); this.delegate.characters(TABSPACE, 0, TABSPACE.length); this.delegate.characters(TABSPACE, 0, TABSPACE.length); @@ -91,7 +87,7 @@ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXExce @Override public void startElement(String uri, String localName, String qName, Attributes attributes) - throws SAXException { + throws SAXException { this.isLeaf = true; if (localName.equals("Spatial_Coverage")) { this.delegate.characters(NEWLINE, 0, NEWLINE.length); diff --git a/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java index 4f9d30c920..a33a554db6 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java @@ -1,44 +1,39 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import java.util.Collections; import java.util.Map; import javax.xml.namespace.QName; - import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; /** - * Content handler decorator that maps element QNames using - * a Map. Not mappable elements are not forwarded. - * Attributes may also be mapped (for each element different using - * a Map for attributes), not mappable attributes are not - * forwarded. The default is to not map any attributes and therefore do - * not forward any of them. + * Content handler decorator that maps element QNames using a Map. Not + * mappable elements are not forwarded. Attributes may also be mapped (for each element different + * using a Map for attributes), not mappable attributes are not forwarded. The default + * is to not map any attributes and therefore do not forward any of them. */ public class ElementMappingContentHandler extends ContentHandlerDecorator { private final Map mappings; public ElementMappingContentHandler(ContentHandler handler, - Map mappings) { + Map mappings) { super(handler); this.mappings = mappings; } @@ -54,18 +49,18 @@ protected static final String getQNameAsString(QName qname) { @Override public void startElement(String namespaceURI, String localName, String qName, Attributes atts) - throws SAXException { + throws SAXException { TargetElement mapping = mappings.get(new QName(namespaceURI, localName)); if (mapping != null) { QName tag = mapping.getMappedTagName(); super.startElement(tag.getNamespaceURI(), tag.getLocalPart(), getQNameAsString(tag), - mapping.mapAttributes(atts)); + mapping.mapAttributes(atts)); } } @Override public void endElement(String namespaceURI, String localName, String qName) - throws SAXException { + throws SAXException { TargetElement mapping = mappings.get(new QName(namespaceURI, localName)); if (mapping != null) { QName tag = mapping.getMappedTagName(); @@ -79,8 +74,7 @@ public static class TargetElement { private final Map attributesMapping; /** - * Creates an TargetElement, attributes of this element will - * be mapped as specified + * Creates an TargetElement, attributes of this element will be mapped as specified */ public TargetElement(QName mappedTagName, Map attributesMapping) { this.mappedTagName = mappedTagName; @@ -91,13 +85,13 @@ public TargetElement(QName mappedTagName, Map attributesMapping) { * A shortcut that automatically creates the QName object */ public TargetElement(String mappedTagURI, String mappedTagLocalName, - Map attributesMapping) { + Map attributesMapping) { this(new QName(mappedTagURI, mappedTagLocalName), attributesMapping); } /** - * Creates an TargetElement with no attributes, all attributes - * will be deleted from SAX stream + * Creates an TargetElement with no attributes, all attributes will be deleted from SAX + * stream */ public TargetElement(QName mappedTagName) { this(mappedTagName, Collections.emptyMap()); @@ -124,7 +118,7 @@ public Attributes mapAttributes(final Attributes atts) { QName name = attributesMapping.get(new QName(atts.getURI(i), atts.getLocalName(i))); if (name != null) { natts.addAttribute(name.getNamespaceURI(), name.getLocalPart(), - getQNameAsString(name), atts.getType(i), atts.getValue(i)); + getQNameAsString(name), atts.getType(i), atts.getValue(i)); } } return natts; diff --git a/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java index 38afb0ca73..dabc0d3011 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java @@ -1,37 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import org.xml.sax.ContentHandler; /** - * Content handler decorator that prevents the {@link #startDocument()} - * and {@link #endDocument()} events from reaching the decorated handler. - * This is useful when you want to direct the results of parsing multiple - * different XML documents into a single target document without worrying - * about the {@link #startDocument()} and {@link #endDocument()} methods - * being called more than once. + * Content handler decorator that prevents the {@link #startDocument()} and {@link #endDocument()} + * events from reaching the decorated handler. This is useful when you want to direct the results of + * parsing multiple different XML documents into a single target document without worrying about the + * {@link #startDocument()} and {@link #endDocument()} methods being called more than once. */ public class EmbeddedContentHandler extends ContentHandlerDecorator { /** - * Created a decorator that prevents the given handler from - * receiving {@link #startDocument()} and {@link #endDocument()} - * events. + * Created a decorator that prevents the given handler from receiving {@link #startDocument()} + * and {@link #endDocument()} events. * * @param handler the content handler to be decorated */ @@ -43,14 +38,12 @@ public EmbeddedContentHandler(ContentHandler handler) { * Ignored. */ @Override - public void startDocument() { - } + public void startDocument() {} /** * Ignored. */ @Override - public void endDocument() { - } + public void endDocument() {} } diff --git a/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java index 544db0d264..a0a1dd5b53 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -20,10 +18,9 @@ import org.xml.sax.SAXException; /** - * A wrapper around a {@link ContentHandler} which will ignore normal - * SAX calls to {@link #endDocument()}, and only fire them later. - * This is typically used to ensure that we can output the metadata - * before ending the document + * A wrapper around a {@link ContentHandler} which will ignore normal SAX calls to + * {@link #endDocument()}, and only fire them later. This is typically used to ensure that we can + * output the metadata before ending the document */ public class EndDocumentShieldingContentHandler extends ContentHandlerDecorator { private boolean endDocumentCalled; diff --git a/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java index e1fa733705..082505b75d 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java @@ -1,38 +1,35 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import javax.xml.transform.sax.TransformerHandler; - import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** - * Content handler decorator which wraps a {@link TransformerHandler} in order to - * allow the TITLE tag to render as <title></title> - * rather than <title/> which is accomplished - * by calling the {@link TransformerHandler#characters(char[], int, int)} method - * with a length of 1 but a zero length char array. + * Content handler decorator which wraps a {@link TransformerHandler} in order to allow the + * TITLE tag to render as <title></title> rather than + * <title/> which is accomplished by calling the + * {@link TransformerHandler#characters(char[], int, int)} method with a length of 1 + * but a zero length char array. *

- * This workaround is an unfortunate circumstance of the limitations imposed by the - * implementation of the XML serialization code in the JDK brought over from - * the xalan project which no longer allows for the specification of an - * alternate content-handler via xslt templates or other means. + * This workaround is an unfortunate circumstance of the limitations imposed by the implementation + * of the XML serialization code in the JDK brought over from the xalan project which no longer + * allows for the specification of an alternate content-handler via xslt templates or + * other means. * * @see TIKA-725 */ @@ -57,7 +54,7 @@ public void startDocument() throws SAXException { @Override public void startElement(String uri, String localName, String qName, Attributes atts) - throws SAXException { + throws SAXException { super.startElement(uri, localName, qName, atts); if (TITLE_TAG.equalsIgnoreCase(localName) && XHTMLContentHandler.XHTML.equals(uri)) { isTitleTagOpen = true; diff --git a/tika-core/src/main/java/org/apache/tika/sax/Link.java b/tika-core/src/main/java/org/apache/tika/sax/Link.java index cf3c25d688..63af3f29af 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/Link.java +++ b/tika-core/src/main/java/org/apache/tika/sax/Link.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; diff --git a/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java b/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java index f022da776c..f1fadec181 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java +++ b/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; diff --git a/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java index 310a183287..0c4bbb6789 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -21,7 +19,6 @@ import java.util.ArrayList; import java.util.LinkedList; import java.util.List; - import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; @@ -31,11 +28,10 @@ public class LinkContentHandler extends DefaultHandler { /** - * Stack of link builders, one for each level of nested links currently - * being processed. A usual case of a nested link would be a hyperlinked - * image (&a href="..."><img src="..."><>), - * but it's possible (though unlikely) for also other kinds of nesting - * to occur. + * Stack of link builders, one for each level of nested links currently being processed. A usual + * case of a nested link would be a hyperlinked image + * (&a href="..."><img src="..."><>), but it's possible (though + * unlikely) for also other kinds of nesting to occur. */ private final LinkedList builderStack = new LinkedList<>(); @@ -76,7 +72,7 @@ public List getLinks() { return links; } - //-------------------------------------------------------< ContentHandler> + // -------------------------------------------------------< ContentHandler> @Override public void startElement(String uri, String local, String name, Attributes attributes) { @@ -133,8 +129,8 @@ public void ignorableWhitespace(char[] ch, int start, int length) { @Override public void endElement(String uri, String local, String name) { if (!builderStack.isEmpty() && XHTML.equals(uri)) { - if ("a".equals(local) || "img".equals(local) || "link".equals(local) || - "script".equals(local) || "iframe".equals(local)) { + if ("a".equals(local) || "img".equals(local) || "link".equals(local) + || "script".equals(local) || "iframe".equals(local)) { // ensure this is the correct builder. not all tags correspond // to a LinkBuilder, e.g. for embedded scripts if (builderStack.getFirst().getType().equals(local)) { diff --git a/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java index 6461e0946a..afde1fc6a8 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -22,8 +20,8 @@ /** * Content handler decorator that always returns an empty stream from the - * {@link #resolveEntity(String, String)} method to prevent potential - * network or other external resources from being accessed by an XML parser. + * {@link #resolveEntity(String, String)} method to prevent potential network or other external + * resources from being accessed by an XML parser. * * @see TIKA-185 */ @@ -34,8 +32,7 @@ public OfflineContentHandler(ContentHandler handler) { } /** - * Returns an empty stream. This will make an XML parser silently - * ignore any external entities. + * Returns an empty stream. This will make an XML parser silently ignore any external entities. */ @Override public InputSource resolveEntity(String publicId, String systemId) { diff --git a/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java index 981174021a..168ee51b7d 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java @@ -1,52 +1,45 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import java.util.Arrays; import java.util.List; - +import org.apache.tika.metadata.Metadata; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.metadata.Metadata; - /** * Class used to extract phone numbers while parsing. *

- * Every time a document is parsed in Tika, the content is split into SAX events. - * Those SAX events are handled by a ContentHandler. You can think of these events - * as marking a tag in an HTML file. Once you're finished parsing, you can call - * handler.toString(), for example, to get the text contents of the file. On the other - * hand, any of the metadata of the file will be added to the Metadata object passed - * in during the parse() call. So, the Parser class sends metadata to the Metadata - * object and content to the ContentHandler. + * Every time a document is parsed in Tika, the content is split into SAX events. Those SAX events + * are handled by a ContentHandler. You can think of these events as marking a tag in an HTML file. + * Once you're finished parsing, you can call handler.toString(), for example, to get the text + * contents of the file. On the other hand, any of the metadata of the file will be added to the + * Metadata object passed in during the parse() call. So, the Parser class sends metadata to the + * Metadata object and content to the ContentHandler. *

- * This class is an example of how to combine a ContentHandler and a Metadata. - * As content is passed to the handler, we first check to see if it matches a - * textual pattern for a phone number. If the extracted content is a phone number, - * we add it to the metadata under the key "phonenumbers". So, if you used this - * ContentHandler when you parsed a document, then called - * metadata.getValues("phonenumbers"), you would get an array of Strings of phone - * numbers found in the document. + * This class is an example of how to combine a ContentHandler and a Metadata. As content is passed + * to the handler, we first check to see if it matches a textual pattern for a phone number. If the + * extracted content is a phone number, we add it to the metadata under the key "phonenumbers". So, + * if you used this ContentHandler when you parsed a document, then called + * metadata.getValues("phonenumbers"), you would get an array of Strings of phone numbers found in + * the document. *

- * Please see the PhoneExtractingContentHandlerTest for an example of how to use - * this class. + * Please see the PhoneExtractingContentHandlerTest for an example of how to use this class. */ public class PhoneExtractingContentHandler extends ContentHandlerDecorator { private static final String PHONE_NUMBERS = "phonenumbers"; @@ -65,22 +58,20 @@ public PhoneExtractingContentHandler(ContentHandler handler, Metadata metadata) } /** - * Creates a decorator that by default forwards incoming SAX events to - * a dummy content handler that simply ignores all the events. Subclasses - * should use the {@link #setContentHandler(ContentHandler)} method to - * switch to a more usable underlying content handler. - * Also creates a dummy Metadata object to store phone numbers in. + * Creates a decorator that by default forwards incoming SAX events to a dummy content handler + * that simply ignores all the events. Subclasses should use the + * {@link #setContentHandler(ContentHandler)} method to switch to a more usable underlying + * content handler. Also creates a dummy Metadata object to store phone numbers in. */ protected PhoneExtractingContentHandler() { this(new DefaultHandler(), new Metadata()); } /** - * The characters method is called whenever a Parser wants to pass raw... - * characters to the ContentHandler. But, sometimes, phone numbers are split - * accross different calls to characters, depending on the specific Parser - * used. So, we simply add all characters to a StringBuilder and analyze it - * once the document is finished. + * The characters method is called whenever a Parser wants to pass raw... characters to the + * ContentHandler. But, sometimes, phone numbers are split accross different calls to + * characters, depending on the specific Parser used. So, we simply add all characters to a + * StringBuilder and analyze it once the document is finished. */ @Override public void characters(char[] ch, int start, int length) throws SAXException { @@ -95,8 +86,8 @@ public void characters(char[] ch, int start, int length) throws SAXException { /** - * This method is called whenever the Parser is done parsing the file. So, - * we check the output for any phone numbers. + * This method is called whenever the Parser is done parsing the file. So, we check the output + * for any phone numbers. */ @Override public void endDocument() throws SAXException { diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java index 0d8671e944..d4fa31c385 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -21,11 +19,6 @@ import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -33,19 +26,21 @@ import org.apache.tika.metadata.filter.NoOpFilter; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.utils.ParserUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** - * This is the default implementation of {@link AbstractRecursiveParserWrapperHandler}. - * See its documentation for more details. + * This is the default implementation of {@link AbstractRecursiveParserWrapperHandler}. See its + * documentation for more details. *

- * This caches the a metadata object for each embedded file and for the container file. - * It places the extracted content in the metadata object, with this key: - * {@link TikaCoreProperties#TIKA_CONTENT} - * If memory is a concern, subclass AbstractRecursiveParserWrapperHandler to handle each - * embedded document. + * This caches the a metadata object for each embedded file and for the container file. It places + * the extracted content in the metadata object, with this key: + * {@link TikaCoreProperties#TIKA_CONTENT} If memory is a concern, subclass + * AbstractRecursiveParserWrapperHandler to handle each embedded document. *

- * NOTE: This handler must only be used with the {@link - * org.apache.tika.parser.RecursiveParserWrapper} + * NOTE: This handler must only be used with the + * {@link org.apache.tika.parser.RecursiveParserWrapper} *

*/ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler { @@ -61,18 +56,17 @@ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory } /** - * Create a handler that limits the number of embedded resources that will be - * parsed + * Create a handler that limits the number of embedded resources that will be parsed * * @param maxEmbeddedResources number of embedded resources that will be parsed */ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, - int maxEmbeddedResources) { + int maxEmbeddedResources) { this(contentHandlerFactory, maxEmbeddedResources, NoOpFilter.NOOP_FILTER); } public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, - int maxEmbeddedResources, MetadataFilter metadataFilter) { + int maxEmbeddedResources, MetadataFilter metadataFilter) { super(contentHandlerFactory, maxEmbeddedResources); this.metadataFilter = metadataFilter; } @@ -81,12 +75,12 @@ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory * This is called before parsing an embedded document * * @param contentHandler - local content handler to use on the embedded document - * @param metadata metadata to use for the embedded document + * @param metadata metadata to use for the embedded document * @throws SAXException */ @Override public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) - throws SAXException { + throws SAXException { super.startEmbeddedDocument(contentHandler, metadata); } @@ -94,12 +88,12 @@ public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metada * This is called after parsing an embedded document. * * @param contentHandler local contenthandler used on the embedded document - * @param metadata metadata from the embedded document + * @param metadata metadata from the embedded document * @throws SAXException */ @Override public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) - throws SAXException { + throws SAXException { super.endEmbeddedDocument(contentHandler, metadata); addContent(contentHandler, metadata); try { @@ -115,7 +109,7 @@ public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata /** * @param contentHandler content handler used on the main document - * @param metadata metadata from the main document + * @param metadata metadata from the main document * @throws SAXException */ @Override @@ -134,9 +128,9 @@ public void endDocument(ContentHandler contentHandler, Metadata metadata) throws } private void writeFinalEmbeddedPaths() { - //for some file types, the file's "name" is not known before - //their attachments are parsed. This goes through the id paths - //and regenerates the path for the "final embedded resource path" + // for some file types, the file's "name" is not known before + // their attachments are parsed. This goes through the id paths + // and regenerates the path for the "final embedded resource path" Map idToName = new HashMap<>(); AtomicInteger unknownCount = new AtomicInteger(0); for (Metadata metadata : metadataList) { @@ -166,7 +160,7 @@ private void writeFinalEmbeddedPaths() { /** * @return a list of Metadata objects, one for the main document and one for each embedded - * document + * document */ public List getMetadataList() { return metadataList; @@ -175,15 +169,15 @@ public List getMetadataList() { void addContent(ContentHandler handler, Metadata metadata) { if (handler.getClass().equals(DefaultHandler.class)) { - //no-op: we can't rely on just testing for - //empty content because DefaultHandler's toString() - //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd" + // no-op: we can't rely on just testing for + // empty content because DefaultHandler's toString() + // returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd" } else { String content = handler.toString(); if (content != null && !content.isBlank()) { metadata.add(TikaCoreProperties.TIKA_CONTENT, content); metadata.add(TikaCoreProperties.TIKA_CONTENT_HANDLER, - handler.getClass().getSimpleName()); + handler.getClass().getSimpleName()); } } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java index c250fa28f1..6b4b9ca8d5 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java @@ -1,37 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import java.io.Writer; - import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** - * Content handler for Rich Text, it will extract XHTML <img/> - * tag <alt/> attribute and XHTML <a/> tag <name/> - * attribute into the output. + * Content handler for Rich Text, it will extract XHTML <img/> tag <alt/> attribute and + * XHTML <a/> tag <name/> attribute into the output. */ public class RichTextContentHandler extends WriteOutContentHandler { /** - * Creates a content handler that writes XHTML body character events to - * the given writer. + * Creates a content handler that writes XHTML body character events to the given writer. * * @param writer writer */ @@ -41,7 +36,7 @@ public RichTextContentHandler(Writer writer) { @Override public void startElement(String uri, String localName, String qName, Attributes attributes) - throws SAXException { + throws SAXException { super.startElement(uri, localName, qName, attributes); if ("img".equals(localName) && attributes.getValue("alt") != null) { diff --git a/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java index b04c327683..c81f9799e9 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; /* -import java.util.ArrayList; -import java.util.List; -*/ + * import java.util.ArrayList; import java.util.List; + */ import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; @@ -28,37 +25,35 @@ /** * Content handler decorator that makes sure that the character events - * ({@link #characters(char[], int, int)} or - * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated - * content handler contain only valid XML characters. All invalid characters - * are replaced with the Unicode replacement character U+FFFD (though a - * subclass may change this by overriding the {@link #writeReplacement(Output)} method). + * ({@link #characters(char[], int, int)} or {@link #ignorableWhitespace(char[], int, int)}) passed + * to the decorated content handler contain only valid XML characters. All invalid characters are + * replaced with the Unicode replacement character U+FFFD (though a subclass may change this by + * overriding the {@link #writeReplacement(Output)} method). *

- * The XML standard defines the following Unicode character ranges as - * valid XML characters: + * The XML standard defines the following Unicode character ranges as valid XML characters: + * *

  * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
  * 
*

- * Note that currently this class only detects those invalid characters whose - * UTF-16 representation fits a single char. Also, this class does not ensure - * that the UTF-16 encoding of incoming characters is correct. + * Note that currently this class only detects those invalid characters whose UTF-16 representation + * fits a single char. Also, this class does not ensure that the UTF-16 encoding of incoming + * characters is correct. */ public class SafeContentHandler extends ContentHandlerDecorator { /** * Replacement for invalid characters. */ - private static final char[] REPLACEMENT = new char[]{'\ufffd'}; + private static final char[] REPLACEMENT = new char[] {'\ufffd'}; /** - * Output through the {@link ContentHandler#characters(char[], int, int)} - * method of the decorated content handler. + * Output through the {@link ContentHandler#characters(char[], int, int)} method of the + * decorated content handler. */ private final Output charactersOutput = SafeContentHandler.super::characters; /** - * Output through the - * {@link ContentHandler#ignorableWhitespace(char[], int, int)} - * method of the decorated content handler. + * Output through the {@link ContentHandler#ignorableWhitespace(char[], int, int)} method of the + * decorated content handler. */ private final Output ignorableWhitespaceOutput = SafeContentHandler.super::ignorableWhitespace; @@ -67,13 +62,12 @@ public SafeContentHandler(ContentHandler handler) { } /** - * Filters and outputs the contents of the given input buffer. Any - * invalid characters in the input buffer area handled by sending a - * replacement (a space character) to the given output. Any sequences - * of valid characters are passed as-is to the given output. + * Filters and outputs the contents of the given input buffer. Any invalid characters in the + * input buffer area handled by sending a replacement (a space character) to the given output. + * Any sequences of valid characters are passed as-is to the given output. * - * @param ch input buffer - * @param start start offset within the buffer + * @param ch input buffer + * @param start start offset within the buffer * @param length number of characters to read from the buffer * @param output output channel * @throws SAXException if the filtered characters could not be written out @@ -110,8 +104,8 @@ private void filter(char[] ch, int start, int length, Output output) throws SAXE * Checks if the given string contains any invalid XML characters. * * @param value string to be checked - * @return true if the string contains invalid XML characters, - * false otherwise + * @return true if the string contains invalid XML characters, false + * otherwise */ private boolean isInvalid(String value) { char[] ch = value.toCharArray(); @@ -129,17 +123,17 @@ private boolean isInvalid(String value) { } /** - * Checks whether the given Unicode character is an invalid XML character - * and should be replaced for output. Subclasses can override this method - * to use an alternative definition of which characters should be replaced - * in the XML output. The default definition from the XML specification is: + * Checks whether the given Unicode character is an invalid XML character and should be replaced + * for output. Subclasses can override this method to use an alternative definition of which + * characters should be replaced in the XML output. The default definition from the XML + * specification is: + * *

      * Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
      * 
* * @param ch character - * @return true if the character should be replaced, - * false otherwise + * @return true if the character should be replaced, false otherwise */ protected boolean isInvalid(int ch) { if (ch < 0x20) { @@ -154,8 +148,8 @@ protected boolean isInvalid(int ch) { } /** - * Outputs the replacement for an invalid character. Subclasses can - * override this method to use a custom replacement. + * Outputs the replacement for an invalid character. Subclasses can override this method to use + * a custom replacement. * * @param output where the replacement is written to * @throws SAXException if the replacement could not be written @@ -166,10 +160,10 @@ protected void writeReplacement(Output output) throws SAXException { @Override public void startElement(String uri, String localName, String name, Attributes atts) - throws SAXException { + throws SAXException { // TODO: enable this, but some parsers currently // trip it - //assert verifyStartElement(name); + // assert verifyStartElement(name); // Look for any invalid characters in attribute values. for (int i = 0; i < atts.getLength(); i++) { if (isInvalid(atts.getValue(i))) { @@ -184,7 +178,7 @@ public void startElement(String uri, String localName, String name, Attributes a value = buffer.toString(); } filtered.addAttribute(atts.getURI(j), atts.getLocalName(j), atts.getQName(j), - atts.getType(j), value); + atts.getType(j), value); } atts = filtered; break; @@ -197,51 +191,37 @@ public void startElement(String uri, String localName, String name, Attributes a public void endElement(String uri, String localName, String name) throws SAXException { // TODO: enable this, but some parsers currently // trip it - //assert verifyEndElement(name); + // assert verifyEndElement(name); super.endElement(uri, localName, name); } /* - private final List elements = new ArrayList(); - - // Called only from assert - private boolean verifyStartElement(String name) { - // TODO: we could strengthen this to do full - // XTHML validation, eg you shouldn't start p inside - // another p (but ODF parser, at least, seems to - // violate this): - //if (name.equals("p")) { - //assert elements.size() == 0 || !elements.get(elements.size()-1).equals("p"); - //} - elements.add(name); - return true; - } - - // Called only from assert - private boolean verifyEndElement(String name) { - assert elements.size() > 0: "end tag=" + name + " with no startElement"; - final String currentElement = elements.get(elements.size()-1); - assert currentElement.equals(name): "mismatched elements open=" + - currentElement + " close=" + name; - elements.remove(elements.size()-1); - return true; - } - - // Called only from assert - private boolean verifyEndDocument() { - assert elements.size() == 0; - return true; - } - */ + * private final List elements = new ArrayList(); + * + * // Called only from assert private boolean verifyStartElement(String name) { // TODO: we + * could strengthen this to do full // XTHML validation, eg you shouldn't start p inside // + * another p (but ODF parser, at least, seems to // violate this): //if (name.equals("p")) { + * //assert elements.size() == 0 || !elements.get(elements.size()-1).equals("p"); //} + * elements.add(name); return true; } + * + * // Called only from assert private boolean verifyEndElement(String name) { assert + * elements.size() > 0: "end tag=" + name + " with no startElement"; final String currentElement + * = elements.get(elements.size()-1); assert currentElement.equals(name): + * "mismatched elements open=" + currentElement + " close=" + name; + * elements.remove(elements.size()-1); return true; } + * + * // Called only from assert private boolean verifyEndDocument() { assert elements.size() == 0; + * return true; } + */ - //------------------------------------------------------< ContentHandler > + // ------------------------------------------------------< ContentHandler > @Override public void endDocument() throws SAXException { // TODO: enable this, but some parsers currently // trip it - //assert verifyEndDocument(); + // assert verifyEndDocument(); super.endDocument(); } @@ -256,8 +236,8 @@ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXExce } /** - * Internal interface that allows both character and - * ignorable whitespace content to be filtered the same way. + * Internal interface that allows both character and ignorable whitespace content to be filtered + * the same way. */ protected interface Output { void write(char[] ch, int start, int length) throws SAXException; diff --git a/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java index fa9d682e89..15f674f4f6 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java @@ -1,39 +1,34 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import java.io.IOException; import java.util.LinkedList; - +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TikaInputStream; - /** - * Content handler decorator that attempts to prevent denial of service - * attacks against Tika parsers. + * Content handler decorator that attempts to prevent denial of service attacks against Tika + * parsers. *

- * Currently this class simply compares the number of output characters - * to to the number of input bytes and keeps track of the XML nesting levels. - * An exception gets thrown if the output seems excessive compared to the - * input document. This is a strong indication of a zip bomb. + * Currently this class simply compares the number of output characters to to the number of input + * bytes and keeps track of the XML nesting levels. An exception gets thrown if the output seems + * excessive compared to the input document. This is a strong indication of a zip bomb. * * @see TIKA-216 * @since Apache Tika 0.4 @@ -77,13 +72,12 @@ public class SecureContentHandler extends ContentHandlerDecorator { private int maxPackageEntryDepth = 10; /** - * Decorates the given content handler with zip bomb prevention based - * on the count of bytes read from the given counting input stream. - * The resulting decorator can be passed to a Tika parser along with - * the given counting input stream. + * Decorates the given content handler with zip bomb prevention based on the count of bytes read + * from the given counting input stream. The resulting decorator can be passed to a Tika parser + * along with the given counting input stream. * * @param handler the content handler to be decorated - * @param stream the input stream to be parsed + * @param stream the input stream to be parsed */ public SecureContentHandler(ContentHandler handler, TikaInputStream stream) { super(handler); @@ -101,10 +95,9 @@ public long getOutputThreshold() { /** - * Sets the threshold for output characters before the zip bomb prevention - * is activated. This avoids false positives in cases where an otherwise - * normal document for some reason starts with a highly compressible - * sequence of bytes. + * Sets the threshold for output characters before the zip bomb prevention is activated. This + * avoids false positives in cases where an otherwise normal document for some reason starts + * with a highly compressible sequence of bytes. * * @param threshold new output threshold */ @@ -124,9 +117,8 @@ public long getMaximumCompressionRatio() { /** - * Sets the ratio between output characters and input bytes. If this - * ratio is exceeded (after the output threshold has been reached) then - * an exception gets thrown. + * Sets the ratio between output characters and input bytes. If this ratio is exceeded (after + * the output threshold has been reached) then an exception gets thrown. * * @param ratio new maximum compression ratio */ @@ -144,8 +136,8 @@ public int getMaximumDepth() { } /** - * Sets the maximum XML element nesting level. If this depth level is - * exceeded then an exception gets thrown. + * Sets the maximum XML element nesting level. If this depth level is exceeded then an exception + * gets thrown. * * @param depth maximum XML element nesting level */ @@ -163,8 +155,8 @@ public int getMaximumPackageEntryDepth() { } /** - * Sets the maximum package entry nesting level. If this depth level is - * exceeded then an exception gets thrown. + * Sets the maximum package entry nesting level. If this depth level is exceeded then an + * exception gets thrown. * * @param depth maximum package entry nesting level */ @@ -173,9 +165,8 @@ public void setMaximumPackageEntryDepth(int depth) { } /** - * Converts the given {@link SAXException} to a corresponding - * {@link TikaException} if it's caused by this instance detecting - * a zip bomb. + * Converts the given {@link SAXException} to a corresponding {@link TikaException} if it's + * caused by this instance detecting a zip bomb. * * @param e SAX exception * @throws TikaException zip bomb exception @@ -199,9 +190,9 @@ private long getByteCount() throws SAXException { } /** - * Records the given number of output characters (or more accurately - * UTF-16 code units). Throws an exception if the recorded number of - * characters highly exceeds the number of input bytes read. + * Records the given number of output characters (or more accurately UTF-16 code units). Throws + * an exception if the recorded number of characters highly exceeds the number of input bytes + * read. * * @param length number of new output characters produced * @throws SAXException if a zip bomb is detected @@ -211,25 +202,26 @@ protected void advance(int length) throws SAXException { if (characterCount > threshold) { long byteCount = getByteCount(); if (characterCount > byteCount * ratio) { - throw new SecureSAXException("Suspected zip bomb: " + byteCount + " input bytes produced " + characterCount + " output characters"); + throw new SecureSAXException("Suspected zip bomb: " + byteCount + + " input bytes produced " + characterCount + " output characters"); } } } @Override public void startElement(String uri, String localName, String name, Attributes atts) - throws SAXException { + throws SAXException { currentDepth++; if (currentDepth >= maxDepth) { - throw new SecureSAXException( - "Suspected zip bomb: " + currentDepth + " levels of XML element nesting"); + throw new SecureSAXException("Suspected zip bomb: " + currentDepth + + " levels of XML element nesting"); } if ("div".equals(name) && "package-entry".equals(atts.getValue("class"))) { packageEntryDepths.addLast(currentDepth); if (packageEntryDepths.size() >= maxPackageEntryDepth) { - throw new SecureSAXException("Suspected zip bomb: " + packageEntryDepths.size() + - " levels of package entry nesting"); + throw new SecureSAXException("Suspected zip bomb: " + packageEntryDepths.size() + + " levels of package entry nesting"); } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java b/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java index c8e89a06ab..fdbec4eae9 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java +++ b/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -21,12 +19,12 @@ import java.util.TreeMap; /** - * This class provides a collection of the most important technical standard organizations. - * The collection of standard organizations has been obtained from + * This class provides a collection of the most important technical standard organizations. The + * collection of standard organizations has been obtained from * Wikipedia. - * Currently, the list is composed of the most important international standard organizations, - * the regional standard organizations (i.e., Africa, Americas, Asia Pacific, Europe, and Middle - * East), and British and American standard organizations among the national-based ones. + * Currently, the list is composed of the most important international standard organizations, the + * regional standard organizations (i.e., Africa, Americas, Asia Pacific, Europe, and Middle East), + * and British and American standard organizations among the national-based ones. */ public class StandardOrganizations { @@ -34,28 +32,28 @@ public class StandardOrganizations { static { organizations = new TreeMap<>(); - //International standard organizations + // International standard organizations organizations.put("3GPP", "3rd Generation Partnership Project"); organizations.put("3GPP2", "3rd Generation Partnership Project 2"); organizations.put("Accellera", "Accellera Organization"); organizations.put("A4L", - "Access for Learning Community (formerly known as the Schools Interoperability " + - "Framework)"); + "Access for Learning Community (formerly known as the Schools Interoperability " + + "Framework)"); organizations.put("AES", "Audio Engineering Society"); organizations.put("AIIM", "Association for Information and Image Management"); organizations.put("ASAM", - "Association for Automation and Measuring Systems - Automotive technology"); + "Association for Automation and Measuring Systems - Automotive technology"); organizations.put("ASHRAE", - "American Society of Heating, Refrigerating and Air-Conditioning Engineers " + - "(ASHRAE is an international organization, despite its name)"); + "American Society of Heating, Refrigerating and Air-Conditioning Engineers " + + "(ASHRAE is an international organization, despite its name)"); organizations.put("ASME", "formerly The American Society of Mechanical Engineers"); - organizations - .put("ASTM", "ASTM (American Society for Testing and Materials) International"); + organizations.put("ASTM", + "ASTM (American Society for Testing and Materials) International"); organizations.put("ATIS", "Alliance for Telecommunications Industry Solutions"); organizations.put("AUTOSAR", "Automotive technology"); organizations.put("BIPM, CGPM, and CIPM", - "Bureau International des Poids et Mesures and the related organizations " + - "established under the Metre Convention of 1875."); + "Bureau International des Poids et Mesures and the related organizations " + + "established under the Metre Convention of 1875."); organizations.put("CableLabs", "Cable Television Laboratories"); organizations.put("CCSDS", "Consultative Committee for Space Data Sciences"); organizations.put("CISPR", "International Special Committee on Radio Interference"); @@ -67,8 +65,8 @@ public class StandardOrganizations { organizations.put("FAI", "Fédération Aéronautique Internationale"); organizations.put("GlobalPlatform", "Secure element and TEE standards"); organizations.put("GS1", - "Global supply chain standards (identification numbers, barcodes, electronic " + - "commerce transactions, RFID)"); + "Global supply chain standards (identification numbers, barcodes, electronic " + + "commerce transactions, RFID)"); organizations.put("HGI", "Home Gateway Initiative"); organizations.put("HFSB", "Hedge Fund Standards Board"); organizations.put("IATA", "International Air Transport Association"); @@ -87,27 +85,27 @@ public class StandardOrganizations { organizations.put("ITU", "The International Telecommunication Union"); organizations.put("ITU-R", "ITU Radiocommunications Sector (formerly known as CCIR)"); organizations.put("CCIR", - "Comité Consultatif International pour la Radio, a forerunner of the ITU-R"); + "Comité Consultatif International pour la Radio, a forerunner of the ITU-R"); organizations.put("ITU-T", "ITU Telecommunications Sector (formerly known as CCITT)"); organizations.put("CCITT", - "Comité Consultatif International Téléphonique et Télégraphique, renamed ITU-T in" + - " 1993"); + "Comité Consultatif International Téléphonique et Télégraphique, renamed ITU-T in" + + " 1993"); organizations.put("ITU-D", "ITU Telecom Development (formerly known as BDT)"); organizations.put("BDT", "Bureau de développement des télécommunications, renamed ITU-D"); organizations.put("IUPAC", "International Union of Pure and Applied Chemistry"); organizations.put("Liberty Alliance", "Liberty Alliance"); organizations.put("Media Grid", "Media Grid Standards Organization"); organizations.put("NACE International", - "Formerly known as National Association of Corrosion Engineers"); + "Formerly known as National Association of Corrosion Engineers"); organizations.put("OASIS", - "Organization for the Advancement of Structured Information Standards"); + "Organization for the Advancement of Structured Information Standards"); organizations.put("OGC", "Open Geospatial Consortium"); organizations.put("OHICC", "Organization of Hotel Industry Classification & Certification"); organizations.put("OMA", "Open Mobile Alliance"); organizations.put("OMG", "Object Management Group"); organizations.put("OGF", - "Open Grid Forum (merger of Global Grid Forum (GGF) and Enterprise Grid Alliance " + - "(EGA))"); + "Open Grid Forum (merger of Global Grid Forum (GGF) and Enterprise Grid Alliance " + + "(EGA))"); organizations.put("GGF", "Global Grid Forum"); organizations.put("EGA", "Enterprise Grid Alliance"); organizations.put("OpenTravel Alliance", "OpenTravel Alliance (previously known as OTA)"); @@ -131,37 +129,37 @@ public class StandardOrganizations { organizations.put("WHO", "World Health Organization"); organizations.put("XSF", "The XMPP Standards Foundation"); organizations.put("FAO", "Food and Agriculture Organization"); - //Regional standards organizations - //Africa + // Regional standards organizations + // Africa organizations.put("ARSO", "African Regional Organization for Standarization"); organizations.put("SADCSTAN", - "Southern African Development Community (SADC) Cooperation in Standarization"); - //Americas + "Southern African Development Community (SADC) Cooperation in Standarization"); + // Americas organizations.put("COPANT", "Pan American Standards Commission"); organizations.put("AMN", "MERCOSUR Standardization Association"); organizations.put("CROSQ", "CARICOM Regional Organization for Standards and Quality"); organizations.put("AAQG", "America's Aerospace Quality Group"); - //Asia Pacific + // Asia Pacific organizations.put("PASC", "Pacific Area Standards Congress"); organizations.put("ACCSQ", "ASEAN Consultative Committee for Standards and Quality"); - //Europe + // Europe organizations.put("RoyalCert", "RoyalCert International Registrars"); organizations.put("CEN", "European Committee for Standardization"); organizations.put("CENELEC", "European Committee for Electrotechnical Standardization"); organizations.put("URS", "United Registrar of Systems, UK"); organizations.put("ETSI", "European Telecommunications Standards Institute"); - organizations - .put("EASC", "Euro-Asian Council for Standardization, Metrology and Certification"); - organizations - .put("IRMM", "Institute for Reference Materials and Measurements (European Union)"); - //Middle East + organizations.put("EASC", + "Euro-Asian Council for Standardization, Metrology and Certification"); + organizations.put("IRMM", + "Institute for Reference Materials and Measurements (European Union)"); + // Middle East organizations.put("AIDMO", "Arab Industrial Development and Mining Organization"); organizations.put("IAU", "International Arabic Union"); - //Nationally-based standards organizations - //United Kingdom + // Nationally-based standards organizations + // United Kingdom organizations.put("BSI", "British Standards Institution aka BSI Group"); organizations.put("DStan", "UK Defence Standardization"); - //United States of America + // United States of America organizations.put("ANSI", "American National Standards Institute"); organizations.put("ACI", "American Concrete Institute"); organizations.put("NIST", "National Institute of Standards and Technology"); @@ -172,7 +170,7 @@ public class StandardOrganizations { * organizations. * * @return the map containing the collection of the most important technical standard - * organizations. + * organizations. */ public static Map getOrganizations() { return organizations; @@ -183,7 +181,7 @@ public static Map getOrganizations() { * organizations. * * @return the regular expression containing the most important technical standard - * organizations. + * organizations. */ public static String getOrganzationsRegex() { diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java b/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java index 243a031c95..b0e5c4d356 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java +++ b/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -28,7 +26,7 @@ public class StandardReference { private double score; private StandardReference(String mainOrganizationAcronym, String separator, - String secondOrganizationAcronym, String identifier, double score) { + String secondOrganizationAcronym, String identifier, double score) { super(); this.mainOrganization = mainOrganizationAcronym; this.separator = separator; @@ -106,7 +104,7 @@ public StandardReferenceBuilder(String mainOrganization, String identifier) { } public StandardReferenceBuilder setSecondOrganization(String separator, - String secondOrganization) { + String secondOrganization) { this.separator = separator; this.secondOrganization = secondOrganization; return this; @@ -119,7 +117,7 @@ public StandardReferenceBuilder setScore(double score) { public StandardReference build() { return new StandardReference(mainOrganization, separator, secondOrganization, - identifier, score); + identifier, score); } } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java index 006034a01d..c616421b90 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java @@ -1,37 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import java.util.Arrays; import java.util.List; - +import org.apache.tika.metadata.Metadata; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.metadata.Metadata; - /** - * StandardsExtractingContentHandler is a Content Handler used to extract - * standard references while parsing. + * StandardsExtractingContentHandler is a Content Handler used to extract standard references while + * parsing. *

- * This handler relies on complex regular expressions which can be slow on some types of - * input data. + * This handler relies on complex regular expressions which can be slow on some types of input data. */ public class StandardsExtractingContentHandler extends ContentHandlerDecorator { public static final String STANDARD_REFERENCES = "standard_references"; @@ -44,7 +39,7 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator { /** * Creates a decorator for the given SAX event handler and Metadata object. * - * @param handler SAX event handler to be decorated. + * @param handler SAX event handler to be decorated. * @param metadata {@link Metadata} object. */ public StandardsExtractingContentHandler(ContentHandler handler, Metadata metadata) { @@ -54,22 +49,21 @@ public StandardsExtractingContentHandler(ContentHandler handler, Metadata metada } /** - * Creates a decorator that by default forwards incoming SAX events to a - * dummy content handler that simply ignores all the events. Subclasses - * should use the {@link #setContentHandler(ContentHandler)} method to - * switch to a more usable underlying content handler. Also creates a dummy - * Metadata object to store phone numbers in. + * Creates a decorator that by default forwards incoming SAX events to a dummy content handler + * that simply ignores all the events. Subclasses should use the + * {@link #setContentHandler(ContentHandler)} method to switch to a more usable underlying + * content handler. Also creates a dummy Metadata object to store phone numbers in. */ protected StandardsExtractingContentHandler() { this(new DefaultHandler(), new Metadata()); } /** - * Gets the threshold to be used for selecting the standard references found - * within the text based on their score. + * Gets the threshold to be used for selecting the standard references found within the text + * based on their score. * - * @return the threshold to be used for selecting the standard references - * found within the text based on their score. + * @return the threshold to be used for selecting the standard references found within the text + * based on their score. */ public double getThreshold() { return threshold; @@ -85,11 +79,10 @@ public void setThreshold(double score) { } /** - * The characters method is called whenever a Parser wants to pass raw - * characters to the ContentHandler. However, standard references are often - * split across different calls to characters, depending on the specific - * Parser used. Therefore, we simply add all characters to a StringBuilder - * and analyze it once the document is finished. + * The characters method is called whenever a Parser wants to pass raw characters to the + * ContentHandler. However, standard references are often split across different calls to + * characters, depending on the specific Parser used. Therefore, we simply add all characters to + * a StringBuilder and analyze it once the document is finished. */ @Override public void characters(char[] ch, int start, int length) throws SAXException { @@ -107,14 +100,14 @@ public void characters(char[] ch, int start, int length) throws SAXException { } /** - * This method is called whenever the Parser is done parsing the file. So, - * we check the output for any standard references. + * This method is called whenever the Parser is done parsing the file. So, we check the output + * for any standard references. */ @Override public void endDocument() throws SAXException { super.endDocument(); - List standards = - StandardsText.extractStandardReferences(stringBuilder.toString(), threshold); + List standards = StandardsText + .extractStandardReferences(stringBuilder.toString(), threshold); for (StandardReference standardReference : standards) { metadata.add(STANDARD_REFERENCES, standardReference.toString()); } @@ -124,8 +117,8 @@ public void endDocument() throws SAXException { /** * The number of characters to store in memory for checking for standards. * - * If this is unbounded, the complex regular expressions can take a long time - * to process some types of data. Only increase this limit with great caution. + * If this is unbounded, the complex regular expressions can take a long time to process some + * types of data. Only increase this limit with great caution. */ public void setMaxBufferLength(int maxBufferLength) { this.maxBufferLength = maxBufferLength; diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java index 697eedee91..80f8d1ae3d 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java +++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -24,28 +22,24 @@ import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.tika.sax.StandardReference.StandardReferenceBuilder; /** - * StandardText relies on regular expressions to extract standard references - * from text. + * StandardText relies on regular expressions to extract standard references from text. * *

- * This class helps to find the standard references from text by performing the - * following steps: + * This class helps to find the standard references from text by performing the following steps: *

    *
  1. searches for headers;
  2. - *
  3. searches for patterns that are supposed to be standard references - * (basically, every string mostly composed of uppercase letters followed by an - * alphanumeric characters);
  4. + *
  5. searches for patterns that are supposed to be standard references (basically, every string + * mostly composed of uppercase letters followed by an alphanumeric characters);
  6. *
  7. each potential standard reference starts with score equal to 0.25;
  8. - *
  9. increases by 0.25 the score of references which include the name of a - * known standard organization ({@link StandardOrganizations});
  10. - *
  11. increases by 0.25 the score of references which include the word - * Publication or Standard;
  12. - *
  13. increases by 0.25 the score of references which have been found within - * "Applicable Documents" and equivalent sections;
  14. + *
  15. increases by 0.25 the score of references which include the name of a known standard + * organization ({@link StandardOrganizations});
  16. + *
  17. increases by 0.25 the score of references which include the word Publication or + * Standard;
  18. + *
  19. increases by 0.25 the score of references which have been found within "Applicable Documents" + * and equivalent sections;
  20. *
  21. returns the standard references along with scores.
  22. *
*

@@ -53,18 +47,18 @@ public class StandardsText { // Regular expression to match uppercase headers private static final String REGEX_HEADER = - "(\\d{1,10}+\\.(\\d{1,10}+\\.?){0,10}+)\\p{Blank}+([A-Z]{1,64}+(\\s[A-Z]{1,64}+){0," + - "256}+){5,10}+"; + "(\\d{1,10}+\\.(\\d{1,10}+\\.?){0,10}+)\\p{Blank}+([A-Z]{1,64}+(\\s[A-Z]{1,64}+){0," + + "256}+){5,10}+"; // Regular expression to match the "APPLICABLE DOCUMENTS" and equivalent // sections private static final String REGEX_APPLICABLE_DOCUMENTS = - "(?i:.*APPLICABLE\\sDOCUMENTS|REFERENCE|STANDARD|REQUIREMENT|GUIDELINE|COMPLIANCE.*)"; + "(?i:.*APPLICABLE\\sDOCUMENTS|REFERENCE|STANDARD|REQUIREMENT|GUIDELINE|COMPLIANCE.*)"; // Regular expression to match the alphanumeric identifier of the standard private static final String REGEX_IDENTIFIER = - "(?([0-9]{3,64}+|([A-Z]{1,64}+(-|_|\\.)?[0-9]{2,64}+))((-|_|\\.)" + - "?[A-Z0-9]{1,64}+){0,64}+)"; + "(?([0-9]{3,64}+|([A-Z]{1,64}+(-|_|\\.)?[0-9]{2,64}+))((-|_|\\.)" + + "?[A-Z0-9]{1,64}+){0,64}+)"; // Regular expression to match the standard organization private static final String REGEX_ORGANIZATION = StandardOrganizations.getOrganzationsRegex(); @@ -75,29 +69,28 @@ public class StandardsText { // Regular expression to match a string that is supposed to be a standard // reference - private static final String REGEX_FALLBACK = "\\(?" + "(?[A-Z]\\w{1,64}+)" + - "\\)?((\\s?(?\\/)\\s?)(\\w{1,64}+\\s)*\\(?" + "(?[A-Z" + - "]\\w{1,64}+)" + - "\\)?)?" + REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER; + private static final String REGEX_FALLBACK = "\\(?" + "(?[A-Z]\\w{1,64}+)" + + "\\)?((\\s?(?\\/)\\s?)(\\w{1,64}+\\s)*\\(?" + + "(?[A-Z" + "]\\w{1,64}+)" + "\\)?)?" + REGEX_STANDARD_TYPE + + "?" + "(-|\\s)?" + REGEX_IDENTIFIER; // Regular expression to match the standard organization within a string // that is supposed to be a standard reference private static final String REGEX_STANDARD = - ".*" + REGEX_ORGANIZATION + ".+" + REGEX_ORGANIZATION + "?.*"; + ".*" + REGEX_ORGANIZATION + ".+" + REGEX_ORGANIZATION + "?.*"; /** * Extracts the standard references found within the given text. * - * @param text the text from which the standard references are extracted. - * @param threshold the lower bound limit to be used in order to select only the - * standard references with score greater than or equal to the - * threshold. For instance, using a threshold of 0.75 means that - * only the patterns with score greater than or equal to 0.75 - * will be returned. + * @param text the text from which the standard references are extracted. + * @param threshold the lower bound limit to be used in order to select only the standard + * references with score greater than or equal to the threshold. For instance, using a + * threshold of 0.75 means that only the patterns with score greater than or equal to + * 0.75 will be returned. * @return the list of standard references extracted from the given text. */ public static ArrayList extractStandardReferences(String text, - double threshold) { + double threshold) { Map headers = findHeaders(text); return findStandards(text, headers, threshold); @@ -125,16 +118,14 @@ private static Map findHeaders(String text) { /** * This method helps to find the standard references within the given text. * - * @param text the text from which the standards references are extracted. - * @param headers the list of headers found within the given text. - * @param threshold the lower bound limit to be used in order to select only the - * standard references with score greater than or equal to the - * threshold. + * @param text the text from which the standards references are extracted. + * @param headers the list of headers found within the given text. + * @param threshold the lower bound limit to be used in order to select only the standard + * references with score greater than or equal to the threshold. * @return the list of standard references extracted from the given text. */ private static ArrayList findStandards(String text, - Map headers, - double threshold) { + Map headers, double threshold) { ArrayList standards = new ArrayList<>(); double score = 0; @@ -143,9 +134,9 @@ private static ArrayList findStandards(String text, while (matcher.find()) { StandardReferenceBuilder builder = new StandardReference.StandardReferenceBuilder( - matcher.group("mainOrganization"), matcher.group("identifier")) - .setSecondOrganization(matcher.group("separator"), - matcher.group("secondOrganization")); + matcher.group("mainOrganization"), matcher.group("identifier")) + .setSecondOrganization(matcher.group("separator"), + matcher.group("secondOrganization")); score = 0.25; // increases by 0.25 the score of references which include the name of a known diff --git a/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java b/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java index c79dd80a7a..c9e2ae1067 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java +++ b/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -20,9 +18,8 @@ import org.xml.sax.SAXException; /** - * Sentinel exception to stop parsing xml once target is found - * while SAX parsing. This should be used when the parse - * can be stopped and the exception ignored. + * Sentinel exception to stop parsing xml once target is found while SAX parsing. This should be + * used when the parse can be stopped and the exception ignored. */ public class StoppingEarlyException extends SAXException { diff --git a/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java index fea0b83890..f0b7d6af74 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -20,10 +18,10 @@ import org.xml.sax.SAXException; /** - * A content handler decorator that tags potential exceptions so that the - * handler that caused the exception can easily be identified. This is - * done by using the {@link TaggedSAXException} class to wrap all thrown - * {@link SAXException}s. See below for an example of using this class. + * A content handler decorator that tags potential exceptions so that the handler that caused the + * exception can easily be identified. This is done by using the {@link TaggedSAXException} class to + * wrap all thrown {@link SAXException}s. See below for an example of using this class. + * *
  * TaggedContentHandler handler = new TaggedContentHandler(...);
  * try {
@@ -40,10 +38,10 @@
  * }
  * 
*

- * Alternatively, the {@link #throwIfCauseOf(Exception)} method can be - * used to let higher levels of code handle the exception caused by this - * stream while other processing errors are being taken care of at this - * lower level. + * Alternatively, the {@link #throwIfCauseOf(Exception)} method can be used to let higher levels of + * code handle the exception caused by this stream while other processing errors are being taken + * care of at this lower level. + * *

  * TaggedContentHandler handler = new TaggedContentHandler(...);
  * try {
@@ -71,8 +69,8 @@ public TaggedContentHandler(ContentHandler proxy) {
      * Tests if the given exception was caused by this handler.
      *
      * @param exception an exception
-     * @return true if the exception was thrown by this handler,
-     * false otherwise
+     * @return true if the exception was thrown by this handler, false
+     *         otherwise
      */
     public boolean isCauseOf(SAXException exception) {
         if (exception instanceof TaggedSAXException) {
@@ -84,11 +82,10 @@ public boolean isCauseOf(SAXException exception) {
     }
 
     /**
-     * Re-throws the original exception thrown by this handler. This method
-     * first checks whether the given exception is a {@link TaggedSAXException}
-     * wrapper created by this decorator, and then unwraps and throws the
-     * original wrapped exception. Returns normally if the exception was
-     * not thrown by this handler.
+     * Re-throws the original exception thrown by this handler. This method first checks whether the
+     * given exception is a {@link TaggedSAXException} wrapper created by this decorator, and then
+     * unwraps and throws the original wrapped exception. Returns normally if the exception was not
+     * thrown by this handler.
      *
      * @param exception an exception
      * @throws SAXException original exception, if any, thrown by this handler
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java b/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java
index 7697cc6ea0..41ff1f9aae 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java
@@ -1,27 +1,25 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.sax;
 
 import org.xml.sax.SAXException;
 
 /**
- * A {@link SAXException} wrapper that tags the wrapped exception with
- * a given object reference. Both the tag and the wrapped original exception
- * can be used to determine further processing when this exception is caught.
+ * A {@link SAXException} wrapper that tags the wrapped exception with a given object reference.
+ * Both the tag and the wrapped original exception can be used to determine further processing when
+ * this exception is caught.
  */
 public class TaggedSAXException extends SAXException {
 
@@ -34,7 +32,7 @@ public class TaggedSAXException extends SAXException {
      * Creates a tagged wrapper for the given exception.
      *
      * @param original the exception to be tagged
-     * @param tag      tag object
+     * @param tag tag object
      */
     public TaggedSAXException(SAXException original, Object tag) {
         super(original.getMessage(), original);
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java
index c54e04fa5b..7fba4ba072 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.sax;
 
@@ -23,8 +21,8 @@
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * Content handler proxy that forwards the received SAX events to zero or
- * more underlying content handlers.
+ * Content handler proxy that forwards the received SAX events to zero or more underlying content
+ * handlers.
  */
 public class TeeContentHandler extends DefaultHandler {
 
@@ -78,7 +76,7 @@ public void endDocument() throws SAXException {
 
     @Override
     public void startElement(String uri, String localName, String name, Attributes atts)
-            throws SAXException {
+                    throws SAXException {
         for (ContentHandler handler : handlers) {
             handler.startElement(uri, localName, name, atts);
         }
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java
index ff20829dc8..971d96e8bc 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.sax;
 
@@ -30,13 +28,13 @@ public TextAndAttributeContentHandler(ContentHandler delegate) {
     }
 
     public TextAndAttributeContentHandler(ContentHandler delegate,
-                                          boolean addSpaceBetweenElements) {
+                    boolean addSpaceBetweenElements) {
         super(delegate, addSpaceBetweenElements);
     }
 
     @Override
     public void startElement(String uri, String localName, String qName, Attributes attributes)
-            throws SAXException {
+                    throws SAXException {
         super.startElement(uri, localName, qName, attributes);
 
         // output element name and attributes if attributes length larger than 0.
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
index a510baf82d..78a9e410f4 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.sax;
 
@@ -22,15 +20,13 @@
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * Content handler decorator that only passes the
- * {@link #characters(char[], int, int)} and
- * (@link {@link #ignorableWhitespace(char[], int, int)}
- * (plus {@link #startDocument()} and {@link #endDocument()} events to
- * the decorated content handler.
+ * Content handler decorator that only passes the {@link #characters(char[], int, int)} and (@link
+ * {@link #ignorableWhitespace(char[], int, int)} (plus {@link #startDocument()} and
+ * {@link #endDocument()} events to the decorated content handler.
  */
 public class TextContentHandler extends DefaultHandler {
 
-    private static final char[] SPACE = new char[]{' '};
+    private static final char[] SPACE = new char[] {' '};
 
     private final ContentHandler delegate;
     private final boolean addSpaceBetweenElements;
@@ -61,7 +57,7 @@ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXExce
 
     @Override
     public void startElement(String uri, String localName, String qName, Attributes attributes)
-            throws SAXException {
+                    throws SAXException {
         if (addSpaceBetweenElements) {
             delegate.characters(SPACE, 0, SPACE.length);
         }
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
index 268edb12cf..8a9b6fd8ab 100755
--- a/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.sax;
 
@@ -21,24 +19,22 @@
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
-
 import org.xml.sax.SAXException;
 
 /**
- * SAX event handler that serializes the HTML document to a character stream.
- * The incoming SAX events are expected to be well-formed (properly nested,
- * etc.) and valid HTML.
+ * SAX event handler that serializes the HTML document to a character stream. The incoming SAX
+ * events are expected to be well-formed (properly nested, etc.) and valid HTML.
  *
  * @since Apache Tika 0.10
  */
 public class ToHTMLContentHandler extends ToXMLContentHandler {
 
-    private static final Set EMPTY_ELEMENTS = new HashSet<>(
-            Arrays.asList("area", "base", "basefont", "br", "col", "frame", "hr", "img", "input",
-                    "isindex", "link", "meta", "param"));
+    private static final Set EMPTY_ELEMENTS =
+                    new HashSet<>(Arrays.asList("area", "base", "basefont", "br", "col", "frame",
+                                    "hr", "img", "input", "isindex", "link", "meta", "param"));
 
     public ToHTMLContentHandler(OutputStream stream, String encoding)
-            throws UnsupportedEncodingException {
+                    throws UnsupportedEncodingException {
         super(stream, encoding);
     }
 
@@ -47,8 +43,7 @@ public ToHTMLContentHandler() {
     }
 
     @Override
-    public void startDocument() throws SAXException {
-    }
+    public void startDocument() throws SAXException {}
 
     @Override
     public void endElement(String uri, String localName, String qName) throws SAXException {
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
index 868a3bc369..5e7635a8cf 100755
--- a/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.sax;
 
@@ -23,18 +21,15 @@
 import java.io.UnsupportedEncodingException;
 import java.io.Writer;
 import java.util.Locale;
-
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * SAX event handler that writes all character content out to a character
- * stream. No escaping or other transformations are made on the character
- * content.
+ * SAX event handler that writes all character content out to a character stream. No escaping or
+ * other transformations are made on the character content.
  * 

- * As of Tika 1.20, this handler ignores content within <script> and - * <style> tags. + * As of Tika 1.20, this handler ignores content within <script> and <style> tags. *

* * @since Apache Tika 0.10 @@ -51,8 +46,7 @@ public class ToTextContentHandler extends DefaultHandler { private int scriptDepth = 0; /** - * Creates a content handler that writes character events to - * the given writer. + * Creates a content handler that writes character events to the given writer. * * @param writer writer */ @@ -61,22 +55,21 @@ public ToTextContentHandler(Writer writer) { } /** - * Creates a content handler that writes character events to - * the given output stream using the given encoding. + * Creates a content handler that writes character events to the given output stream using the + * given encoding. * - * @param stream output stream + * @param stream output stream * @param encoding output encoding * @throws UnsupportedEncodingException if the encoding is unsupported */ public ToTextContentHandler(OutputStream stream, String encoding) - throws UnsupportedEncodingException { + throws UnsupportedEncodingException { this(new OutputStreamWriter(stream, encoding)); } /** - * Creates a content handler that writes character events - * to an internal string buffer. Use the {@link #toString()} - * method to access the collected character content. + * Creates a content handler that writes character events to an internal string buffer. Use the + * {@link #toString()} method to access the collected character content. */ public ToTextContentHandler() { this(new StringWriter()); @@ -101,9 +94,8 @@ public void characters(char[] ch, int start, int length) throws SAXException { /** - * Writes the given ignorable characters to the given character stream. - * The default implementation simply forwards the call to the - * {@link #characters(char[], int, int)} method. + * Writes the given ignorable characters to the given character stream. The default + * implementation simply forwards the call to the {@link #characters(char[], int, int)} method. */ @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { @@ -111,8 +103,7 @@ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXExce } /** - * Flushes the character stream so that no characters are forgotten - * in internal buffers. + * Flushes the character stream so that no characters are forgotten in internal buffers. * * @throws SAXException if the stream can not be flushed * @see TIKA-179 @@ -128,7 +119,7 @@ public void endDocument() throws SAXException { @Override public void startElement(String uri, String localName, String qName, Attributes atts) - throws SAXException { + throws SAXException { String uc = (qName == null) ? "" : qName.toUpperCase(Locale.ENGLISH); if (uc.equals(STYLE)) { styleDepth++; @@ -150,11 +141,9 @@ public void endElement(String uri, String localName, String qName) throws SAXExc } /** - * Returns the contents of the internal string buffer where - * all the received characters have been collected. Only works - * when this object was constructed using the empty default - * constructor or by passing a {@link StringWriter} to the - * other constructor. + * Returns the contents of the internal string buffer where all the received characters have + * been collected. Only works when this object was constructed using the empty default + * constructor or by passing a {@link StringWriter} to the other constructor. */ @Override public String toString() { diff --git a/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java index 0a09f14b43..745f22919b 100755 --- a/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -21,15 +19,13 @@ import java.util.Collections; import java.util.HashMap; import java.util.Map; - import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** - * SAX event handler that serializes the XML document to a character stream. - * The incoming SAX events are expected to be well-formed (properly nested, - * etc.) and to explicitly include namespace declaration attributes and - * corresponding namespace prefixes in element and attribute names. + * SAX event handler that serializes the XML document to a character stream. The incoming SAX events + * are expected to be well-formed (properly nested, etc.) and to explicitly include namespace + * declaration attributes and corresponding namespace prefixes in element and attribute names. * * @since Apache Tika 0.10 */ @@ -42,15 +38,15 @@ public class ToXMLContentHandler extends ToTextContentHandler { private ElementInfo currentElement; /** - * Creates an XML serializer that writes to the given byte stream - * using the given character encoding. + * Creates an XML serializer that writes to the given byte stream using the given character + * encoding. * - * @param stream output stream + * @param stream output stream * @param encoding output encoding * @throws UnsupportedEncodingException if the encoding is unsupported */ public ToXMLContentHandler(OutputStream stream, String encoding) - throws UnsupportedEncodingException { + throws UnsupportedEncodingException { super(stream, encoding); this.encoding = encoding; } @@ -93,7 +89,7 @@ public void startPrefixMapping(String prefix, String uri) throws SAXException { @Override public void startElement(String uri, String localName, String qName, Attributes atts) - throws SAXException { + throws SAXException { lazyCloseStartElement(); currentElement = new ElementInfo(currentElement, namespaces); @@ -168,7 +164,7 @@ private void lazyCloseStartElement() throws SAXException { * @throws SAXException if the character could not be written */ protected void write(char ch) throws SAXException { - super.characters(new char[]{ch}, 0, 1); + super.characters(new char[] {ch}, 0, 1); } /** @@ -184,16 +180,15 @@ protected void write(String string) throws SAXException { /** * Writes the given characters as-is followed by the given entity. * - * @param ch character array - * @param from start position in the array - * @param to end position in the array + * @param ch character array + * @param from start position in the array + * @param to end position in the array * @param entity entity code - * @return next position in the array, - * after the characters plus one entity + * @return next position in the array, after the characters plus one entity * @throws SAXException if the characters could not be written */ private int writeCharsAndEntity(char[] ch, int from, int to, String entity) - throws SAXException { + throws SAXException { super.characters(ch, from, to - from); write('&'); write(entity); @@ -204,11 +199,11 @@ private int writeCharsAndEntity(char[] ch, int from, int to, String entity) /** * Writes the given characters with XML meta characters escaped. * - * @param ch character array - * @param from start position in the array - * @param to end position in the array - * @param attribute whether the characters should be escaped as - * an attribute value or normal character content + * @param ch character array + * @param from start position in the array + * @param to end position in the array + * @param attribute whether the characters should be escaped as an attribute value or normal + * character content * @throws SAXException if the characters could not be written */ private void writeEscaped(char[] ch, int from, int to, boolean attribute) throws SAXException { diff --git a/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java b/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java index d82895a1ba..5455bf6b2d 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java +++ b/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java @@ -1,22 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; public interface WriteLimiter { int getWriteLimit(); + boolean isThrowOnWriteLimitReached(); } diff --git a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java index 22d69f0a4d..65155c2b97 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java @@ -1,41 +1,36 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import java.io.StringWriter; import java.io.Writer; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.ParseRecord; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * SAX event handler that writes content up to an optional write - * limit out to a character stream or other decorated handler. + * SAX event handler that writes content up to an optional write limit out to a character stream or + * other decorated handler. */ public class WriteOutContentHandler extends ContentHandlerDecorator { /** - * The maximum number of characters to write to the character stream. - * Set to -1 for no limit. + * The maximum number of characters to write to the character stream. Set to -1 for no limit. */ private final int writeLimit; @@ -51,10 +46,10 @@ public class WriteOutContentHandler extends ContentHandlerDecorator { private boolean writeLimitReached; /** - * Creates a content handler that writes content up to the given - * write limit to the given content handler. + * Creates a content handler that writes content up to the given write limit to the given + * content handler. * - * @param handler content handler to be decorated + * @param handler content handler to be decorated * @param writeLimit write limit * @since Apache Tika 0.10 */ @@ -64,10 +59,10 @@ public WriteOutContentHandler(ContentHandler handler, int writeLimit) { } /** - * Creates a content handler that writes content up to the given - * write limit to the given character stream. + * Creates a content handler that writes content up to the given write limit to the given + * character stream. * - * @param writer character stream + * @param writer character stream * @param writeLimit write limit * @since Apache Tika 0.10 */ @@ -76,8 +71,7 @@ public WriteOutContentHandler(Writer writer, int writeLimit) { } /** - * Creates a content handler that writes character events to - * the given writer. + * Creates a content handler that writes character events to the given writer. * * @param writer writer */ @@ -86,17 +80,16 @@ public WriteOutContentHandler(Writer writer) { } /** - * Creates a content handler that writes character events - * to an internal string buffer. Use the {@link #toString()} - * method to access the collected character content. + * Creates a content handler that writes character events to an internal string buffer. Use the + * {@link #toString()} method to access the collected character content. *

- * The internal string buffer is bounded at the given number of characters. - * If this write limit is reached, then a {@link SAXException} is thrown. - * The {@link WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to + * The internal string buffer is bounded at the given number of characters. If this write limit + * is reached, then a {@link SAXException} is thrown. The + * {@link WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to * detect this case. * - * @param writeLimit maximum number of characters to include in the string, - * or -1 to disable the write limit + * @param writeLimit maximum number of characters to include in the string, or -1 to disable the + * write limit * @since Apache Tika 0.7 */ public WriteOutContentHandler(int writeLimit) { @@ -104,12 +97,11 @@ public WriteOutContentHandler(int writeLimit) { } /** - * Creates a content handler that writes character events - * to an internal string buffer. Use the {@link #toString()} - * method to access the collected character content. + * Creates a content handler that writes character events to an internal string buffer. Use the + * {@link #toString()} method to access the collected character content. *

- * The internal string buffer is bounded at 100k characters. If this - * write limit is reached, then a {@link SAXException} is thrown. The + * The internal string buffer is bounded at 100k characters. If this write limit is reached, + * then a {@link SAXException} is thrown. The * {@link WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to * detect this case. */ @@ -119,14 +111,14 @@ public WriteOutContentHandler() { /** * The default is to throw a {@link WriteLimitReachedException} + * * @param handler * @param writeLimit * @param throwOnWriteLimitReached * @param parseContext */ - public WriteOutContentHandler(ContentHandler handler, - int writeLimit, boolean throwOnWriteLimitReached, - ParseContext parseContext) { + public WriteOutContentHandler(ContentHandler handler, int writeLimit, + boolean throwOnWriteLimitReached, ParseContext parseContext) { super(handler); this.writeLimit = writeLimit; this.throwOnWriteLimitReached = throwOnWriteLimitReached; diff --git a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java index 6ba4232205..e4d2e8f342 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -20,18 +18,16 @@ import java.util.Collections; import java.util.HashSet; import java.util.Set; - +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; - /** - * Content handler decorator that simplifies the task of producing XHTML - * events for Tika content parsers. + * Content handler decorator that simplifies the task of producing XHTML events for Tika content + * parsers. */ public class XHTMLContentHandler extends SafeContentHandler { @@ -42,37 +38,36 @@ public class XHTMLContentHandler extends SafeContentHandler { /** * The elements that get appended with the {@link #NL} character. */ - public static final Set ENDLINE = - unmodifiableSet("p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", "pre", - "hr", "blockquote", "address", "fieldset", "table", "form", "noscript", "li", - "dt", "dd", "noframes", "br", "tr", "select", "option", "link", "script"); + public static final Set ENDLINE = unmodifiableSet("p", "h1", "h2", "h3", "h4", "h5", + "h6", "div", "ul", "ol", "dl", "pre", "hr", "blockquote", "address", "fieldset", + "table", "form", "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", + "option", "link", "script"); /** * The newline character that gets inserted after block elements. */ - private static final char[] NL = new char[]{'\n'}; + private static final char[] NL = new char[] {'\n'}; /** * The tab character gets inserted before table cells and list items. */ - private static final char[] TAB = new char[]{'\t'}; + private static final char[] TAB = new char[] {'\t'}; /** * The elements that are in the section. */ private static final Set HEAD = - unmodifiableSet("title", "link", "base", "meta", "script"); + unmodifiableSet("title", "link", "base", "meta", "script"); /** - * The elements that are automatically emitted by lazyStartHead, so - * skip them if they get sent to startElement/endElement by mistake. + * The elements that are automatically emitted by lazyStartHead, so skip them if they get sent + * to startElement/endElement by mistake. */ private static final Set AUTO = unmodifiableSet("head", "frameset"); /** * The elements that get prepended with the {@link #TAB} character. */ private static final Set INDENT = - unmodifiableSet("li", "dd", "dt", "td", "th", "frame"); + unmodifiableSet("li", "dd", "dt", "td", "th", "frame"); private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); /** - * Metadata associated with the document. Used to fill in the - * <head/> section. + * Metadata associated with the document. Used to fill in the <head/> section. */ private final Metadata metadata; /** @@ -85,6 +80,7 @@ public class XHTMLContentHandler extends SafeContentHandler { private boolean headStarted = false; private boolean headEnded = false; private boolean useFrameset = false; + public XHTMLContentHandler(ContentHandler handler, Metadata metadata) { super(handler); this.metadata = metadata; @@ -95,10 +91,8 @@ private static Set unmodifiableSet(String... elements) { } /** - * Starts an XHTML document by setting up the namespace mappings - * when called for the first time. - * The standard XHTML prefix is generated lazily when the first - * element is started. + * Starts an XHTML document by setting up the namespace mappings when called for the first time. + * The standard XHTML prefix is generated lazily when the first element is started. */ @Override public void startDocument() throws SAXException { @@ -111,6 +105,7 @@ public void startDocument() throws SAXException { /** * Generates the following XHTML prefix when called for the first time: + * *

      * <html>
      *   <head>
@@ -139,6 +134,7 @@ private void lazyStartHead() throws SAXException {
 
     /**
      * Generates the following XHTML prefix when called for the first time:
+     * 
      * 
      * <html>
      *   <head>
@@ -199,8 +195,8 @@ private void lazyEndHead(boolean isFrameset) throws SAXException {
     }
 
     /**
-     * Ends the XHTML document by writing the following footer and
-     * clearing the namespace mappings:
+     * Ends the XHTML document by writing the following footer and clearing the namespace mappings:
+     * 
      * 
      *   </body>
      * </html>
@@ -223,12 +219,12 @@ public void endDocument() throws SAXException {
     }
 
     /**
-     * Starts the given element. Table cells and list items are automatically
-     * indented by emitting a tab character as ignorable whitespace.
+     * Starts the given element. Table cells and list items are automatically indented by emitting a
+     * tab character as ignorable whitespace.
      */
     @Override
     public void startElement(String uri, String local, String name, Attributes attributes)
-            throws SAXException {
+                    throws SAXException {
 
         if (name.equals("frameset")) {
             lazyEndHead(true);
@@ -248,8 +244,7 @@ public void startElement(String uri, String local, String name, Attributes attri
     }
 
     /**
-     * Ends the given element. Block elements are automatically followed
-     * by a newline character.
+     * Ends the given element. Block elements are automatically followed by a newline character.
      */
     @Override
     public void endElement(String uri, String local, String name) throws SAXException {
@@ -270,7 +265,7 @@ public void characters(char[] ch, int start, int length) throws SAXException {
         super.characters(ch, start, length);
     }
 
-    //------------------------------------------< public convenience methods >
+    // ------------------------------------------< public convenience methods >
 
     public void startElement(String name) throws SAXException {
         startElement(XHTML, name, name, EMPTY_ATTRIBUTES);
@@ -301,10 +296,10 @@ public void newline() throws SAXException {
     }
 
     /**
-     * Emits an XHTML element with the given text content. If the given
-     * text value is null or empty, then the element is not written.
+     * Emits an XHTML element with the given text content. If the given text value is null or empty,
+     * then the element is not written.
      *
-     * @param name  XHTML element name
+     * @param name XHTML element name
      * @param value element value, possibly null
      * @throws SAXException if the content element could not be written
      */
@@ -321,7 +316,7 @@ protected boolean isInvalid(int ch) {
         if (super.isInvalid(ch)) {
             return true;
         }
-        // These control chars are  invalid in XHTML.
+        // These control chars are invalid in XHTML.
         return 0x7F <= ch && ch <= 0x9F;
     }
 
diff --git a/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
index 953ad6a446..56e74fd29d 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
@@ -1,29 +1,26 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika.sax;
 
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-
 /**
  * Content handler decorator that simplifies the task of producing XMP output.
  *
@@ -49,11 +46,12 @@ public XMPContentHandler(ContentHandler handler) {
         super(handler);
     }
 
-    //------------------------------------------< public convenience methods >
+    // ------------------------------------------< public convenience methods >
 
     /**
-     * Starts an XMP document by setting up the namespace mappings and
-     * writing out the following header:
+     * Starts an XMP document by setting up the namespace mappings and writing out the following
+     * header:
+     * 
      * 
      * <rdf:RDF>
      * 
@@ -69,8 +67,8 @@ public void startDocument() throws SAXException { } /** - * Ends the XMP document by writing the following footer and - * clearing the namespace mappings: + * Ends the XMP document by writing the following footer and clearing the namespace mappings: + * *
      * </rdf:RDF>
      * 
diff --git a/tika-core/src/main/java/org/apache/tika/sax/package-info.java b/tika-core/src/main/java/org/apache/tika/sax/package-info.java index 3c0b4ba48d..22aeb8bd7d 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/sax/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java index 7b1693d57a..c8e0dad433 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java @@ -1,24 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; /** - * Final evaluation state of a .../@* XPath expression. - * Matches all attributes of the current element. + * Final evaluation state of a .../@* XPath expression. Matches all attributes of the + * current element. */ public class AttributeMatcher extends Matcher { diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java index b95983c1b6..e38667eed2 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java @@ -1,24 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; /** - * Intermediate evaluation state of a .../*... XPath expression. - * Matches nothing, but specifies the evaluation state for all child elements. + * Intermediate evaluation state of a .../*... XPath expression. Matches nothing, but + * specifies the evaluation state for all child elements. */ public class ChildMatcher extends Matcher { diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java index b0ef5110a1..bfec17928e 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java @@ -1,24 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; /** - * Composite XPath evaluation state. Used when XPath evaluation results - * in two or more branches of independent evaluation states. + * Composite XPath evaluation state. Used when XPath evaluation results in two or more branches of + * independent evaluation states. */ public class CompositeMatcher extends Matcher { diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java index 164e08aa29..51cc05a3fd 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java @@ -1,24 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; /** - * Final evaluation state of an XPath expression that targets an element. - * Matches the current element. + * Final evaluation state of an XPath expression that targets an element. Matches the current + * element. */ public class ElementMatcher extends Matcher { diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java index ab9d21c385..36ec028009 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java @@ -1,40 +1,36 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; /** - * XPath element matcher. A matcher instance encapsulates a specific - * state in XPath evaluation. + * XPath element matcher. A matcher instance encapsulates a specific state in XPath evaluation. */ public class Matcher { /** - * State of a failed XPath evaluation, where nothing is matched. - * This matcher instance is used as a sentinel object whenever an - * XPath evaluation branch fails. + * State of a failed XPath evaluation, where nothing is matched. This matcher instance is used + * as a sentinel object whenever an XPath evaluation branch fails. */ public static final Matcher FAIL = new Matcher(); /** - * Returns the XPath evaluation state that results from descending - * to a child element with the given name. + * Returns the XPath evaluation state that results from descending to a child element with the + * given name. * * @param namespace element namespace or null - * @param name element name + * @param name element name * @return next XPath evaluation state */ public Matcher descend(String namespace, String name) { @@ -42,8 +38,8 @@ public Matcher descend(String namespace, String name) { } /** - * Returns true if the XPath expression matches - * the element associated with this evaluation state. + * Returns true if the XPath expression matches the element associated with this + * evaluation state. * * @return XPath evaluation state for this element */ @@ -52,11 +48,11 @@ public boolean matchesElement() { } /** - * Returns true if the XPath expression matches the named - * attribute of the element associated with this evaluation state. + * Returns true if the XPath expression matches the named attribute of the element + * associated with this evaluation state. * * @param namespace attribute namespace or null - * @param name attribute name + * @param name attribute name * @return XPath evaluation state for named attribute of this element */ public boolean matchesAttribute(String namespace, String name) { @@ -64,9 +60,8 @@ public boolean matchesAttribute(String namespace, String name) { } /** - * Returns true if the XPath expression matches all text - * nodes whose parent is the element associated with this evaluation - * state. + * Returns true if the XPath expression matches all text nodes whose parent is the + * element associated with this evaluation state. * * @return XPath evaluation state for text children of this element */ diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java index 831611c063..2770f4f1ca 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java @@ -1,33 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; import java.util.LinkedList; - +import org.apache.tika.sax.ContentHandlerDecorator; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.sax.ContentHandlerDecorator; - /** - * Content handler decorator that only passes the elements, attributes, - * and text nodes that match the given XPath expression. + * Content handler decorator that only passes the elements, attributes, and text nodes that match + * the given XPath expression. */ public class MatchingContentHandler extends ContentHandlerDecorator { @@ -41,7 +37,7 @@ public MatchingContentHandler(ContentHandler delegate, Matcher matcher) { } public void startElement(String uri, String localName, String name, Attributes attributes) - throws SAXException { + throws SAXException { matchers.addFirst(matcher); matcher = matcher.descend(uri, localName); @@ -51,7 +47,7 @@ public void startElement(String uri, String localName, String name, Attributes a String attributeName = attributes.getLocalName(i); if (matcher.matchesAttribute(attributeURI, attributeName)) { matches.addAttribute(attributeURI, attributeName, attributes.getQName(i), - attributes.getType(i), attributes.getValue(i)); + attributes.getType(i), attributes.getValue(i)); } } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java index 46b65a4da0..f233c5e762 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java @@ -1,26 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; import java.util.Objects; /** - * Final evaluation state of a .../@name XPath expression. - * Matches the named attributes of the current element. + * Final evaluation state of a .../@name XPath expression. Matches the named attributes + * of the current element. */ public class NamedAttributeMatcher extends Matcher { diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java index e304789c09..f4eea9ca48 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java @@ -1,27 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; import java.util.Objects; /** - * Intermediate evaluation state of a .../name... XPath - * expression. Matches nothing, but specifies the evaluation state - * for the child elements with the given name. + * Intermediate evaluation state of a .../name... XPath expression. Matches nothing, + * but specifies the evaluation state for the child elements with the given name. */ public class NamedElementMatcher extends ChildMatcher { diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java index 8c2e45cadd..43d285dbf3 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java @@ -1,24 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; /** - * Final evaluation state of a .../node() XPath expression. - * Matches all elements, attributes, and text. + * Final evaluation state of a .../node() XPath expression. Matches all elements, + * attributes, and text. */ public class NodeMatcher extends Matcher { diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java index 1915dfc8d8..e1e6a868f8 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java @@ -1,24 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; /** - * Evaluation state of a ...//... XPath expression. Applies the - * contained evaluation state to the current element and all its descendants. + * Evaluation state of a ...//... XPath expression. Applies the contained evaluation + * state to the current element and all its descendants. */ public class SubtreeMatcher extends Matcher { diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java index caf82f4883..0b474aa377 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java @@ -1,24 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; /** - * Final evaluation state of a .../text() XPath expression. - * Matches all text children of the current element. + * Final evaluation state of a .../text() XPath expression. Matches all text children + * of the current element. */ public class TextMatcher extends Matcher { diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java index c169d5600d..97afea8628 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; @@ -20,29 +18,28 @@ import java.util.Map; /** - * Parser for a very simple XPath subset. Only the following XPath constructs - * (with namespaces) are supported: + * Parser for a very simple XPath subset. Only the following XPath constructs (with namespaces) are + * supported: *
    - *
  • .../node()
  • - *
  • .../text()
  • - *
  • .../@*
  • - *
  • .../@name
  • - *
  • .../*...
  • - *
  • .../name...
  • - *
  • ...//*...
  • - *
  • ...//name...
  • + *
  • .../node()
  • + *
  • .../text()
  • + *
  • .../@*
  • + *
  • .../@name
  • + *
  • .../*...
  • + *
  • .../name...
  • + *
  • ...//*...
  • + *
  • ...//name...
  • *
*

- * In addition the non-abbreviated .../descendant::node() - * construct can be used for cases where the descendant-or-self axis - * used by the ...//node() construct is not appropriate. + * In addition the non-abbreviated .../descendant::node() construct can be used for + * cases where the descendant-or-self axis used by the ...//node() construct is not + * appropriate. */ public class XPathParser { private final Map prefixes = new HashMap<>(); - public XPathParser() { - } + public XPathParser() {} public XPathParser(String prefix, String namespace) { addPrefix(prefix, namespace); @@ -53,9 +50,9 @@ public void addPrefix(String prefix, String namespace) { } /** - * Parses the given simple XPath expression to an evaluation state - * initialized at the document node. Invalid expressions are not flagged - * as errors, they just result in a failing evaluation state. + * Parses the given simple XPath expression to an evaluation state initialized at the document + * node. Invalid expressions are not flagged as errors, they just result in a failing evaluation + * state. * * @param xpath simple XPath expression * @return XPath evaluation state @@ -65,10 +62,10 @@ public Matcher parse(String xpath) { return TextMatcher.INSTANCE; } else if (xpath.equals("/node()")) { return NodeMatcher.INSTANCE; - } else if (xpath.equals("/descendant::node()") || - xpath.equals("/descendant:node()")) { // for compatibility + } else if (xpath.equals("/descendant::node()") || xpath.equals("/descendant:node()")) { // for + // compatibility return new CompositeMatcher(TextMatcher.INSTANCE, - new ChildMatcher(new SubtreeMatcher(NodeMatcher.INSTANCE))); + new ChildMatcher(new SubtreeMatcher(NodeMatcher.INSTANCE))); } else if (xpath.equals("/@*")) { return AttributeMatcher.INSTANCE; } else if (xpath.isEmpty()) { @@ -106,7 +103,7 @@ public Matcher parse(String xpath) { } if (prefixes.containsKey(prefix)) { return new NamedElementMatcher(prefixes.get(prefix), name, - parse(xpath.substring(slash))); + parse(xpath.substring(slash))); } else { return Matcher.FAIL; } diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java index f9c1801bd6..3847eeac86 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java index b3b8264b41..923c56cba1 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -25,15 +23,13 @@ import java.util.List; import java.util.Locale; import java.util.Map; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.Field; import org.apache.tika.config.Param; import org.apache.tika.config.ParamField; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaConfigException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * This class contains utilities for dealing with tika annotations @@ -51,17 +47,17 @@ public class AnnotationUtils { /** * Collects all the fields and methods for an annotation * - * @param clazz bean class with annotations + * @param clazz bean class with annotations * @param annotation annotation class * @return list of accessible objects such as fields and methods */ private static List collectInfo(Class clazz, - Class annotation) { + Class annotation) { Class superClazz = clazz; List members = new ArrayList<>(); List annotatedMembers = new ArrayList<>(); - //walk through the inheritance chain + // walk through the inheritance chain while (superClazz != null && superClazz != Object.class) { members.addAll(Arrays.asList(superClazz.getDeclaredFields())); members.addAll(Arrays.asList(superClazz.getDeclaredMethods())); @@ -83,13 +79,13 @@ private static List collectInfo(Class clazz, * @throws TikaConfigException when an error occurs while assigning params */ public static void assignFieldParams(Object bean, Map params) - throws TikaConfigException { + throws TikaConfigException { Class beanClass = bean.getClass(); if (!PARAM_INFO.containsKey(beanClass)) { synchronized (TikaConfig.class) { if (!PARAM_INFO.containsKey(beanClass)) { List aObjs = - collectInfo(beanClass, org.apache.tika.config.Field.class); + collectInfo(beanClass, org.apache.tika.config.Field.class); List fields = new ArrayList<>(aObjs.size()); for (AccessibleObject aObj : aObjs) { @@ -109,26 +105,27 @@ public static void assignFieldParams(Object bean, Map params) try { field.assignValue(bean, param.getValue()); } catch (InvocationTargetException e) { - LOG.error("Error assigning value '{}' to '{}'", param.getValue(), param.getName()); + LOG.error("Error assigning value '{}' to '{}'", param.getValue(), + param.getName()); final Throwable cause = e.getCause() == null ? e : e.getCause(); throw new TikaConfigException(cause.getMessage(), cause); } catch (IllegalAccessException e) { - LOG.error("Error assigning value '{}' to '{}'", param.getValue(), param.getName()); + LOG.error("Error assigning value '{}' to '{}'", param.getValue(), + param.getName()); throw new TikaConfigException(e.getMessage(), e); } } else { - String msg = String.format(Locale.ROOT, - "Value '%s' of type '%s' can't be" + - " assigned to field '%s' of defined type '%s'", - param.getValue(), - param.getValue().getClass(), field.getName(), field.getType()); + String msg = String.format(Locale.ROOT, "Value '%s' of type '%s' can't be" + + " assigned to field '%s' of defined type '%s'", + param.getValue(), param.getValue().getClass(), field.getName(), + field.getType()); throw new TikaConfigException(msg); } } else if (field.isRequired()) { - //param not supplied but field is declared as required? + // param not supplied but field is declared as required? String msg = String.format(Locale.ROOT, - "Param %s is required for %s," + " but it is not given in config.", - field.getName(), bean.getClass().getName()); + "Param %s is required for %s," + " but it is not given in config.", + field.getName(), bean.getClass().getName()); throw new TikaConfigException(msg); } else { LOG.debug("Param not supplied, field is not mandatory"); diff --git a/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java b/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java index 5177752100..3ed0930796 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -45,11 +43,11 @@ public class CharsetUtils { static { initCommonCharsets("Big5", "EUC-JP", "EUC-KR", "x-EUC-TW", "GB18030", "IBM855", "IBM866", - "ISO-2022-CN", "ISO-2022-JP", "ISO-2022-KR", "ISO-8859-1", "ISO-8859-2", - "ISO-8859-3", "ISO-8859-4", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8", - "ISO-8859-9", "ISO-8859-11", "ISO-8859-13", "ISO-8859-15", "KOI8-R", - "x-MacCyrillic", "SHIFT_JIS", "UTF-8", "UTF-16BE", "UTF-16LE", "windows-1251", - "windows-1252", "windows-1253", "windows-1255"); + "ISO-2022-CN", "ISO-2022-JP", "ISO-2022-KR", "ISO-8859-1", "ISO-8859-2", + "ISO-8859-3", "ISO-8859-4", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", + "ISO-8859-8", "ISO-8859-9", "ISO-8859-11", "ISO-8859-13", "ISO-8859-15", + "KOI8-R", "x-MacCyrillic", "SHIFT_JIS", "UTF-8", "UTF-16BE", "UTF-16LE", + "windows-1251", "windows-1252", "windows-1253", "windows-1255"); // Common aliases/typos not included in standard charset definitions COMMON_CHARSETS.put("iso-8851-1", COMMON_CHARSETS.get("iso-8859-1")); @@ -59,10 +57,10 @@ public class CharsetUtils { // See if we can load the icu4j CharsetICU class Class icuCharset = null; try { - icuCharset = - CharsetUtils.class.getClassLoader().loadClass("com.ibm.icu.charset.CharsetICU"); + icuCharset = CharsetUtils.class.getClassLoader() + .loadClass("com.ibm.icu.charset.CharsetICU"); } catch (ClassNotFoundException e) { - //swallow + // swallow } if (icuCharset != null) { try { @@ -73,7 +71,7 @@ public class CharsetUtils { try { isSupportedICU = icuCharset.getMethod("isSupported", String.class); } catch (Throwable t) { - //swallow + // swallow } // TODO: would be nice to somehow log that we // successfully found ICU @@ -120,8 +118,8 @@ public static boolean isSupported(String charsetName) { } /** - * Handle various common charset name errors, and return something - * that will be considered valid (and is normalized) + * Handle various common charset name errors, and return something that will be considered valid + * (and is normalized) * * @param charsetName name of charset to process * @return potentially remapped/cleaned up version of charset name @@ -135,10 +133,9 @@ public static String clean(String charsetName) { } /** - * Returns Charset impl, if one exists. This method - * optionally uses ICU4J's CharsetICU.forNameICU, - * if it is found on the classpath, else only uses - * JDK's builtin Charset.forName. + * Returns Charset impl, if one exists. This method optionally uses ICU4J's + * CharsetICU.forNameICU, if it is found on the classpath, else only uses JDK's builtin + * Charset.forName. */ public static Charset forName(String name) { if (name == null) { @@ -186,9 +183,9 @@ public static Charset forName(String name) { if (cs != null) { return cs; } - } catch (IllegalArgumentException | IllegalAccessException | - InvocationTargetException e) { - //ignore + } catch (IllegalArgumentException | IllegalAccessException + | InvocationTargetException e) { + // ignore } } diff --git a/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java b/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java index a4da7772a5..48a7b5ea08 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java @@ -1,28 +1,25 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; public class CompareUtils { /** - * Compare two classes by class names. - * If both classes are Tika's or both are not Tika's class, compare by name String. - * Otherwise one of these two class is Tika's class. - * Then the non-Tika's class comes before Tika's class. + * Compare two classes by class names. If both classes are Tika's or both are not Tika's class, + * compare by name String. Otherwise one of these two class is Tika's class. Then the non-Tika's + * class comes before Tika's class. * * @param o1 the object 1 to be compared * @param o2 the object 2 to be compared diff --git a/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java index 8720e74a9f..4feda36c63 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.FutureTask; - import org.apache.tika.parser.ParseContext; /** @@ -30,8 +27,8 @@ public class ConcurrentUtils { /** - * Execute a runnable using an ExecutorService from the ParseContext if possible. - * Otherwise fallback to individual threads. + * Execute a runnable using an ExecutorService from the ParseContext if possible. Otherwise + * fallback to individual threads. * * @param context * @param runnable diff --git a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java index a6a68fef6e..ad10c96ce9 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -34,27 +32,23 @@ public class DateUtils { /** - * The UTC time zone. Not sure if {@link TimeZone#getTimeZone(String)} - * understands "UTC" in all environments, but it'll fall back to GMT - * in such cases, which is in practice equivalent to UTC. + * The UTC time zone. Not sure if {@link TimeZone#getTimeZone(String)} understands "UTC" in all + * environments, but it'll fall back to GMT in such cases, which is in practice equivalent to + * UTC. */ public static final TimeZone UTC = TimeZone.getTimeZone("UTC"); /** - * Custom time zone used to interpret date values without a time - * component in a way that most likely falls within the same day - * regardless of in which time zone it is later interpreted. For - * example, the "2012-02-17" date would map to "2012-02-17T12:00:00Z" - * (instead of the default "2012-02-17T00:00:00Z"), which would still - * map to "2012-02-17" if interpreted in say Pacific time (while the - * default mapping would result in "2012-02-16" for UTC-8). + * Custom time zone used to interpret date values without a time component in a way that most + * likely falls within the same day regardless of in which time zone it is later interpreted. + * For example, the "2012-02-17" date would map to "2012-02-17T12:00:00Z" (instead of the + * default "2012-02-17T00:00:00Z"), which would still map to "2012-02-17" if interpreted in say + * Pacific time (while the default mapping would result in "2012-02-16" for UTC-8). */ public static final TimeZone MIDDAY = TimeZone.getTimeZone("GMT-12:00"); /** - * So we can return Date objects for these, this is the - * list (in preference order) of the various ISO-8601 - * variants that we try when processing a date based - * property. + * So we can return Date objects for these, this is the list (in preference order) of the + * various ISO-8601 variants that we try when processing a date based property. */ private final List iso8601InputFormats = loadDateFormats(); @@ -67,8 +61,8 @@ private static DateFormat createDateFormat(String format, TimeZone timezone) { } /** - * Returns a ISO 8601 representation of the given date in UTC, - * truncated to the seconds unit. This method is thread safe and non-blocking. + * Returns a ISO 8601 representation of the given date in UTC, truncated to the seconds unit. + * This method is thread safe and non-blocking. * * @param date given date * @return ISO 8601 date string in UTC, truncated to the seconds unit @@ -81,8 +75,8 @@ public static String formatDate(Date date) { } /** - * Returns a ISO 8601 representation of the given date in UTC, - * truncated to the seconds unit. This method is thread safe and non-blocking. + * Returns a ISO 8601 representation of the given date in UTC, truncated to the seconds unit. + * This method is thread safe and non-blocking. * * @param date given Calendar * @return ISO 8601 date string in UTC, truncated to the seconds unit @@ -91,9 +85,10 @@ public static String formatDate(Date date) { public static String formatDate(Calendar date) { return doFormatDate(date); } + /** - * Returns a ISO 8601 representation of the given date in UTC, - * truncated to the seconds unit. This method is thread safe and non-blocking. + * Returns a ISO 8601 representation of the given date in UTC, truncated to the seconds unit. + * This method is thread safe and non-blocking. * * @param date given date * @return ISO 8601 date string in UTC, truncated to the seconds unit @@ -113,6 +108,7 @@ public static String formatDateUnknownTimezone(Date date) { /** * Returns ISO-8601 formatted time converted to UTC, truncated to the seconds place + * * @param calendar * @return */ @@ -123,17 +119,16 @@ private static String doFormatDate(Calendar calendar) { private List loadDateFormats() { List dateFormats = new ArrayList<>(); // yyyy-mm-ddThh... - dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", UTC)); // UTC/Zulu - dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", null)); // With timezone - dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss", null)); // Without timezone + dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", UTC)); // UTC/Zulu + dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", null)); // With timezone + dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss", null)); // Without timezone // yyyy-mm-dd hh... - dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss'Z'", UTC)); // UTC/Zulu - dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ssZ", null)); // With timezone - dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss", null)); // Without timezone + dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss'Z'", UTC)); // UTC/Zulu + dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ssZ", null)); // With timezone + dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss", null)); // Without timezone // Date without time, set to Midday UTC - dateFormats.add(createDateFormat("yyyy-MM-dd", MIDDAY)); // Normal date format - dateFormats.add(createDateFormat("yyyy:MM:dd", - MIDDAY)); // Image (IPTC/EXIF) format + dateFormats.add(createDateFormat("yyyy-MM-dd", MIDDAY)); // Normal date format + dateFormats.add(createDateFormat("yyyy:MM:dd", MIDDAY)); // Image (IPTC/EXIF) format return dateFormats; } @@ -141,8 +136,7 @@ private List loadDateFormats() { /** * Tries to parse the date string; returns null if no parse was possible. *

- * This is not thread safe! Wrap in synchronized or create new {@link DateUtils} - * for each class. + * This is not thread safe! Wrap in synchronized or create new {@link DateUtils} for each class. * * @param dateString * @return @@ -151,8 +145,8 @@ public Date tryToParse(String dateString) { // Java doesn't like timezones in the form ss+hh:mm // It only likes the hhmm form, without the colon int n = dateString.length(); - if (dateString.charAt(n - 3) == ':' && - (dateString.charAt(n - 6) == '+' || dateString.charAt(n - 6) == '-')) { + if (dateString.charAt(n - 3) == ':' + && (dateString.charAt(n - 6) == '+' || dateString.charAt(n - 6) == '-')) { dateString = dateString.substring(0, n - 3) + dateString.substring(n - 2); } @@ -160,7 +154,7 @@ public Date tryToParse(String dateString) { try { return df.parse(dateString); } catch (java.text.ParseException e) { - //swallow + // swallow } } return null; diff --git a/tika-core/src/main/java/org/apache/tika/utils/DurationFormatUtils.java b/tika-core/src/main/java/org/apache/tika/utils/DurationFormatUtils.java index 8dafe87cd9..12c58ffcf5 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/DurationFormatUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/DurationFormatUtils.java @@ -1,24 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; /** - * Functionality and naming conventions (roughly) copied from org.apache.commons.lang3 - * so that we didn't have to add another dependency. + * Functionality and naming conventions (roughly) copied from org.apache.commons.lang3 so that we + * didn't have to add another dependency. */ public class DurationFormatUtils { @@ -30,7 +28,7 @@ public static String formatMillis(long duration) { int hrs = (int) ((duration / (1000 * 60 * 60)) % 24); int days = (int) ((duration / (1000 * 60 * 60 * 24)) % 7); - //sb.append(millis + " milliseconds"); + // sb.append(millis + " milliseconds"); addUnitString(sb, days, "day"); addUnitString(sb, hrs, "hour"); addUnitString(sb, mins, "minute"); @@ -43,7 +41,7 @@ public static String formatMillis(long duration) { } private static void addUnitString(StringBuilder sb, long unit, String unitString) { - //only add unit if >= 1 + // only add unit if >= 1 if (unit == 1) { addComma(sb); sb.append("1 "); diff --git a/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java index 8f071e2569..d77c447f6e 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -23,7 +21,6 @@ import java.io.Writer; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.tika.exception.TikaException; public class ExceptionUtils { @@ -35,8 +32,8 @@ public class ExceptionUtils { *

* This will unwrap a TikaException and return the cause if not null *

- * NOTE: If your stacktraces are truncated, make sure to start your jvm - * with: -XX:-OmitStackTraceInFastThrow + * NOTE: If your stacktraces are truncated, make sure to start your jvm with: + * -XX:-OmitStackTraceInFastThrow * * @param t throwable * @return @@ -66,17 +63,16 @@ public static String getStackTrace(Throwable t) { writer.close(); result.close(); } catch (IOException e) { - //swallow + // swallow } return result.toString(); } /** - * Utility method to trim the message from a stack trace - * string. + * Utility method to trim the message from a stack trace string. *

- * E.g. java.lang.IllegalStateException: Potential loop detected - * will be trimmed to java.lang.IllegalStateException + * E.g. java.lang.IllegalStateException: Potential loop detected will be trimmed + * to java.lang.IllegalStateException * * @param trace string view of stack trace * @return trimmed stack trace diff --git a/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java b/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java index f08ca472c8..e77ad77386 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java +++ b/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -102,16 +100,10 @@ public void setStdoutTruncated(boolean stdoutTruncated) { @Override public String toString() { - return "FileProcessResult{" + - "stderr='" + stderr + '\'' + - ", stdout='" + stdout + '\'' + - ", exitValue=" + exitValue + - ", processTimeMillis=" + processTimeMillis + - ", isTimeout=" + isTimeout + - ", stdoutLength=" + stdoutLength + - ", stderrLength=" + stderrLength + - ", stderrTruncated=" + stderrTruncated + - ", stdoutTruncated=" + stdoutTruncated + - '}'; + return "FileProcessResult{" + "stderr='" + stderr + '\'' + ", stdout='" + stdout + '\'' + + ", exitValue=" + exitValue + ", processTimeMillis=" + processTimeMillis + + ", isTimeout=" + isTimeout + ", stdoutLength=" + stdoutLength + + ", stderrLength=" + stderrLength + ", stderrTruncated=" + stderrTruncated + + ", stdoutTruncated=" + stdoutTruncated + '}'; } } diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java index 837f762955..e8a40fe736 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -21,7 +19,6 @@ import java.io.IOException; import java.io.InputStream; import java.util.Arrays; - import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -36,7 +33,7 @@ public class ParserUtils { public final static Property EMBEDDED_PARSER = Property.internalText( - TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser"); + TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser"); /** @@ -59,8 +56,8 @@ public static Metadata cloneMetadata(Metadata m) { } /** - * Identifies the real class name of the {@link Parser}, unwrapping - * any {@link ParserDecorator} decorations on top of it. + * Identifies the real class name of the {@link Parser}, unwrapping any {@link ParserDecorator} + * decorations on top of it. */ public static String getParserClassname(Parser parser) { if (parser instanceof ParserDecorator) { @@ -71,9 +68,8 @@ public static String getParserClassname(Parser parser) { } /** - * Records details of the {@link Parser} used to the {@link Metadata}, - * typically wanted where multiple parsers could be picked between - * or used. + * Records details of the {@link Parser} used to the {@link Metadata}, typically wanted where + * multiple parsers could be picked between or used. */ public static void recordParserDetails(Parser parser, Metadata metadata) { String className = getParserClassname(parser); @@ -81,24 +77,22 @@ public static void recordParserDetails(Parser parser, Metadata metadata) { } /** - * Records details of the {@link Parser} used to the {@link Metadata}, - * typically wanted where multiple parsers could be picked between - * or used. + * Records details of the {@link Parser} used to the {@link Metadata}, typically wanted where + * multiple parsers could be picked between or used. */ public static void recordParserDetails(String parserClassName, Metadata metadata) { String[] parsedBys = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY); if (parsedBys == null || parsedBys.length == 0) { metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName); } else if (Arrays.stream(parsedBys).noneMatch(parserClassName::equals)) { - //only add parser once + // only add parser once metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName); } } /** - * Records details of a {@link Parser}'s failure to the - * {@link Metadata}, so you can check what went wrong even if the - * {@link Exception} wasn't immediately thrown (eg when several different + * Records details of a {@link Parser}'s failure to the {@link Metadata}, so you can check what + * went wrong even if the {@link Exception} wasn't immediately thrown (eg when several different * Parsers are used) */ public static void recordParserFailure(Parser parser, Throwable failure, Metadata metadata) { @@ -108,14 +102,12 @@ public static void recordParserFailure(Parser parser, Throwable failure, Metadat } /** - * Ensures that the Stream will be able to be re-read, by buffering to - * a temporary file if required. - * Streams that are automatically OK include {@link TikaInputStream}s - * created from Files or InputStreamFactories, and {@link RereadableInputStream}. + * Ensures that the Stream will be able to be re-read, by buffering to a temporary file if + * required. Streams that are automatically OK include {@link TikaInputStream}s created from + * Files or InputStreamFactories, and {@link RereadableInputStream}. */ public static InputStream ensureStreamReReadable(InputStream stream, TemporaryResources tmp, - Metadata metadata) - throws IOException { + Metadata metadata) throws IOException { // If it's re-readable, we're done if (stream instanceof RereadableInputStream) { return stream; @@ -141,11 +133,11 @@ public static InputStream ensureStreamReReadable(InputStream stream, TemporaryRe /** * Resets the given {@link TikaInputStream} (checked by - * {@link #ensureStreamReReadable(InputStream, TemporaryResources, Metadata)}) - * so that it can be re-read again. + * {@link #ensureStreamReReadable(InputStream, TemporaryResources, Metadata)}) so that it can be + * re-read again. */ public static InputStream streamResetForReRead(InputStream stream, TemporaryResources tmp) - throws IOException { + throws IOException { // If re-readable, rewind to start if (stream instanceof RereadableInputStream) { ((RereadableInputStream) stream).rewind(); diff --git a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java index 5ee5865fe1..06eb7000e2 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -46,9 +44,8 @@ private static Process release(String id) { } /** - * This should correctly put double-quotes around an argument if - * ProcessBuilder doesn't seem to work (as it doesn't - * on paths with spaces on Windows) + * This should correctly put double-quotes around an argument if ProcessBuilder doesn't seem to + * work (as it doesn't on paths with spaces on Windows) * * @param arg * @return @@ -57,18 +54,18 @@ public static String escapeCommandLine(String arg) { if (arg == null) { return arg; } - //need to test for " " on windows, can't just add double quotes - //across platforms. - if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS && - (!arg.startsWith("\"") && !arg.endsWith("\""))) { + // need to test for " " on windows, can't just add double quotes + // across platforms. + if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS + && (!arg.startsWith("\"") && !arg.endsWith("\""))) { arg = "\"" + arg + "\""; } return arg; } public static String unescapeCommandLine(String arg) { - if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS && - (arg.startsWith("\"") && arg.endsWith("\""))) { + if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS + && (arg.startsWith("\"") && arg.endsWith("\""))) { arg = arg.substring(1, arg.length() - 1); } return arg; @@ -84,10 +81,8 @@ public static String unescapeCommandLine(String arg) { * @return * @throws IOException */ - public static FileProcessResult execute(ProcessBuilder pb, - long timeoutMillis, - int maxStdoutBuffer, int maxStdErrBuffer) - throws IOException { + public static FileProcessResult execute(ProcessBuilder pb, long timeoutMillis, + int maxStdoutBuffer, int maxStdErrBuffer) throws IOException { Process p = null; String id = null; try { @@ -121,7 +116,7 @@ public static FileProcessResult execute(ProcessBuilder pb, try { exitValue = p.exitValue(); } catch (IllegalThreadStateException e) { - //not finished! + // not finished! } } } @@ -135,7 +130,7 @@ public static FileProcessResult execute(ProcessBuilder pb, result.processTimeMillis = elapsed; result.stderrLength = errGobbler.getStreamLength(); result.stdoutLength = outGobbler.getStreamLength(); - result.isTimeout = ! complete; + result.isTimeout = !complete; result.exitValue = exitValue; result.stdout = StringUtils.joinWith("\n", outGobbler.getLines()); result.stderr = StringUtils.joinWith("\n", errGobbler.getLines()); @@ -162,9 +157,8 @@ public static FileProcessResult execute(ProcessBuilder pb, * @return * @throws IOException */ - public static FileProcessResult execute(ProcessBuilder pb, - long timeoutMillis, - Path stdoutRedirect, int maxStdErrBuffer) throws IOException { + public static FileProcessResult execute(ProcessBuilder pb, long timeoutMillis, + Path stdoutRedirect, int maxStdErrBuffer) throws IOException { if (!Files.isDirectory(stdoutRedirect.getParent())) { Files.createDirectories(stdoutRedirect.getParent()); diff --git a/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java b/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java index 9c0736381c..9a1daddd11 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -23,24 +21,23 @@ import java.util.regex.Pattern; /** - * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract - * content + * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract content */ public class RegexUtils { /** * Regex pattern to get URLs within a plain text. * - * @see http://www.truerwords.net/articles/ut/urlactivation.html - * + * @see http://www.truerwords.net/articles/ut/urlactivation.html + * */ - private static final String LINKS_REGEX = "([A-Za-z][A-Za-z0-9+.-]{1,120}:" + - "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}" + - "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; + private static final String LINKS_REGEX = "([A-Za-z][A-Za-z0-9+.-]{1,120}:" + + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}" + + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; private static final Pattern LINKS_PATTERN = - Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE); + Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE); /** * Extract urls from plain text. diff --git a/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java b/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java index e2fdba1ab9..19c96634a6 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -25,14 +23,12 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; /** - * Wraps an input stream, reading it only once, but making it available - * for rereading an arbitrary number of times. The stream's bytes are - * stored in memory up to a user specified maximum, and then stored in a - * temporary file which is deleted when this class's close() method is called. + * Wraps an input stream, reading it only once, but making it available for rereading an arbitrary + * number of times. The stream's bytes are stored in memory up to a user specified maximum, and then + * stored in a temporary file which is deleted when this class's close() method is called. */ public class RereadableInputStream extends InputStream { @@ -48,31 +44,29 @@ public class RereadableInputStream extends InputStream { private final InputStream originalInputStream; /** - * The inputStream currently being used by this object to read contents; - * may be the original stream passed in, or a stream that reads - * the saved copy from a memory buffer or file. + * The inputStream currently being used by this object to read contents; may be the original + * stream passed in, or a stream that reads the saved copy from a memory buffer or file. */ private InputStream inputStream; /** - * Maximum number of bytes that can be stored in memory before - * storage will be moved to a temporary file. + * Maximum number of bytes that can be stored in memory before storage will be moved to a + * temporary file. */ private final int maxBytesInMemory; /** - * Whether or not we are currently reading from the byte buffer in memory - * Bytes are read until we've exhausted the buffered bytes and then we proceed to read from - * the original input stream. If the numbers of bytes read from the original stream - * eventually exceed maxBytesInMemory, then we'll switch to reading from a file. + * Whether or not we are currently reading from the byte buffer in memory Bytes are read until + * we've exhausted the buffered bytes and then we proceed to read from the original input + * stream. If the numbers of bytes read from the original stream eventually exceed + * maxBytesInMemory, then we'll switch to reading from a file. */ private boolean readingFromBuffer; /** - * The buffer used to store the stream's content; this storage is moved - * to a file when the stored data's size exceeds maxBytesInMemory. - * Set to null once we start writing to a file. + * The buffer used to store the stream's content; this storage is moved to a file when the + * stored data's size exceeds maxBytesInMemory. Set to null once we start writing to a file. */ private byte[] byteBuffer; @@ -87,8 +81,8 @@ public class RereadableInputStream extends InputStream { private int bufferHighWaterMark; /** - * File used to store the stream's contents; is null until the stored - * content's size exceeds maxBytesInMemory. + * File used to store the stream's contents; is null until the stored content's size exceeds + * maxBytesInMemory. */ private File storeFile; @@ -98,23 +92,22 @@ public class RereadableInputStream extends InputStream { private boolean closed; /** - * OutputStream used to save the content of the input stream in a - * temporary file. + * OutputStream used to save the content of the input stream in a temporary file. */ private OutputStream storeOutputStream; /** - * Specifies whether or not to close the original input stream - * when close() is called. Defaults to true. + * Specifies whether or not to close the original input stream when close() is called. Defaults + * to true. */ private final boolean closeOriginalStreamOnClose; /** - * Creates a rereadable input stream with defaults of 512*1024*1024 bytes (500M) for - * maxBytesInMemory and both readToEndOfStreamOnFirstRewind and closeOriginalStreamOnClose - * set to true + * Creates a rereadable input stream with defaults of 512*1024*1024 bytes (500M) for + * maxBytesInMemory and both readToEndOfStreamOnFirstRewind and closeOriginalStreamOnClose set + * to true * * @param inputStream stream containing the source of data */ @@ -133,16 +126,14 @@ public RereadableInputStream(InputStream inputStream, boolean closeOriginalStrea } /** - * Creates a rereadable input stream with closeOriginalStreamOnClose set to true + * Creates a rereadable input stream with closeOriginalStreamOnClose set to true * - * @param inputStream stream containing the source of data - * @param maxBytesInMemory maximum number of bytes to use to store - * the stream's contents in memory before switching to disk; note that - * the instance will preallocate a byte array whose size is - * maxBytesInMemory. This byte array will be made available for - * garbage collection (i.e. its reference set to null) when the - * content size exceeds the array's size, when close() is called, or - * when there are no more references to the instance. + * @param inputStream stream containing the source of data + * @param maxBytesInMemory maximum number of bytes to use to store the stream's contents in + * memory before switching to disk; note that the instance will preallocate a byte array + * whose size is maxBytesInMemory. This byte array will be made available for garbage + * collection (i.e. its reference set to null) when the content size exceeds the array's + * size, when close() is called, or when there are no more references to the instance. */ public RereadableInputStream(InputStream inputStream, int maxBytesInMemory) { this(inputStream, maxBytesInMemory, true); @@ -151,17 +142,15 @@ public RereadableInputStream(InputStream inputStream, int maxBytesInMemory) { /** * Creates a rereadable input stream. * - * @param inputStream stream containing the source of data - * @param maxBytesInMemory maximum number of bytes to use to store - * the stream's contents in memory before switching to disk; note that - * the instance will preallocate a byte array whose size is - * maxBytesInMemory. This byte array will be made available for - * garbage collection (i.e. its reference set to null) when the - * content size exceeds the array's size, when close() is called, or - * when there are no more references to the instance. + * @param inputStream stream containing the source of data + * @param maxBytesInMemory maximum number of bytes to use to store the stream's contents in + * memory before switching to disk; note that the instance will preallocate a byte array + * whose size is maxBytesInMemory. This byte array will be made available for garbage + * collection (i.e. its reference set to null) when the content size exceeds the array's + * size, when close() is called, or when there are no more references to the instance. */ public RereadableInputStream(InputStream inputStream, int maxBytesInMemory, - boolean closeOriginalStreamOnClose) { + boolean closeOriginalStreamOnClose) { this.inputStream = inputStream; this.originalInputStream = inputStream; this.maxBytesInMemory = maxBytesInMemory; @@ -170,9 +159,8 @@ public RereadableInputStream(InputStream inputStream, int maxBytesInMemory, } /** - * Reads a byte from the stream, saving it in the store if it is being - * read from the original stream. Implements the abstract - * InputStream.read(). + * Reads a byte from the stream, saving it in the store if it is being read from the original + * stream. Implements the abstract InputStream.read(). * * @return the read byte, or -1 on end of stream. * @throws IOException @@ -188,9 +176,9 @@ public int read() throws IOException { // the next byte from there instead if (readingFromBuffer) { readingFromBuffer = false; - inputStream.close(); // Close the input byte stream + inputStream.close(); // Close the input byte stream } else { - inputStream.close(); // Close the input file stream + inputStream.close(); // Close the input file stream // start appending to the file storeOutputStream = new BufferedOutputStream(new FileOutputStream(storeFile, true)); } @@ -257,8 +245,8 @@ public void rewind() throws IOException { // If we have a buffer, then we'll read from it if (byteBuffer != null) { readingFromBuffer = true; - inputStream = UnsynchronizedByteArrayInputStream.builder(). - setByteArray(byteBuffer).setOffset(0).setLength(bufferHighWaterMark).get(); + inputStream = UnsynchronizedByteArrayInputStream.builder().setByteArray(byteBuffer) + .setOffset(0).setLength(bufferHighWaterMark).get(); } else { // No buffer, which means we've switched to a file inputStream = new BufferedInputStream(new FileInputStream(storeFile)); @@ -269,8 +257,8 @@ public void rewind() throws IOException { } /** - * Closes the input stream currently used for reading (may either be - * the original stream or a memory or file stream after the first pass). + * Closes the input stream currently used for reading (may either be the original stream or a + * memory or file stream after the first pass). * * @throws IOException */ @@ -286,8 +274,7 @@ private void closeStream() throws IOException { } /** - * Closes the input stream and removes the temporary file if one was - * created. + * Closes the input stream and removes the temporary file if one was created. * * @throws IOException */ diff --git a/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java index 1e61c97ae0..cd3614357f 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.util.List; - import org.apache.tika.config.ServiceLoader; /** @@ -27,8 +24,8 @@ */ public class ServiceLoaderUtils { /** - * Sorts a list of loaded classes, so that non-Tika ones come - * before Tika ones, and otherwise in reverse alphabetical order + * Sorts a list of loaded classes, so that non-Tika ones come before Tika ones, and otherwise in + * reverse alphabetical order */ public static void sortLoadedClasses(List loaded) { loaded.sort(CompareUtils::compareClassName); @@ -38,7 +35,7 @@ public static void sortLoadedClasses(List loaded) { * Loads a class and instantiates it * * @param className service class name - * @param service type + * @param service type * @return instance of service */ public static T newInstance(String className) { @@ -49,27 +46,27 @@ public static T newInstance(String className) { * Loads a class and instantiates it * * @param className service class name - * @param loader class loader - * @param service type + * @param loader class loader + * @param service type * @return instance of service */ public static T newInstance(String className, ClassLoader loader) { try { - return ((Class) Class.forName(className, true, loader)).getDeclaredConstructor().newInstance(); - } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | - NoSuchMethodException | InvocationTargetException e) { + return ((Class) Class.forName(className, true, loader)).getDeclaredConstructor() + .newInstance(); + } catch (ClassNotFoundException | InstantiationException | IllegalAccessException + | NoSuchMethodException | InvocationTargetException e) { throw new RuntimeException(e); } } /** - * Loads a class and instantiates it. If the class can be initialized - * with a ServiceLoader, the ServiceLoader constructor is used. - * Otherwise, a zero arg newInstance() is called. + * Loads a class and instantiates it. If the class can be initialized with a ServiceLoader, the + * ServiceLoader constructor is used. Otherwise, a zero arg newInstance() is called. * - * @param klass class to build - * @param loader service loader - * @param service type + * @param klass class to build + * @param loader service loader + * @param service type * @return instance of service */ public static T newInstance(Class klass, ServiceLoader loader) { @@ -78,12 +75,12 @@ public static T newInstance(Class klass, ServiceLoader loader) { Constructor constructor = klass.getDeclaredConstructor(ServiceLoader.class); return constructor.newInstance(loader); } catch (NoSuchMethodException e) { - return (T)klass.getDeclaredConstructor().newInstance(); + return (T) klass.getDeclaredConstructor().newInstance(); } catch (InvocationTargetException e) { throw new RuntimeException(e); } - } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | - InvocationTargetException e) { + } catch (InstantiationException | IllegalAccessException | NoSuchMethodException + | InvocationTargetException e) { throw new RuntimeException(e); } } diff --git a/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java b/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java index effbeb2c90..e8a32307e9 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java +++ b/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -43,8 +41,8 @@ public StreamGobbler(InputStream is, int maxBufferLength) { @Override public void run() { - try (BufferedReader r = new BufferedReader( - new InputStreamReader(is, StandardCharsets.UTF_8))) { + try (BufferedReader r = + new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { String line = r.readLine(); while (line != null) { if (maxBufferLength >= 0) { diff --git a/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java b/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java index b09963d461..9baa84dd30 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -45,9 +43,13 @@ public static boolean isBlank(final String s) { } /** - *

Left pad a String with a specified String.

+ *

+ * Left pad a String with a specified String. + *

* - *

Pad to a size of {@code size}.

+ *

+ * Pad to a size of {@code size}. + *

* *
      * StringUtils.leftPad(null, *, *)      = null
@@ -61,11 +63,11 @@ public static boolean isBlank(final String s) {
      * StringUtils.leftPad("bat", 5, "")    = "  bat"
      * 
* - * @param str the String to pad out, may be null - * @param size the size to pad to + * @param str the String to pad out, may be null + * @param size the size to pad to * @param padStr the String to pad with, null or empty treated as single space - * @return left padded String or original String if no padding is necessary, - * {@code null} if null String input + * @return left padded String or original String if no padding is necessary, {@code null} if + * null String input */ public static String leftPad(final String str, final int size, String padStr) { if (str == null) { @@ -114,8 +116,9 @@ public static String leftPad(final String str, final int size, final char padCha } /** - *

Returns padding using the specified delimiter repeated - * to a given length.

+ *

+ * Returns padding using the specified delimiter repeated to a given length. + *

* *
      * StringUtils.repeat('e', 0)  = ""
@@ -123,14 +126,15 @@ public static String leftPad(final String str, final int size, final char padCha
      * StringUtils.repeat('e', -2) = ""
      * 
* - *

Note: this method does not support padding with - * Unicode Supplementary Characters - * as they require a pair of {@code char}s to be represented. - * If you are needing to support full I18N of your applications - * consider using {@link #repeat(String, int)} instead. + *

+ * Note: this method does not support padding with + * Unicode Supplementary + * Characters as they require a pair of {@code char}s to be represented. If you are needing + * to support full I18N of your applications consider using {@link #repeat(String, int)} + * instead. *

* - * @param ch character to repeat + * @param ch character to repeat * @param repeat number of times to repeat char, negative treated as zero * @return String with repeated character * @see #repeat(String, int) @@ -147,11 +151,12 @@ public static String repeat(final char ch, final int repeat) { } // Padding - //----------------------------------------------------------------------- + // ----------------------------------------------------------------------- /** - *

Repeat a String {@code repeat} times to form a - * new String.

+ *

+ * Repeat a String {@code repeat} times to form a new String. + *

* *
      * StringUtils.repeat(null, 2) = null
@@ -162,10 +167,10 @@ public static String repeat(final char ch, final int repeat) {
      * StringUtils.repeat("a", -2) = ""
      * 
* - * @param str the String to repeat, may be null + * @param str the String to repeat, may be null * @param repeat number of times to repeat str, negative treated as zero - * @return a new String consisting of the original String repeated, - * {@code null} if null String input + * @return a new String consisting of the original String repeated, {@code null} if null String + * input */ public static String repeat(final String str, final int repeat) { // Performance tuned for 2.0 (JDK1.4) diff --git a/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java b/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java index 027b677fcc..42ee906264 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -27,7 +25,7 @@ public class SystemUtils { public static final boolean IS_OS_HP_UX = getOSMatchesName("HP-UX"); public static final boolean IS_OS_IRIX = getOSMatchesName("Irix"); public static final boolean IS_OS_LINUX = - getOSMatchesName("Linux") || getOSMatchesName("LINUX"); + getOSMatchesName("Linux") || getOSMatchesName("LINUX"); public static final boolean IS_OS_MAC = getOSMatchesName("Mac"); public static final boolean IS_OS_MAC_OSX = getOSMatchesName("Mac OS X"); public static final boolean IS_OS_OS2 = getOSMatchesName("OS/2"); @@ -40,8 +38,8 @@ public class SystemUtils { private static final String OS_VERSION_WSL = "WSL"; static { - IS_OS_UNIX = IS_OS_AIX || IS_OS_HP_UX || IS_OS_IRIX || IS_OS_LINUX || IS_OS_MAC_OSX || - IS_OS_SOLARIS || IS_OS_SUN_OS; + IS_OS_UNIX = IS_OS_AIX || IS_OS_HP_UX || IS_OS_IRIX || IS_OS_LINUX || IS_OS_MAC_OSX + || IS_OS_SOLARIS || IS_OS_SUN_OS; IS_OS_WINDOWS = getOSMatchesName(OS_NAME_WINDOWS_PREFIX); IS_OS_VERSION_WSL = getOSContainsVersion(OS_VERSION_WSL); } diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java index fe57f04eea..302cef2b7f 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -43,8 +41,10 @@ import javax.xml.transform.TransformerFactory; import javax.xml.transform.TransformerFactoryConfigurationError; import javax.xml.transform.sax.SAXTransformerFactory; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.OfflineContentHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; @@ -59,10 +59,6 @@ import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.exception.TikaException; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.OfflineContentHandler; - /** * Utility functions for reading XML. @@ -70,8 +66,7 @@ public class XMLReaderUtils implements Serializable { /** - * Default size for the pool of SAX Parsers - * and the pool of DOM builders + * Default size for the pool of SAX Parsers and the pool of DOM builders */ public static final int DEFAULT_POOL_SIZE = 10; public static final int DEFAULT_MAX_ENTITY_EXPANSIONS = 20; @@ -83,20 +78,20 @@ public class XMLReaderUtils implements Serializable { private static final Logger LOG = LoggerFactory.getLogger(XMLReaderUtils.class); private static final String XERCES_SECURITY_MANAGER = "org.apache.xerces.util.SecurityManager"; private static final String XERCES_SECURITY_MANAGER_PROPERTY = - "http://apache.org/xml/properties/security-manager"; + "http://apache.org/xml/properties/security-manager"; private static final AtomicBoolean HAS_WARNED_STAX = new AtomicBoolean(false); private static final ContentHandler IGNORING_CONTENT_HANDLER = new DefaultHandler(); private static final DTDHandler IGNORING_DTD_HANDLER = new DTDHandler() { @Override public void notationDecl(String name, String publicId, String systemId) - throws SAXException { + throws SAXException { } @Override public void unparsedEntityDecl(String name, String publicId, String systemId, - String notationName) throws SAXException { + String notationName) throws SAXException { } }; @@ -117,19 +112,19 @@ public void fatalError(SAXParseException exception) throws SAXException { } }; private static final String JAXP_ENTITY_EXPANSION_LIMIT_KEY = "jdk.xml.entityExpansionLimit"; - //TODO: figure out if the rw lock is any better than a simple lock - //these lock the pool arrayblocking queues so that there isn't a race condition - //of trying to acquire a parser while the pool is being resized + // TODO: figure out if the rw lock is any better than a simple lock + // these lock the pool arrayblocking queues so that there isn't a race condition + // of trying to acquire a parser while the pool is being resized private static final ReentrantReadWriteLock SAX_POOL_LOCK = new ReentrantReadWriteLock(); private static final ReentrantReadWriteLock DOM_POOL_LOCK = new ReentrantReadWriteLock(); private static final AtomicInteger POOL_GENERATION = new AtomicInteger(); private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = - (publicId, systemId) -> new InputSource(new StringReader("")); + (publicId, systemId) -> new InputSource(new StringReader("")); - //BE CAREFUL with the return type. Some parsers will silently ignore an unexpected return type: CVE-2025-54988 - private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER = - (publicID, systemID, baseURI, namespace) -> - UnsynchronizedByteArrayInputStream.nullInputStream(); + // BE CAREFUL with the return type. Some parsers will silently ignore an unexpected return type: + // CVE-2025-54988 + private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER = (publicID, systemID, baseURI, + namespace) -> UnsynchronizedByteArrayInputStream.nullInputStream(); /** * Parser pool size */ @@ -138,9 +133,9 @@ public void fatalError(SAXParseException exception) throws SAXException { private static long LAST_LOG = -1; private static volatile int MAX_ENTITY_EXPANSIONS = determineMaxEntityExpansions(); private static ArrayBlockingQueue SAX_PARSERS = - new ArrayBlockingQueue<>(POOL_SIZE); + new ArrayBlockingQueue<>(POOL_SIZE); private static ArrayBlockingQueue DOM_BUILDERS = - new ArrayBlockingQueue<>(POOL_SIZE); + new ArrayBlockingQueue<>(POOL_SIZE); static { try { @@ -156,19 +151,17 @@ private static int determineMaxEntityExpansions() { try { return Integer.parseInt(expansionLimit); } catch (NumberFormatException e) { - LOG.warn( - "Couldn't parse an integer for the entity expansion limit: {}; " + - "backing off to default: {}", - expansionLimit, DEFAULT_MAX_ENTITY_EXPANSIONS); + LOG.warn("Couldn't parse an integer for the entity expansion limit: {}; " + + "backing off to default: {}", expansionLimit, + DEFAULT_MAX_ENTITY_EXPANSIONS); } } return DEFAULT_MAX_ENTITY_EXPANSIONS; } /** - * Returns the XMLReader specified in this parsing context. If a reader - * is not explicitly specified, then one is created using the specified - * or the default SAX parser. + * Returns the XMLReader specified in this parsing context. If a reader is not explicitly + * specified, then one is created using the specified or the default SAX parser. * * @return XMLReader * @throws TikaException @@ -187,12 +180,11 @@ public static XMLReader getXMLReader() throws TikaException { } /** - * Returns the SAX parser specified in this parsing context. If a parser - * is not explicitly specified, then one is created using the specified - * or the default SAX parser factory. + * Returns the SAX parser specified in this parsing context. If a parser is not explicitly + * specified, then one is created using the specified or the default SAX parser factory. *

- * If you call reset() on the parser, make sure to replace the - * SecurityManager which will be cleared by xerces2 on reset(). + * If you call reset() on the parser, make sure to replace the SecurityManager which will be + * cleared by xerces2 on reset(). *

* * @return SAX parser @@ -213,10 +205,9 @@ public static SAXParser getSAXParser() throws TikaException { } /** - * Returns the SAX parser factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware, not validating, and to use + * Returns the SAX parser factory specified in this parsing context. If a factory is not + * explicitly specified, then a default factory instance is created and returned. The default + * factory instance is configured to be namespace-aware, not validating, and to use * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. * * @return SAX parser factory @@ -233,25 +224,24 @@ public static SAXParserFactory getSAXParserFactory() { trySetSAXFeature(factory, "http://xml.org/sax/features/external-general-entities", false); trySetSAXFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false); trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", - false); + false); trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar", - false); + false); return factory; } /** - * Returns the DOM builder factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware and to apply reasonable security + * Returns the DOM builder factory specified in this parsing context. If a factory is not + * explicitly specified, then a default factory instance is created and returned. The default + * factory instance is configured to be namespace-aware and to apply reasonable security * features. * * @return DOM parser factory * @since Apache Tika 1.13 */ public static DocumentBuilderFactory getDocumentBuilderFactory() { - //borrowed from Apache POI + // borrowed from Apache POI DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); if (LOG.isDebugEnabled()) { LOG.debug("DocumentBuilderFactory class {}", factory.getClass()); @@ -265,19 +255,18 @@ public static DocumentBuilderFactory getDocumentBuilderFactory() { trySetSAXFeature(factory, "http://xml.org/sax/features/external-general-entities", false); trySetSAXFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false); trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", - false); + false); trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar", - false); + false); trySetXercesSecurityManager(factory); return factory; } /** - * Returns the DOM builder specified in this parsing context. - * If a builder is not explicitly specified, then a builder - * instance is created and returned. The builder instance is - * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER}, - * and it sets the ErrorHandler to null. + * Returns the DOM builder specified in this parsing context. If a builder is not explicitly + * specified, then a builder instance is created and returned. The builder instance is + * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER}, and it sets the ErrorHandler to + * null. * * @return DOM Builder * @since Apache Tika 1.13 @@ -295,10 +284,9 @@ public static DocumentBuilder getDocumentBuilder() throws TikaException { } /** - * Returns the StAX input factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware and to apply reasonable security + * Returns the StAX input factory specified in this parsing context. If a factory is not + * explicitly specified, then a default factory instance is created and returned. The default + * factory instance is configured to be namespace-aware and to apply reasonable security * precautions. * * @return StAX input factory @@ -312,20 +300,20 @@ public static XMLInputFactory getXMLInputFactory() { tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, true); - //try to configure secure processing + // try to configure secure processing tryToSetStaxProperty(factory, XMLConstants.ACCESS_EXTERNAL_DTD, ""); tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false); tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false); tryToSetStaxProperty(factory, XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false); - //defense in depth + // defense in depth factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER); trySetStaxSecurityManager(factory); return factory; } private static void trySetTransformerAttribute(TransformerFactory transformerFactory, - String attribute, String value) { + String attribute, String value) { try { transformerFactory.setAttribute(attribute, value); } catch (SecurityException e) { @@ -333,14 +321,13 @@ private static void trySetTransformerAttribute(TransformerFactory transformerFac } catch (Exception e) { LOG.warn("Transformer Attribute unsupported: {}", attribute, e); } catch (AbstractMethodError ame) { - LOG.warn( - "Cannot set Transformer attribute because outdated XML parser in classpath: {}", - attribute, ame); + LOG.warn("Cannot set Transformer attribute because outdated XML parser in classpath: {}", + attribute, ame); } } private static void trySetSAXFeature(SAXParserFactory saxParserFactory, String feature, - boolean enabled) { + boolean enabled) { try { saxParserFactory.setFeature(feature, enabled); } catch (SecurityException e) { @@ -349,19 +336,19 @@ private static void trySetSAXFeature(SAXParserFactory saxParserFactory, String f LOG.warn("SAX Feature unsupported: {}", feature, e); } catch (AbstractMethodError ame) { LOG.warn("Cannot set SAX feature because outdated XML parser in classpath: {}", feature, - ame); + ame); } } private static void trySetSAXFeature(DocumentBuilderFactory documentBuilderFactory, - String feature, boolean enabled) { + String feature, boolean enabled) { try { documentBuilderFactory.setFeature(feature, enabled); } catch (Exception e) { LOG.warn("SAX Feature unsupported: {}", feature, e); } catch (AbstractMethodError ame) { LOG.warn("Cannot set SAX feature because outdated XML parser in classpath: {}", feature, - ame); + ame); } } @@ -402,8 +389,8 @@ public static Transformer getTransformer() throws TikaException { /** * Returns a TransformerFactory. The factory is configured with - * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing} and other - * settings to prevent XXE. + * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing} and other settings to + * prevent XXE. * * @return TransformerFactory * @throws TikaException @@ -414,7 +401,8 @@ public static TransformerFactory getTransformerFactory() throws TikaException { TransformerFactory transformerFactory = TransformerFactory.newInstance(); transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_DTD, ""); - trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); + trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, + ""); return transformerFactory; } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) { throw new TikaException("Transformer not available", e); @@ -423,8 +411,8 @@ public static TransformerFactory getTransformerFactory() throws TikaException { /** * Returns a SAXTransformerFactory. The factory is configured with - * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing} and other - * settings to prevent XXE. + * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing} and other settings to + * prevent XXE. * * @return TransformerFactory * @throws TikaException @@ -432,10 +420,12 @@ public static TransformerFactory getTransformerFactory() throws TikaException { public static SAXTransformerFactory getSAXTransformerFactory() throws TikaException { try { - SAXTransformerFactory transformerFactory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + SAXTransformerFactory transformerFactory = + (SAXTransformerFactory) SAXTransformerFactory.newInstance(); transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_DTD, ""); - trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); + trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, + ""); return transformerFactory; } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) { throw new TikaException("Transformer not available", e); @@ -443,10 +433,10 @@ public static SAXTransformerFactory getSAXTransformerFactory() throws TikaExcept } /** - * This checks context for a user specified {@link DocumentBuilder}. - * If one is not found, this reuses a DocumentBuilder from the pool. + * This checks context for a user specified {@link DocumentBuilder}. If one is not found, this + * reuses a DocumentBuilder from the pool. * - * @param is InputStream to parse + * @param is InputStream to parse * @param context context to use * @return a document * @throws TikaException @@ -455,7 +445,7 @@ public static SAXTransformerFactory getSAXTransformerFactory() throws TikaExcept * @since Apache Tika 1.19 */ public static Document buildDOM(InputStream is, ParseContext context) - throws TikaException, IOException, SAXException { + throws TikaException, IOException, SAXException { DocumentBuilder builder = context.get(DocumentBuilder.class); PoolDOMBuilder poolBuilder = null; if (builder == null) { @@ -479,10 +469,10 @@ public static Document buildDOM(InputStream is, ParseContext context) } /** - * This checks context for a user specified {@link DocumentBuilder}. - * If one is not found, this reuses a DocumentBuilder from the pool. + * This checks context for a user specified {@link DocumentBuilder}. If one is not found, this + * reuses a DocumentBuilder from the pool. * - * @param reader reader (character stream) to parse + * @param reader reader (character stream) to parse * @param context context to use * @return a document * @throws TikaException @@ -491,7 +481,7 @@ public static Document buildDOM(InputStream is, ParseContext context) * @since Apache Tika 2.5 */ public static Document buildDOM(Reader reader, ParseContext context) - throws TikaException, IOException, SAXException { + throws TikaException, IOException, SAXException { DocumentBuilder builder = context.get(DocumentBuilder.class); PoolDOMBuilder poolBuilder = null; if (builder == null) { @@ -541,7 +531,7 @@ public static Document buildDOM(Path path) throws TikaException, IOException, SA * @since Apache Tika 1.19.1 */ public static Document buildDOM(String uriString) - throws TikaException, IOException, SAXException { + throws TikaException, IOException, SAXException { PoolDOMBuilder poolBuilder = null; DocumentBuilder builder = null; if (POOL_SIZE == 0) { @@ -572,7 +562,7 @@ public static Document buildDOM(String uriString) * @since Apache Tika 1.19.1 */ public static Document buildDOM(InputStream is) - throws TikaException, IOException, SAXException { + throws TikaException, IOException, SAXException { PoolDOMBuilder poolBuilder = null; DocumentBuilder builder = null; if (POOL_SIZE == 0) { @@ -594,14 +584,13 @@ public static Document buildDOM(InputStream is) } /** - * This checks context for a user specified {@link SAXParser}. - * If one is not found, this reuses a SAXParser from the pool. + * This checks context for a user specified {@link SAXParser}. If one is not found, this reuses + * a SAXParser from the pool. * - * @param is InputStream to parse - * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler} - * to the content handler as an extra layer of defense against - * external entity vulnerabilities - * @param context context to use + * @param is InputStream to parse + * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler} to the + * content handler as an extra layer of defense against external entity vulnerabilities + * @param context context to use * @return * @throws TikaException * @throws IOException @@ -609,7 +598,7 @@ public static Document buildDOM(InputStream is) * @since Apache Tika 1.19 */ public static void parseSAX(InputStream is, ContentHandler contentHandler, ParseContext context) - throws TikaException, IOException, SAXException { + throws TikaException, IOException, SAXException { SAXParser saxParser = context.get(SAXParser.class); PoolSAXParser poolSAXParser = null; if (saxParser == null) { @@ -632,14 +621,13 @@ public static void parseSAX(InputStream is, ContentHandler contentHandler, Parse } /** - * This checks context for a user specified {@link SAXParser}. - * If one is not found, this reuses a SAXParser from the pool. + * This checks context for a user specified {@link SAXParser}. If one is not found, this reuses + * a SAXParser from the pool. * - * @param reader reader (character stream) to parse - * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler} - * to the content handler as an extra layer of defense against - * external entity vulnerabilities - * @param context context to use + * @param reader reader (character stream) to parse + * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler} to the + * content handler as an extra layer of defense against external entity vulnerabilities + * @param context context to use * @return * @throws TikaException * @throws IOException @@ -647,7 +635,7 @@ public static void parseSAX(InputStream is, ContentHandler contentHandler, Parse * @since Apache Tika 2.5 */ public static void parseSAX(Reader reader, ContentHandler contentHandler, ParseContext context) - throws TikaException, IOException, SAXException { + throws TikaException, IOException, SAXException { SAXParser saxParser = context.get(SAXParser.class); PoolSAXParser poolSAXParser = null; if (saxParser == null) { @@ -670,9 +658,8 @@ public static void parseSAX(Reader reader, ContentHandler contentHandler, ParseC } /** - * Acquire a DOMBuilder from the pool. Make sure to - * {@link #releaseDOMBuilder(PoolDOMBuilder)} in - * a finally block every time you call this. + * Acquire a DOMBuilder from the pool. Make sure to {@link #releaseDOMBuilder(PoolDOMBuilder)} + * in a finally block every time you call this. * * @return a DocumentBuilder or null if no DOMBuilders are available * @throws TikaException @@ -680,19 +667,15 @@ public static void parseSAX(Reader reader, ContentHandler contentHandler, ParseC private static PoolDOMBuilder acquireDOMBuilder() throws TikaException { PoolDOMBuilder builder = null; - DOM_POOL_LOCK - .readLock() - .lock(); + DOM_POOL_LOCK.readLock().lock(); try { builder = DOM_BUILDERS.poll(); } finally { - DOM_POOL_LOCK - .readLock() - .unlock(); + DOM_POOL_LOCK.readLock().unlock(); } if (builder == null) { - LOG.warn("Contention waiting for a DOMBuilder. " + - "Consider increasing the XMLReaderUtils.POOL_SIZE"); + LOG.warn("Contention waiting for a DOMBuilder. " + + "Consider increasing the XMLReaderUtils.POOL_SIZE"); } return builder; @@ -713,39 +696,36 @@ private static void releaseDOMBuilder(PoolDOMBuilder builder) { try { builder.reset(); } catch (UnsupportedOperationException e) { - //ignore + // ignore } - DOM_POOL_LOCK - .readLock().lock(); + DOM_POOL_LOCK.readLock().lock(); builder.incrementUses(); if (builder.numUses >= MAX_NUM_REUSES) { try { - builder = new PoolDOMBuilder(builder.getPoolGeneration(), getDocumentBuilderFactory().newDocumentBuilder()); + builder = new PoolDOMBuilder(builder.getPoolGeneration(), + getDocumentBuilderFactory().newDocumentBuilder()); } catch (ParserConfigurationException e) { LOG.warn("Exception trying to configure a new dom builder?!", e); return; } } try { - //if there are extra parsers (e.g. after a reset of the pool to a smaller size), + // if there are extra parsers (e.g. after a reset of the pool to a smaller size), // this parser will not be added and will then be gc'd boolean success = DOM_BUILDERS.offer(builder); if (!success) { - LOG.warn( - "DocumentBuilder not taken back into pool. If you haven't resized the " + - "pool, this could be a sign that there are more calls to " + - "'acquire' than to 'release'"); + LOG.warn("DocumentBuilder not taken back into pool. If you haven't resized the " + + "pool, this could be a sign that there are more calls to " + + "'acquire' than to 'release'"); } } finally { - DOM_POOL_LOCK - .readLock().unlock(); + DOM_POOL_LOCK.readLock().unlock(); } } /** - * Acquire a SAXParser from the pool. Make sure to - * {@link #releaseParser(PoolSAXParser)} in - * a finally block every time you call this. + * Acquire a SAXParser from the pool. Make sure to {@link #releaseParser(PoolSAXParser)} in a + * finally block every time you call this. * * @return a SAXParser or null if a parser is not available * @throws TikaException @@ -753,21 +733,17 @@ private static void releaseDOMBuilder(PoolDOMBuilder builder) { private static PoolSAXParser acquireSAXParser() throws TikaException { PoolSAXParser parser = null; - //this locks around the pool so that there's - //no race condition with it being resized - SAX_POOL_LOCK - .readLock() - .lock(); + // this locks around the pool so that there's + // no race condition with it being resized + SAX_POOL_LOCK.readLock().lock(); try { parser = SAX_PARSERS.poll(); } finally { - SAX_POOL_LOCK - .readLock() - .unlock(); + SAX_POOL_LOCK.readLock().unlock(); } if (parser == null) { - LOG.warn("Contention waiting for a SAXParser. " + - "Consider increasing the XMLReaderUtils.POOL_SIZE"); + LOG.warn("Contention waiting for a SAXParser. " + + "Consider increasing the XMLReaderUtils.POOL_SIZE"); } return parser; } @@ -784,64 +760,59 @@ private static void releaseParser(PoolSAXParser parser) { try { parser.reset(); } catch (UnsupportedOperationException e) { - //TIKA-3009 -- we really shouldn't have to do this... :( + // TIKA-3009 -- we really shouldn't have to do this... :( } - //if this is a different generation, don't put it back - //in the pool + // if this is a different generation, don't put it back + // in the pool if (parser.getGeneration() != POOL_GENERATION.get()) { return; } - SAX_POOL_LOCK - .readLock().lock(); + SAX_POOL_LOCK.readLock().lock(); try { parser.incrementUses(); if (parser.numUses >= MAX_NUM_REUSES) { try { - parser = buildPoolParser(parser.getGeneration(), getSAXParserFactory().newSAXParser()); + parser = buildPoolParser(parser.getGeneration(), + getSAXParserFactory().newSAXParser()); } catch (SAXException | ParserConfigurationException e) { LOG.warn("Couldn't build new SAXParser after hitting max reuses", e); return; } } - //if there are extra parsers (e.g. after a reset of the pool to a smaller size), + // if there are extra parsers (e.g. after a reset of the pool to a smaller size), // this parser will not be added and will then be gc'd boolean success = SAX_PARSERS.offer(parser); if (!success) { - LOG.warn( - "SAXParser not taken back into pool. If you haven't resized the pool " + - "this could be a sign that there are more calls to 'acquire' " + - "than to 'release'"); + LOG.warn("SAXParser not taken back into pool. If you haven't resized the pool " + + "this could be a sign that there are more calls to 'acquire' " + + "than to 'release'"); } } finally { - SAX_POOL_LOCK - .readLock().unlock(); + SAX_POOL_LOCK.readLock().unlock(); } } private static void trySetXercesSecurityManager(DocumentBuilderFactory factory) { - //from POI + // from POI // Try built-in JVM one first, standalone if not - for (String securityManagerClassName : new String[]{ - //"com.sun.org.apache.xerces.internal.util.SecurityManager", - XERCES_SECURITY_MANAGER}) { + for (String securityManagerClassName : new String[] { + // "com.sun.org.apache.xerces.internal.util.SecurityManager", + XERCES_SECURITY_MANAGER}) { try { - Object mgr = - Class.forName(securityManagerClassName).getDeclaredConstructor().newInstance(); - Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", - Integer.TYPE); + Object mgr = Class.forName(securityManagerClassName).getDeclaredConstructor() + .newInstance(); + Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE); setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS); factory.setAttribute(XERCES_SECURITY_MANAGER_PROPERTY, mgr); // Stop once one can be setup without error return; } catch (ClassNotFoundException e) { // continue without log, this is expected in some setups - } catch (Throwable e) { // NOSONAR - also catch things like NoClassDefError here + } catch (Throwable e) { // NOSONAR - also catch things like NoClassDefError here // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { - LOG.warn( - "SAX Security Manager could not be setup [log suppressed for 5 " + - "minutes]", - e); + LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 " + + "minutes]", e); LAST_LOG = System.currentTimeMillis(); } } @@ -850,27 +821,27 @@ private static void trySetXercesSecurityManager(DocumentBuilderFactory factory) // separate old version of Xerces not found => use the builtin way of setting the property try { factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", - MAX_ENTITY_EXPANSIONS); + MAX_ENTITY_EXPANSIONS); } catch (IllegalArgumentException e) { // NOSONAR - also catch things like NoClassDefError here // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]", - e); + e); LAST_LOG = System.currentTimeMillis(); } } } private static void trySetXercesSecurityManager(SAXParser parser) { - //from POI + // from POI // Try built-in JVM one first, standalone if not - for (String securityManagerClassName : new String[]{ - //"com.sun.org.apache.xerces.internal.util.SecurityManager", - XERCES_SECURITY_MANAGER}) { + for (String securityManagerClassName : new String[] { + // "com.sun.org.apache.xerces.internal.util.SecurityManager", + XERCES_SECURITY_MANAGER}) { try { - Object mgr = - Class.forName(securityManagerClassName).getDeclaredConstructor().newInstance(); + Object mgr = Class.forName(securityManagerClassName).getDeclaredConstructor() + .newInstance(); Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE); setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS); @@ -883,10 +854,8 @@ private static void trySetXercesSecurityManager(SAXParser parser) { // NOSONAR - also catch things like NoClassDefError here // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { - LOG.warn( - "SAX Security Manager could not be setup [log suppressed for 5 " + - "minutes]", - e); + LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 " + + "minutes]", e); LAST_LOG = System.currentTimeMillis(); } } @@ -895,28 +864,30 @@ private static void trySetXercesSecurityManager(SAXParser parser) { // separate old version of Xerces not found => use the builtin way of setting the property try { parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", - MAX_ENTITY_EXPANSIONS); - } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here + MAX_ENTITY_EXPANSIONS); + } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]", - e); + e); LAST_LOG = System.currentTimeMillis(); } } } private static void trySetStaxSecurityManager(XMLInputFactory inputFactory) { - //try default java entity expansion, then fallback to woodstox, then warn...once. + // try default java entity expansion, then fallback to woodstox, then warn...once. try { - inputFactory.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", - MAX_ENTITY_EXPANSIONS); + inputFactory.setProperty( + "http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", + MAX_ENTITY_EXPANSIONS); } catch (IllegalArgumentException e) { try { inputFactory.setProperty("com.ctc.wstx.maxEntityCount", MAX_ENTITY_EXPANSIONS); } catch (IllegalArgumentException e2) { if (HAS_WARNED_STAX.getAndSet(true) == false) { - LOG.warn("Could not set limit on maximum entity expansions for: " + inputFactory.getClass()); + LOG.warn("Could not set limit on maximum entity expansions for: " + + inputFactory.getClass()); } } @@ -941,9 +912,9 @@ public static int getPoolSize() { } /** - * Set the pool size for cached XML parsers. This has a side - * effect of locking the pool, and rebuilding the pool from - * scratch with the most recent settings, such as {@link #MAX_ENTITY_EXPANSIONS} + * Set the pool size for cached XML parsers. This has a side effect of locking the pool, and + * rebuilding the pool from scratch with the most recent settings, such as + * {@link #MAX_ENTITY_EXPANSIONS} * * As of Tika 3.2.1, if a value of 0 is passed in, no SAXParsers or DOMBuilders * will be pooled, and a new parser/builder will be built for each parse. @@ -955,16 +926,15 @@ public static void setPoolSize(int poolSize) throws TikaException { if (poolSize < 0) { throw new IllegalArgumentException("PoolSize must be >= 0"); } - //stop the world with a write lock. - //parsers that are currently in use will be offered later (once the lock is released), - //but not accepted and will be gc'd. We have to do this locking and - //the read locking in case one thread resizes the pool when the - //parsers have already started. We could have an NPE on SAX_PARSERS - //if we didn't lock. - SAX_POOL_LOCK - .writeLock().lock(); + // stop the world with a write lock. + // parsers that are currently in use will be offered later (once the lock is released), + // but not accepted and will be gc'd. We have to do this locking and + // the read locking in case one thread resizes the pool when the + // parsers have already started. We could have an NPE on SAX_PARSERS + // if we didn't lock. + SAX_POOL_LOCK.writeLock().lock(); try { - //free up any resources before emptying SAX_PARSERS + // free up any resources before emptying SAX_PARSERS for (PoolSAXParser parser : SAX_PARSERS) { parser.reset(); } @@ -974,30 +944,29 @@ public static void setPoolSize(int poolSize) throws TikaException { int generation = POOL_GENERATION.incrementAndGet(); for (int i = 0; i < poolSize; i++) { try { - SAX_PARSERS.offer(buildPoolParser(generation, getSAXParserFactory().newSAXParser())); + SAX_PARSERS.offer(buildPoolParser(generation, + getSAXParserFactory().newSAXParser())); } catch (SAXException | ParserConfigurationException e) { throw new TikaException("problem creating sax parser", e); } } } } finally { - SAX_POOL_LOCK - .writeLock().unlock(); + SAX_POOL_LOCK.writeLock().unlock(); } - DOM_POOL_LOCK - .writeLock().lock(); + DOM_POOL_LOCK.writeLock().lock(); try { DOM_BUILDERS.clear(); if (poolSize > 0) { DOM_BUILDERS = new ArrayBlockingQueue<>(poolSize); for (int i = 0; i < poolSize; i++) { - DOM_BUILDERS.offer(new PoolDOMBuilder(POOL_GENERATION.get(), getDocumentBuilder())); + DOM_BUILDERS.offer(new PoolDOMBuilder(POOL_GENERATION.get(), + getDocumentBuilder())); } } } finally { - DOM_POOL_LOCK - .writeLock().unlock(); + DOM_POOL_LOCK.writeLock().unlock(); } POOL_SIZE = poolSize; } @@ -1007,14 +976,13 @@ public static int getMaxEntityExpansions() { } /** - * Set the maximum number of entity expansions allowable in SAX/DOM/StAX parsing. - * NOTE:A value less than or equal to zero indicates no limit. - * This will override the system property {@link #JAXP_ENTITY_EXPANSION_LIMIT_KEY} - * and the {@link #DEFAULT_MAX_ENTITY_EXPANSIONS} value for allowable entity expansions + * Set the maximum number of entity expansions allowable in SAX/DOM/StAX parsing. NOTE:A + * value less than or equal to zero indicates no limit. This will override the system property + * {@link #JAXP_ENTITY_EXPANSION_LIMIT_KEY} and the {@link #DEFAULT_MAX_ENTITY_EXPANSIONS} value + * for allowable entity expansions *

- * NOTE: To trigger a rebuild of the pool of parsers with this setting, - * the client must call {@link #setPoolSize(int)} to rebuild the SAX and DOM parsers - * with this setting. + * NOTE: To trigger a rebuild of the pool of parsers with this setting, the client must + * call {@link #setPoolSize(int)} to rebuild the SAX and DOM parsers with this setting. *

* * @param maxEntityExpansions -- maximum number of allowable entity expansions @@ -1048,15 +1016,15 @@ private static PoolSAXParser buildPoolParser(int generation, SAXParser parser) { } boolean hasSecurityManager = false; try { - Object mgr = - Class.forName(XERCES_SECURITY_MANAGER).getDeclaredConstructor().newInstance(); + Object mgr = Class.forName(XERCES_SECURITY_MANAGER).getDeclaredConstructor() + .newInstance(); Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE); setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS); parser.setProperty(XERCES_SECURITY_MANAGER_PROPERTY, mgr); hasSecurityManager = true; } catch (SecurityException e) { - //don't swallow security exceptions + // don't swallow security exceptions throw e; } catch (ClassNotFoundException e) { // continue without log, this is expected in some setups @@ -1065,7 +1033,7 @@ private static PoolSAXParser buildPoolParser(int generation, SAXParser parser) { // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]", - e); + e); LAST_LOG = System.currentTimeMillis(); } } @@ -1075,15 +1043,13 @@ private static PoolSAXParser buildPoolParser(int generation, SAXParser parser) { // use the builtin way of setting the property try { parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", - MAX_ENTITY_EXPANSIONS); + MAX_ENTITY_EXPANSIONS); canSetJaxPEntity = true; - } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here + } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here // throttle the log somewhat as it can spam the log otherwise if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) { - LOG.warn( - "SAX Security Manager could not be setup [log suppressed for 5 " + - "minutes]", - e); + LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 " + + "minutes]", e); LAST_LOG = System.currentTimeMillis(); } } @@ -1144,6 +1110,7 @@ private abstract static class PoolSAXParser { final int poolGeneration; final SAXParser saxParser; int numUses = 0; + PoolSAXParser(int poolGeneration, SAXParser saxParser) { this.poolGeneration = poolGeneration; this.saxParser = saxParser; @@ -1172,12 +1139,12 @@ public XercesPoolSAXParser(int generation, SAXParser parser) { @Override public void reset() { - //don't do anything + // don't do anything try { XMLReader reader = saxParser.getXMLReader(); clearReader(reader); } catch (SAXException e) { - //swallow + // swallow } } } @@ -1223,8 +1190,8 @@ void reset() { } private static class UnrecognizedPoolSAXParser extends PoolSAXParser { - //if unrecognized, try to set all protections - //and try to reset every time + // if unrecognized, try to set all protections + // and try to reset every time public UnrecognizedPoolSAXParser(int generation, SAXParser parser) { super(generation, parser); } @@ -1247,13 +1214,12 @@ void reset() { } /** - * Returns the DOM builder specified in this parsing context. - * If a builder is not explicitly specified, then a builder - * instance is created and returned. The builder instance is - * configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER}, - * and it sets the ErrorHandler to null. - * Consider using {@link XMLReaderUtils#buildDOM(InputStream, ParseContext)} - * instead for more efficient reuse of document builders. + * Returns the DOM builder specified in this parsing context. If a builder is not explicitly + * specified, then a builder instance is created and returned. The builder instance is + * configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER}, and it sets the + * ErrorHandler to null. Consider using + * {@link XMLReaderUtils#buildDOM(InputStream, ParseContext)} instead for more efficient reuse + * of document builders. * * @return DOM Builder */ @@ -1267,10 +1233,9 @@ public static DocumentBuilder getDocumentBuilder(ParseContext context) throws Ti } /** - * Returns the StAX input factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware and to apply reasonable security + * Returns the StAX input factory specified in this parsing context. If a factory is not + * explicitly specified, then a default factory instance is created and returned. The default + * factory instance is configured to be namespace-aware and to apply reasonable security * precautions. * * @return StAX input factory @@ -1287,9 +1252,8 @@ public static XMLInputFactory getXMLInputFactory(ParseContext context) { /** * Returns the transformer specified in this parsing context. *

- * If a transformer is not explicitly specified, then a default transformer - * instance is created and returned. The default transformer instance is - * configured to to use + * If a transformer is not explicitly specified, then a default transformer instance is created + * and returned. The default transformer instance is configured to to use * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. * * @return Transformer diff --git a/tika-core/src/main/java/org/apache/tika/utils/package-info.java b/tika-core/src/main/java/org/apache/tika/utils/package-info.java index 04ea52e5cf..f9d114acff 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/package-info.java +++ b/tika-core/src/main/java/org/apache/tika/utils/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java b/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java index 7237d11766..07349dcef6 100644 --- a/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java +++ b/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.custom.detect; import java.io.IOException; import java.io.InputStream; - import org.apache.tika.detect.Detector; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java index ee87f9bf72..70dfcea85b 100644 --- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika; @@ -42,7 +40,6 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; - import org.apache.tika.detect.Detector; import org.apache.tika.detect.XmlRootExtractor; import org.apache.tika.exception.TikaException; @@ -57,18 +54,18 @@ import org.apache.tika.utils.XMLReaderUtils; public class MultiThreadedTikaTest extends TikaTest { - //TODO: figure out how to make failures reproducible a la Lucene/Solr with a seed - //TODO: Consider randomizing the Locale and timezone, like Lucene/Solr... + // TODO: figure out how to make failures reproducible a la Lucene/Solr with a seed + // TODO: Consider randomizing the Locale and timezone, like Lucene/Solr... XmlRootExtractor ex = new XmlRootExtractor(); public static Path[] getTestFiles(final FileFilter fileFilter) - throws URISyntaxException, IOException { + throws URISyntaxException, IOException { Path root = Paths.get(MultiThreadedTikaTest.class.getResource("/test-documents").toURI()); final List files = new ArrayList<>(); Files.walkFileTree(root, new SimpleFileVisitor() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) - throws IOException { + throws IOException { if (fileFilter != null && !fileFilter.accept(file.toFile())) { return FileVisitResult.CONTINUE; } @@ -82,7 +79,7 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) } private static ConcurrentHashMap getBaselineDetection(Detector detector, - Path[] files) { + Path[] files) { ConcurrentHashMap baseline = new ConcurrentHashMap<>(); XmlRootExtractor extractor = new XmlRootExtractor(); @@ -99,7 +96,7 @@ private static ConcurrentHashMap getBaselineDetection(Detector } private static ConcurrentHashMap getBaseline(Parser parser, Path[] files, - ParseContext parseContext) { + ParseContext parseContext) { ConcurrentHashMap baseline = new ConcurrentHashMap<>(); for (Path f : files) { @@ -109,69 +106,68 @@ private static ConcurrentHashMap getBaseline(Parser parser, Path[ baseline.put(f, new Extract(metadataList)); } catch (Exception e) { - //swallow + // swallow } } return baseline; } private static List getRecursiveMetadata(InputStream is, Parser parser, - ParseContext parseContext) throws Exception { - //different from parent TikaTest in that this extracts text. - //can't extract xhtml because "tmp" file names wind up in - //content's metadata and they'll differ by file. + ParseContext parseContext) throws Exception { + // different from parent TikaTest in that this extracts text. + // can't extract xhtml because "tmp" file names wind up in + // content's metadata and they'll differ by file. parseContext = new ParseContext(); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - -1); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), -1); parser.parse(is, handler, new Metadata(), parseContext); return handler.getMetadataList(); } private static void assertExtractEquals(Extract extractA, Extract extractB) { - //this currently only checks the basics - //might want to add more checks + // this currently only checks the basics + // might want to add more checks assertEquals(extractA.metadataList.size(), extractB.metadataList.size(), - "number of embedded files"); + "number of embedded files"); for (int i = 0; i < extractA.metadataList.size(); i++) { assertEquals(extractA.metadataList.get(i).size(), extractB.metadataList.get(i).size(), - "number of metadata elements in attachment: " + i); + "number of metadata elements in attachment: " + i); assertEquals(extractA.metadataList.get(i).get(TikaCoreProperties.TIKA_CONTENT), - extractB.metadataList.get(i).get(TikaCoreProperties.TIKA_CONTENT), - "content in attachment: " + i); + extractB.metadataList.get(i).get(TikaCoreProperties.TIKA_CONTENT), + "content in attachment: " + i); } } /** - * This calls {@link #testEach(Parser parser, Path[], ParseContext[], int, int)} and - * then {@link #testAll(Parser parser, Path[], ParseContext[], int, int)} + * This calls {@link #testEach(Parser parser, Path[], ParseContext[], int, int)} and then + * {@link #testAll(Parser parser, Path[], ParseContext[], int, int)} * - * @param numThreads number of threads to use + * @param numThreads number of threads to use * @param numIterations number of iterations per thread - * @param filter file filter to select files from "/test-documents"; if - * null, - * all files will be used + * @param filter file filter to select files from "/test-documents"; if null, all + * files will be used * @throws Exception */ protected void testMultiThreaded(Parser parser, ParseContext[] parseContext, int numThreads, - int numIterations, FileFilter filter) throws Exception { + int numIterations, FileFilter filter) throws Exception { Path[] allFiles = getTestFiles(filter); testEach(parser, allFiles, parseContext, numThreads, numIterations); testAll(parser, allFiles, parseContext, numThreads, numIterations); } public void testDetector(Detector detector, int numThreads, int numIterations, - FileFilter filter, int randomlyResizeSAXPool) throws Exception { + FileFilter filter, int randomlyResizeSAXPool) throws Exception { Path[] files = getTestFiles(filter); testDetectorEach(detector, files, numThreads, numIterations, randomlyResizeSAXPool); testDetectorOnAll(detector, files, numThreads, numIterations, randomlyResizeSAXPool); } void testDetectorEach(Detector detector, Path[] files, int numThreads, int numIterations, - int randomlyResizeSAXPool) { + int randomlyResizeSAXPool) { for (Path p : files) { Path[] toTest = new Path[1]; toTest[0] = p; @@ -180,13 +176,13 @@ void testDetectorEach(Detector detector, Path[] files, int numThreads, int numIt } private void testDetectorOnAll(Detector detector, Path[] toTest, int numThreads, - int numIterations, int randomlyResizeSAXPool) { + int numIterations, int randomlyResizeSAXPool) { Map truth = getBaselineDetection(detector, toTest); - //if all files caused an exception + // if all files caused an exception if (truth.size() == 0) { return; } - //only those that parsed without exception + // only those that parsed without exception Path[] testFiles = new Path[truth.size()]; int j = 0; for (Path testFile : truth.keySet()) { @@ -196,7 +192,7 @@ private void testDetectorOnAll(Detector detector, Path[] toTest, int numThreads, ExecutorService ex = Executors.newFixedThreadPool(actualThreadCount); try { _testDetectorOnAll(detector, testFiles, numThreads, numIterations, truth, ex, - randomlyResizeSAXPool); + randomlyResizeSAXPool); } finally { ex.shutdown(); ex.shutdownNow(); @@ -204,26 +200,26 @@ private void testDetectorOnAll(Detector detector, Path[] toTest, int numThreads, } private void _testDetectorOnAll(Detector detector, Path[] testFiles, int numThreads, - int numIterations, Map truth, - ExecutorService ex, int randomlyResizeSAXPool) { + int numIterations, Map truth, ExecutorService ex, + int randomlyResizeSAXPool) { ExecutorCompletionService executorCompletionService = - new ExecutorCompletionService<>(ex); + new ExecutorCompletionService<>(ex); executorCompletionService.submit(new SAXPoolResizer(randomlyResizeSAXPool)); for (int i = 0; i < numThreads; i++) { - executorCompletionService - .submit(new TikaDetectorRunner(detector, numIterations, testFiles, truth)); + executorCompletionService.submit( + new TikaDetectorRunner(detector, numIterations, testFiles, truth)); } int completed = 0; while (completed < numThreads) { - //TODO: add a maximum timeout threshold + // TODO: add a maximum timeout threshold Future future = null; try { future = executorCompletionService.poll(1000, TimeUnit.MILLISECONDS); if (future != null) { - future.get();//trigger exceptions from thread + future.get();// trigger exceptions from thread completed++; } } catch (InterruptedException | ExecutionException e) { @@ -235,21 +231,19 @@ private void _testDetectorOnAll(Detector detector, Path[] testFiles, int numThre } /** - * Test each file, one at a time in multiple threads. - * This was required to test TIKA-2519 in a reasonable - * amount of time. This forced the parser to use the - * same underlying memory structures because it was the same file. - * This is stricter than I think our agreement with clients is - * because this run tests on literally the same file and - * not a copy of the file per thread. Let's leave this as is - * unless there's a good reason to create a separate copy per thread. + * Test each file, one at a time in multiple threads. This was required to test TIKA-2519 in a + * reasonable amount of time. This forced the parser to use the same underlying memory + * structures because it was the same file. This is stricter than I think our agreement with + * clients is because this run tests on literally the same file and not a copy of the file per + * thread. Let's leave this as is unless there's a good reason to create a separate copy per + * thread. * - * @param files files to test, one at a time - * @param numThreads number of threads to use + * @param files files to test, one at a time + * @param numThreads number of threads to use * @param numIterations number of iterations per thread */ protected void testEach(Parser parser, Path[] files, ParseContext[] parseContext, - int numThreads, int numIterations) { + int numThreads, int numIterations) { for (Path p : files) { Path[] toTest = new Path[1]; toTest[0] = p; @@ -258,27 +252,25 @@ protected void testEach(Parser parser, Path[] files, ParseContext[] parseContext } /** - * This tests all files together. Each parser randomly selects - * a file from the array. Two parsers could wind up parsing the - * same file at the same time. Good. + * This tests all files together. Each parser randomly selects a file from the array. Two + * parsers could wind up parsing the same file at the same time. Good. *

- * In the current implementation, this gets ground truth only - * from files that do not throw exceptions. This will ignore - * files that cause exceptions. + * In the current implementation, this gets ground truth only from files that do not throw + * exceptions. This will ignore files that cause exceptions. * - * @param files files to parse - * @param numThreads number of parser threads + * @param files files to parse + * @param numThreads number of parser threads * @param numIterations number of iterations per parser */ protected void testAll(Parser parser, Path[] files, ParseContext[] parseContext, int numThreads, - int numIterations) { + int numIterations) { Map truth = getBaseline(parser, files, parseContext[0]); - //if all files caused an exception + // if all files caused an exception if (truth.size() == 0) { - //return; + // return; } - //only those that parsed without exception + // only those that parsed without exception Path[] testFiles = new Path[truth.size()]; int j = 0; for (Path testFile : truth.keySet()) { @@ -295,28 +287,27 @@ protected void testAll(Parser parser, Path[] files, ParseContext[] parseContext, } private void _testAll(Parser parser, Path[] testFiles, ParseContext[] parseContext, - int numThreads, int numIterations, Map truth, - ExecutorService ex) { + int numThreads, int numIterations, Map truth, + ExecutorService ex) { ExecutorCompletionService executorCompletionService = - new ExecutorCompletionService<>(ex); + new ExecutorCompletionService<>(ex); - //use the same parser in all threads + // use the same parser in all threads for (int i = 0; i < numThreads; i++) { - executorCompletionService - .submit(new TikaRunner(parser, parseContext[i], numIterations, testFiles, - truth)); + executorCompletionService.submit(new TikaRunner(parser, parseContext[i], numIterations, + testFiles, truth)); } int completed = 0; while (completed < numThreads) { - //TODO: add a maximum timeout threshold + // TODO: add a maximum timeout threshold Future future = null; try { future = executorCompletionService.poll(1000, TimeUnit.MILLISECONDS); if (future != null) { - future.get();//trigger exceptions from thread + future.get();// trigger exceptions from thread completed++; } } catch (InterruptedException | ExecutionException e) { @@ -325,7 +316,7 @@ private void _testAll(Parser parser, Path[] testFiles, ParseContext[] parseConte } } - //TODO: make this return something useful besides an integer + // TODO: make this return something useful besides an integer private static class TikaRunner implements Callable { private static final AtomicInteger threadCount = new AtomicInteger(0); private final Parser parser; @@ -337,7 +328,7 @@ private static class TikaRunner implements Callable { private final int threadNumber; private TikaRunner(Parser parser, ParseContext parseContext, int iterations, Path[] files, - Map truth) { + Map truth) { this.parser = parser; this.iterations = iterations; this.files = files; @@ -357,8 +348,8 @@ public Integer call() throws Exception { metadataList = getRecursiveMetadata(is, parser, new ParseContext()); success = true; } catch (Exception e) { - //swallow - //throw new RuntimeException(testFile + " triggered this exception", e); + // swallow + // throw new RuntimeException(testFile + " triggered this exception", e); } if (success) { assertExtractEquals(truth.get(testFile), new Extract(metadataList)); @@ -411,7 +402,7 @@ private static class TikaDetectorRunner implements Callable { private final Random random = new Random(); private TikaDetectorRunner(Detector detector, int iterations, Path[] files, - Map truth) { + Map truth) { this.detector = detector; this.iterations = iterations; this.files = files; @@ -427,7 +418,7 @@ public Integer call() throws Exception { try (TikaInputStream tis = TikaInputStream.get(testFile, metadata)) { MediaType mediaType = detector.detect(tis, metadata); assertEquals(truth.get(testFile), mediaType, - "failed on: " + testFile.getFileName()); + "failed on: " + testFile.getFileName()); } } return 1; diff --git a/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java b/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java index 1a6d454d32..63725be674 100644 --- a/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java +++ b/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika; @@ -26,10 +24,8 @@ import java.util.Map; /** - * A wrapper around a {@link ClassLoader} that logs all - * the Resources loaded through it. - * Used to check that a specific ClassLoader was used - * when unit testing + * A wrapper around a {@link ClassLoader} that logs all the Resources loaded through it. Used to + * check that a specific ClassLoader was used when unit testing */ public class ResourceLoggingClassLoader extends ClassLoader { private final Map> loadedResources = new HashMap<>(); diff --git a/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java b/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java index 05fdb53f4d..7546483b2d 100644 --- a/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java +++ b/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika; @@ -25,12 +23,10 @@ import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; - +import org.apache.tika.utils.RereadableInputStream; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import org.apache.tika.utils.RereadableInputStream; - public class TestRereadableInputStream { private final int DEFAULT_TEST_SIZE = 3; @@ -56,25 +52,25 @@ public void testInMemory() throws IOException { readEntireStream((TEST_SIZE_MEMORY)); } -// @Test -// public void testInFile() throws IOException { -// readData(TEST_SIZE_FILE); -// } -// -// @Test -// public void testMemoryThreshold() throws IOException { -// readData(TEST_SIZE_MAX); -// } -// -// @Test -// public void testInMemory2() throws IOException { -// readData2((TEST_SIZE_MEMORY)); -// } -// -// @Test -// public void testInFile2() throws IOException { -// readData2(TEST_SIZE_FILE); -// } + // @Test + // public void testInFile() throws IOException { + // readData(TEST_SIZE_FILE); + // } + // + // @Test + // public void testMemoryThreshold() throws IOException { + // readData(TEST_SIZE_MAX); + // } + // + // @Test + // public void testInMemory2() throws IOException { + // readData2((TEST_SIZE_MEMORY)); + // } + // + // @Test + // public void testInFile2() throws IOException { + // readData2(TEST_SIZE_FILE); + // } @Test public void testMemoryThreshold2() throws IOException { @@ -90,21 +86,20 @@ private void readEntireStream(int testSize) throws IOException { for (int pass = 0; pass < NUM_PASSES; pass++) { for (int byteNum = 0; byteNum < testSize; byteNum++) { int byteRead = ris.read(); - assertEquals(byteNum, byteRead, - "Pass = " + pass + ", byte num should be " + byteNum + " but is " + - byteRead + "."); + assertEquals(byteNum, byteRead, "Pass = " + pass + ", byte num should be " + + byteNum + " but is " + byteRead + "."); } int eof = ris.read(); - assertEquals(-1, eof, - "Pass = " + pass + ", byte num should be " + -1 + " but is " + eof + "."); + assertEquals(-1, eof, "Pass = " + pass + ", byte num should be " + -1 + " but is " + + eof + "."); ris.rewind(); } } } /** - * Read increasingly more of the stream, but not all, with each pass before rewinding to - * make sure we pick up at the correct point + * Read increasingly more of the stream, but not all, with each pass before rewinding to make + * sure we pick up at the correct point */ private void readPartialStream(int testSize) throws IOException { InputStream is = createTestInputStream(20); @@ -114,8 +109,8 @@ private void readPartialStream(int testSize) throws IOException { for (int pass = 0; pass < NUM_PASSES; pass++) { for (int byteNum = 0; byteNum < iterations; byteNum++) { int byteRead = ris.read(); - assertEquals(byteNum, byteRead, - "Pass = " + pass + ", byte num should be " + byteNum + " but is " + byteRead + "."); + assertEquals(byteNum, byteRead, "Pass = " + pass + ", byte num should be " + + byteNum + " but is " + byteRead + "."); } ris.rewind(); iterations++; @@ -128,10 +123,11 @@ private void readPartialStream(int testSize) throws IOException { public void testRewind() throws IOException { InputStream is = createTestInputStream(DEFAULT_TEST_SIZE); try (RereadableInputStream ris = new RereadableInputStream(is, MEMORY_THRESHOLD, true)) { - ris.rewind(); // rewind before we've done anything + ris.rewind(); // rewind before we've done anything for (int byteNum = 0; byteNum < 1; byteNum++) { int byteRead = ris.read(); - assertEquals(byteNum, byteRead, "Byte num should be " + byteNum + " but is " + byteRead + "."); + assertEquals(byteNum, byteRead, + "Byte num should be " + byteNum + " but is " + byteRead + "."); } } } @@ -139,7 +135,7 @@ public void testRewind() throws IOException { private TestInputStream createTestInputStream(int testSize) throws IOException { return new TestInputStream( - new BufferedInputStream(Files.newInputStream(createTestFile(testSize)))); + new BufferedInputStream(Files.newInputStream(createTestFile(testSize)))); } private Path createTestFile(int testSize) throws IOException { diff --git a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java index f52482c8d7..85b3530ee6 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika; @@ -26,9 +24,10 @@ public class TikaDetectionTest { private final Tika tika = new Tika(); /** - * This test checks that Tika correctly detects all the file extensions - * defined in the mime.types file (revision 819245) of the Apache HTTP - * Server project. The tests were created with: + * This test checks that Tika correctly detects all the file extensions defined in the + * mime.types file (revision 819245) of the Apache HTTP Server project. The tests were created + * with: + * *

      * cat docs/conf/mime.types | grep -v '#' | perl -lne '/\S\s+\S/ and do {
      *     my ($type, @ext) = split /\s+/;
@@ -92,11 +91,11 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/oebps-package+xml", tika.detect("x.opf"));
         assertEquals("application/ogg", tika.detect("x.ogx"));
         // Differ from httpd - We have subtypes they lack
-        //assertEquals("application/onenote", tika.detect("x.one"));
-        //assertEquals("application/onenote", tika.detect("x.onetoc"));
-        //assertEquals("application/onenote", tika.detect("x.onetoc2"));
-        //assertEquals("application/onenote", tika.detect("x.onetmp"));
-        //assertEquals("application/onenote", tika.detect("x.onepkg"));
+        // assertEquals("application/onenote", tika.detect("x.one"));
+        // assertEquals("application/onenote", tika.detect("x.onetoc"));
+        // assertEquals("application/onenote", tika.detect("x.onetoc2"));
+        // assertEquals("application/onenote", tika.detect("x.onetmp"));
+        // assertEquals("application/onenote", tika.detect("x.onepkg"));
         assertEquals("application/patch-ops-error+xml", tika.detect("x.xer"));
         assertEquals("application/pdf", tika.detect("x.pdf"));
         assertEquals("application/pgp-encrypted", tika.detect("x.pgp"));
@@ -155,7 +154,7 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.acucorp", tika.detect("x.atc"));
         assertEquals("application/vnd.acucorp", tika.detect("x.acutc"));
         assertEquals("application/vnd.adobe.air-application-installer-package+zip",
-                tika.detect("x.air"));
+                        tika.detect("x.air"));
         assertEquals("application/vnd.adobe.xdp+xml", tika.detect("x.xdp"));
         assertEquals("application/vnd.adobe.xfdf", tika.detect("x.xfdf"));
         assertEquals("application/vnd.airzip.filesecure.azf", tika.detect("x.azf"));
@@ -165,13 +164,13 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.amiga.ami", tika.detect("x.ami"));
         assertEquals("application/vnd.android.package-archive", tika.detect("x.apk"));
         assertEquals("application/vnd.anser-web-certificate-issue-initiation",
-                tika.detect("x.cii"));
+                        tika.detect("x.cii"));
         assertEquals("application/vnd.anser-web-funds-transfer-initiation", tika.detect("x.fti"));
         assertEquals("application/vnd.antix.game-component", tika.detect("x.atx"));
         assertEquals("application/vnd.apple.installer+xml", tika.detect("x.mpkg"));
         assertEquals("application/vnd.arastra.swi", tika.detect("x.swi"));
         // Differ from httpd - Adobe After Effects is a much more common user of .AEP these days
-        //assertEquals("application/vnd.audiograph", tika.detect("x.aep"));
+        // assertEquals("application/vnd.audiograph", tika.detect("x.aep"));
         assertEquals("application/vnd.blueice.multipass", tika.detect("x.mpm"));
         assertEquals("application/vnd.bmi", tika.detect("x.bmi"));
         assertEquals("application/vnd.businessobjects", tika.detect("x.rep"));
@@ -310,7 +309,7 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.kodak-descriptor", tika.detect("x.sse"));
         assertEquals("application/vnd.llamagraphics.life-balance.desktop", tika.detect("x.lbd"));
         assertEquals("application/vnd.llamagraphics.life-balance.exchange+xml",
-                tika.detect("x.lbe"));
+                        tika.detect("x.lbe"));
         assertEquals("application/vnd.lotus-1-2-3", tika.detect("x.123"));
         assertEquals("application/vnd.lotus-approach", tika.detect("x.apr"));
         assertEquals("application/vnd.lotus-freelance", tika.detect("x.pre"));
@@ -347,7 +346,7 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.ms-excel", tika.detect("x.xlw"));
         assertEquals("application/vnd.ms-excel.addin.macroenabled.12", tika.detect("x.xlam"));
         assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12",
-                tika.detect("x.xlsb"));
+                        tika.detect("x.xlsb"));
         assertEquals("application/vnd.ms-excel.sheet.macroenabled.12", tika.detect("x.xlsm"));
         assertEquals("application/vnd.ms-excel.template.macroenabled.12", tika.detect("x.xltm"));
         assertEquals("application/vnd.ms-fontobject", tika.detect("x.eot"));
@@ -361,12 +360,12 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.ms-powerpoint", tika.detect("x.pot"));
         assertEquals("application/vnd.ms-powerpoint.addin.macroenabled.12", tika.detect("x.ppam"));
         assertEquals("application/vnd.ms-powerpoint.presentation.macroenabled.12",
-                tika.detect("x.pptm"));
+                        tika.detect("x.pptm"));
         assertEquals("application/vnd.ms-powerpoint.slide.macroenabled.12", tika.detect("x.sldm"));
         assertEquals("application/vnd.ms-powerpoint.slideshow.macroenabled.12",
-                tika.detect("x.ppsm"));
+                        tika.detect("x.ppsm"));
         assertEquals("application/vnd.ms-powerpoint.template.macroenabled.12",
-                tika.detect("x.potm"));
+                        tika.detect("x.potm"));
         assertEquals("application/vnd.ms-project", tika.detect("x.mpp"));
         assertEquals("application/vnd.ms-project", tika.detect("x.mpt"));
         assertEquals("application/vnd.ms-word.document.macroenabled.12", tika.detect("x.docm"));
@@ -394,7 +393,7 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.oasis.opendocument.chart", tika.detect("x.odc"));
         assertEquals("application/vnd.oasis.opendocument.chart-template", tika.detect("x.otc"));
         // Differ from httpd - Mimetype embedded in file is .base not .database
-        //assertEquals("application/vnd.oasis.opendocument.database", tika.detect("x.odb"));
+        // assertEquals("application/vnd.oasis.opendocument.database", tika.detect("x.odb"));
         assertEquals("application/vnd.oasis.opendocument.formula", tika.detect("x.odf"));
         assertEquals("application/vnd.oasis.opendocument.formula-template", tika.detect("x.odft"));
         assertEquals("application/vnd.oasis.opendocument.graphics", tika.detect("x.odg"));
@@ -403,10 +402,10 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.oasis.opendocument.image-template", tika.detect("x.oti"));
         assertEquals("application/vnd.oasis.opendocument.presentation", tika.detect("x.odp"));
         assertEquals("application/vnd.oasis.opendocument.presentation-template",
-                tika.detect("x.otp"));
+                        tika.detect("x.otp"));
         assertEquals("application/vnd.oasis.opendocument.spreadsheet", tika.detect("x.ods"));
         assertEquals("application/vnd.oasis.opendocument.spreadsheet-template",
-                tika.detect("x.ots"));
+                        tika.detect("x.ots"));
         assertEquals("application/vnd.oasis.opendocument.text", tika.detect("x.odt"));
         assertEquals("application/vnd.oasis.opendocument.text-master", tika.detect("x.otm"));
         assertEquals("application/vnd.oasis.opendocument.text-template", tika.detect("x.ott"));
@@ -415,21 +414,21 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/vnd.oma.dd2+xml", tika.detect("x.dd2"));
         assertEquals("application/vnd.openofficeorg.extension", tika.detect("x.oxt"));
         assertEquals("application/vnd.openxmlformats-officedocument.presentationml.presentation",
-                tika.detect("x.pptx"));
+                        tika.detect("x.pptx"));
         assertEquals("application/vnd.openxmlformats-officedocument.presentationml.slide",
-                tika.detect("x.sldx"));
+                        tika.detect("x.sldx"));
         assertEquals("application/vnd.openxmlformats-officedocument.presentationml.slideshow",
-                tika.detect("x.ppsx"));
+                        tika.detect("x.ppsx"));
         assertEquals("application/vnd.openxmlformats-officedocument.presentationml.template",
-                tika.detect("x.potx"));
+                        tika.detect("x.potx"));
         assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-                tika.detect("x.xlsx"));
+                        tika.detect("x.xlsx"));
         assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.template",
-                tika.detect("x.xltx"));
+                        tika.detect("x.xltx"));
         assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-                tika.detect("x.docx"));
+                        tika.detect("x.docx"));
         assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.template",
-                tika.detect("x.dotx"));
+                        tika.detect("x.dotx"));
         assertEquals("application/vnd.osgi.dp", tika.detect("x.dp"));
         assertEquals("chemical/x-pdb", tika.detect("x.pdb"));
         assertEquals("application/vnd.palm", tika.detect("x.pqa"));
@@ -582,8 +581,8 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/x-font-type1", tika.detect("x.pfa"));
         assertEquals("application/x-font-type1", tika.detect("x.pfb"));
         // TODO Get these fixed upstream too
-        //assertEquals("application/x-font-type1", tika.detect("x.pfm"));
-        //assertEquals("application/x-font-type1", tika.detect("x.afm"));
+        // assertEquals("application/x-font-type1", tika.detect("x.pfm"));
+        // assertEquals("application/x-font-type1", tika.detect("x.afm"));
         assertEquals("application/x-font-printer-metric", tika.detect("x.pfm"));
         assertEquals("application/x-font-adobe-metric", tika.detect("x.afm"));
         assertEquals("application/x-futuresplash", tika.detect("x.spl"));
@@ -606,14 +605,14 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/x-msdownload", tika.detect("x.dll"));
         assertEquals("application/x-msdownload", tika.detect("x.com"));
         // Differ from httpd - BAT is different from normal windows executables
-        //assertEquals("application/x-msdownload", tika.detect("x.bat"));
+        // assertEquals("application/x-msdownload", tika.detect("x.bat"));
         // Differ from httpd - MSI is different from normal windows executables
-        //assertEquals("application/x-msdownload", tika.detect("x.msi"));
+        // assertEquals("application/x-msdownload", tika.detect("x.msi"));
         assertEquals("application/x-msmediaview", tika.detect("x.mvb"));
         assertEquals("application/x-msmediaview", tika.detect("x.m13"));
         assertEquals("application/x-msmediaview", tika.detect("x.m14"));
         // Differ from httpd - wmf was properly registered in RFC 7903
-        //assertEquals("application/x-msmetafile", tika.detect("x.wmf"));
+        // assertEquals("application/x-msmetafile", tika.detect("x.wmf"));
         assertEquals("application/x-msmoney", tika.detect("x.mny"));
         assertEquals("application/x-mspublisher", tika.detect("x.pub"));
         assertEquals("application/x-msschedule", tika.detect("x.scd"));
@@ -644,8 +643,8 @@ public void testHttpServerFileExtensions() {
         assertEquals("application/x-ustar", tika.detect("x.ustar"));
         assertEquals("application/x-wais-source", tika.detect("x.src"));
         // Differ from httpd - use a common parent for CA and User certs
-        //assertEquals("application/x-x509-ca-cert", tika.detect("x.der"));
-        //assertEquals("application/x-x509-ca-cert", tika.detect("x.crt"));
+        // assertEquals("application/x-x509-ca-cert", tika.detect("x.der"));
+        // assertEquals("application/x-x509-ca-cert", tika.detect("x.crt"));
         assertEquals("application/x-xfig", tika.detect("x.fig"));
         assertEquals("application/x-xpinstall", tika.detect("x.xpi"));
         assertEquals("application/xenc+xml", tika.detect("x.xenc"));
@@ -678,9 +677,9 @@ public void testHttpServerFileExtensions() {
         assertEquals("audio/mpeg", tika.detect("x.m3a"));
         assertEquals("audio/ogg", tika.detect("x.oga"));
         // Differ from httpd - Use a dedicated mimetype of Vorbis
-        //assertEquals("audio/ogg", tika.detect("x.ogg"));
+        // assertEquals("audio/ogg", tika.detect("x.ogg"));
         // Differ from httpd - Speex more commonly uses its own mimetype
-        //assertEquals("audio/ogg", tika.detect("x.spx"));
+        // assertEquals("audio/ogg", tika.detect("x.spx"));
         assertEquals("audio/vnd.digital-winds", tika.detect("x.eol"));
         assertEquals("audio/vnd.dts", tika.detect("x.dts"));
         assertEquals("audio/vnd.dts.hd", tika.detect("x.dtshd"));
@@ -700,7 +699,7 @@ public void testHttpServerFileExtensions() {
         assertEquals("audio/x-pn-realaudio", tika.detect("x.ra"));
         assertEquals("audio/x-pn-realaudio-plugin", tika.detect("x.rmp"));
         // Differ from httpd - wav was properly registered in RFC 2361
-        //assertEquals("audio/x-wav", tika.detect("x.wav"));
+        // assertEquals("audio/x-wav", tika.detect("x.wav"));
         assertEquals("chemical/x-cdx", tika.detect("x.cdx"));
         assertEquals("chemical/x-cif", tika.detect("x.cif"));
         assertEquals("chemical/x-cmdf", tika.detect("x.cmdf"));
@@ -708,7 +707,7 @@ public void testHttpServerFileExtensions() {
         assertEquals("chemical/x-csml", tika.detect("x.csml"));
         assertEquals("chemical/x-xyz", tika.detect("x.xyz"));
         // Differ from httpd - bmp was properly registered in RFC 7903
-        //assertEquals("image/x-ms-bmp", tika.detect("x.bmp"));
+        // assertEquals("image/x-ms-bmp", tika.detect("x.bmp"));
         assertEquals("image/cgm", tika.detect("x.cgm"));
         assertEquals("image/g3fax", tika.detect("x.g3"));
         assertEquals("image/gif", tika.detect("x.gif"));
@@ -746,11 +745,11 @@ public void testHttpServerFileExtensions() {
         assertEquals("image/x-freehand", tika.detect("x.fh5"));
         assertEquals("image/x-freehand", tika.detect("x.fh7"));
         // Differ from httpd - An official mimetype has subsequently been issued
-        //  favicon.ico +friends should now be image/vnd.microsoft.icon
-        //assertEquals("image/x-icon", tika.detect("x.ico"));
+        // favicon.ico +friends should now be image/vnd.microsoft.icon
+        // assertEquals("image/x-icon", tika.detect("x.ico"));
         // Differ from httpd - An official mimetype has subsequently been issued
-        //  pcx PiCture eXchange files should now be image/vnd.zbrush.pcx
-        //assertEquals("image/x-pcx", tika.detect("x.pcx"));
+        // pcx PiCture eXchange files should now be image/vnd.zbrush.pcx
+        // assertEquals("image/x-pcx", tika.detect("x.pcx"));
         assertEquals("image/x-pict", tika.detect("x.pic"));
         assertEquals("image/x-pict", tika.detect("x.pct"));
         assertEquals("image/x-portable-anymap", tika.detect("x.pnm"));
@@ -784,7 +783,7 @@ public void testHttpServerFileExtensions() {
         assertEquals("text/plain", tika.detect("x.txt"));
         assertEquals("text/plain", tika.detect("x.text"));
         // Differ from httpd - Use a dedicated mimetype for Config files
-        //assertEquals("text/plain", tika.detect("x.conf"));
+        // assertEquals("text/plain", tika.detect("x.conf"));
         assertEquals("text/plain", tika.detect("x.def"));
         assertEquals("text/plain", tika.detect("x.list"));
         assertEquals("text/x-log", tika.detect("x.log"));
diff --git a/tika-core/src/test/java/org/apache/tika/TikaIT.java b/tika-core/src/test/java/org/apache/tika/TikaIT.java
index 17b617eab4..887010f2e7 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaIT.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaIT.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika;
 
@@ -27,8 +25,8 @@ public class TikaIT {
     public void testToString() {
         String version = new Tika().toString();
         assertNotNull(version);
-        assertTrue(
-                version.matches("Apache Tika \\d+\\.\\d+\\.\\d+(-(?:ALPHA|BETA)\\d*)?(?:-SNAPSHOT)?"));
+        assertTrue(version.matches(
+                        "Apache Tika \\d+\\.\\d+\\.\\d+(-(?:ALPHA|BETA)\\d*)?(?:-SNAPSHOT)?"));
     }
 
 }
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 817f907d94..73a032ae71 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -1,18 +1,16 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
  */
 package org.apache.tika;
 
@@ -39,11 +37,7 @@
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
-
 import org.apache.commons.io.IOUtils;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.extractor.EmbeddedResourceHandler;
@@ -61,6 +55,8 @@
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.RecursiveParserWrapperHandler;
 import org.apache.tika.sax.ToXMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 /**
  * Parent class of Tika tests
@@ -77,6 +73,7 @@ public abstract class TikaTest {
             throw new RuntimeException(e);
         }
     }
+
     public static void assertContainsCount(String needle, String haystack, int targetCount) {
         int i = haystack.indexOf(needle);
         int count = 0;
@@ -85,7 +82,7 @@ public static void assertContainsCount(String needle, String haystack, int targe
             i = haystack.indexOf(needle, i + 1);
         }
         assertEquals(targetCount, count,
-                "found " + count + " but should have found: " + targetCount);
+                        "found " + count + " but should have found: " + targetCount);
     }
 
     public static void assertContains(String needle, String haystack) {
@@ -105,8 +102,7 @@ public static  void assertNotContained(T needle, Collection hays
     }
 
     public static void assertMetadataListEquals(List metadataListA,
-                                          List metadataListB,
-                                    Set fieldsToIgnore) {
+                    List metadataListB, Set fieldsToIgnore) {
         assertEquals(metadataListA.size(), metadataListB.size(), "different sizes");
         for (int i = 0; i < metadataListA.size(); i++) {
             Metadata mA = metadataListA.get(i);
@@ -117,8 +113,8 @@ public static void assertMetadataListEquals(List metadataListA,
                     continue;
                 }
                 mAFields.add(n);
-                assertArrayEquals(mA.getValues(n), mB.getValues(n), "problem with " + n +
-                        " in metadata index=" + i);
+                assertArrayEquals(mA.getValues(n), mB.getValues(n),
+                                "problem with " + n + " in metadata index=" + i);
             }
             Set mBFields = new HashSet<>();
             for (String n : mB.names()) {
@@ -132,14 +128,13 @@ public static void assertMetadataListEquals(List metadataListA,
     }
 
     /**
-     * Test that in at least one item in metadataList, all keys and values
-     * in minExpected are contained.
+     * Test that in at least one item in metadataList, all keys and values in minExpected are
+     * contained.
      * 

- * The values in minExpected are tested for whether they are contained - * within a value in the target. If minExpected=&dquot;text/vbasic&dquot; and - * what was actually found in the target within metadatalist is - * &dquot;text/vbasic; charset=windows-1252&dquot;, - * that is counted as a hit. + * The values in minExpected are tested for whether they are contained within a value in the + * target. If minExpected=&dquot;text/vbasic&dquot; and what was actually found in the target + * within metadatalist is &dquot;text/vbasic; charset=windows-1252&dquot;, that is counted as a + * hit. * * @param minExpected * @param metadataList @@ -162,11 +157,11 @@ public static void assertContainsAtLeast(Metadata minExpected, List me } } if (foundPropertyCount == minExpected.names().length) { - //found everything! + // found everything! return; } } - //TODO: figure out how to have more informative error message + // TODO: figure out how to have more informative error message fail("Couldn't find everything within a single metadata item"); } @@ -223,8 +218,8 @@ public URL getResourceAsUrl(String name) { * * @param name name of the desired resource * @return A {@link java.net.URI} object or null - * @throws URISyntaxException if this URL is not formatted strictly according to - * RFC2396 and cannot be converted to a URI. + * @throws URISyntaxException if this URL is not formatted strictly according to RFC2396 and + * cannot be converted to a URI. */ public URI getResourceAsUri(String name) throws URISyntaxException { URL url = getResourceAsUrl(name); @@ -235,13 +230,12 @@ public URI getResourceAsUri(String name) throws URISyntaxException { } /** - * This method will give you back the filename incl. the absolute path name - * to the resource. If the resource does not exist it will give you back the - * resource name incl. the path. + * This method will give you back the filename incl. the absolute path name to the resource. If + * the resource does not exist it will give you back the resource name incl. the path. * * @param name The named resource to search for. - * @return an absolute path incl. the name which is in the same directory as - * the the class you've called it from. + * @return an absolute path incl. the name which is in the same directory as the the class + * you've called it from. */ public File getResourceAsFile(String name) throws URISyntaxException { URI uri = getResourceAsUri(name); @@ -268,9 +262,9 @@ public InputStream getResourceAsStream(String name) { } protected XMLResult getXML(String filePath, Parser parser, ParseContext context) - throws Exception { + throws Exception { return getXML(getResourceAsStream("/test-documents/" + filePath), parser, new Metadata(), - context); + context); } protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception { @@ -282,21 +276,20 @@ protected XMLResult getXML(String filePath, ParseContext parseContext) throws Ex } protected XMLResult getXML(String filePath, Parser parser, Metadata metadata, - ParseContext parseContext) - throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), parser, - metadata, parseContext); + ParseContext parseContext) throws Exception { + return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata, + parseContext); } protected XMLResult getXML(String filePath, Metadata metadata, ParseContext parseContext) - throws Exception { + throws Exception { return getXML(getResourceAsStream("/test-documents/" + filePath), AUTO_DETECT_PARSER, - metadata, parseContext); + metadata, parseContext); } protected XMLResult getXML(String filePath, Metadata metadata) throws Exception { return getXML(getResourceAsStream("/test-documents/" + filePath), AUTO_DETECT_PARSER, - metadata, null); + metadata, null); } protected XMLResult getXML(String filePath, Parser parser) throws Exception { @@ -307,16 +300,16 @@ protected XMLResult getXML(String filePath, Parser parser) throws Exception { protected XMLResult getXML(String filePath) throws Exception { return getXML(getResourceAsStream("/test-documents/" + filePath), AUTO_DETECT_PARSER, - new Metadata(), null); + new Metadata(), null); } protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) - throws Exception { + throws Exception { return getXML(input, parser, metadata, null); } protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, - ParseContext context) throws Exception { + ParseContext context) throws Exception { if (context == null) { context = new ParseContext(); } @@ -335,13 +328,13 @@ protected List getRecursiveMetadataFromFullPath(String path) throws Ex } protected List getRecursiveMetadata(String filePath, boolean suppressException) - throws Exception { + throws Exception { return getRecursiveMetadata(filePath, new Metadata(), new ParseContext(), - suppressException); + suppressException); } protected List getRecursiveMetadata(String filePath, ParseContext parseContext, - boolean suppressException) throws Exception { + boolean suppressException) throws Exception { return getRecursiveMetadata(filePath, new Metadata(), parseContext, suppressException); } @@ -350,77 +343,72 @@ protected List getRecursiveMetadata(String filePath) throws Exception } protected List getRecursiveMetadata(String filePath, - BasicContentHandlerFactory.HANDLER_TYPE handlerType) - throws Exception { + BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { return getRecursiveMetadata(filePath, TikaTest.AUTO_DETECT_PARSER, new Metadata(), - new ParseContext(), true, - handlerType); + new ParseContext(), true, handlerType); } protected List getRecursiveMetadata(String filePath, Metadata metadata) - throws Exception { + throws Exception { return getRecursiveMetadata(filePath, metadata, new ParseContext()); } protected List getRecursiveMetadata(String filePath, Metadata metadata, - ParseContext context) throws Exception { + ParseContext context) throws Exception { return getRecursiveMetadata(filePath, metadata, context, false); } protected List getRecursiveMetadata(String filePath, Metadata metadata, - ParseContext context, boolean suppressException) - throws Exception { + ParseContext context, boolean suppressException) throws Exception { return getRecursiveMetadata(filePath, AUTO_DETECT_PARSER, metadata, context, - suppressException); + suppressException); } protected List getRecursiveMetadata(String filePath, Parser wrapped, - Metadata metadata, ParseContext context, - boolean suppressException) throws Exception { + Metadata metadata, ParseContext context, boolean suppressException) + throws Exception { try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { return getRecursiveMetadata(is, wrapped, metadata, context, suppressException); } } protected List getRecursiveMetadata(String filePath, Parser wrapped, - Metadata metadata, ParseContext context, - boolean suppressException, - BasicContentHandlerFactory.HANDLER_TYPE handlerType) - throws Exception { + Metadata metadata, ParseContext context, boolean suppressException, + BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { - return getRecursiveMetadata(is, wrapped, metadata, context, suppressException, handlerType); + return getRecursiveMetadata(is, wrapped, metadata, context, suppressException, + handlerType); } } protected List getRecursiveMetadata(Path path, ParseContext context, - boolean suppressException) throws Exception { + boolean suppressException) throws Exception { Metadata metadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(path, metadata)) { return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, metadata, context, - suppressException); + suppressException); } } - protected List getRecursiveMetadata(Path path, Parser parser, ParseContext parseContext, - boolean suppressException) throws Exception { + protected List getRecursiveMetadata(Path path, Parser parser, + ParseContext parseContext, boolean suppressException) throws Exception { Metadata metadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(path, metadata)) { - return getRecursiveMetadata(tis, parser, metadata, parseContext, - suppressException); + return getRecursiveMetadata(tis, parser, metadata, parseContext, suppressException); } } protected List getRecursiveMetadata(Path path, Parser parser, - boolean suppressException) throws Exception { + boolean suppressException) throws Exception { Metadata metadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(path, metadata)) { return getRecursiveMetadata(tis, parser, metadata, new ParseContext(), - suppressException); + suppressException); } } protected List getRecursiveMetadata(Path p, boolean suppressException) - throws Exception { + throws Exception { Metadata metadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(p, metadata)) { return getRecursiveMetadata(tis, metadata, new ParseContext(), suppressException); @@ -435,28 +423,27 @@ protected List getRecursiveMetadata(Path filePath) throws Exception { } protected List getRecursiveMetadata(InputStream is, boolean suppressException) - throws Exception { + throws Exception { return getRecursiveMetadata(is, new Metadata(), new ParseContext(), suppressException); } protected List getRecursiveMetadata(InputStream is, Parser parser, - boolean suppressException) throws Exception { + boolean suppressException) throws Exception { return getRecursiveMetadata(is, parser, new Metadata(), new ParseContext(), - suppressException); + suppressException); } protected List getRecursiveMetadata(InputStream is, Metadata metadata, - ParseContext context, boolean suppressException) - throws Exception { + ParseContext context, boolean suppressException) throws Exception { return getRecursiveMetadata(is, AUTO_DETECT_PARSER, metadata, context, suppressException); } protected List getRecursiveMetadata(InputStream is, Parser p, Metadata metadata, - ParseContext context, boolean suppressException) - throws Exception { + ParseContext context, boolean suppressException) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try { wrapper.parse(is, handler, metadata, context); } catch (Exception e) { @@ -468,12 +455,11 @@ protected List getRecursiveMetadata(InputStream is, Parser p, Metadata } protected List getRecursiveMetadata(InputStream is, Parser p, Metadata metadata, - ParseContext context, boolean suppressException, - BasicContentHandlerFactory.HANDLER_TYPE handlerType) - throws Exception { + ParseContext context, boolean suppressException, + BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(handlerType, -1)); + new BasicContentHandlerFactory(handlerType, -1)); try { wrapper.parse(is, handler, metadata, context); } catch (Exception e) { @@ -485,11 +471,12 @@ protected List getRecursiveMetadata(InputStream is, Parser p, Metadata } protected List getRecursiveMetadata(String filePath, ParseContext context) - throws Exception { + throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), context); } @@ -497,25 +484,22 @@ protected List getRecursiveMetadata(String filePath, ParseContext cont } protected List getRecursiveMetadata(String filePath, Parser parserToWrap) - throws Exception { + throws Exception { return getRecursiveMetadata(filePath, parserToWrap, - BasicContentHandlerFactory.HANDLER_TYPE.XML); + BasicContentHandlerFactory.HANDLER_TYPE.XML); } protected List getRecursiveMetadata(String filePath, Parser parserToWrap, - BasicContentHandlerFactory.HANDLER_TYPE - handlerType) - throws Exception { + BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception { return getRecursiveMetadata(filePath, parserToWrap, handlerType, new ParseContext()); } protected List getRecursiveMetadata(String filePath, Parser parserToWrap, - BasicContentHandlerFactory.HANDLER_TYPE - handlerType, - ParseContext context) throws Exception { + BasicContentHandlerFactory.HANDLER_TYPE handlerType, ParseContext context) + throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); - RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(handlerType, -1)); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(handlerType, -1)); Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { @@ -525,10 +509,11 @@ protected List getRecursiveMetadata(String filePath, Parser parserToWr } protected List getRecursiveMetadata(String filePath, Parser parserToWrap, - ParseContext parseContext) throws Exception { + ParseContext parseContext) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), parseContext); @@ -553,14 +538,14 @@ protected String getText(String filePath, Metadata metadata) throws Exception { } protected String getText(String filePath, Metadata metadata, ParseContext parseContext) - throws Exception { + throws Exception { return getText(filePath, AUTO_DETECT_PARSER, metadata, parseContext); } protected String getText(String filePath, Parser parser, Metadata metadata, - ParseContext parseContext) throws Exception { + ParseContext parseContext) throws Exception { return getText(getResourceAsStream("/test-documents/" + filePath), parser, parseContext, - metadata); + metadata); } /** @@ -569,7 +554,7 @@ protected String getText(String filePath, Parser parser, Metadata metadata, * Tries to close input stream after processing. */ public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) - throws Exception { + throws Exception { ContentHandler handler = new BodyContentHandler(1000000); try (is) { parser.parse(is, handler, metadata, context); @@ -600,7 +585,7 @@ public InputStream truncate(String testFileName, int truncatedLength) throws IOE } if (truncatedLength > bos.toByteArray().length) { throw new EOFException( - "Can't truncate beyond file length: " + bos.toByteArray().length); + "Can't truncate beyond file length: " + bos.toByteArray().length); } byte[] truncated = new byte[truncatedLength]; System.arraycopy(bos.toByteArray(), 0, truncated, 0, truncatedLength); @@ -608,8 +593,8 @@ public InputStream truncate(String testFileName, int truncatedLength) throws IOE } public List getAllTestFiles() { - //for now, just get main files - //TODO: fix this to be recursive + // for now, just get main files + // TODO: fix this to be recursive try { File[] pathArray = Paths.get(getResourceAsUri("/test-documents")).toFile().listFiles(); List paths = new ArrayList<>(); @@ -677,7 +662,7 @@ public void handle(String filename, MediaType mediaType, InputStream stream) { bytes.add(os.toByteArray()); stream.reset(); } catch (IOException e) { - //swallow + // swallow } } } diff --git a/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java b/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java index 9ff104c708..22a3535860 100644 --- a/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java +++ b/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika; @@ -21,7 +19,6 @@ import java.io.FileInputStream; import java.io.InputStream; import java.util.Locale; - import org.apache.commons.io.IOUtils; public class TypeDetectionBenchmark { @@ -52,7 +49,7 @@ private static void benchmark(File file) throws Exception { tika.detect(new ByteArrayInputStream(content)); } System.out.printf(Locale.ROOT, "%6dns per Tika.detect(%s) = %s%n", - System.currentTimeMillis() - start, file, type); + System.currentTimeMillis() - start, file, type); } } else if (file.isDirectory()) { for (File child : file.listFiles()) { diff --git a/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java index 763cb43fdd..1576a5f663 100644 --- a/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -21,17 +19,14 @@ import java.net.URL; import java.nio.file.Path; import java.nio.file.Paths; - -import org.junit.jupiter.api.AfterEach; - import org.apache.tika.TikaTest; import org.apache.tika.parser.ParseContext; +import org.junit.jupiter.api.AfterEach; /** - * Parent of Junit test classes for {@link TikaConfig}, including - * Tika Core based ones, and ones in Tika Parsers that do things - * that {@link TikaConfigTest} can't, do due to a need for the - * full set of "real" classes of parsers / detectors + * Parent of Junit test classes for {@link TikaConfig}, including Tika Core based ones, and ones in + * Tika Parsers that do things that {@link TikaConfigTest} can't, do due to a need for the full set + * of "real" classes of parsers / detectors */ public abstract class AbstractTikaConfigTest extends TikaTest { protected static ParseContext context = new ParseContext(); diff --git a/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java b/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java index 185387cffe..518ef549ea 100644 --- a/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java +++ b/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; - import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor; public class DummyExecutor extends ThreadPoolExecutor implements ConfigurableThreadPoolExecutor { diff --git a/tika-core/src/test/java/org/apache/tika/config/DummyParser.java b/tika-core/src/test/java/org/apache/tika/config/DummyParser.java index cea6c2f498..a63f048eba 100644 --- a/tika-core/src/test/java/org/apache/tika/config/DummyParser.java +++ b/tika-core/src/test/java/org/apache/tika/config/DummyParser.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; import java.util.Collection; - import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; @@ -28,7 +25,7 @@ public class DummyParser extends CompositeParser implements Parser { private final ServiceLoader loader; public DummyParser(MediaTypeRegistry registry, ServiceLoader loader, - Collection> excludeParsers) { + Collection> excludeParsers) { this.loader = loader; } diff --git a/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java index 9bbe8ede99..a280bee3f3 100644 --- a/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java @@ -1,18 +1,16 @@ /** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at *

* http://www.apache.org/licenses/LICENSE-2.0 *

- * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -22,10 +20,8 @@ import java.io.InputStream; import java.util.List; import java.util.Map; - -import org.junit.jupiter.api.Test; - import org.apache.tika.exception.TikaException; +import org.junit.jupiter.api.Test; public class MockConfigTest { diff --git a/tika-core/src/test/java/org/apache/tika/config/ParamTest.java b/tika-core/src/test/java/org/apache/tika/config/ParamTest.java index d4d53b667d..01cb72f865 100644 --- a/tika-core/src/test/java/org/apache/tika/config/ParamTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/ParamTest.java @@ -1,18 +1,16 @@ /** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at *

* http://www.apache.org/licenses/LICENSE-2.0 *

- * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -25,7 +23,6 @@ import java.net.URI; import java.util.ArrayList; import java.util.List; - import org.junit.jupiter.api.Test; public class ParamTest { @@ -38,11 +35,13 @@ public void testSaveAndLoad() throws Exception { list.add("brown"); list.add("fox"); Object[] objects = - new Object[]{list, Integer.MAX_VALUE, 2.5f, 4000.57576, true, false, Long.MAX_VALUE, - "Hello this is a boring string", new URI("http://apache.org").toURL(), - new URI("tika://org.apache.tika.ner.parser?impl=xyz"), - new BigInteger(Long.MAX_VALUE + "").add( - new BigInteger(Long.MAX_VALUE + "")), new File("."),}; + new Object[] {list, Integer.MAX_VALUE, 2.5f, 4000.57576, true, false, + Long.MAX_VALUE, "Hello this is a boring string", + new URI("http://apache.org").toURL(), + new URI("tika://org.apache.tika.ner.parser?impl=xyz"), + new BigInteger(Long.MAX_VALUE + "") + .add(new BigInteger(Long.MAX_VALUE + "")), + new File("."),}; for (Object object : objects) { String name = "name" + System.currentTimeMillis(); diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java index dafdd641f0..f6a77ea253 100644 --- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -25,49 +23,43 @@ import java.io.InputStream; import java.io.StringWriter; import java.nio.charset.StandardCharsets; - -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.mock.MockParser; import org.apache.tika.parser.multiple.FallbackParser; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; public class TikaConfigSerializerTest extends TikaConfigTest { /** - * TIKA-1445 It should be possible to exclude DefaultParser from - * certain types, so another parser explicitly listed will take them + * TIKA-1445 It should be possible to exclude DefaultParser from certain types, so another + * parser explicitly listed will take them */ @Test public void defaultParserWithExcludes() throws Exception { - String xml = - loadAndSerialize("TIKA-1445-default-except.xml", TikaConfigSerializer.Mode.STATIC); - assertContains( - "" + " fail/world" + - " " + - "", xml); + String xml = loadAndSerialize("TIKA-1445-default-except.xml", + TikaConfigSerializer.Mode.STATIC); + assertContains("" + " fail/world" + + " " + "", xml); } @Test public void testEncodingDetectors() throws Exception { String xml = loadAndSerialize("TIKA-1762-executors.xml", TikaConfigSerializer.Mode.STATIC); - assertContains(" " + - " " + - "", xml); + assertContains(" " + " " + "", xml); } @Test public void testMultipleWithFallback() throws Exception { TikaConfig config = getConfig("TIKA-1509-multiple-fallback.xml"); StringWriter writer = new StringWriter(); - TikaConfigSerializer.serialize(config, - TikaConfigSerializer.Mode.STATIC_FULL, writer, StandardCharsets.UTF_8); - try (InputStream is = - new ByteArrayInputStream(writer.toString().getBytes(StandardCharsets.UTF_8))) { + TikaConfigSerializer.serialize(config, TikaConfigSerializer.Mode.STATIC_FULL, writer, + StandardCharsets.UTF_8); + try (InputStream is = new ByteArrayInputStream( + writer.toString().getBytes(StandardCharsets.UTF_8))) { config = new TikaConfig(is); } @@ -90,9 +82,9 @@ public void testMultipleWithFallback() throws Exception { @Disabled("TODO: executor-service info needs to be stored in TikaConfig for serialization") public void testExecutors() throws Exception { String xml = loadAndSerialize("TIKA-1762-executors.xml", TikaConfigSerializer.Mode.STATIC); - assertContains("" + - " 3" + " 10" + - "", xml); + assertContains("" + + " 3" + " 10" + + "", xml); } String loadAndSerialize(String configFile, TikaConfigSerializer.Mode mode) throws Exception { diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java index 97ff3f376f..a6af4475e9 100644 --- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.config; @@ -31,9 +29,6 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ThreadPoolExecutor; - -import org.junit.jupiter.api.Test; - import org.apache.tika.ResourceLoggingClassLoader; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; @@ -51,18 +46,17 @@ import org.apache.tika.parser.mock.MockParser; import org.apache.tika.parser.multiple.FallbackParser; import org.apache.tika.utils.XMLReaderUtils; +import org.junit.jupiter.api.Test; /** - * Tests for the Tika Config, which don't require real parsers / - * detectors / etc. - * There's also {@link TikaParserConfigTest} and {@link TikaDetectorConfigTest} - * over in the Tika Parsers project, which do further Tika Config - * testing using real parsers and detectors. + * Tests for the Tika Config, which don't require real parsers / detectors / etc. There's also + * {@link TikaParserConfigTest} and {@link TikaDetectorConfigTest} over in the Tika Parsers project, + * which do further Tika Config testing using real parsers and detectors. */ public class TikaConfigTest extends AbstractTikaConfigTest { /** - * Make sure that a configuration file can't reference the - * {@link AutoDetectParser} class a <parser> configuration element. + * Make sure that a configuration file can't reference the {@link AutoDetectParser} class a + * <parser> configuration element. * * @see TIKA-866 */ @@ -76,18 +70,17 @@ public void withInvalidParser() throws Exception { } /** - * Make sure that with a service loader given, we can - * get different configurable behaviour on parser classes - * which can't be found. + * Make sure that with a service loader given, we can get different configurable behaviour on + * parser classes which can't be found. */ @Test public void testUnknownParser() throws Exception { ServiceLoader ignoreLoader = - new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.IGNORE); + new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.IGNORE); ServiceLoader warnLoader = - new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.WARN); + new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.WARN); ServiceLoader throwLoader = - new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.THROW); + new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.THROW); Path configPath = Paths.get(new URI(getConfigPath("TIKA-1700-unknown-parser.xml"))); TikaConfig ignore = new TikaConfig(configPath, ignoreLoader); @@ -108,9 +101,8 @@ public void testUnknownParser() throws Exception { } /** - * Make sure that a configuration file can reference also a composite - * parser class like {@link DefaultParser} in a <parser> - * configuration element. + * Make sure that a configuration file can reference also a composite parser class like + * {@link DefaultParser} in a <parser> configuration element. * * @see TIKA-866 */ @@ -124,8 +116,8 @@ public void asCompositeParser() throws Exception { } /** - * Make sure that a valid configuration file without mimetypes or - * detector entries can be loaded without problems. + * Make sure that a valid configuration file without mimetypes or detector entries can be loaded + * without problems. * * @see TIKA-866 */ @@ -139,14 +131,13 @@ public void onlyValidParser() throws Exception { } /** - * TIKA-1145 If the TikaConfig has a ClassLoader set on it, - * that should be used when loading the mimetypes and when - * discovering services + * TIKA-1145 If the TikaConfig has a ClassLoader set on it, that should be used when loading the + * mimetypes and when discovering services */ @Test public void ensureClassLoaderUsedEverywhere() throws Exception { ResourceLoggingClassLoader customLoader = - new ResourceLoggingClassLoader(getClass().getClassLoader()); + new ResourceLoggingClassLoader(getClass().getClassLoader()); TikaConfig config; // Without a classloader set, normal one will be used @@ -163,13 +154,13 @@ public void ensureClassLoaderUsedEverywhere() throws Exception { Map> resources = customLoader.getLoadedResources(); int resourcesCount = resources.size(); assertTrue(resourcesCount > 3, - "Not enough things used the classloader, found only " + resourcesCount); + "Not enough things used the classloader, found only " + resourcesCount); // Ensure everything that should do, did use it // - Parsers assertNotNull(resources.get("META-INF/services/org.apache.tika.parser.Parser")); // - Detectors - //assertNotNull(resources.get("META-INF/services/org.apache.tika.detect.Detector")); + // assertNotNull(resources.get("META-INF/services/org.apache.tika.detect.Detector")); // - Built-In Mimetypes assertNotNull(resources.get("org/apache/tika/mime/tika-mimetypes.xml")); // - Custom Mimetypes @@ -179,12 +170,13 @@ public void ensureClassLoaderUsedEverywhere() throws Exception { /** * TIKA-4485: try a relative path with an empty constructor. * - * @throws Exception + * @throws Exception */ @Test void testEmptyConstructor() throws Exception { // file that exists - System.setProperty("tika.config", "src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml"); + System.setProperty("tika.config", + "src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml"); TikaConfig tikaConfig = new TikaConfig(); // code from the TIKA-1445 test CompositeParser cp = (CompositeParser) tikaConfig.getParser(); @@ -194,12 +186,13 @@ void testEmptyConstructor() throws Exception { // file that doesn't exist System.setProperty("tika.config", "doesntexist.xml"); TikaException ex = assertThrows(TikaException.class, () -> new TikaConfig()); - assertTrue(ex.getMessage().contains("Specified Tika configuration not found: doesntexist.xml")); + assertTrue(ex.getMessage() + .contains("Specified Tika configuration not found: doesntexist.xml")); } /** - * TIKA-1445 It should be possible to exclude DefaultParser from - * certain types, so another parser explicitly listed will take them + * TIKA-1445 It should be possible to exclude DefaultParser from certain types, so another + * parser explicitly listed will take them */ @Test public void defaultParserWithExcludes() throws Exception { @@ -214,13 +207,13 @@ public void defaultParserWithExcludes() throws Exception { assertEquals(3, parsers.size()); // Should have a wrapped DefaultParser, not the main DefaultParser, - // as it is excluded from handling certain classes + // as it is excluded from handling certain classes p = parsers.get(0); assertTrue(p instanceof ParserDecorator, p.toString()); assertEquals(DefaultParser.class, ((ParserDecorator) p).getWrappedParser().getClass()); // Should have two others which claim things, which they wouldn't - // otherwise handle + // otherwise handle p = parsers.get(1); assertTrue(p instanceof ParserDecorator, p.toString()); assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass()); @@ -236,8 +229,8 @@ public void defaultParserWithExcludes() throws Exception { } /** - * TIKA-1653 If one parser has child parsers, those child parsers shouldn't - * show up at the top level as well + * TIKA-1653 If one parser has child parsers, those child parsers shouldn't show up at the top + * level as well */ @Test public void parserWithChildParsers() throws Exception { @@ -252,7 +245,7 @@ public void parserWithChildParsers() throws Exception { assertEquals(2, parsers.size()); // Should have a CompositeParser with 2 child ones, and - // and a wrapped empty parser + // and a wrapped empty parser p = parsers.get(0); assertTrue(p instanceof CompositeParser, p.toString()); assertEquals(2, ((CompositeParser) p).getAllComponentParsers().size()); @@ -290,7 +283,7 @@ public void testTikaExecutorServiceFromConfig() throws Exception { assertTrue((executorService instanceof DummyExecutor), "Should use Dummy Executor"); assertEquals(3, executorService.getCorePoolSize(), "Should have configured Core Threads"); assertEquals(10, executorService.getMaximumPoolSize(), - "Should have configured Max Threads"); + "Should have configured Max Threads"); } @Test @@ -317,13 +310,13 @@ public void testInitializerServiceLoaderThrow() throws Exception { @Test public void testInitializerServiceLoaderThrowButOverridden() throws Exception { - //TODO: test that this was logged at INFO level + // TODO: test that this was logged at INFO level TikaConfig config = getConfig("TIKA-2389-throw-default-overridden.xml"); } @Test public void testInitializerPerParserWarn() throws Exception { - //TODO: test that this was logged at WARN level + // TODO: test that this was logged at WARN level TikaConfig config = getConfig("TIKA-2389-warn-per-parser.xml"); } @@ -348,20 +341,21 @@ public void testMultipleWithFallback() throws Exception { @Test public void testXMLReaderUtils() throws Exception { - //pool size may have been reset already by an - //earlier test. Can't test for default here. + // pool size may have been reset already by an + // earlier test. Can't test for default here. assertEquals(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS, - XMLReaderUtils.getMaxEntityExpansions()); - //make sure that detection on this file actually works with - //default expansions + XMLReaderUtils.getMaxEntityExpansions()); + // make sure that detection on this file actually works with + // default expansions assertEquals("application/rdf+xml", - detect("test-difficult-rdf1.xml", TikaConfig.getDefaultConfig()).toString()); + detect("test-difficult-rdf1.xml", TikaConfig.getDefaultConfig()) + .toString()); TikaConfig tikaConfig = getConfig("TIKA-2732-xmlreaderutils.xml"); try { assertEquals(33, XMLReaderUtils.getPoolSize()); assertEquals(5, XMLReaderUtils.getMaxEntityExpansions()); - //make sure that there's actually a change in behavior + // make sure that there's actually a change in behavior assertEquals("text/plain", detect("test-difficult-rdf1.xml", tikaConfig).toString()); } finally { XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS); @@ -371,12 +365,13 @@ public void testXMLReaderUtils() throws Exception { @Test public void testXMLReaderUtilsReuse() throws Exception { - //this just tests that there's no exception thrown + // this just tests that there's no exception thrown try { XMLReaderUtils.setPoolSize(10); TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); for (int i = 0; i < 500; i++) { - assertEquals("application/rdf+xml", detect("test-difficult-rdf1.xml", tikaConfig).toString()); + assertEquals("application/rdf+xml", + detect("test-difficult-rdf1.xml", tikaConfig).toString()); } } finally { XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS); @@ -391,8 +386,9 @@ public void testXMLReaderUtilsConfigReuse() throws Exception { assertEquals(11, XMLReaderUtils.getPoolSize()); assertEquals(5000, XMLReaderUtils.getMaxEntityExpansions()); assertEquals(10000, XMLReaderUtils.getMaxNumReuses()); - //make sure that there's actually a change in behavior - assertEquals("application/rdf+xml", detect("test-difficult-rdf1.xml", tikaConfig).toString()); + // make sure that there's actually a change in behavior + assertEquals("application/rdf+xml", + detect("test-difficult-rdf1.xml", tikaConfig).toString()); } finally { XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS); XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE); @@ -401,20 +397,21 @@ public void testXMLReaderUtilsConfigReuse() throws Exception { @Test public void testXMLReaderUtilsNoPool() throws Exception { - //pool size may have been reset already by an - //earlier test. Can't test for default here. + // pool size may have been reset already by an + // earlier test. Can't test for default here. assertEquals(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS, - XMLReaderUtils.getMaxEntityExpansions()); - //make sure that detection on this file actually works with - //default expansions + XMLReaderUtils.getMaxEntityExpansions()); + // make sure that detection on this file actually works with + // default expansions assertEquals("application/rdf+xml", - detect("test-difficult-rdf1.xml", TikaConfig.getDefaultConfig()).toString()); + detect("test-difficult-rdf1.xml", TikaConfig.getDefaultConfig()) + .toString()); TikaConfig tikaConfig = getConfig("TIKA-4427-no-sax-pool.xml"); try { assertEquals(0, XMLReaderUtils.getPoolSize()); assertEquals(5, XMLReaderUtils.getMaxEntityExpansions()); - //make sure that there's actually a change in behavior + // make sure that there's actually a change in behavior assertEquals("text/plain", detect("test-difficult-rdf1.xml", tikaConfig).toString()); } finally { XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS); @@ -438,7 +435,8 @@ public void testXMLReaderUtilsException() throws Exception { @Test public void testXMLReaderUtilsUnspecifiedAttribute() throws Exception { TikaConfig tikaConfig = getConfig("TIKA-3551-xmlreaderutils.xml"); - assertEquals(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS, XMLReaderUtils.getMaxEntityExpansions()); + assertEquals(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS, + XMLReaderUtils.getMaxEntityExpansions()); } @Test @@ -450,17 +448,17 @@ public void testBadExclude() throws Exception { @Test public void testTimesInitiated() throws Exception { - //this prevents multi-threading tests, but we aren't doing that now... + // this prevents multi-threading tests, but we aren't doing that now... MockParser.resetTimesInitiated(); - TikaConfig tikaConfig = - new TikaConfig(TikaConfigTest.class.getResourceAsStream("mock-exclude.xml")); + TikaConfig tikaConfig = new TikaConfig( + TikaConfigTest.class.getResourceAsStream("mock-exclude.xml")); assertEquals(1, MockParser.getTimesInitiated()); } @Test public void testAutoDetectParserConfig() throws Exception { TikaConfig tikaConfig = - new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3594.xml")); + new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3594.xml")); AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig(); assertEquals(12345, config.getSpoolToDisk()); assertEquals(6789, config.getOutputThreshold()); diff --git a/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java index dbb8220e74..3421681f77 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -20,14 +18,12 @@ import static org.junit.jupiter.api.Assumptions.assumeTrue; import java.io.InputStream; - -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; public class FileCommandDetectorTest { @@ -44,28 +40,27 @@ public static void setUp() throws Exception { public void testBasic() throws Exception { assumeTrue(FileCommandDetector.checkHasFile()); - try (InputStream is = getClass() - .getResourceAsStream("/test-documents/basic_embedded.xml")) { - //run more than once to ensure that the input stream is reset + try (InputStream is = + getClass().getResourceAsStream("/test-documents/basic_embedded.xml")) { + // run more than once to ensure that the input stream is reset for (int i = 0; i < 2; i++) { Metadata metadata = new Metadata(); MediaType answer = DETECTOR.detect(is, metadata); String fileMime = metadata.get(FileCommandDetector.FILE_MIME); - assertTrue(MediaType.text("xml").equals(answer) || - MediaType.application("xml").equals(answer)); - assertTrue("application/xml".equals(fileMime) || - "text/xml".equals(fileMime)); + assertTrue(MediaType.text("xml").equals(answer) + || MediaType.application("xml").equals(answer)); + assertTrue("application/xml".equals(fileMime) || "text/xml".equals(fileMime)); } } - //now try with TikaInputStream - try (InputStream is = TikaInputStream - .get(getClass().getResourceAsStream("/test-documents/basic_embedded.xml"))) { - //run more than once to ensure that the input stream is reset + // now try with TikaInputStream + try (InputStream is = TikaInputStream.get( + getClass().getResourceAsStream("/test-documents/basic_embedded.xml"))) { + // run more than once to ensure that the input stream is reset for (int i = 0; i < 2; i++) { MediaType answer = DETECTOR.detect(is, new Metadata()); - assertTrue(MediaType.text("xml").equals(answer) || - MediaType.application("xml").equals(answer)); + assertTrue(MediaType.text("xml").equals(answer) + || MediaType.application("xml").equals(answer)); } } } diff --git a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java index 3a86a53b36..5aa6cf0b20 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -26,13 +24,11 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.Test; /** * Test cases for the {@link MagicDetector} class. @@ -73,10 +69,11 @@ public void testDetectOffsetRange() throws Exception { assertDetect(detector, MediaType.OCTET_STREAM, " html"); assertDetect(detector, MediaType.OCTET_STREAM, ""); @@ -96,10 +93,11 @@ public void testDetectMask() throws Exception { assertDetect(detector, MediaType.OCTET_STREAM, "" + - "XHTML test document"); + assertDetect(detector, xhtml, "" + + "XHTML test document"); } @Test public void testDetectRegExOptions() throws Exception { - String pattern = "(?s)\\A.{0,1024}\\x3c\\!(?:DOCTYPE|doctype) (?:HTML|html) " + - "(?:PUBLIC|public) \"-//.{1,16}//(?:DTD|dtd) .{0,64}" + "(?:HTML|html) 4\\.01"; + String pattern = "(?s)\\A.{0,1024}\\x3c\\!(?:DOCTYPE|doctype) (?:HTML|html) " + + "(?:PUBLIC|public) \"-//.{1,16}//(?:DTD|dtd) .{0,64}" + + "(?:HTML|html) 4\\.01"; - String data = "" + - "HTML document" + "

Hello world!"; + String data = "" + + "HTML document" + + "

Hello world!"; - String data1 = "" + - "HTML document" + "

Hello world!"; + String data1 = "" + + "HTML document" + + "

Hello world!"; - String data2 = "" + - "HTML document" + "

Hello world!"; + String data2 = "" + + "HTML document" + + "

Hello world!"; MediaType html = new MediaType("text", "html"); Detector detector = new MagicDetector(html, pattern.getBytes(US_ASCII), null, true, 0, 0); @@ -172,7 +174,7 @@ public void testDetectStreamReadProblems() throws Exception { @Test public void testDetectApplicationEnviHdr() throws Exception { InputStream iStream = MagicDetectorTest.class - .getResourceAsStream("/test-documents/ang20150420t182050_corr_v1e_img.hdr"); + .getResourceAsStream("/test-documents/ang20150420t182050_corr_v1e_img.hdr"); byte[] data = IOUtils.toByteArray(iStream); MediaType testMT = new MediaType("application", "envi.hdr"); Detector detector = new MagicDetector(testMT, data, null, false, 0, 0); @@ -226,8 +228,7 @@ private void assertDetect(Detector detector, MediaType type, byte[] bytes) { } /** - * InputStream class that does not read in all available bytes in - * one go. + * InputStream class that does not read in all available bytes in one go. */ private static class RestrictiveInputStream extends ByteArrayInputStream { public RestrictiveInputStream(byte[] buf) { @@ -235,8 +236,7 @@ public RestrictiveInputStream(byte[] buf) { } /** - * Prevent reading the entire len of bytes if requesting more - * than 10 bytes. + * Prevent reading the entire len of bytes if requesting more than 10 bytes. */ public int read(byte[] b, int off, int len) { if (len > 10) { @@ -250,18 +250,16 @@ public int read(byte[] b, int off, int len) { @Test public void testBZ2Detection() throws Exception { Detector detector = new TikaConfig().getDetector(); - for (String bz2 : new String[]{"bzip2-8-file.txt.bz2", - "empty-file.txt.bz2", "lbzip2-8-file.txt.bz2", - "small-file.txt.bz2", "test-file-1.csv.bz2", - "test-file-2.csv.bz2"}) { + for (String bz2 : new String[] {"bzip2-8-file.txt.bz2", "empty-file.txt.bz2", + "lbzip2-8-file.txt.bz2", "small-file.txt.bz2", "test-file-1.csv.bz2", + "test-file-2.csv.bz2"}) { assertEquals("application/x-bzip2", detect(detector, bz2)); } } - private String detect(Detector detector, String bz2Name) throws IOException { + private String detect(Detector detector, String bz2Name) throws IOException { try (InputStream is = new BufferedInputStream( - this.getClass().getResourceAsStream( - "/test-documents/bz2/" + bz2Name))) { + this.getClass().getResourceAsStream("/test-documents/bz2/" + bz2Name))) { return detector.detect(is, new Metadata()).toString(); } } diff --git a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java index 293f423d2c..08ab32bce9 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -22,13 +20,11 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeDetectionTest; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class MimeDetectionWithNNTest { @@ -43,10 +39,10 @@ public void setUp() { } /** - * The test case only works on the detector that only has grb model as - * currently the grb model is used as an example; if more models are added - * in the TrainedModelDetector, the following tests will need to modified to reflect - * the corresponding type instead of test-equal with the "OCTET_STREAM"; + * The test case only works on the detector that only has grb model as currently the grb model + * is used as an example; if more models are added in the TrainedModelDetector, the following + * tests will need to modified to reflect the corresponding type instead of test-equal with the + * "OCTET_STREAM"; * * @throws Exception */ @@ -70,7 +66,7 @@ public void testDetection() throws Exception { testFile(octetStream_str, "test-long-comment.xml"); testFile(octetStream_str, "stylesheet.xsl"); testUrl(octetStream_str, "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", - "test-difficult-rdf1.xml"); + "test-difficult-rdf1.xml"); testUrl(octetStream_str, "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml"); // add evil test from TIKA-327 testFile(octetStream_str, "test-tika-327.html"); @@ -100,7 +96,7 @@ private void testFile(String expected, String filename) throws IOException { } private void testStream(String expected, String urlOrFileName, InputStream in) - throws IOException { + throws IOException { assertNotNull(in, "Test stream: [" + urlOrFileName + "] is null!"); if (!in.markSupported()) { in = new java.io.BufferedInputStream(in); @@ -108,14 +104,13 @@ private void testStream(String expected, String urlOrFileName, InputStream in) try { Metadata metadata = new Metadata(); String mime = this.detector.detect(in, metadata).toString(); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected: detected."); + assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected."); // Add resource name and test again // metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName); mime = this.detector.detect(in, metadata).toString(); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected after adding resource name."); + assertEquals(expected, mime, urlOrFileName + + " is not properly detected after adding resource name."); } finally { in.close(); } @@ -127,7 +122,7 @@ private void testStream(String expected, String urlOrFileName, InputStream in) @Test public void testEmptyDocument() throws IOException { assertEquals(MediaType.OCTET_STREAM, - detector.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); + detector.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); } diff --git a/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java index dc15299afc..51a734ab0b 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -23,13 +21,11 @@ import java.util.HashMap; import java.util.Map; import java.util.regex.Pattern; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; /** * Test cases for the {@link NameDetector} class. @@ -50,25 +46,25 @@ public void setUp() { @Test public void testDetect() { assertDetect(MediaType.TEXT_PLAIN, "text.txt"); - assertDetect(MediaType.TEXT_PLAIN, "text.txt "); // trailing space - assertDetect(MediaType.TEXT_PLAIN, "text.txt\n"); // trailing newline + assertDetect(MediaType.TEXT_PLAIN, "text.txt "); // trailing space + assertDetect(MediaType.TEXT_PLAIN, "text.txt\n"); // trailing newline assertDetect(MediaType.TEXT_PLAIN, "text.txt?a=b"); // URL query assertDetect(MediaType.TEXT_PLAIN, "text.txt#abc"); // URL fragment - assertDetect(MediaType.TEXT_PLAIN, "text%2Etxt"); // URL encoded - assertDetect(MediaType.TEXT_PLAIN, "text.TXT"); // case insensitive + assertDetect(MediaType.TEXT_PLAIN, "text%2Etxt"); // URL encoded + assertDetect(MediaType.TEXT_PLAIN, "text.TXT"); // case insensitive assertDetect(MediaType.OCTET_STREAM, "text.txt.gz"); assertDetect(MediaType.TEXT_PLAIN, "README"); - assertDetect(MediaType.TEXT_PLAIN, " README "); // space around - assertDetect(MediaType.TEXT_PLAIN, "\tREADME\n"); // other whitespace - assertDetect(MediaType.TEXT_PLAIN, "/a/README"); // leading path - assertDetect(MediaType.TEXT_PLAIN, "\\b\\README"); // windows path - assertDetect(MediaType.OCTET_STREAM, "ReadMe"); // case sensitive + assertDetect(MediaType.TEXT_PLAIN, " README "); // space around + assertDetect(MediaType.TEXT_PLAIN, "\tREADME\n"); // other whitespace + assertDetect(MediaType.TEXT_PLAIN, "/a/README"); // leading path + assertDetect(MediaType.TEXT_PLAIN, "\\b\\README"); // windows path + assertDetect(MediaType.OCTET_STREAM, "ReadMe"); // case sensitive assertDetect(MediaType.OCTET_STREAM, "README.NOW"); // TIKA-1928 # in the filename assertDetect(MediaType.TEXT_PLAIN, "text.txt"); - assertDetect(MediaType.TEXT_PLAIN, "text#.txt"); // # before extension + assertDetect(MediaType.TEXT_PLAIN, "text#.txt"); // # before extension assertDetect(MediaType.TEXT_PLAIN, "text#123.txt");// # before extension assertDetect(MediaType.TEXT_PLAIN, "text.txt#pdf");// # after extension @@ -82,7 +78,7 @@ public void testDetect() { // tough one assertDetect(MediaType.TEXT_PLAIN, " See http://www.example.com:1234/README.txt?a=b#c \n"); assertDetect(MediaType.TEXT_PLAIN, "See README.txt"); // even this! - assertDetect(MediaType.OCTET_STREAM, "See README"); // but not this + assertDetect(MediaType.OCTET_STREAM, "See README"); // but not this assertDetect(MediaType.application("envi.hdr"), "ang20150420t182050_corr_v1e_img.hdr"); diff --git a/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java index 1870033d9e..4ee32633b2 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -24,11 +22,9 @@ import java.io.IOException; import java.io.InputStream; import java.util.Arrays; - -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.Test; /** * Test cases for the {@link TextDetector} class. @@ -56,9 +52,9 @@ public void testDetectEmpty() throws Exception { public void testDetectText() throws Exception { assertText("Hello, World!".getBytes(UTF_8)); assertText(" \t\r\n".getBytes(UTF_8)); - assertNotText(new byte[]{-1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B}); - assertNotText(new byte[]{0}); - assertNotText(new byte[]{'H', 'e', 'l', 'l', 'o', 0}); + assertNotText(new byte[] {-1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B}); + assertNotText(new byte[] {0}); + assertNotText(new byte[] {'H', 'e', 'l', 'l', 'o', 0}); byte[] data = new byte[512]; Arrays.fill(data, (byte) '.'); @@ -100,7 +96,7 @@ private void assertText(byte[] data) { private void assertNotText(byte[] data) { try { assertEquals(MediaType.OCTET_STREAM, - detector.detect(new ByteArrayInputStream(data), new Metadata())); + detector.detect(new ByteArrayInputStream(data), new Metadata())); } catch (IOException e) { fail("Unexpected exception from TextDetector"); } diff --git a/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java index d79e9b7e50..0695f6a981 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -22,11 +20,9 @@ import java.io.IOException; import java.util.Map; import java.util.TreeMap; - -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.Test; /** * Test cases for the {@link TypeDetector} class. diff --git a/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java index 852711671d..bf2bf4bfa1 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -23,12 +21,10 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class ZeroSizeFileDetectorTest { diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java index f6ac5335be..b4d77cdab0 100644 --- a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -45,14 +43,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.TikaTest; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -67,6 +58,11 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; public class ForkParserTest extends TikaTest { @@ -76,7 +72,7 @@ public class ForkParserTest extends TikaTest { @Test public void testHelloWorld() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser())) { + new ForkTestParser())) { Metadata metadata = new Metadata(); ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); @@ -90,7 +86,7 @@ public void testHelloWorld() throws Exception { @Test public void testSerialParsing() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser())) { + new ForkTestParser())) { ParseContext context = new ParseContext(); for (int i = 0; i < 10; i++) { ContentHandler output = new BodyContentHandler(); @@ -104,7 +100,7 @@ public void testSerialParsing() throws Exception { @Test public void testParallelParsing() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser())) { + new ForkTestParser())) { final ParseContext context = new ParseContext(); Thread[] threads = new Thread[10]; @@ -133,7 +129,7 @@ public void testParallelParsing() throws Exception { @Test public void testPoolSizeReached() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser())) { + new ForkTestParser())) { final Semaphore barrier = new Semaphore(0); Thread[] threads = new Thread[parser.getPoolSize()]; @@ -196,44 +192,44 @@ public synchronized int read() throws IOException { public void testPulseAndTimeouts() throws Exception { ForkParser forkParser = - new ForkParser(ForkParserTest.class.getClassLoader(), new MockParser()); + new ForkParser(ForkParserTest.class.getClassLoader(), new MockParser()); forkParser.setServerPulseMillis(500); forkParser.setServerParseTimeoutMillis(5000); forkParser.setServerWaitTimeoutMillis(60000); - String sleepCommand = "\n" + " Hello, World!\n" + - " \n" + - ""; + String sleepCommand = "\n" + " Hello, World!\n" + + " \n" + + ""; ContentHandler o = new BodyContentHandler(-1); Metadata m = new Metadata(); ParseContext c = new ParseContext(); try { - forkParser - .parse(new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)), + forkParser.parse( + new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)), o, m, c); fail("should have thrown IOException"); } catch (TikaException e) { - //failed to communicate with forked parser process" + // failed to communicate with forked parser process" } finally { forkParser.close(); } - //test setting very short pulse (10 ms) and a parser that takes at least 1000 ms + // test setting very short pulse (10 ms) and a parser that takes at least 1000 ms forkParser = new ForkParser(ForkParserTest.class.getClassLoader(), new MockParser()); forkParser.setServerPulseMillis(10); forkParser.setServerParseTimeoutMillis(100); - sleepCommand = "\n" + " Hello, World!\n" + - " \n" + - ""; + sleepCommand = "\n" + " Hello, World!\n" + + " \n" + + ""; o = new BodyContentHandler(-1); m = new Metadata(); c = new ParseContext(); try { - forkParser - .parse(new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)), + forkParser.parse( + new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)), o, m, c); fail("Should have thrown exception"); } catch (IOException | TikaException e) { - //"should have thrown IOException lost connection" + // "should have thrown IOException lost connection" } finally { forkParser.close(); } @@ -242,7 +238,7 @@ public void testPulseAndTimeouts() throws Exception { @Test public void testPackageCanBeAccessed() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser.ForkTestParserAccessingPackage())) { + new ForkTestParser.ForkTestParserAccessingPackage())) { Metadata metadata = new Metadata(); ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); @@ -257,11 +253,12 @@ public void testPackageCanBeAccessed() throws Exception { public void testRecursiveParserWrapper() throws Exception { Parser parser = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, - 20000)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000)); try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); - InputStream is = getResourceAsStream("/test-documents/basic_embedded.xml")) { + InputStream is = getResourceAsStream( + "/test-documents/basic_embedded.xml")) { Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); fork.parse(is, handler, metadata, context); @@ -282,11 +279,12 @@ public void testRecursiveParserWrapper() throws Exception { public void testRecursiveParserWrapperMassiveEmbedded() throws Exception { Parser parser = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, - 20000)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000)); try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); - InputStream is = getResourceAsStream("/test-documents/massive_embedded.xml")) { + InputStream is = getResourceAsStream( + "/test-documents/massive_embedded.xml")) { Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); fork.parse(is, handler, metadata, context); @@ -311,11 +309,12 @@ public void testRecursiveParserWrapperMassiveEmbedded() throws Exception { public void testRPWWithEmbeddedNPE() throws Exception { Parser parser = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, - 20000)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000)); try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); - InputStream is = getResourceAsStream("/test-documents/embedded_with_npe.xml")) { + InputStream is = getResourceAsStream( + "/test-documents/embedded_with_npe.xml")) { Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); fork.parse(is, handler, metadata, context); @@ -331,18 +330,19 @@ public void testRPWWithEmbeddedNPE() throws Exception { assertContains("some_embedded_content", m1.get(TikaCoreProperties.TIKA_CONTENT)); assertEquals("/embed1.xml", m1.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); assertContains("another null pointer exception", - m1.get(TikaCoreProperties.EMBEDDED_EXCEPTION)); + m1.get(TikaCoreProperties.EMBEDDED_EXCEPTION)); } @Test public void testRPWWithMainDocNPE() throws Exception { Parser parser = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, - 20000)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000)); try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); - InputStream is = getResourceAsStream("/test-documents/embedded_then_npe.xml")) { + InputStream is = getResourceAsStream( + "/test-documents/embedded_then_npe.xml")) { Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); fork.parse(is, handler, metadata, context); @@ -365,15 +365,15 @@ public void testRPWWithMainDocNPE() throws Exception { @Test public void testToFileHandler() throws Exception { - //test that a server-side write-to-file works without proxying back the - //AbstractContentHandlerFactory + // test that a server-side write-to-file works without proxying back the + // AbstractContentHandlerFactory Path target = Files.createTempFile(tempDir, "fork-to-file-handler-", ".txt"); try (InputStream is = getResourceAsStream("/test-documents/basic_embedded.xml")) { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(new AutoDetectParser()); ToFileHandler toFileHandler = - new ToFileHandler(new SBContentHandlerFactory(), target.toFile()); - try (ForkParser forkParser = new ForkParser(ForkParserTest.class.getClassLoader(), - wrapper)) { + new ToFileHandler(new SBContentHandlerFactory(), target.toFile()); + try (ForkParser forkParser = + new ForkParser(ForkParserTest.class.getClassLoader(), wrapper)) { Metadata m = new Metadata(); ParseContext context = new ParseContext(); forkParser.parse(is, toFileHandler, m, context); @@ -384,10 +384,10 @@ public void testToFileHandler() throws Exception { try (Reader reader = Files.newBufferedReader(target, StandardCharsets.UTF_8)) { contents = IOUtils.toString(reader); } - assertContainsCount(TikaCoreProperties.TIKA_PARSED_BY.getName() + - " : org.apache.tika.parser.DefaultParser", contents, 2); - assertContainsCount(TikaCoreProperties.TIKA_PARSED_BY.getName() + - " : org.apache.tika.parser.mock.MockParser", contents, 2); + assertContainsCount(TikaCoreProperties.TIKA_PARSED_BY.getName() + + " : org.apache.tika.parser.DefaultParser", contents, 2); + assertContainsCount(TikaCoreProperties.TIKA_PARSED_BY.getName() + + " : org.apache.tika.parser.mock.MockParser", contents, 2); assertContains("Nikolai Lobachevsky", contents); assertContains("embeddedAuthor", contents); assertContains("main_content", contents); @@ -398,12 +398,13 @@ public void testToFileHandler() throws Exception { @Test public void testRecursiveParserWrapperWithProxyingContentHandlersAndMetadata() - throws Exception { + throws Exception { Parser parser = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); BufferingHandler handler = new BufferingHandler(new SBContentHandlerFactory()); try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); - InputStream is = getResourceAsStream("/test-documents/basic_embedded.xml")) { + InputStream is = getResourceAsStream( + "/test-documents/basic_embedded.xml")) { Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); fork.parse(is, handler, metadata, context); @@ -429,9 +430,10 @@ public void testRPWWithNonSerializableContentHandler() throws Exception { Parser parser = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(new NonSerializableHandlerFactory()); + new RecursiveParserWrapperHandler(new NonSerializableHandlerFactory()); try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); - InputStream is = getResourceAsStream("/test-documents/embedded_then_npe.xml")) { + InputStream is = getResourceAsStream( + "/test-documents/embedded_then_npe.xml")) { Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); fork.parse(is, handler, metadata, context); @@ -470,7 +472,7 @@ public void testForkParserDoesntPreventShutdown() throws Exception { CountDownLatch cdl = new CountDownLatch(1); service.submit(() -> { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), - new ForkTestParser.ForkTestParserWaiting())) { + new ForkTestParser.ForkTestParserWaiting())) { Metadata metadata = new Metadata(); ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); @@ -493,14 +495,15 @@ public void testForkParserDoesntPreventShutdown() throws Exception { service.shutdownNow(); service.awaitTermination(15, TimeUnit.SECONDS); long secondsSinceShutdown = ChronoUnit.SECONDS.between(requestShutdown, Instant.now()); - assertTrue(secondsSinceShutdown < 5, "Should have shutdown the service in less than 5 seconds"); + assertTrue(secondsSinceShutdown < 5, + "Should have shutdown the service in less than 5 seconds"); } - //use this to test that the wrapper handler is acted upon by the server but not proxied back + // use this to test that the wrapper handler is acted upon by the server but not proxied back private static class ToFileHandler extends AbstractRecursiveParserWrapperHandler { - //this needs to be a file because a File is serializable + // this needs to be a file because a File is serializable private final File file; private OutputStream os; @@ -530,7 +533,7 @@ public void endDocument() throws SAXException { @Override public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) - throws SAXException { + throws SAXException { try { byte[] bytes = toString(contentHandler, metadata); os.write(bytes, 0, bytes.length); @@ -541,7 +544,7 @@ public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata @Override public void endDocument(ContentHandler contentHandler, Metadata metadata) - throws SAXException { + throws SAXException { try { byte[] bytes = toString(contentHandler, metadata); os.write(bytes, 0, bytes.length); @@ -607,9 +610,9 @@ public ContentHandler getNewContentHandler(OutputStream os, Charset charset) { } private static class LyingNonSerializableContentHandler extends DefaultHandler - implements Serializable { - //StringWriter makes this class not actually Serializable - //as is. + implements Serializable { + // StringWriter makes this class not actually Serializable + // as is. StringWriter writer = new StringWriter(); @Override @@ -623,9 +626,9 @@ public String toString() { } } - //use this to test that a handler that extends RecursiveParserWrapperHandler - //does have both contenthandlers and metadata objects proxied back from the - //server. + // use this to test that a handler that extends RecursiveParserWrapperHandler + // does have both contenthandlers and metadata objects proxied back from the + // server. private static class BufferingHandler extends RecursiveParserWrapperHandler { List contentHandlers = new ArrayList<>(); @@ -636,14 +639,14 @@ public BufferingHandler(ContentHandlerFactory contentHandlerFactory) { @Override public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) - throws SAXException { + throws SAXException { contentHandlers.add(contentHandler); metadataList.add(metadata); } @Override public void endDocument(ContentHandler contentHandler, Metadata metadata) - throws SAXException { + throws SAXException { contentHandlers.add(0, contentHandler); metadataList.add(0, metadata); } diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java index 4756f00abe..9f04d58d22 100644 --- a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java +++ b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java @@ -1,23 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; import static org.junit.jupiter.api.Assertions.assertEquals; +import com.google.common.reflect.ClassPath; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -32,15 +31,7 @@ import java.util.function.Predicate; import java.util.jar.JarEntry; import java.util.jar.JarOutputStream; - -import com.google.common.reflect.ClassPath; import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.TikaTest; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -48,6 +39,11 @@ import org.apache.tika.parser.AutoDetectParserFactory; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.ToXMLContentHandler; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class ForkParserTikaBinTest extends TikaTest { private static final String JAR_FILE_NAME = "mock-tika-app.jar"; @@ -65,48 +61,49 @@ public static void bootstrapJar() throws Exception { ClassLoader loader = ForkServer.class.getClassLoader(); ClassPath classPath = ClassPath.from(loader); addClasses(jarOs, classPath, ci -> ci.getPackageName().startsWith("org.slf4j")); - addClasses(jarOs, classPath, ci -> ci.getPackageName().startsWith("org.apache.logging")); addClasses(jarOs, classPath, - ci -> ci.getPackageName().startsWith("org.apache.commons.io")); - //exclude TypeDetectionBenchmark because it is not serializable - //exclude UpperCasingContentHandler because we want to test that - //we can serialize it from the parent process into the forked process - addClasses(jarOs, classPath, ci -> ci.getPackageName().startsWith("org.apache.tika") && - (!ci.getName().contains("TypeDetectionBenchmark")) && - (!ci.getName().contains("UpperCasingContentHandler"))); - - try (InputStream input = ForkParserTikaBinTest.class - .getResourceAsStream("/org/apache/tika/config/TIKA-2653-vowel-parser-ae.xml")) { - jarOs.putNextEntry( - new JarEntry("org/apache/tika/parser/TIKA-2653-vowel-parser-ae.xml")); + ci -> ci.getPackageName().startsWith("org.apache.logging")); + addClasses(jarOs, classPath, + ci -> ci.getPackageName().startsWith("org.apache.commons.io")); + // exclude TypeDetectionBenchmark because it is not serializable + // exclude UpperCasingContentHandler because we want to test that + // we can serialize it from the parent process into the forked process + addClasses(jarOs, classPath, ci -> ci.getPackageName().startsWith("org.apache.tika") + && (!ci.getName().contains("TypeDetectionBenchmark")) + && (!ci.getName().contains("UpperCasingContentHandler"))); + + try (InputStream input = ForkParserTikaBinTest.class.getResourceAsStream( + "/org/apache/tika/config/TIKA-2653-vowel-parser-ae.xml")) { + jarOs.putNextEntry(new JarEntry( + "org/apache/tika/parser/TIKA-2653-vowel-parser-ae.xml")); IOUtils.copy(input, jarOs); } try (InputStream input = ForkParserTikaBinTest.class - .getResourceAsStream("/org/apache/tika/mime/tika-mimetypes.xml")) { + .getResourceAsStream("/org/apache/tika/mime/tika-mimetypes.xml")) { jarOs.putNextEntry(new JarEntry("org/apache/tika/mime/tika-mimetypes.xml")); IOUtils.copy(input, jarOs); } try (InputStream input = ForkParserTikaBinTest.class - .getResourceAsStream("/custom-mimetypes.xml")) { + .getResourceAsStream("/custom-mimetypes.xml")) { jarOs.putNextEntry(new JarEntry("custom-mimetypes.xml")); IOUtils.copy(input, jarOs); } jarOs.putNextEntry(new JarEntry("META-INF/services/org.apache.tika.parser.Parser")); - jarOs.write( - "org.apache.tika.parser.mock.VowelParser\n".getBytes(StandardCharsets.UTF_8)); + jarOs.write("org.apache.tika.parser.mock.VowelParser\n" + .getBytes(StandardCharsets.UTF_8)); } Path tikaConfigVowelParser = JAR_DIR.resolve("TIKA_2653-iou.xml"); - try (InputStream is = ForkServer.class - .getResourceAsStream("/org/apache/tika/config/TIKA-2653-vowel-parser-iou.xml"); - OutputStream os = Files.newOutputStream(tikaConfigVowelParser)) { + try (InputStream is = ForkServer.class.getResourceAsStream( + "/org/apache/tika/config/TIKA-2653-vowel-parser-iou.xml"); + OutputStream os = Files.newOutputStream(tikaConfigVowelParser)) { IOUtils.copy(is, os); } } private static void addClasses(JarOutputStream jarOs, ClassPath classPath, - Predicate predicate) throws IOException { + Predicate predicate) throws IOException { for (ClassPath.ClassInfo classInfo : classPath.getAllClasses()) { if (predicate.test(classInfo)) { jarOs.putNextEntry(new JarEntry(classInfo.getResourceName())); @@ -117,18 +114,16 @@ private static void addClasses(JarOutputStream jarOs, ClassPath classPath, @Test public void testExplicitParserFactory() throws Exception { - XMLResult xmlResult = - getXML(new ParserFactoryFactory("org.apache.tika.parser.mock.MockParserFactory", - EMPTY_MAP)); + XMLResult xmlResult = getXML(new ParserFactoryFactory( + "org.apache.tika.parser.mock.MockParserFactory", EMPTY_MAP)); assertContains("hello world!", xmlResult.xml); assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR)); } @Test public void testVowelParserAsDefault() throws Exception { - ParserFactoryFactory pff = - new ParserFactoryFactory("org.apache.tika.parser.AutoDetectParserFactory", - EMPTY_MAP); + ParserFactoryFactory pff = new ParserFactoryFactory( + "org.apache.tika.parser.AutoDetectParserFactory", EMPTY_MAP); XMLResult xmlResult = getXML(pff); assertContains("eooeuiooueoeeao", xmlResult.xml); assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR)); @@ -139,21 +134,19 @@ public void testVowelParserInClassPath() throws Exception { Map args = new HashMap<>(); args.put(AutoDetectParserFactory.TIKA_CONFIG_PATH, "TIKA-2653-vowel-parser-ae.xml"); ParserFactoryFactory pff = new ParserFactoryFactory( - "org.apache.tika.parser.AutoDetectParserFactory", - args); + "org.apache.tika.parser.AutoDetectParserFactory", args); XMLResult xmlResult = getXML(pff); assertContains("eeeeea", xmlResult.xml); - assertEquals("Nikolai Lobachevsky", - xmlResult.metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR)); } @Test public void testVowelParserFromDirectory() throws Exception { Map args = new HashMap<>(); args.put(AutoDetectParserFactory.TIKA_CONFIG_PATH, - JAR_DIR.resolve("TIKA_2653-iou.xml").toAbsolutePath().toString()); - ParserFactoryFactory pff = - new ParserFactoryFactory("org.apache.tika.parser.AutoDetectParserFactory", args); + JAR_DIR.resolve("TIKA_2653-iou.xml").toAbsolutePath().toString()); + ParserFactoryFactory pff = new ParserFactoryFactory( + "org.apache.tika.parser.AutoDetectParserFactory", args); XMLResult xmlResult = getXML(pff); assertContains("oouioouoo", xmlResult.xml); assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR)); @@ -161,27 +154,25 @@ public void testVowelParserFromDirectory() throws Exception { @Test public void testPFFWithClassLoaderFromParentProcess() throws Exception { - //The UpperCasingContentHandler is not sent to the bootstrap test jar file in @BeforeClass. - //this tests that the content handler was loaded from the parent process. - - ParserFactoryFactory pff = - new ParserFactoryFactory("org.apache.tika.parser.AutoDetectParserFactory", - EMPTY_MAP); - XMLResult xmlResult = - getXML(pff, this.getClass().getClassLoader(), new UpperCasingContentHandler()); + // The UpperCasingContentHandler is not sent to the bootstrap test jar file in @BeforeClass. + // this tests that the content handler was loaded from the parent process. + + ParserFactoryFactory pff = new ParserFactoryFactory( + "org.apache.tika.parser.AutoDetectParserFactory", EMPTY_MAP); + XMLResult xmlResult = getXML(pff, this.getClass().getClassLoader(), + new UpperCasingContentHandler()); assertContains("EOOEUIOOUEOEEAO", xmlResult.xml); assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR)); } private XMLResult getXML(ParserFactoryFactory pff) - throws TikaException, SAXException, IOException { + throws TikaException, SAXException, IOException { return getXML(pff, null, null); } private XMLResult getXML(ParserFactoryFactory pff, ClassLoader classloader, - ContentHandler contentHandler) - throws TikaException, SAXException, IOException { + ContentHandler contentHandler) throws TikaException, SAXException, IOException { List java = new ArrayList<>(); java.add("java"); @@ -196,7 +187,7 @@ private XMLResult getXML(ParserFactoryFactory pff, ClassLoader classloader, parser.setServerPulseMillis(10000); ContentHandler handler = - (contentHandler == null) ? new ToXMLContentHandler() : contentHandler; + (contentHandler == null) ? new ToXMLContentHandler() : contentHandler; Metadata m = new Metadata(); try (InputStream is = getResourceAsStream("/test-documents/example.xml")) { parser.parse(is, handler, m, new ParseContext()); diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java b/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java index e9c6949fa2..64100d9ca3 100644 --- a/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java +++ b/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; @@ -22,10 +20,6 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.fork.unusedpackage.ClassInUnusedPackage; import org.apache.tika.metadata.Metadata; @@ -33,6 +27,8 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; class ForkTestParser implements Parser { @@ -46,7 +42,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { stream.read(); metadata.set(Metadata.CONTENT_TYPE, "text/plain"); @@ -61,7 +57,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, static class ForkTestParserAccessingPackage extends ForkTestParser { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { assertNotNull(ClassInUnusedPackage.class.getPackage()); super.parse(stream, handler, metadata, context); } @@ -70,7 +66,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, static class ForkTestParserWaiting extends ForkTestParser { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { try { Thread.sleep(10_000); } catch (InterruptedException e) { diff --git a/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java b/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java index 3ca513f1a4..5924b8ea81 100644 --- a/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java +++ b/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork; import java.util.Locale; - import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; diff --git a/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java b/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java index 1de4c45496..ef1f4738b8 100644 --- a/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java +++ b/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.fork.unusedpackage; diff --git a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java index 906870e730..afce171e90 100644 --- a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -21,7 +19,6 @@ import static org.junit.jupiter.api.Assertions.fail; import java.io.ByteArrayInputStream; - import org.junit.jupiter.api.Test; public class EndianUtilsTest { @@ -29,65 +26,65 @@ public class EndianUtilsTest { public void testReadUE7() throws Exception { byte[] data; - data = new byte[]{0x08}; + data = new byte[] {0x08}; assertEquals(8, EndianUtils.readUE7(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0x84, 0x1e}; + data = new byte[] {(byte) 0x84, 0x1e}; assertEquals(542, EndianUtils.readUE7(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xac, (byte) 0xbe, 0x17}; + data = new byte[] {(byte) 0xac, (byte) 0xbe, 0x17}; assertEquals(728855, EndianUtils.readUE7(new ByteArrayInputStream(data))); } @Test public void testReadUIntLE() throws Exception { - byte[] data = new byte[]{(byte) 0x08, (byte) 0x00, (byte) 0x00, (byte) 0x00}; + byte[] data = new byte[] {(byte) 0x08, (byte) 0x00, (byte) 0x00, (byte) 0x00}; assertEquals(8, EndianUtils.readUIntLE(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xF0, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; + data = new byte[] {(byte) 0xF0, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; assertEquals(4294967280L, EndianUtils.readUIntLE(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; + data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; try { EndianUtils.readUIntLE(new ByteArrayInputStream(data)); fail("Should have thrown exception"); } catch (EndianUtils.BufferUnderrunException e) { - //swallow + // swallow } } @Test public void testReadUIntBE() throws Exception { - byte[] data = new byte[]{(byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x08}; + byte[] data = new byte[] {(byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x08}; assertEquals(8, EndianUtils.readUIntBE(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xF0}; + data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xF0}; assertEquals(4294967280L, EndianUtils.readUIntBE(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; + data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; try { EndianUtils.readUIntLE(new ByteArrayInputStream(data)); fail("Should have thrown exception"); } catch (EndianUtils.BufferUnderrunException e) { - //swallow + // swallow } } @Test public void testReadIntME() throws Exception { - // Example from https://yamm.finance/wiki/Endianness.html#mwAiw - byte[] data = new byte[]{(byte) 0x0b, (byte) 0x0a, (byte) 0x0d, (byte) 0x0c}; + // Example from https://yamm.finance/wiki/Endianness.html#mwAiw + byte[] data = new byte[] {(byte) 0x0b, (byte) 0x0a, (byte) 0x0d, (byte) 0x0c}; assertEquals(0x0a0b0c0d, EndianUtils.readIntME(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xFE, (byte) 0xFF, (byte) 0xFC, (byte) 0xFD}; + data = new byte[] {(byte) 0xFE, (byte) 0xFF, (byte) 0xFC, (byte) 0xFD}; assertEquals(0xfffefdfc, EndianUtils.readIntME(new ByteArrayInputStream(data))); - data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; + data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF}; try { EndianUtils.readIntME(new ByteArrayInputStream(data)); fail("Should have thrown exception"); } catch (EndianUtils.BufferUnderrunException e) { - //swallow + // swallow } } } diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java index bbcfe08894..5b6dfd5025 100644 --- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -22,24 +20,22 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.StringUtils; +import org.junit.jupiter.api.Test; public class FilenameUtilsTest { /** - * Different filesystems and operating systems have different restrictions - * on the name that can be used for files and directories. - * FilenameUtils.normalize() returns a cross platform file name that turns - * special characters in a HEX based code convention. This is %. - * For example why?.zip will be converted into why%3F.zip + * Different filesystems and operating systems have different restrictions on the name that can + * be used for files and directories. FilenameUtils.normalize() returns a cross platform file + * name that turns special characters in a HEX based code convention. This is %. For + * example why?.zip will be converted into why%3F.zip * * @see http://en.wikipedia.org/wiki/Filename#Comparison_of_filename_limitations - *

- * Reserved chars are the ones in FilenameUtils.RESERVED_FILENAME_CHARACTERS: + *

+ * Reserved chars are the ones in FilenameUtils.RESERVED_FILENAME_CHARACTERS: */ @Test public void normalizeNothingTodo() throws Exception { @@ -63,10 +59,10 @@ public void normalizeWithNull() throws Exception { public void normalizeWithReservedChar() throws Exception { final String[] TEST_NAMES = {"test?.txt", "?test.txt", "test.txt?", "?test?txt?"}; final String[] EXPECTED_NAMES = - {"test%3F.txt", "%3Ftest.txt", "test.txt%3F", "%3Ftest%3Ftxt%3F"}; + {"test%3F.txt", "%3Ftest.txt", "test.txt%3F", "%3Ftest%3Ftxt%3F"}; for (int i = 0; i < TEST_NAMES.length; ++i) { - //System.out.println("checking " + TEST_NAMES[i]); + // System.out.println("checking " + TEST_NAMES[i]); assertEquals(EXPECTED_NAMES[i], FilenameUtils.normalize(TEST_NAMES[i])); } } @@ -81,11 +77,11 @@ public void normalizeWithReservedChars() throws Exception { @Test public void normalizeWithNotPrintableChars() throws Exception { - final String TEST_NAME = new String( - new char[]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, '.', 16, 17, 18, - 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); - final String EXPECTED_NAME = "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" + "." + - "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F"; + final String TEST_NAME = new String(new char[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, '.', 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31}); + final String EXPECTED_NAME = "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" + "." + + "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F"; assertEquals(EXPECTED_NAME, FilenameUtils.normalize(TEST_NAME)); } @@ -121,11 +117,11 @@ public void testEmbeddedFileNames() throws Exception { String n = "the quick brown fox.docx"; assertEquals(n, sanitizeFilename(n)); assertEquals(n, sanitizeFilename(n.substring(0, n.length() - 5), - "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); assertEquals(n, sanitizeFilename("the quick\u0000brown fox.docx")); assertEquals(n, sanitizeFilename(n.substring(0, n.length() - 5), - "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); assertEquals("the quick brown fox.bin", sanitizeFilename(n.substring(0, n.length() - 5))); assertEquals("brown fox.docx", sanitizeFilename("the quick..\\brown fox.docx")); @@ -139,18 +135,23 @@ public void testEmbeddedFileNames() throws Exception { assertEquals("_brown fox.docx", sanitizeFilename("....brown fox.docx")); assertEquals("_brown fox.docx", sanitizeFilename(".brown fox.docx")); assertEquals("abcdefghijklmnopqrstuvwxyz_abcdefghijklmno....docx", sanitizeFilename( - "abcdefghijklmnopqrstuvwxyz_abcdefghijklmnopqrstuvwxyz_abcdefghijklmnopqrstuvwxyz.docx")); + "abcdefghijklmnopqrstuvwxyz_abcdefghijklmnopqrstuvwxyz_abcdefghijklmnopqrstuvwxyz.docx")); assertEquals("the quick brown fox.xlsx", sanitizeFilename("C:\\the quick brown fox.xlsx")); assertEquals("the quick brown fox.xlsx", sanitizeFilename("/the quick brown fox.xlsx")); assertEquals("the quick brown fox.xlsx", sanitizeFilename("~/the quick brown fox.xlsx")); - assertEquals("the quick brown fox.xlsx", sanitizeFilename("https://the quick brown fox.xlsx")); - assertEquals("the quick brown fox.xlsx", sanitizeFilename("https://tika.apache.org/the quick brown fox.xlsx")); - assertEquals("the quick brown fox.xlsx", sanitizeFilename("file:///tika.apache.org/the quick brown fox.xlsx")); + assertEquals("the quick brown fox.xlsx", + sanitizeFilename("https://the quick brown fox.xlsx")); + assertEquals("the quick brown fox.xlsx", + sanitizeFilename("https://tika.apache.org/the quick brown fox.xlsx")); + assertEquals("the quick brown fox.xlsx", + sanitizeFilename("file:///tika.apache.org/the quick brown fox.xlsx")); assertEquals("brown fox.xlsx", sanitizeFilename("a:/the quick:brown fox.xlsx")); - assertEquals("_the quick brown fox.xlsx", sanitizeFilename("C:\\a/b/c/..the quick brown fox.xlsx")); - assertEquals("_the quick brown fox.xlsx", sanitizeFilename("~/a/b/c/.the quick brown fox.xlsx")); + assertEquals("_the quick brown fox.xlsx", + sanitizeFilename("C:\\a/b/c/..the quick brown fox.xlsx")); + assertEquals("_the quick brown fox.xlsx", + sanitizeFilename("~/a/b/c/.the quick brown fox.xlsx")); assertEquals("the quick%3Ebrown fox.xlsx", sanitizeFilename("the quick>brown fox.xlsx")); assertEquals("the quick%22brown fox.xlsx", sanitizeFilename("the quick\"brown fox.xlsx")); assertEquals("the quick brown fox.xlsx", sanitizeFilename("\"the quick brown fox.xlsx\"")); @@ -173,7 +174,7 @@ public void testEmbeddedFilePaths() throws Exception { String n = "the quick brown fox.docx"; assertEquals(n, sanitizePath(n)); assertEquals(n, sanitizePath(n.substring(0, n.length() - 5), - "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); assertEquals(n, sanitizeFilename("the quick\u0000brown fox.docx")); assertEquals("the quick brown fox.bin", sanitizePath(n.substring(0, n.length() - 5))); @@ -188,18 +189,22 @@ public void testEmbeddedFilePaths() throws Exception { assertEquals("_brown fox.docx", sanitizePath("....brown fox.docx")); assertEquals("_brown fox.docx", sanitizePath(".brown fox.docx")); assertEquals("abcdefghijklmnopqrstuvwxyz_abcdefghijklmno....docx", sanitizePath( - "abcdefghijklmnopqrstuvwxyz_abcdefghijklmnopqrstuvwxyz_abcdefghijklmnopqrstuvwxyz.docx")); + "abcdefghijklmnopqrstuvwxyz_abcdefghijklmnopqrstuvwxyz_abcdefghijklmnopqrstuvwxyz.docx")); assertEquals("the quick brown fox.xlsx", sanitizePath("C:\\the quick brown fox.xlsx")); assertEquals("the quick brown fox.xlsx", sanitizePath("/the quick brown fox.xlsx")); assertEquals("the quick brown fox.xlsx", sanitizePath("~/the quick brown fox.xlsx")); assertEquals("the quick brown fox.xlsx", sanitizePath("https://the quick brown fox.xlsx")); - assertEquals("tika.apache.org/the quick brown fox.xlsx", sanitizePath("https://tika.apache.org/the quick brown fox.xlsx")); - assertEquals("tika.apache.org/the quick brown fox.xlsx", sanitizePath("file:///tika.apache.org/the quick brown fox.xlsx")); + assertEquals("tika.apache.org/the quick brown fox.xlsx", + sanitizePath("https://tika.apache.org/the quick brown fox.xlsx")); + assertEquals("tika.apache.org/the quick brown fox.xlsx", + sanitizePath("file:///tika.apache.org/the quick brown fox.xlsx")); assertEquals("the quick/brown fox.xlsx", sanitizePath("a:/the quick:brown fox.xlsx")); - assertEquals("a/b/c/_the quick brown fox.xlsx", sanitizePath("C:\\a/b/c/..the quick brown fox.xlsx")); - assertEquals("a/b/c/_the quick brown fox.xlsx", sanitizePath("~/a/b/c/.the quick brown fox.xlsx")); + assertEquals("a/b/c/_the quick brown fox.xlsx", + sanitizePath("C:\\a/b/c/..the quick brown fox.xlsx")); + assertEquals("a/b/c/_the quick brown fox.xlsx", + sanitizePath("~/a/b/c/.the quick brown fox.xlsx")); assertEquals(".docx", sanitizePath("..................docx")); assertEquals(".docx", sanitizePath("..docx")); diff --git a/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java b/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java index b6237b3600..6f11d3c40c 100644 --- a/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -21,7 +19,6 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; - import org.junit.jupiter.api.Test; /** @@ -46,7 +43,7 @@ public void testEmptyStream() throws IOException { @Test public void testBasicLookahead() throws IOException { - InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'}); + InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'}); InputStream lookahead = new LookaheadInputStream(stream, 2); assertEquals('a', lookahead.read()); assertEquals('b', lookahead.read()); @@ -60,7 +57,7 @@ public void testBasicLookahead() throws IOException { @Test public void testZeroLookahead() throws IOException { - InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'}); + InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'}); InputStream lookahead = new LookaheadInputStream(stream, 0); assertEquals(-1, lookahead.read()); lookahead.close(); @@ -72,7 +69,7 @@ public void testZeroLookahead() throws IOException { @Test public void testMarkLookahead() throws IOException { - InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'}); + InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'}); InputStream lookahead = new LookaheadInputStream(stream, 2); lookahead.mark(1); assertEquals('a', lookahead.read()); @@ -93,7 +90,7 @@ public void testMarkLookahead() throws IOException { @Test public void testSkipLookahead() throws IOException { - InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'}); + InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'}); InputStream lookahead = new LookaheadInputStream(stream, 2); assertEquals(1, lookahead.skip(1)); assertEquals('b', lookahead.read()); diff --git a/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java b/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java index cfe0c15f05..a2deca1548 100644 --- a/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -24,7 +22,6 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; - import org.junit.jupiter.api.Test; /** @@ -34,16 +31,17 @@ public class TailStreamTest { /** * Constant for generating test text. */ - private static final String TEXT = "Lorem ipsum dolor sit amet, consetetur " + - "sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut " + - "labore et dolore magna aliquyam erat, sed diam voluptua. At vero" + - " eos et accusam et justo duo dolores et ea rebum. Stet clita " + - "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor " + "sit amet."; + private static final String TEXT = "Lorem ipsum dolor sit amet, consetetur " + + "sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut " + + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero" + + " eos et accusam et justo duo dolores et ea rebum. Stet clita " + + "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor " + + "sit amet."; /** * Generates a test text using the specified parameters. * - * @param from the start index of the text + * @param from the start index of the text * @param length the length of the text * @return the generated test text */ @@ -59,7 +57,7 @@ private static String generateText(int from, int length) { /** * Generates a stream which contains a test text. * - * @param from the start index of the text + * @param from the start index of the text * @param length the length of the generated stream * @return the stream with the test text */ @@ -114,7 +112,7 @@ public void testTailSingleByteReads() throws IOException { TailStream stream = new TailStream(generateStream(0, 2 * count), count); readStream(stream); assertEquals(generateText(count, count), new String(stream.getTail(), UTF_8), - "Wrong buffer"); + "Wrong buffer"); } /** @@ -132,8 +130,8 @@ public void testTailChunkReads() throws IOException { while (read != -1) { read = stream.read(buf); } - assertEquals(generateText(count - tailSize, tailSize), - new String(stream.getTail(), UTF_8), "Wrong buffer"); + assertEquals(generateText(count - tailSize, tailSize), new String(stream.getTail(), UTF_8), + "Wrong buffer"); stream.close(); } @@ -150,8 +148,8 @@ public void testReadWithMarkAndReset() throws IOException { stream.read(buf); stream.reset(); readStream(stream); - assertEquals(generateText(tailSize, tailSize), - new String(stream.getTail(), UTF_8), "Wrong buffer"); + assertEquals(generateText(tailSize, tailSize), new String(stream.getTail(), UTF_8), + "Wrong buffer"); } /** @@ -165,8 +163,8 @@ public void testResetWithoutMark() throws IOException { stream.reset(); byte[] buf = new byte[count]; stream.read(buf); - assertEquals(generateText(count - tailSize, tailSize), - new String(stream.getTail(), UTF_8), "Wrong buffer"); + assertEquals(generateText(count - tailSize, tailSize), new String(stream.getTail(), UTF_8), + "Wrong buffer"); stream.close(); } @@ -181,7 +179,7 @@ public void testSkip() throws IOException { TailStream stream = new TailStream(generateStream(0, count), tailSize); assertEquals(skipCount, stream.skip(skipCount), "Wrong skip result"); assertEquals(generateText(skipCount - tailSize, tailSize), - new String(stream.getTail(), UTF_8), "Wrong buffer"); + new String(stream.getTail(), UTF_8), "Wrong buffer"); stream.close(); } @@ -193,8 +191,7 @@ public void testSkipEOS() throws IOException { final int count = 128; TailStream stream = new TailStream(generateStream(0, count), 2 * count); assertEquals(count, stream.skip(2 * count), "Wrong skip result"); - assertEquals(generateText(0, count), new String(stream.getTail(), UTF_8), - "Wrong buffer"); + assertEquals(generateText(0, count), new String(stream.getTail(), UTF_8), "Wrong buffer"); stream.close(); } diff --git a/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java b/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java index fffb3f3778..86b16bf468 100644 --- a/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -21,7 +19,6 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; - import org.junit.jupiter.api.Test; public class TemporaryResourcesTest { @@ -31,10 +28,11 @@ public void testFileDeletion() throws IOException { Path tempFile; try (TemporaryResources tempResources = new TemporaryResources()) { tempFile = tempResources.createTempFile(); - assertTrue(Files.exists(tempFile), "Temp file should exist while TempResources is used"); + assertTrue(Files.exists(tempFile), + "Temp file should exist while TempResources is used"); } assertTrue(Files.notExists(tempFile), - "Temp file should not exist after TempResources is closed"); + "Temp file should not exist after TempResources is closed"); } } diff --git a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java index 1f8943e688..15f1db4be4 100644 --- a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java +++ b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.io; @@ -29,13 +27,11 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; public class TikaInputStreamTest { @@ -51,17 +47,16 @@ public void testFileBased() throws IOException { assertNull(stream.getInputStreamFactory()); assertEquals(path, TikaInputStream.get(stream).getPath(), - "The file returned by the getFile() method should" + - " be the file used to instantiate a TikaInputStream"); + "The file returned by the getFile() method should" + + " be the file used to instantiate a TikaInputStream"); assertEquals("Hello, World!", readStream(stream), - "The contents of the TikaInputStream should equal the" + - " contents of the underlying file"); + "The contents of the TikaInputStream should equal the" + + " contents of the underlying file"); stream.close(); - assertTrue(Files.exists(path), - "The close() method must not remove the file used to" + - " instantiate a TikaInputStream"); + assertTrue(Files.exists(path), "The close() method must not remove the file used to" + + " instantiate a TikaInputStream"); } @@ -80,28 +75,29 @@ public void testStreamBased() throws IOException { assertNull(stream.getInputStreamFactory()); assertEquals("Hello, World!", readFile(file), - "The contents of the file returned by the getFile method" + - " should equal the contents of the TikaInputStream"); + "The contents of the file returned by the getFile method" + + " should equal the contents of the TikaInputStream"); assertEquals("Hello, World!", readStream(stream), - "The contents of the TikaInputStream should not get modified" + - " by reading the file first"); + "The contents of the TikaInputStream should not get modified" + + " by reading the file first"); stream.close(); assertFalse(Files.exists(file), - "The close() method must remove the temporary file created by a TikaInputStream"); + "The close() method must remove the temporary file created by a TikaInputStream"); } @Test public void testInputStreamFactoryBased() throws IOException { - TikaInputStream stream = TikaInputStream.get(() -> IOUtils.toInputStream("Hello, World!", UTF_8)); + TikaInputStream stream = + TikaInputStream.get(() -> IOUtils.toInputStream("Hello, World!", UTF_8)); assertFalse(stream.hasFile()); assertNull(stream.getOpenContainer()); assertNotNull(stream.getInputStreamFactory()); assertEquals("Hello, World!", readStream(stream), - "The contents of the TikaInputStream should not get modified" + - " by reading the file first"); + "The contents of the TikaInputStream should not get modified" + + " by reading the file first"); stream.close(); } @@ -126,7 +122,7 @@ public void testGetMetadata() throws Exception { TikaInputStream.get(url, metadata).close(); assertEquals("test.txt", metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY)); assertEquals(Long.toString(Files.size(Paths.get(url.toURI()))), - metadata.get(Metadata.CONTENT_LENGTH)); + metadata.get(Metadata.CONTENT_LENGTH)); } } diff --git a/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java b/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java index a035574281..6bbbb953cd 100644 --- a/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java +++ b/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.language.detect; diff --git a/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java b/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java index 2ce1b8b6c6..1965fce44c 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata; @@ -35,13 +33,11 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.utils.DateUtils; +import org.junit.jupiter.api.Test; -//Junit imports +// Junit imports /** * JUnit based tests of class {@link org.apache.tika.metadata.Metadata}. @@ -87,7 +83,7 @@ public void testAdd() { meta.add(nonMultiValued, "value2"); fail("add should fail on the second call of a non-multi valued item"); } catch (PropertyTypeException e) { - //swallow + // swallow } } @@ -248,8 +244,7 @@ public void testObject() { } /** - * Tests for getting and setting integer - * based properties + * Tests for getting and setting integer based properties */ @Test public void testGetSetInt() { @@ -264,13 +259,13 @@ public void testGetSetInt() { meta.set(Metadata.BITS_PER_SAMPLE, 1); fail("Shouldn't be able to set a multi valued property as an int"); } catch (PropertyTypeException e) { - //swallow + // swallow } try { meta.set(TikaCoreProperties.CREATED, 1); fail("Shouldn't be able to set a date property as an int"); } catch (PropertyTypeException e) { - //swallow + // swallow } // Can set it and retrieve it @@ -291,8 +286,7 @@ public void testGetSetInt() { } /** - * Tests for getting and setting date - * based properties + * Tests for getting and setting date based properties */ @Test public void testGetSetDate() { @@ -308,13 +302,13 @@ public void testGetSetDate() { meta.set(Metadata.BITS_PER_SAMPLE, new Date(1000)); fail("Shouldn't be able to set a multi valued property as a date"); } catch (PropertyTypeException e) { - //swallow + // swallow } try { meta.set(Metadata.IMAGE_WIDTH, new Date(1000)); fail("Shouldn't be able to set an int property as an date"); } catch (PropertyTypeException e) { - //swallow + // swallow } // Can set it and retrieve it @@ -334,7 +328,7 @@ public void testGetSetDate() { assertEquals(null, meta.getInt(TikaCoreProperties.CREATED)); // Our format doesn't include milliseconds - // This means things get rounded + // This means things get rounded meta.set(TikaCoreProperties.CREATED, new Date(1050)); assertEquals("1970-01-01T00:00:01Z", meta.get(TikaCoreProperties.CREATED)); assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime()); @@ -367,8 +361,8 @@ public void testGetSetDate() { } /** - * Some documents, like jpegs, might have date in unspecified time zone - * which should be handled like strings but verified to have parseable ISO 8601 format + * Some documents, like jpegs, might have date in unspecified time zone which should be handled + * like strings but verified to have parseable ISO 8601 format */ @Test public void testGetSetDateUnspecifiedTimezone() { @@ -377,25 +371,25 @@ public void testGetSetDateUnspecifiedTimezone() { // Set explictly without a timezone meta.set(TikaCoreProperties.CREATED, "1970-01-01T00:00:01"); assertEquals("1970-01-01T00:00:01", meta.get(TikaCoreProperties.CREATED), - "should return string without time zone specifier because zone is not known"); + "should return string without time zone specifier because zone is not known"); // Now ask DateUtils to format for us without one meta.set(TikaCoreProperties.CREATED, DateUtils.formatDateUnknownTimezone(new Date(1000))); assertEquals("1970-01-01T00:00:01", meta.get(TikaCoreProperties.CREATED), - "should return string without time zone specifier because zone is not known"); + "should return string without time zone specifier because zone is not known"); } /** - * Defines a composite property, then checks that when set as the - * composite the value can be retrieved with the property or the aliases + * Defines a composite property, then checks that when set as the composite the value can be + * retrieved with the property or the aliases */ @SuppressWarnings("deprecation") @Test public void testCompositeProperty() { Metadata meta = new Metadata(); Property compositeProperty = Property.composite(DublinCore.DESCRIPTION, - new Property[]{TikaCoreProperties.DESCRIPTION, - Property.internalText("testDescriptionAlt")}); + new Property[] {TikaCoreProperties.DESCRIPTION, + Property.internalText("testDescriptionAlt")}); String message = "composite description"; meta.set(compositeProperty, message); @@ -412,7 +406,7 @@ public void testMultithreadedDates() throws Exception { int numThreads = 10; ExecutorService executorService = Executors.newFixedThreadPool(numThreads); ExecutorCompletionService executorCompletionService = - new ExecutorCompletionService<>(executorService); + new ExecutorCompletionService<>(executorService); for (int i = 0; i < numThreads; i++) { executorCompletionService.submit(new MetadataDateAdder()); } @@ -505,9 +499,8 @@ public Integer call() throws Exception { DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.US); df.setTimeZone(TimeZone.getTimeZone("UTC")); m.set(TikaCoreProperties.CREATED, df.format(now)); - assertTrue( - Math.abs(now.getTime() - m.getDate(TikaCoreProperties.CREATED).getTime()) < - 2000); + assertTrue(Math.abs(now.getTime() + - m.getDate(TikaCoreProperties.CREATED).getTime()) < 2000); } return 1; diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java index ac64734c20..3e82f82a86 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; import java.util.Locale; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java index 91e4bd3bed..f282abc2b9 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.filter; @@ -24,14 +22,12 @@ import java.util.Arrays; import java.util.HashSet; import java.util.Set; - -import org.junit.jupiter.api.Test; - import org.apache.tika.config.AbstractTikaConfigTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.Test; public class TestMetadataFilter extends AbstractTikaConfigTest { @@ -111,7 +107,7 @@ public void testConfigExcludeFilter() throws Exception { @Test public void testConfigIncludeAndUCFilter() throws Exception { TikaConfig config = getConfig("TIKA-3137-include-uc.xml"); - String[] expectedTitles = new String[]{"TITLE1", "TITLE2", "TITLE3"}; + String[] expectedTitles = new String[] {"TITLE1", "TITLE2", "TITLE3"}; Metadata metadata = new Metadata(); metadata.add("title", "title1"); metadata.add("title", "title2"); @@ -182,8 +178,8 @@ public void testFieldNameMapping() throws Exception { @Test public void testDateNormalizingFilter() throws Exception { - //test that a Date lacking a timezone, if interpreted as Los Angeles, for example, - //yields a UTC string that is properly +7 hours. + // test that a Date lacking a timezone, if interpreted as Los Angeles, for example, + // yields a UTC string that is properly +7 hours. Metadata m = new Metadata(); m.set(TikaCoreProperties.CREATED, "2021-07-23T01:02:24"); DateNormalizingMetadataFilter filter = new DateNormalizingMetadataFilter(); @@ -248,7 +244,8 @@ public void testCaptureGroupOverwrite() throws Exception { public void testAttachmentTypeMetadataFilter() throws Exception { TikaConfig config = getConfig("TIKA-4261-clear-by-embedded-type.xml"); Metadata metadata = new Metadata(); - metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.name()); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.INLINE.name()); metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); MetadataFilter filter = config.getMetadataFilter(); @@ -256,8 +253,8 @@ public void testAttachmentTypeMetadataFilter() throws Exception { assertEquals(0, metadata.names().length); metadata = new Metadata(); - metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK - .name()); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK.name()); metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); filter.filter(metadata); assertEquals(2, metadata.names().length); diff --git a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java index daa68c9280..74e28a63f6 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.listfilter; import java.util.List; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -27,7 +24,8 @@ public List filter(List metadataList) throws TikaException { if (metadataList == null || metadataList.isEmpty()) { return metadataList; } - metadataList.get(0).set("X-TIKA:attachment_count", Integer.toString(metadataList.size() - 1)); + metadataList.get(0).set("X-TIKA:attachment_count", + Integer.toString(metadataList.size() - 1)); return metadataList; } } diff --git a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/MetadataListFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/listfilter/MetadataListFilterTest.java index ad5aa1a19e..3d8881180e 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/MetadataListFilterTest.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/listfilter/MetadataListFilterTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.listfilter; @@ -20,22 +18,22 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.InputStream; - -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; +import org.junit.jupiter.api.Test; public class MetadataListFilterTest { @Test public void testBasic() throws Exception { TikaConfig tikaConfig; - try (InputStream is = MetadataListFilterTest.class.getResourceAsStream( - "metadatalistfilter-config.xml")) { + try (InputStream is = MetadataListFilterTest.class + .getResourceAsStream("metadatalistfilter-config.xml")) { tikaConfig = new TikaConfig(is); } - CompositeMetadataListFilter compositeMetadataListFilter = (CompositeMetadataListFilter) tikaConfig.getMetadataListFilter(); + CompositeMetadataListFilter compositeMetadataListFilter = + (CompositeMetadataListFilter) tikaConfig.getMetadataListFilter(); assertEquals(1, compositeMetadataListFilter.getFilters().size()); - assertTrue(compositeMetadataListFilter.getFilters().get(0) instanceof AttachmentCountingListFilter); + assertTrue(compositeMetadataListFilter.getFilters() + .get(0) instanceof AttachmentCountingListFilter); } } diff --git a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java index 4fb98cffa7..ec7880bca6 100644 --- a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java +++ b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.metadata.writefilter; @@ -26,9 +24,6 @@ import java.util.Collections; import java.util.List; import java.util.Set; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.config.TikaConfigTest; @@ -42,6 +37,7 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.AutoDetectParserConfig; import org.apache.tika.parser.ParseContext; +import org.junit.jupiter.api.Test; public class StandardWriteFilterTest extends TikaTest { @@ -49,22 +45,21 @@ public class StandardWriteFilterTest extends TikaTest { @Test public void testMetadataFactoryConfig() throws Exception { TikaConfig tikaConfig = - new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695.xml")); + new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695.xml")); AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig(); MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory(); assertEquals(350, ((StandardWriteFilterFactory) factory).getMaxTotalEstimatedBytes()); AutoDetectParser parser = new AutoDetectParser(tikaConfig); - String mock = "" + - ""; + String mock = "" + ""; for (int i = 0; i < 20; i++) { mock += "01234567890123456789"; } mock += " hello \n"; mock += ""; Metadata metadata = new Metadata(); - List metadataList = - getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)), - parser, metadata, new ParseContext(), true); + List metadataList = getRecursiveMetadata( + new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)), parser, + metadata, new ParseContext(), true); assertEquals(1, metadataList.size()); metadata = metadataList.get(0); @@ -77,16 +72,15 @@ public void testMetadataFactoryConfig() throws Exception { @Test public void testMetadataFactoryFieldsConfig() throws Exception { - TikaConfig tikaConfig = - new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-fields.xml")); + TikaConfig tikaConfig = new TikaConfig( + TikaConfigTest.class.getResourceAsStream("TIKA-3695-fields.xml")); AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig(); MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory(); assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxTotalEstimatedBytes()); assertEquals(999, ((StandardWriteFilterFactory) factory).getMaxKeySize()); assertEquals(10001, ((StandardWriteFilterFactory) factory).getMaxFieldSize()); AutoDetectParser parser = new AutoDetectParser(tikaConfig); - String mock = "" + - ""; + String mock = "" + ""; mock += "this is not a title"; mock += "this is a title"; for (int i = 0; i < 20; i++) { @@ -97,20 +91,20 @@ public void testMetadataFactoryFieldsConfig() throws Exception { Metadata metadata = new Metadata(); metadata.add("dc:creator", "abcdefghijabcdefghij"); metadata.add("not-allowed", "not-allowed"); - List metadataList = - getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)), - parser, metadata, new ParseContext(), true); + List metadataList = getRecursiveMetadata( + new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)), parser, + metadata, new ParseContext(), true); assertEquals(1, metadataList.size()); metadata = metadataList.get(0); - //test that this was removed during the filter existing stage + // test that this was removed during the filter existing stage assertNull(metadata.get("not-allowed")); - //test that this was not allowed because it isn't in the "include" list + // test that this was not allowed because it isn't in the "include" list assertNull(metadata.get("dc:subject")); String[] creators = metadata.getValues("dc:creator"); assertEquals("abcdefghijabcdefghij", creators[0]); - //this gets more than the other test because this is filtering out some fields + // this gets more than the other test because this is filtering out some fields assertEquals(3, creators.length); assertEquals("012345678901234", creators[2]); assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30); @@ -119,16 +113,16 @@ public void testMetadataFactoryFieldsConfig() throws Exception { @Test public void testKeySizeFilter() throws Exception { - Metadata metadata = filter(10, 1000, 10000, 100, - Collections.EMPTY_SET, Collections.EMPTY_SET, true); - //test that must add keys are not truncated + Metadata metadata = filter(10, 1000, 10000, 100, Collections.EMPTY_SET, + Collections.EMPTY_SET, true); + // test that must add keys are not truncated metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser1"); metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser2"); metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser3"); assertEquals(3, metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY).length); metadata.add(OfficeOpenXMLExtended.DOC_SECURITY_STRING, "some doc-security-string"); - //truncated to 10 bytes in UTF-16 = 5 characters + // truncated to 10 bytes in UTF-16 = 5 characters assertEquals("some doc-security-string", metadata.getValues("exten")[0]); assertTruncated(metadata); @@ -139,16 +133,15 @@ public void testKeySizeFilter() throws Exception { @Test public void testAfterMaxHit() throws Exception { - String k = "dc:creator";//20 bytes - //key is > maxTotalBytes, so the value isn't even added - Metadata metadata = filter(100, 10000, 10, - 100, Collections.EMPTY_SET, Collections.EMPTY_SET, false); + String k = "dc:creator";// 20 bytes + // key is > maxTotalBytes, so the value isn't even added + Metadata metadata = filter(100, 10000, 10, 100, Collections.EMPTY_SET, + Collections.EMPTY_SET, false); metadata.set(k, "ab"); assertEquals(1, metadata.names().length); assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA)); - metadata = filter(100, 10000, 50, 100, - Collections.EMPTY_SET, Collections.EMPTY_SET, false); + metadata = filter(100, 10000, 50, 100, Collections.EMPTY_SET, Collections.EMPTY_SET, false); for (int i = 0; i < 10; i++) { metadata.set(k, "abcde"); } @@ -157,10 +150,10 @@ public void testAfterMaxHit() throws Exception { assertEquals("abcde", metadata.getValues(k)[0]); assertNull(metadata.get(TikaCoreProperties.TRUNCATED_METADATA)); - metadata.add(k, "abcde");//40 - metadata.add(k, "abc");//46 - metadata.add(k, "abcde");//only the first character is taken from this - metadata.add(k, "abcde");//this shouldn't even be countenanced + metadata.add(k, "abcde");// 40 + metadata.add(k, "abc");// 46 + metadata.add(k, "abcde");// only the first character is taken from this + metadata.add(k, "abcde");// this shouldn't even be countenanced assertEquals(2, metadata.names().length); assertEquals(4, metadata.getValues(k).length); @@ -170,8 +163,8 @@ public void testAfterMaxHit() throws Exception { assertEquals("a", metadata.getValues(k)[3]); assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA)); - //this will force a reset of the total max bytes because - //this is a set, not an add. This should get truncated at 15 chars = 30 bytes + // this will force a reset of the total max bytes because + // this is a set, not an add. This should get truncated at 15 chars = 30 bytes metadata.set(k, "abcdefghijklmnopqrstuvwx"); assertEquals(2, metadata.names().length); assertEquals(1, metadata.getValues(k).length); @@ -181,15 +174,15 @@ public void testAfterMaxHit() throws Exception { @Test public void testMinSizeForAlwaysInclude() throws Exception { - //test that mimes don't get truncated - Metadata metadata = filter(100, 10, 10000, 100, - Collections.EMPTY_SET, Collections.EMPTY_SET, true); + // test that mimes don't get truncated + Metadata metadata = filter(100, 10, 10000, 100, Collections.EMPTY_SET, + Collections.EMPTY_SET, true); String mime = getLongestMime().toString(); metadata.set(Metadata.CONTENT_TYPE, mime); assertEquals(mime, metadata.get(Metadata.CONTENT_TYPE)); - //test that other fields are truncated + // test that other fields are truncated metadata.set("dc:title", "abcdefghij"); assertEquals("abcde", metadata.get("dc:title")); assertTruncated(metadata); @@ -197,8 +190,8 @@ public void testMinSizeForAlwaysInclude() throws Exception { @Test public void testMaxFieldValues() throws Exception { - Metadata metadata = filter(100, 10000, 10000, 3, - Collections.EMPTY_SET, Collections.EMPTY_SET, true); + Metadata metadata = filter(100, 10000, 10000, 3, Collections.EMPTY_SET, + Collections.EMPTY_SET, true); for (int i = 0; i < 10; i++) { metadata.add(TikaCoreProperties.SUBJECT, "ab"); } @@ -207,19 +200,21 @@ public void testMaxFieldValues() throws Exception { @Test public void testAddOrder() throws Exception { - StandardWriteFilter standardWriteFilter = new StandardWriteFilter(100, 1000, 100000, 10, Set.of(), Set.of(), true); + StandardWriteFilter standardWriteFilter = + new StandardWriteFilter(100, 1000, 100000, 10, Set.of(), Set.of(), true); Metadata m = new Metadata(); m.setMetadataWriteFilter(standardWriteFilter); m.add("test", "foo"); m.add("test", "bar"); m.add("test", "baz"); - assertArrayEquals(new String[]{"foo", "bar", "baz"}, m.getValues("test")); + assertArrayEquals(new String[] {"foo", "bar", "baz"}, m.getValues("test")); } @Test public void testNullValues() throws Exception { - StandardWriteFilter standardWriteFilter = new StandardWriteFilter(100, 1000, 100000, 10, Set.of(), Set.of(), true); + StandardWriteFilter standardWriteFilter = + new StandardWriteFilter(100, 1000, 100000, 10, Set.of(), Set.of(), true); Metadata m = new Metadata(); m.set("test", "foo"); m.setMetadataWriteFilter(standardWriteFilter); @@ -228,15 +223,16 @@ public void testNullValues() throws Exception { assertEquals(0, m.names().length); assertNull(m.get("test")); - //now test adding + // now test adding m = new Metadata(); m.add("test", "foo"); m.add("test", null); - //Not sure this is the behavior we want, but it is what we're currently doing. - assertArrayEquals(new String[]{"foo"}, m.getValues("test")); + // Not sure this is the behavior we want, but it is what we're currently doing. + assertArrayEquals(new String[] {"foo"}, m.getValues("test")); - //now check when empty not allowed - standardWriteFilter = new StandardWriteFilter(100, 1000, 100000, 10, Set.of(), Set.of(), false); + // now check when empty not allowed + standardWriteFilter = + new StandardWriteFilter(100, 1000, 100000, 10, Set.of(), Set.of(), false); m = new Metadata(); m.set("test", "foo"); m.setMetadataWriteFilter(standardWriteFilter); @@ -256,7 +252,8 @@ public void testNullValues() throws Exception { @Test public void testNullKeys() { - StandardWriteFilter standardWriteFilter = new StandardWriteFilter(100, 1000, 100000, 10, Set.of(), Set.of(), true); + StandardWriteFilter standardWriteFilter = + new StandardWriteFilter(100, 1000, 100000, 10, Set.of(), Set.of(), true); Metadata m = new Metadata(); m.setMetadataWriteFilter(standardWriteFilter); Exception ex = assertThrows(NullPointerException.class, () -> { @@ -278,20 +275,19 @@ public void testNullKeys() { @Test public void testExclude() throws Exception { - TikaConfig tikaConfig = - new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-exclude.xml")); + TikaConfig tikaConfig = new TikaConfig( + TikaConfigTest.class.getResourceAsStream("TIKA-3695-exclude.xml")); AutoDetectParser parser = new AutoDetectParser(tikaConfig); - String mock = "" + - ""; + String mock = "" + ""; mock += "01234567890123456789"; mock += "01234567890123456789"; mock += "01234567890123456789"; mock += " hello \n"; mock += ""; Metadata metadata = new Metadata(); - List metadataList = - getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)), - parser, metadata, new ParseContext(), true); + List metadataList = getRecursiveMetadata( + new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)), parser, + metadata, new ParseContext(), true); assertEquals(1, metadataList.size()); metadata = metadataList.get(0); assertEquals(9, metadata.names().length); @@ -304,11 +300,13 @@ public void testExclude() throws Exception { private void assertTruncated(Metadata metadata) { assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA)); } + private Metadata filter(int maxKeySize, int maxFieldSize, int maxTotalBytes, - int maxValuesPerField, - Set includeFields, Set excludeFields, boolean includeEmpty) { + int maxValuesPerField, Set includeFields, Set excludeFields, + boolean includeEmpty) { MetadataWriteFilter filter = new StandardWriteFilter(maxKeySize, maxFieldSize, - maxTotalBytes, maxValuesPerField, includeFields, excludeFields, includeEmpty); + maxTotalBytes, maxValuesPerField, includeFields, excludeFields, + includeEmpty); Metadata metadata = new Metadata(); metadata.setMetadataWriteFilter(filter); return metadata; diff --git a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java index 6c57740873..96bec70a1a 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -24,7 +22,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - import org.junit.jupiter.api.Test; import org.xml.sax.Attributes; import org.xml.sax.SAXException; @@ -55,7 +52,7 @@ public void testCustomReader() throws Exception { assertEquals("kittens", reader.values.get(key)); assertEquals(1, reader.ignorePatterns.size()); assertEquals(another.toString() + ">>*" + hello.getExtension(), - reader.ignorePatterns.get(0)); + reader.ignorePatterns.get(0)); assertTrue(another.isInterpreted(), "Server-side script type not detected"); } @@ -70,7 +67,7 @@ static class CustomMimeTypesReader extends MimeTypesReader { @Override public void startElement(String uri, String localName, String qName, Attributes attributes) - throws SAXException { + throws SAXException { super.startElement(uri, localName, qName, attributes); if ("hello".equals(qName)) { characters = new StringBuilder(); @@ -90,7 +87,7 @@ public void endElement(String uri, String localName, String qName) { @Override protected void handleGlobError(MimeType type, String pattern, MimeTypeException ex, - String qName, Attributes attributes) throws SAXException { + String qName, Attributes attributes) throws SAXException { ignorePatterns.add(type.toString() + ">>" + pattern); } } diff --git a/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java b/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java index 64a2bebf82..9f97962296 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -23,7 +21,6 @@ import java.util.HashMap; import java.util.Map; - import org.junit.jupiter.api.Test; public class MediaTypeTest { @@ -31,7 +28,7 @@ public class MediaTypeTest { @Test public void testBasics() { assertEquals("application/octet-stream", - new MediaType("application", "octet-stream").toString()); + new MediaType("application", "octet-stream").toString()); assertEquals("text/plain", new MediaType("text", "plain").toString()); @@ -40,11 +37,11 @@ public void testBasics() { parameters.put("charset", "UTF-8"); assertEquals("text/plain; charset=UTF-8", - new MediaType("text", "plain", parameters).toString()); + new MediaType("text", "plain", parameters).toString()); parameters.put("x-eol-style", "crlf"); assertEquals("text/plain; charset=UTF-8; x-eol-style=crlf", - new MediaType("text", "plain", parameters).toString()); + new MediaType("text", "plain", parameters).toString()); } @Test @@ -57,11 +54,11 @@ public void testLowerCase() { parameters.put("CHARSET", "UTF-8"); assertEquals("text/plain; charset=UTF-8", - new MediaType("TEXT", "plain", parameters).toString()); + new MediaType("TEXT", "plain", parameters).toString()); parameters.put("X-Eol-Style", "crlf"); assertEquals("text/plain; charset=UTF-8; x-eol-style=crlf", - new MediaType("TeXt", "PlAiN", parameters).toString()); + new MediaType("TeXt", "PlAiN", parameters).toString()); } @Test @@ -74,11 +71,11 @@ public void testTrim() { parameters.put(" charset", "UTF-8"); assertEquals("text/plain; charset=UTF-8", - new MediaType("\n\ntext", "plain \r", parameters).toString()); + new MediaType("\n\ntext", "plain \r", parameters).toString()); parameters.put("\r\n\tx-eol-style \t", "crlf"); assertEquals("text/plain; charset=UTF-8; x-eol-style=crlf", - new MediaType(" text", "\tplain ", parameters).toString()); + new MediaType(" text", "\tplain ", parameters).toString()); } @Test @@ -87,9 +84,9 @@ public void testQuote() { parameters.put("a", " value with spaces "); parameters.put("b", "text/plain"); parameters.put("c", "()<>@,;:\\\"/[]?="); - assertEquals("text/plain; a=\" value with spaces \"; b=\"text\\/plain\"" + - "; c=\"\\(\\)\\<\\>\\@\\,\\;\\:\\\\\\\"\\/\\[\\]\\?\\=\"", - new MediaType("text", "plain", parameters).toString()); + assertEquals("text/plain; a=\" value with spaces \"; b=\"text\\/plain\"" + + "; c=\"\\(\\)\\<\\>\\@\\,\\;\\:\\\\\\\"\\/\\[\\]\\?\\=\"", + new MediaType("text", "plain", parameters).toString()); } /** @@ -177,13 +174,13 @@ public void testParseNoParamsWithSemi() { @Test public void testOddParameters() { assertEquals("text/html; charset=UTF-8", - MediaType.parse("text/html;; charset=UTF-8").toString()); + MediaType.parse("text/html;; charset=UTF-8").toString()); assertEquals("text/html; charset=UTF-8", - MediaType.parse("text/html;; charset=UTF-8").toString()); + MediaType.parse("text/html;; charset=UTF-8").toString()); assertEquals("text/html; charset=UTF-8", - MediaType.parse("text/html;; charset=\"UTF-8\"").toString()); + MediaType.parse("text/html;; charset=\"UTF-8\"").toString()); assertEquals("text/html; charset=UTF-8", - MediaType.parse("text/html;; charset=\"UTF-8").toString()); + MediaType.parse("text/html;; charset=\"UTF-8").toString()); } } diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java index 690290cbda..b2c082a0e6 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -27,15 +25,13 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; - import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; public class MimeDetectionTest { @@ -66,7 +62,7 @@ public void testDetection() throws Exception { testFile("application/xml", "test-long-comment.xml"); testFile("application/xslt+xml", "stylesheet.xsl"); testUrl("application/rdf+xml", "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", - "test-difficult-rdf1.xml"); + "test-difficult-rdf1.xml"); testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml"); // add evil test from TIKA-327 testFile("text/html", "test-tika-327.html"); @@ -81,7 +77,7 @@ public void testDetection() throws Exception { // test HTML detection of malformed file, previously identified as image/cgm (TIKA-1170) testFile("text/html", "test-malformed-header.html.bin"); - //test GCMD Directory Interchange Format (.dif) TIKA-1561 + // test GCMD Directory Interchange Format (.dif) TIKA-1561 testFile("application/dif+xml", "brwNIMS_2014.dif"); // truncated xml should still be detected as xml, See TIKA-3596 @@ -105,62 +101,65 @@ public void testDetectionWithoutContent() throws IOException { @Test public void testByteOrderMark() throws Exception { - assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata())); - assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata())); - assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata())); + assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata())); + assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata())); + assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata())); } @Test public void testRFC822WithBOM() throws Exception { - String header = "From: blah \r\n" + "Received: Friday, January 24, 2020 3:24 PM\r\n" + - "To: someone@somewhere.com\r\n" + "Cc: someone-else@other.com\r\n" + - "Subject: Received\r\n"; + String header = "From: blah \r\n" + + "Received: Friday, January 24, 2020 3:24 PM\r\n" + + "To: someone@somewhere.com\r\n" + "Cc: someone-else@other.com\r\n" + + "Subject: Received\r\n"; MediaType rfc822 = MediaType.parse("message/rfc822"); - assertEquals(rfc822, MIME_TYPES.detect(UnsynchronizedByteArrayInputStream - .builder() - .setByteArray(header.getBytes(UTF_8)) - .get(), new Metadata())); + assertEquals(rfc822, + MIME_TYPES.detect( + UnsynchronizedByteArrayInputStream.builder() + .setByteArray(header.getBytes(UTF_8)).get(), + new Metadata())); int utfLength = ByteOrderMark.UTF_8.length(); byte[] bytes = new byte[header.getBytes(UTF_8).length + utfLength]; System.arraycopy(ByteOrderMark.UTF_8.getBytes(), 0, bytes, 0, utfLength); System.arraycopy(header.getBytes(UTF_8), 0, bytes, 3, header.getBytes(UTF_8).length); - assertEquals(rfc822, MIME_TYPES.detect(UnsynchronizedByteArrayInputStream - .builder() - .setByteArray(bytes) - .get(), new Metadata())); + assertEquals(rfc822, + MIME_TYPES.detect( + UnsynchronizedByteArrayInputStream.builder() + .setByteArray(bytes).get(), + new Metadata())); } @Test public void testSuperTypes() { assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.parse("text/something"))); + MediaType.parse("text/something"))); assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.TEXT_PLAIN)); + MediaType.TEXT_PLAIN)); assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.OCTET_STREAM)); + MediaType.OCTET_STREAM)); assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something"), - MediaType.TEXT_PLAIN)); + MediaType.TEXT_PLAIN)); assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("application/something+xml"), - MediaType.APPLICATION_XML)); + MediaType.APPLICATION_XML)); assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("application/something+zip"), - MediaType.APPLICATION_ZIP)); + MediaType.APPLICATION_ZIP)); assertTrue(REGISTRY.isSpecializationOf(MediaType.APPLICATION_XML, MediaType.TEXT_PLAIN)); assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("application/vnd.apple.iwork"), - MediaType.APPLICATION_ZIP)); + MediaType.APPLICATION_ZIP)); assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("application/json"), - MediaType.TEXT_PLAIN)); + MediaType.TEXT_PLAIN)); } @SuppressWarnings("unused") @@ -173,8 +172,7 @@ private void testUrlWithoutContent(String expected, String url) throws IOExcepti Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, url); String mime = this.MIME_TYPES.detect(null, metadata).toString(); - assertEquals(expected, mime, - url + " is not properly detected using only resource name"); + assertEquals(expected, mime, url + " is not properly detected using only resource name"); } private void testUrl(String expected, String url, String file) throws IOException { @@ -190,7 +188,7 @@ private void testFile(String expected, String filename) throws IOException { } private void testStream(String expected, String urlOrFileName, InputStream in) - throws IOException { + throws IOException { assertNotNull(in, "Test stream: [" + urlOrFileName + "] is null!"); if (!in.markSupported()) { in = new java.io.BufferedInputStream(in); @@ -198,14 +196,13 @@ private void testStream(String expected, String urlOrFileName, InputStream in) try { Metadata metadata = new Metadata(); String mime = this.MIME_TYPES.detect(in, metadata).toString(); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected: detected."); + assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected."); - //Add resource name and test again + // Add resource name and test again metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName); mime = this.MIME_TYPES.detect(in, metadata).toString(); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected after adding resource name."); + assertEquals(expected, mime, urlOrFileName + + " is not properly detected after adding resource name."); } finally { in.close(); } @@ -219,36 +216,35 @@ private void testStream(String expected, String urlOrFileName, InputStream in) @Test public void testEmptyDocument() throws IOException { assertEquals(MediaType.OCTET_STREAM, - MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); + MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); Metadata namehint = new Metadata(); namehint.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt"); assertEquals(MediaType.TEXT_PLAIN, - MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), namehint)); + MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), namehint)); Metadata typehint = new Metadata(); typehint.set(Metadata.CONTENT_TYPE, "text/plain"); assertEquals(MediaType.TEXT_PLAIN, - MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), typehint)); + MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), typehint)); } /** - * Test for things like javascript files whose content is enclosed in XML - * comment delimiters, but that aren't actually XML. + * Test for things like javascript files whose content is enclosed in XML comment delimiters, + * but that aren't actually XML. * * @see TIKA-426 */ @Test public void testNotXML() throws IOException { - assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES - .detect(new ByteArrayInputStream("".getBytes(UTF_8)), new Metadata())); + assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES.detect( + new ByteArrayInputStream("".getBytes(UTF_8)), new Metadata())); } /** - * Tests that when we repeatedly test the detection of a document - * that can be detected with Mime Magic, that we consistently - * detect it correctly. See TIKA-391 for more details. + * Tests that when we repeatedly test the detection of a document that can be detected with Mime + * Magic, that we consistently detect it correctly. See TIKA-391 for more details. */ @Test public void testMimeMagicStability() throws IOException { @@ -258,10 +254,9 @@ public void testMimeMagicStability() throws IOException { } /** - * Tests that when two magic matches both apply, and both - * have the same priority, we use the name to pick the - * right one based on the glob, or the first one we - * come across if not. See TIKA-1292 for more details. + * Tests that when two magic matches both apply, and both have the same priority, we use the + * name to pick the right one based on the glob, or the first one we come across if not. See + * TIKA-1292 for more details. */ @Test public void testMimeMagicClashSamePriority() throws IOException { diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java index 846b757c3f..cbc333088f 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -32,27 +30,21 @@ import java.util.Set; import java.util.concurrent.Executors; import java.util.stream.Collectors; - -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; /** - * These tests try to ensure that the MimeTypesReader - * has correctly processed the mime-types.xml file. - * To do this, it tests that various aspects of the - * mime-types.xml file have ended up correctly as - * globs, matches, magics etc. + * These tests try to ensure that the MimeTypesReader has correctly processed the mime-types.xml + * file. To do this, it tests that various aspects of the mime-types.xml file have ended up + * correctly as globs, matches, magics etc. *

- * If you make updates to mime-types.xml, then the - * checks in this test may no longer hold true. - * As such, if tests here start failing after your - * changes, please review the test details, and - * update it to match the new state of the file! + * If you make updates to mime-types.xml, then the checks in this test may no longer hold true. As + * such, if tests here start failing after your changes, please review the test details, and update + * it to match the new state of the file! */ public class MimeTypesReaderTest { @@ -64,23 +56,22 @@ public class MimeTypesReaderTest { private String customMimeTypes; private static String getTypeAsString(MimeTypes mimeTypes, String text, Metadata metadata) - throws IOException { - return mimeTypes - .detect(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), metadata) - .toString(); + throws IOException { + return mimeTypes.detect(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), + metadata).toString(); } @SuppressWarnings("unchecked") @BeforeEach public void setUp() throws NoSuchFieldException, SecurityException, IllegalArgumentException, - IllegalAccessException { + IllegalAccessException { this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository(); Field magicsField = mimeTypes.getClass().getDeclaredField("magics"); magicsField.setAccessible(true); magics = (List) magicsField.get(mimeTypes); - //ensure reset of custom mimes path + // ensure reset of custom mimes path customMimeTypes = System.getProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP); } @@ -100,9 +91,8 @@ public void testHtmlMatches() throws Exception { // Check on the type MimeType html = mimeTypes.forName("text/html"); assertTrue(html.hasMagic()); - assertTrue(html.getMagics().size() >= minMatches, - "There should be at least " + minMatches + " HTML matches, found " + - html.getMagics().size()); + assertTrue(html.getMagics().size() >= minMatches, "There should be at least " + minMatches + + " HTML matches, found " + html.getMagics().size()); // Check on the overall magics List htmlMagics = new ArrayList<>(); @@ -112,8 +102,8 @@ public void testHtmlMatches() throws Exception { } } - assertTrue(htmlMagics.size() >= minMatches, - "There should be at least " + minMatches + " HTML matches, found " + htmlMagics.size()); + assertTrue(htmlMagics.size() >= minMatches, "There should be at least " + minMatches + + " HTML matches, found " + htmlMagics.size()); } @Test @@ -123,9 +113,8 @@ public void testExcelMatches() throws Exception { // Check on the type MimeType excel = mimeTypes.forName("application/vnd.ms-excel"); assertTrue(excel.hasMagic()); - assertTrue(excel.getMagics().size() >= minMatches, - "There should be at least " + minMatches + " Excel matches, found " + - excel.getMagics().size()); + assertTrue(excel.getMagics().size() >= minMatches, "There should be at least " + minMatches + + " Excel matches, found " + excel.getMagics().size()); // Check on the overall magics List excelMagics = new ArrayList<>(); @@ -135,9 +124,8 @@ public void testExcelMatches() throws Exception { } } - assertTrue(excel.getMagics().size() >= minMatches, - "There should be at least " + minMatches + " Excel matches, found " + - excelMagics.size()); + assertTrue(excel.getMagics().size() >= minMatches, "There should be at least " + minMatches + + " Excel matches, found " + excelMagics.size()); } /** @@ -161,7 +149,7 @@ public void testReadExtendedMetadata() throws Exception { assertEquals("BMP", mime.getAcronym()); assertEquals("com.microsoft.bmp", mime.getUniformTypeIdentifier()); assertEquals("http://en.wikipedia.org/wiki/BMP_file_format", - mime.getLinks().get(0).toString()); + mime.getLinks().get(0).toString()); mime = this.mimeTypes.forName("application/xml"); assertEquals("XML", mime.getAcronym()); @@ -172,7 +160,7 @@ public void testReadExtendedMetadata() throws Exception { @Test public void testReadParameterHierarchy() throws Exception { MimeType mimeBTree4 = - this.mimeTypes.forName("application/x-berkeley-db;format=btree;version=4"); + this.mimeTypes.forName("application/x-berkeley-db;format=btree;version=4"); MediaType mtBTree4 = mimeBTree4.getType(); // Canonicalised with spaces @@ -185,7 +173,7 @@ public void testReadParameterHierarchy() throws Exception { // Parent has several children, for versions 2 through 4 Set mtBTreeChildren = - this.mimeTypes.getMediaTypeRegistry().getChildTypes(mtBTree); + this.mimeTypes.getMediaTypeRegistry().getChildTypes(mtBTree); assertTrue(mtBTreeChildren.size() >= 3, mtBTreeChildren.toString()); assertTrue(mtBTreeChildren.contains(mtBTree4), mtBTreeChildren.toString()); @@ -194,15 +182,14 @@ public void testReadParameterHierarchy() throws Exception { assertEquals("application/x-berkeley-db", mtBD.toString()); // If we use one with parameters not known in the media registry, - // getting the parent will return the non-parameter version + // getting the parent will return the non-parameter version MediaType mtAlt = MediaType.application("x-berkeley-db; format=unknown; version=42"); MediaType mtAltP = this.mimeTypes.getMediaTypeRegistry().getSupertype(mtAlt); assertEquals("application/x-berkeley-db", mtAltP.toString()); } /** - * TIKA-746 Ensures that the custom mimetype maps were also - * loaded and used + * TIKA-746 Ensures that the custom mimetype maps were also loaded and used */ @Test public void testCustomMimeTypes() { @@ -265,7 +252,7 @@ public void testCustomMimeTypes() { @Test public void testExternalMimeTypes() throws Exception { System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, - "src/test/resources/org/apache/tika/mime/external-mimetypes.xml"); + "src/test/resources/org/apache/tika/mime/external-mimetypes.xml"); MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader()); Metadata m = new Metadata(); m.add(TikaCoreProperties.RESOURCE_NAME_KEY, "test.external.mime.type"); @@ -287,37 +274,34 @@ public void testGetExtensionForJavaScript() throws Exception { assertEquals(List.of(".js", ".mjs"), mt.getExtensions()); } - + @Test public void testMSAccessByName() { MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(); MediaType result = mimeTypes.getMimeType("testfile1.accdb").getType(); assertEquals("application/x-msaccess", result.toString()); - } - - + } + + @Test public void testZipXFiles() { MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(); MediaType result = mimeTypes.getMimeType("testfile1.zipx").getType(); assertEquals("application/zip", result.toString()); } - - + + @Test public void testGetAliasForJavaScript() throws Exception { MimeType mt = this.mimeTypes.forName("text/javascript"); - Set aliases = mimeTypes.getMediaTypeRegistry() - .getAliases(mt.getType()) - .stream() - .map(MediaType::toString) - .collect(Collectors.toSet()); + Set aliases = mimeTypes.getMediaTypeRegistry().getAliases(mt.getType()).stream() + .map(MediaType::toString).collect(Collectors.toSet()); assertEquals(Set.of("application/javascript", "application/x-javascript"), aliases); } @Test public void testGetRegisteredMimesWithParameters() throws Exception { - //TIKA-1692 + // TIKA-1692 // Media Type always keeps details / parameters String name = "application/xml; charset=UTF-8"; @@ -359,60 +343,64 @@ public void testMultiThreaded() throws Exception { @Test public void testMinShouldMatch() throws Exception { System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, - "src/test/resources/org/apache/tika/mime/custom-mimetypes-minShouldMatch.xml"); + "src/test/resources/org/apache/tika/mime/custom-mimetypes-minShouldMatch.xml"); MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader()); - //matches one + // matches one assertEquals("hello/world-min-file", - getTypeAsString(mimeTypes, "Hello World!", new Metadata())); + getTypeAsString(mimeTypes, "Hello World!", new Metadata())); - //matches two + // matches two assertEquals("hello/world-min-file", - getTypeAsString(mimeTypes, "Hello Welt!", new Metadata())); + getTypeAsString(mimeTypes, "Hello Welt!", new Metadata())); - //matches two + // matches two assertEquals("hello/world-min-file", - getTypeAsString(mimeTypes, "Hallo Welt!", new Metadata())); + getTypeAsString(mimeTypes, "Hallo Welt!", new Metadata())); - //missing ! + // missing ! assertEquals("text/plain", getTypeAsString(mimeTypes, "Hello World", new Metadata())); - //Hello requires world, welt or hallo; monde requires bonjour le + // Hello requires world, welt or hallo; monde requires bonjour le assertEquals("text/plain", getTypeAsString(mimeTypes, "Hello Monde", new Metadata())); - //this matcher is treated as "or" with minshouldmatch clause + // this matcher is treated as "or" with minshouldmatch clause assertEquals("hello/world-min-file", - getTypeAsString(mimeTypes, "Bonjour le Monde!", new Metadata())); + getTypeAsString(mimeTypes, "Bonjour le Monde!", new Metadata())); } @Test public void testBadMinShouldMatch1() { System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, - "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch1.xml"); + "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch1.xml"); - assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); + assertThrows(IllegalArgumentException.class, + () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); } @Test public void testBadMinShouldMatch2() { System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, - "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch2.xml"); - assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); + "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch2.xml"); + assertThrows(IllegalArgumentException.class, + () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); } @Test public void testBadMinShouldMatch3() { System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, - "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch3.xml"); - assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); + "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch3.xml"); + assertThrows(IllegalArgumentException.class, + () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); } @Test public void testBadMinShouldMatch4() { System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, - "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch4.xml"); - assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); + "src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch4.xml"); + assertThrows(IllegalArgumentException.class, + () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader())); } private static class CustomClassLoader extends ClassLoader { diff --git a/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java b/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java index 25721b15fd..7941a3aa82 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -21,7 +19,6 @@ import static org.junit.jupiter.api.Assertions.fail; import java.util.List; - import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java index c9d0073c21..f42d14c201 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -28,12 +26,10 @@ import java.io.InputStream; import java.net.URL; import java.nio.charset.StandardCharsets; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class ProbabilisticMimeDetectionTest { @@ -64,7 +60,7 @@ public void testDetection() throws Exception { testFile("application/xml", "test-long-comment.xml"); testFile("application/xslt+xml", "stylesheet.xsl"); testUrl("application/rdf+xml", "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", - "test-difficult-rdf1.xml"); + "test-difficult-rdf1.xml"); testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml"); // add evil test from TIKA-327 testFile("text/html", "test-tika-327.html"); @@ -81,39 +77,39 @@ public void testDetection() throws Exception { @Test public void testByteOrderMark() throws Exception { - assertEquals(MediaType.TEXT_PLAIN, proDetector - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata())); - assertEquals(MediaType.TEXT_PLAIN, proDetector - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata())); + assertEquals(MediaType.TEXT_PLAIN, proDetector.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata())); + assertEquals(MediaType.TEXT_PLAIN, proDetector.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata())); - assertEquals(MediaType.TEXT_PLAIN, proDetector - .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata())); + assertEquals(MediaType.TEXT_PLAIN, proDetector.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata())); } @Test public void testSuperTypes() { assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.parse("text/something"))); + MediaType.parse("text/something"))); assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.TEXT_PLAIN)); + MediaType.TEXT_PLAIN)); assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.OCTET_STREAM)); + MediaType.OCTET_STREAM)); assertTrue(registry.isSpecializationOf(MediaType.parse("text/something"), - MediaType.TEXT_PLAIN)); + MediaType.TEXT_PLAIN)); assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+xml"), - MediaType.APPLICATION_XML)); + MediaType.APPLICATION_XML)); assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+zip"), - MediaType.APPLICATION_ZIP)); + MediaType.APPLICATION_ZIP)); assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML, MediaType.TEXT_PLAIN)); assertTrue(registry.isSpecializationOf(MediaType.parse("application/vnd.apple.iwork"), - MediaType.APPLICATION_ZIP)); + MediaType.APPLICATION_ZIP)); } @SuppressWarnings("unused") @@ -135,7 +131,7 @@ private void testFile(String expected, String filename) throws IOException { } private void testStream(String expected, String urlOrFileName, InputStream in) - throws IOException { + throws IOException { assertNotNull(in, "Test stream: [" + urlOrFileName + "] is null!"); if (!in.markSupported()) { in = new java.io.BufferedInputStream(in); @@ -143,14 +139,13 @@ private void testStream(String expected, String urlOrFileName, InputStream in) try { Metadata metadata = new Metadata(); String mime = this.proDetector.detect(in, metadata).toString(); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected: detected."); + assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected."); // Add resource name and test again metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName); mime = this.proDetector.detect(in, metadata).toString(); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected after adding resource name."); + assertEquals(expected, mime, urlOrFileName + + " is not properly detected after adding resource name."); } finally { in.close(); } @@ -159,43 +154,40 @@ private void testStream(String expected, String urlOrFileName, InputStream in) /** * Test for type detection of empty documents. * - * @see TIKA-483 + * @see TIKA-483 */ @Test public void testEmptyDocument() throws IOException { assertEquals(MediaType.OCTET_STREAM, - proDetector.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); + proDetector.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); Metadata namehint = new Metadata(); namehint.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt"); assertEquals(MediaType.TEXT_PLAIN, - proDetector.detect(new ByteArrayInputStream(new byte[0]), namehint)); + proDetector.detect(new ByteArrayInputStream(new byte[0]), namehint)); Metadata typehint = new Metadata(); typehint.set(Metadata.CONTENT_TYPE, "text/plain"); assertEquals(MediaType.TEXT_PLAIN, - proDetector.detect(new ByteArrayInputStream(new byte[0]), typehint)); + proDetector.detect(new ByteArrayInputStream(new byte[0]), typehint)); } /** - * Test for things like javascript files whose content is enclosed in XML - * comment delimiters, but that aren't actually XML. + * Test for things like javascript files whose content is enclosed in XML comment delimiters, + * but that aren't actually XML. * - * @see TIKA-426 + * @see TIKA-426 */ @Test public void testNotXML() throws IOException { - assertEquals(MediaType.TEXT_PLAIN, proDetector - .detect(new ByteArrayInputStream("".getBytes(UTF_8)), new Metadata())); + assertEquals(MediaType.TEXT_PLAIN, proDetector.detect( + new ByteArrayInputStream("".getBytes(UTF_8)), new Metadata())); } /** - * Tests that when we repeatedly test the detection of a document that can - * be detected with Mime Magic, that we consistently detect it correctly. - * See TIKA-391 for more details. + * Tests that when we repeatedly test the detection of a document that can be detected with Mime + * Magic, that we consistently detect it correctly. See TIKA-391 for more details. */ @Test public void testMimeMagicStability() throws IOException { @@ -205,9 +197,9 @@ public void testMimeMagicStability() throws IOException { } /** - * Tests that when two magic matches both apply, and both have the same - * priority, we use the name to pick the right one based on the glob, or the - * first one we come across if not. See TIKA-1292 for more details. + * Tests that when two magic matches both apply, and both have the same priority, we use the + * name to pick the right one based on the glob, or the first one we come across if not. See + * TIKA-1292 for more details. */ @Test public void testMimeMagicClashSamePriority() throws IOException { @@ -224,13 +216,13 @@ public void testMimeMagicClashSamePriority() throws IOException { metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.x-hello-world"); assertEquals(helloXType, - proDetector.detect(new ByteArrayInputStream(helloWorld), metadata)); + proDetector.detect(new ByteArrayInputStream(helloWorld), metadata)); // Without, goes for the one that sorts last metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testingTESTINGtesting"); assertEquals(helloXType, - proDetector.detect(new ByteArrayInputStream(helloWorld), metadata)); + proDetector.detect(new ByteArrayInputStream(helloWorld), metadata)); } @Test @@ -238,9 +230,9 @@ public void testTIKA2237() throws IOException { Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, MediaType.text("javascript").toString()); InputStream input = new ByteArrayInputStream( - ("function() {};\n" + "try {\n" + " window.location = 'index.html';\n" + - "} catch (e) {\n" + " console.log(e);\n" + "}") - .getBytes(StandardCharsets.UTF_8)); + ("function() {};\n" + "try {\n" + " window.location = 'index.html';\n" + + "} catch (e) {\n" + " console.log(e);\n" + "}") + .getBytes(StandardCharsets.UTF_8)); MediaType detect = new ProbabilisticMimeDetectionSelector().detect(input, metadata); assertEquals(MediaType.text("javascript"), detect); } diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java index 69ef03ad51..e488a5ed30 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java +++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -27,16 +25,14 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.Tika; import org.apache.tika.config.ServiceLoader; import org.apache.tika.detect.DefaultProbDetector; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.ProbabilisticMimeDetectionSelector.Builder; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class ProbabilisticMimeDetectionTestWithTika { @@ -54,13 +50,11 @@ public void setUp() { registry = types.getMediaTypeRegistry(); /* - * here is an example with the use of the builder to - * instantiate the object. + * here is an example with the use of the builder to instantiate the object. */ Builder builder = new ProbabilisticMimeDetectionSelector.Builder(); - proSelector = new ProbabilisticMimeDetectionSelector(types, - builder.priorMagicFileType(0.5f).priorExtensionFileType(0.5f) - .priorMetaFileType(0.5f)); + proSelector = new ProbabilisticMimeDetectionSelector(types, builder.priorMagicFileType(0.5f) + .priorExtensionFileType(0.5f).priorMetaFileType(0.5f)); DefaultProbDetector detector = new DefaultProbDetector(proSelector, loader); // Use a default Tika, except for our different detector @@ -81,7 +75,7 @@ public void testDetection() throws Exception { testFile("application/xml", "test-long-comment.xml"); testFile("application/xslt+xml", "stylesheet.xsl"); testUrl("application/rdf+xml", "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl", - "test-difficult-rdf1.xml"); + "test-difficult-rdf1.xml"); testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml"); // add evil test from TIKA-327 testFile("text/html", "test-tika-327.html"); @@ -98,43 +92,40 @@ public void testDetection() throws Exception { @Test public void testByteOrderMark() throws Exception { - assertEquals(MediaType.TEXT_PLAIN.toString(), - tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), - new Metadata())); - assertEquals(MediaType.TEXT_PLAIN.toString(), - tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), - new Metadata())); + assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata())); + assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata())); - assertEquals(MediaType.TEXT_PLAIN.toString(), - tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), - new Metadata())); + assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect( + new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata())); } @Test public void testSuperTypes() { assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.parse("text/something"))); + MediaType.parse("text/something"))); assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.TEXT_PLAIN)); + MediaType.TEXT_PLAIN)); assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"), - MediaType.OCTET_STREAM)); + MediaType.OCTET_STREAM)); assertTrue(registry.isSpecializationOf(MediaType.parse("text/something"), - MediaType.TEXT_PLAIN)); + MediaType.TEXT_PLAIN)); assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+xml"), - MediaType.APPLICATION_XML)); + MediaType.APPLICATION_XML)); assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+zip"), - MediaType.APPLICATION_ZIP)); + MediaType.APPLICATION_ZIP)); assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML, MediaType.TEXT_PLAIN)); assertTrue(registry.isSpecializationOf(MediaType.parse("application/vnd.apple.iwork"), - MediaType.APPLICATION_ZIP)); + MediaType.APPLICATION_ZIP)); } @SuppressWarnings("unused") @@ -156,7 +147,7 @@ private void testFile(String expected, String filename) throws IOException { } private void testStream(String expected, String urlOrFileName, InputStream in) - throws IOException { + throws IOException { assertNotNull(in, "Test stream: [" + urlOrFileName + "] is null!"); if (!in.markSupported()) { in = new java.io.BufferedInputStream(in); @@ -165,15 +156,14 @@ private void testStream(String expected, String urlOrFileName, InputStream in) Metadata metadata = new Metadata(); // String mime = this.proDetector.detect(in, metadata).toString(); String mime = tika.detect(in, metadata); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected: detected."); + assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected."); // Add resource name and test again metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName); // mime = this.proDetector.detect(in, metadata).toString(); mime = tika.detect(in, metadata); - assertEquals(expected, mime, - urlOrFileName + " is not properly detected after adding resource name."); + assertEquals(expected, mime, urlOrFileName + + " is not properly detected after adding resource name."); } finally { in.close(); } @@ -182,44 +172,40 @@ private void testStream(String expected, String urlOrFileName, InputStream in) /** * Test for type detection of empty documents. * - * @see TIKA-483 + * @see TIKA-483 */ @Test public void testEmptyDocument() throws IOException { assertEquals(MediaType.OCTET_STREAM.toString(), - tika.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); + tika.detect(new ByteArrayInputStream(new byte[0]), new Metadata())); Metadata namehint = new Metadata(); namehint.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt"); assertEquals(MediaType.TEXT_PLAIN.toString(), - tika.detect(new ByteArrayInputStream(new byte[0]), namehint)); + tika.detect(new ByteArrayInputStream(new byte[0]), namehint)); Metadata typehint = new Metadata(); typehint.set(Metadata.CONTENT_TYPE, "text/plain"); assertEquals(MediaType.TEXT_PLAIN.toString(), - tika.detect(new ByteArrayInputStream(new byte[0]), typehint)); + tika.detect(new ByteArrayInputStream(new byte[0]), typehint)); } /** - * Test for things like javascript files whose content is enclosed in XML - * comment delimiters, but that aren't actually XML. + * Test for things like javascript files whose content is enclosed in XML comment delimiters, + * but that aren't actually XML. * - * @see TIKA-426 + * @see TIKA-426 */ @Test public void testNotXML() throws IOException { - assertEquals(MediaType.TEXT_PLAIN.toString(), - tika.detect(new ByteArrayInputStream("".getBytes(UTF_8)), - new Metadata())); + assertEquals(MediaType.TEXT_PLAIN.toString(), tika.detect( + new ByteArrayInputStream("".getBytes(UTF_8)), new Metadata())); } /** - * Tests that when we repeatedly test the detection of a document that can - * be detected with Mime Magic, that we consistently detect it correctly. - * See TIKA-391 for more details. + * Tests that when we repeatedly test the detection of a document that can be detected with Mime + * Magic, that we consistently detect it correctly. See TIKA-391 for more details. */ @Test public void testMimeMagicStability() throws IOException { @@ -229,9 +215,9 @@ public void testMimeMagicStability() throws IOException { } /** - * Tests that when two magic matches both apply, and both have the same - * priority, we use the name to pick the right one based on the glob, or the - * first one we come across if not. See TIKA-1292 for more details. + * Tests that when two magic matches both apply, and both have the same priority, we use the + * name to pick the right one based on the glob, or the first one we come across if not. See + * TIKA-1292 for more details. */ @Test public void testMimeMagicClashSamePriority() throws IOException { @@ -244,17 +230,17 @@ public void testMimeMagicClashSamePriority() throws IOException { metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.hello.world"); assertEquals(helloType.toString(), - tika.detect(new ByteArrayInputStream(helloWorld), metadata)); + tika.detect(new ByteArrayInputStream(helloWorld), metadata)); metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.x-hello-world"); assertEquals(helloXType.toString(), - tika.detect(new ByteArrayInputStream(helloWorld), metadata)); + tika.detect(new ByteArrayInputStream(helloWorld), metadata)); // Without, goes for the one that sorts last metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testingTESTINGtesting"); assertEquals(helloXType.toString(), - tika.detect(new ByteArrayInputStream(helloWorld), metadata)); + tika.detect(new ByteArrayInputStream(helloWorld), metadata)); } } diff --git a/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java index 7340e06a28..81e0fd2fef 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -20,12 +18,10 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.Test; public class RFC822DetectionTest { @@ -33,47 +29,44 @@ public class RFC822DetectionTest { @Test public void testBasic() throws Exception { - for (String txt : new String[]{ - "Date: blah\nSent: someone\r\nthis is a test", - "date: blah\nSent: someone\r\nthis is a test", - "date: blah\nDelivered-To: someone\r\nthis is a test" - }) { + for (String txt : new String[] {"Date: blah\nSent: someone\r\nthis is a test", + "date: blah\nSent: someone\r\nthis is a test", + "date: blah\nDelivered-To: someone\r\nthis is a test"}) { assertMime("message/rfc822", txt); } - for (String txt : new String[]{ - //test missing colon - "Date blah\nSent: someone\r\nthis is a test", - //test precursor junk - "some precursor junk Date: blah\nSent: someone\r\nthis is a test", - "some precursor junk\nDate: blah\nSent: someone\r\nthis is a test", - "some precursor junk:\nDate: blah\nSent: someone\r\nthis is a test", - //confirm that date is case-insensitive, but delivered-to is case-sensitive - "date: blah\ndelivered-To: someone\r\nthis is a test", - //test that a file that starts only with "Subject:" and no other header is - //detected as text/plain - "Subject: this is a subject\nand there's some other text", - "To: someone\nand there's some other text", - "To: someone or other" - }) { + for (String txt : new String[] { + // test missing colon + "Date blah\nSent: someone\r\nthis is a test", + // test precursor junk + "some precursor junk Date: blah\nSent: someone\r\nthis is a test", + "some precursor junk\nDate: blah\nSent: someone\r\nthis is a test", + "some precursor junk:\nDate: blah\nSent: someone\r\nthis is a test", + // confirm that date is case-insensitive, but delivered-to is case-sensitive + "date: blah\ndelivered-To: someone\r\nthis is a test", + // test that a file that starts only with "Subject:" and no other header is + // detected as text/plain + "Subject: this is a subject\nand there's some other text", + "To: someone\nand there's some other text", "To: someone or other"}) { assertMime("text/plain", txt); } - //TIKA-4153, specifically - String txt = "Some text here 1.\n" + "Some text here 2.\n" + "Some text here 3.\n" + - "Original Message-----\n" + "From: some_mail@abc.com\n" + - "Sent: Thursday, October 31, 2019 9:52 AM\n" + - "To: Some person, (The XYZ group)\n" + - "Subject: RE: Mr. Random person phone call: MESSAGE\n" + "Hi,\n" + - "I am available now to receive the call.\n" + "Some text here 4.\n" + - "Some text here 5.\n" + "Some text here 6."; + // TIKA-4153, specifically + String txt = "Some text here 1.\n" + "Some text here 2.\n" + "Some text here 3.\n" + + "Original Message-----\n" + "From: some_mail@abc.com\n" + + "Sent: Thursday, October 31, 2019 9:52 AM\n" + + "To: Some person, (The XYZ group)\n" + + "Subject: RE: Mr. Random person phone call: MESSAGE\n" + "Hi,\n" + + "I am available now to receive the call.\n" + "Some text here 4.\n" + + "Some text here 5.\n" + "Some text here 6."; assertMime("text/plain", txt); } private void assertMime(String expected, String txt) throws IOException { - MediaType mediaType = - MIME_TYPES.detect(UnsynchronizedByteArrayInputStream.builder() - .setByteArray(txt.getBytes(StandardCharsets.UTF_8)).get(), new Metadata()); + MediaType mediaType = MIME_TYPES.detect( + UnsynchronizedByteArrayInputStream.builder() + .setByteArray(txt.getBytes(StandardCharsets.UTF_8)).get(), + new Metadata()); assertEquals(expected, mediaType.toString(), txt); } } diff --git a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java index 62b061d98e..d1cebfd972 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -20,9 +18,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.InputStream; - -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.extractor.EmbeddedBytesSelector; import org.apache.tika.extractor.RUnpackExtractor; @@ -30,19 +25,20 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.StringUtils; +import org.junit.jupiter.api.Test; public class AutoDetectParserConfigTest { @Test public void testEmbeddedBytesSelector() throws Exception { TikaConfig config; - try (InputStream is = TikaConfig.class.getResourceAsStream( - "TIKA-4207-embedded-bytes-config.xml")) { + try (InputStream is = TikaConfig.class + .getResourceAsStream("TIKA-4207-embedded-bytes-config.xml")) { config = new TikaConfig(is); } AutoDetectParserConfig c = config.getAutoDetectParserConfig(); RUnpackExtractorFactory f = - (RUnpackExtractorFactory) c.getEmbeddedDocumentExtractorFactory(); + (RUnpackExtractorFactory) c.getEmbeddedDocumentExtractorFactory(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); diff --git a/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java index 5519dce675..455612b291 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -26,15 +24,13 @@ import java.util.List; import java.util.Map; import java.util.Set; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; public class CompositeParserTest { @@ -58,9 +54,9 @@ public Set getSupportedTypes(ParseContext context) { }; CompositeParser composite = - new CompositeParser(MediaTypeRegistry.getDefaultRegistry(), a, b, c); + new CompositeParser(MediaTypeRegistry.getDefaultRegistry(), a, b, c); Map> duplicates = - composite.findDuplicateParsers(new ParseContext()); + composite.findDuplicateParsers(new ParseContext()); assertEquals(1, duplicates.size()); List parsers = duplicates.get(MediaType.TEXT_PLAIN); assertNotNull(parsers); @@ -86,24 +82,22 @@ public void testMimeTypeAliases() throws Exception { bmpCanonicalMetadata.put("BMP", "True"); bmpCanonicalMetadata.put("Canonical", "True"); Parser bmpCanonicalParser = - new DummyParser(new HashSet<>(Collections.singletonList(bmpCanonical)), - bmpCanonicalMetadata, null); + new DummyParser(new HashSet<>(Collections.singletonList(bmpCanonical)), + bmpCanonicalMetadata, null); MediaType bmpAlias = MediaType.image("x-ms-bmp"); Map bmpAliasMetadata = new HashMap<>(); bmpAliasMetadata.put("BMP", "True"); bmpAliasMetadata.put("Alias", "True"); - Parser bmpAliasParser = - new DummyParser(new HashSet<>(Collections.singletonList(bmpAlias)), bmpAliasMetadata, - null); + Parser bmpAliasParser = new DummyParser(new HashSet<>(Collections.singletonList(bmpAlias)), + bmpAliasMetadata, null); TikaConfig config = TikaConfig.getDefaultConfig(); CompositeParser canonical = - new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser); + new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser); CompositeParser alias = new CompositeParser(config.getMediaTypeRegistry(), bmpAliasParser); - CompositeParser both = - new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser, - bmpAliasParser); + CompositeParser both = new CompositeParser(config.getMediaTypeRegistry(), + bmpCanonicalParser, bmpAliasParser); ContentHandler handler = new BodyContentHandler(); Metadata metadata; @@ -112,7 +106,7 @@ public void testMimeTypeAliases() throws Exception { metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString()); canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, - new ParseContext()); + new ParseContext()); assertEquals("True", metadata.get("BMP")); assertEquals("True", metadata.get("Canonical")); @@ -129,7 +123,7 @@ public void testMimeTypeAliases() throws Exception { metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString()); canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, - new ParseContext()); + new ParseContext()); assertEquals("True", metadata.get("BMP")); assertEquals("True", metadata.get("Canonical")); @@ -143,7 +137,7 @@ public void testMimeTypeAliases() throws Exception { // And when both are there, will go for the last one - // to be registered (which is the alias one) + // to be registered (which is the alias one) metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString()); both.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java index 7b329faeef..1f2e9c284a 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -22,10 +20,6 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -34,10 +28,11 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * This tests that initialize() is called after adding the parameters - * configured via TikaConfig + * This tests that initialize() is called after adding the parameters configured via TikaConfig */ public class DummyInitializableParser implements Parser, Initializable { @@ -61,7 +56,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { metadata.set(SUM_FIELD, Integer.toString(sum)); } @@ -74,11 +69,11 @@ public void initialize(Map params) throws TikaConfigException { @Override public void checkInitialization(InitializableProblemHandler handler) - throws TikaConfigException { - //completely arbitrary + throws TikaConfigException { + // completely arbitrary if (sum > 1000) { handler.handleInitializableProblem("DummyInitializableParser", - "sum cannot be > 1000: " + sum); + "sum cannot be > 1000: " + sum); } } } diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java index c88c238e0b..ad4af030ef 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -26,14 +24,12 @@ import java.net.URL; import java.util.HashSet; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * A test Parsers to test {@link Field} @@ -114,7 +110,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { metadata.add("testparam", testParam); metadata.add("xshort", xshort + ""); diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java index 9b1ffcc4e1..b909225b9d 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -21,14 +19,12 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * A Dummy Parser for use with unit tests. @@ -51,7 +47,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { for (Entry m : this.metadata.entrySet()) { metadata.add(m.getKey(), m.getValue()); } diff --git a/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java index 9571ab2527..8bcc9e375d 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -20,13 +18,11 @@ import java.net.URL; import java.nio.charset.StandardCharsets; - -import org.junit.jupiter.api.Test; - import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.Test; public class InitializableParserTest { diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java index 7ad198f521..6f47d5f7dc 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -24,15 +22,13 @@ import java.net.URL; import java.util.HashMap; import java.util.Map; - -import org.junit.jupiter.api.Test; -import org.xml.sax.SAXException; - import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.Test; +import org.xml.sax.SAXException; public class ParameterizedParserTest { @@ -60,8 +56,8 @@ public class ParameterizedParserTest { public void testConfigurableParserTypes() throws Exception { Metadata md = getMetadata("TIKA-1986-parameterized.xml"); for (Map.Entry entry : expcted.entrySet()) { - assertEquals(entry.getValue(), - md.get(entry.getKey()), "mismatch for " + entry.getKey()); + assertEquals(entry.getValue(), md.get(entry.getKey()), + "mismatch for " + entry.getKey()); } } @@ -69,16 +65,16 @@ public void testConfigurableParserTypes() throws Exception { public void testConfigurableParserTypesDecorated() throws Exception { Metadata md = getMetadata("TIKA-1986-parameterized-decorated.xml"); for (Map.Entry entry : expcted.entrySet()) { - assertEquals(entry.getValue(), - md.get(entry.getKey()), "mismatch for " + entry.getKey()); + assertEquals(entry.getValue(), md.get(entry.getKey()), + "mismatch for " + entry.getKey()); } } @Test public void testSomeParams() throws Exception { - //test that a parameterized parser can read a config file - //with only some changes to the initial values + // test that a parameterized parser can read a config file + // with only some changes to the initial values Metadata md = getMetadata("TIKA-1986-some-parameters.xml"); assertEquals("-6.0", md.get("xdouble")); assertEquals("testparamval", md.get("testparam")); @@ -99,8 +95,8 @@ public void testBadType() throws Exception { }); } - //TODO later -- add a test for a parser that isn't configurable - //but that has params in the config file + // TODO later -- add a test for a parser that isn't configurable + // but that has params in the config file private Metadata getMetadata(String name) throws TikaException, IOException, SAXException { URL url = this.getClass().getResource("/org/apache/tika/config/" + name); diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java index 141c058af8..8b7b339e4e 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -25,12 +23,10 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Set; - -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; public class ParserDecoratorTest { @@ -54,8 +50,7 @@ public void withAndWithoutTypes() { assertTrue(types.contains(MediaType.TEXT_PLAIN), types.toString()); // With a parser with other types, still just the decorated type - p = ParserDecorator - .withTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt); + p = ParserDecorator.withTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt); types = p.getSupportedTypes(context); assertEquals(1, types.size()); assertTrue(types.contains(MediaType.TEXT_PLAIN), types.toString()); @@ -66,14 +61,12 @@ public void withAndWithoutTypes() { types = p.getSupportedTypes(context); assertEquals(0, types.size()); - p = ParserDecorator - .withoutTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt); + p = ParserDecorator.withoutTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt); types = p.getSupportedTypes(context); assertEquals(1, types.size()); assertTrue(types.contains(MediaType.OCTET_STREAM), types.toString()); - p = ParserDecorator - .withoutTypes(new DummyParser(both, new HashMap<>(), ""), onlyTxt); + p = ParserDecorator.withoutTypes(new DummyParser(both, new HashMap<>(), ""), onlyTxt); types = p.getSupportedTypes(context); assertEquals(1, types.size()); assertTrue(types.contains(MediaType.OCTET_STREAM), types.toString()); @@ -86,7 +79,7 @@ public void withAndWithoutTypes() { public void withFallback() throws Exception { Set onlyOct = Collections.singleton(MediaType.OCTET_STREAM); Set octAndText = - new HashSet<>(Arrays.asList(MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN)); + new HashSet<>(Arrays.asList(MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN)); ParseContext context = new ParseContext(); BodyContentHandler handler; @@ -97,8 +90,8 @@ public void withFallback() throws Exception { EmptyParser pNothing = new EmptyParser(); // Create a combination which will fail first - @SuppressWarnings("deprecation") Parser p = - ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText); + @SuppressWarnings("deprecation") + Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText); // Will claim to support the types given, not those on the child parsers Set types = p.getSupportedTypes(context); @@ -109,7 +102,7 @@ public void withFallback() throws Exception { // Parsing will make it to the second one metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); @@ -117,7 +110,7 @@ public void withFallback() throws Exception { p = ParserDecorator.withFallbacks(Arrays.asList(pNothing, pWork), octAndText); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("", handler.toString()); } } diff --git a/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java index 13c5eada73..6dfd97e715 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser; @@ -22,31 +20,26 @@ import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; - +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; import org.junit.jupiter.api.Test; import org.xml.sax.ContentHandler; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; - public class RegexCaptureParserTest { @Test public void testBasic() throws Exception { Metadata m = new Metadata(); ContentHandler contentHandler = new DefaultHandler(); - String output = "Something\n" + - "Title: the quick brown fox\n" + - "Author: jumped over\n" + - "Created: 10/20/2024"; + String output = "Something\n" + "Title: the quick brown fox\n" + "Author: jumped over\n" + + "Created: 10/20/2024"; RegexCaptureParser parser = new RegexCaptureParser(); Map regexes = new HashMap<>(); regexes.put("title", "^Title: ([^\r\n]+)"); parser.setCaptureMap(regexes); - try (InputStream stream = - TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) { + try (InputStream stream = TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) { parser.parse(stream, contentHandler, m, new ParseContext()); } assertEquals("the quick brown fox", m.get("title")); diff --git a/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java index d4c3899550..1d720bdd33 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.external2; @@ -22,11 +20,6 @@ import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.List; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.io.TikaInputStream; @@ -36,14 +29,16 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RegexCaptureParser; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; public class ExternalParserTest extends TikaTest { @Test public void testConfigRegexCaptureParser() throws Exception { - assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{ - "file", "--version" - })); + assumeTrue(org.apache.tika.parser.external.ExternalParser + .check(new String[] {"file", "--version"})); try (InputStream is = TikaConfig.class.getResourceAsStream("TIKA-3557.xml")) { TikaConfig config = new TikaConfig(is); @@ -56,12 +51,10 @@ public void testConfigRegexCaptureParser() throws Exception { Metadata m = new Metadata(); ContentHandler contentHandler = new DefaultHandler(); - String output = "Something\n" + - "Title: the quick brown fox\n" + - "Author: jumped over\n" + - "Created: 10/20/2024"; + String output = "Something\n" + "Title: the quick brown fox\n" + "Author: jumped over\n" + + "Created: 10/20/2024"; try (InputStream stream = - TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) { + TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) { outputParser.parse(stream, contentHandler, m, new ParseContext()); } assertEquals("the quick brown fox", m.get("title")); @@ -70,8 +63,10 @@ public void testConfigRegexCaptureParser() throws Exception { @Test public void testConfigBasic() throws Exception { - assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{"file", "--version"})); - try (InputStream is = TikaConfig.class.getResourceAsStream("TIKA-3557-no-output-parser.xml")) { + assumeTrue(org.apache.tika.parser.external.ExternalParser + .check(new String[] {"file", "--version"})); + try (InputStream is = + TikaConfig.class.getResourceAsStream("TIKA-3557-no-output-parser.xml")) { TikaConfig config = new TikaConfig(is); CompositeParser p = (CompositeParser) config.getParser(); assertEquals(1, p.getAllComponentParsers().size()); @@ -84,13 +79,13 @@ public void testConfigBasic() throws Exception { @Test public void testExifTool() throws Exception { - assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{"exiftool", - "-ver"})); + assumeTrue(org.apache.tika.parser.external.ExternalParser + .check(new String[] {"exiftool", "-ver"})); try (InputStream is = - TikaConfig.class.getResourceAsStream("TIKA-3557-exiftool-example.xml")) { + TikaConfig.class.getResourceAsStream("TIKA-3557-exiftool-example.xml")) { TikaConfig config = new TikaConfig(is); Parser p = new AutoDetectParser(config); - //this was the smallest pdf we had + // this was the smallest pdf we had List metadataList = getRecursiveMetadata("testOverlappingText.pdf", p); assertEquals(1, metadataList.size()); Metadata m = metadataList.get(0); diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java index 84c9b7ab1f..94b86b9c39 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java @@ -1,24 +1,27 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mock; import static java.nio.charset.StandardCharsets.UTF_8; +import com.martensigwart.fakeload.FakeLoad; +import com.martensigwart.fakeload.FakeLoadBuilder; +import com.martensigwart.fakeload.FakeLoadExecutor; +import com.martensigwart.fakeload.FakeLoadExecutors; +import com.martensigwart.fakeload.MemoryUnit; import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; @@ -38,20 +41,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import javax.xml.parsers.DocumentBuilder; - -import com.martensigwart.fakeload.FakeLoad; -import com.martensigwart.fakeload.FakeLoadBuilder; -import com.martensigwart.fakeload.FakeLoadExecutor; -import com.martensigwart.fakeload.FakeLoadExecutors; -import com.martensigwart.fakeload.MemoryUnit; import org.apache.commons.io.input.CloseShieldInputStream; -import org.w3c.dom.Document; -import org.w3c.dom.NamedNodeMap; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; @@ -65,13 +55,18 @@ import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.XMLReaderUtils; +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * This class enables mocking of parser behavior for use in testing - * wrappers and drivers of parsers. + * This class enables mocking of parser behavior for use in testing wrappers and drivers of parsers. *

- * See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation - * of all the options for this MockParser. + * See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation of all + * the options for this MockParser. *

* Tests for this class are in tika-parsers. *

@@ -115,7 +110,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { if (Thread.currentThread().isInterrupted()) { throw new TikaException("interrupted", new InterruptedException()); } @@ -124,7 +119,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder(context); doc = docBuilder.parse(CloseShieldInputStream.wrap(stream)); } catch (SAXException e) { - //to distinguish between SAX on read vs SAX while writing + // to distinguish between SAX on read vs SAX while writing throw new IOException(e); } Node root = doc.getDocumentElement(); @@ -138,8 +133,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } private void executeAction(Node action, Metadata metadata, ParseContext context, - XHTMLContentHandler xhtml) - throws SAXException, IOException, TikaException { + XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { if (action.getNodeType() != 1) { return; @@ -183,21 +177,21 @@ private void parentMetadata(Node action, ParseContext context) { } private void fakeload(Node action) { - //https://github.com/msigwart/fakeload - //with this version of fakeload, you should only need one thread to hit - //the cpu targets; on Linux with Java 8 at least, two or more threads did - //not increase the overall CPU over a single thread + // https://github.com/msigwart/fakeload + // with this version of fakeload, you should only need one thread to hit + // the cpu targets; on Linux with Java 8 at least, two or more threads did + // not increase the overall CPU over a single thread int numThreads = 1; NamedNodeMap attrs = action.getAttributes(); if (attrs == null) { - throw new IllegalArgumentException("Must specify details...no attributes for " + - "fakeload?!"); + throw new IllegalArgumentException( + "Must specify details...no attributes for " + "fakeload?!"); } - if (attrs.getNamedItem("millis") == null || attrs.getNamedItem("cpu") == null || - attrs.getNamedItem("mb") == null) { - throw new IllegalArgumentException("must specify 'millis' (time to process), " + - "'cpu' (% cpu as an integer, e.g. 50% would be '50'), " + - "and 'mb' (megabytes as an integer)"); + if (attrs.getNamedItem("millis") == null || attrs.getNamedItem("cpu") == null + || attrs.getNamedItem("mb") == null) { + throw new IllegalArgumentException("must specify 'millis' (time to process), " + + "'cpu' (% cpu as an integer, e.g. 50% would be '50'), " + + "and 'mb' (megabytes as an integer)"); } Node n = attrs.getNamedItem("numThreads"); if (n != null) { @@ -209,12 +203,11 @@ private void fakeload(Node action) { ExecutorService executorService = Executors.newFixedThreadPool(numThreads); ExecutorCompletionService executorCompletionService = - new ExecutorCompletionService<>(executorService); + new ExecutorCompletionService<>(executorService); for (int i = 0; i < numThreads; i++) { executorCompletionService.submit(() -> { - FakeLoad fakeload = - new FakeLoadBuilder().lasting(millis, TimeUnit.MILLISECONDS) + FakeLoad fakeload = new FakeLoadBuilder().lasting(millis, TimeUnit.MILLISECONDS) .withCpu(cpu).withMemory(mb, MemoryUnit.MB).build(); FakeLoadExecutor executor = FakeLoadExecutors.newDefaultExecutor(); executor.execute(fakeload); @@ -244,7 +237,7 @@ private void throwIllegalChars() throws IOException { } private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseContext context) - throws TikaException, SAXException, IOException { + throws TikaException, SAXException, IOException { String fileName = ""; String contentType = ""; NamedNodeMap attrs = action.getAttributes(); @@ -260,7 +253,8 @@ private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseConte } String embeddedText = action.getTextContent(); - EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + EmbeddedDocumentExtractor extractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); Metadata m = new Metadata(); m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName); @@ -339,7 +333,7 @@ private void hang(Node action) { Node pNode = attrs.getNamedItem("pulse_millis"); if (pNode == null) { throw new RuntimeException( - "Must specify attribute \"pulse_millis\" if the hang is \"heavy\""); + "Must specify attribute \"pulse_millis\" if the hang is \"heavy\""); } String pulseMillisString = mNode.getNodeValue(); try { @@ -364,7 +358,7 @@ private void throwIt(Node action) throws IOException, SAXException, TikaExceptio private void metadata(Node action, Metadata metadata) { NamedNodeMap attrs = action.getAttributes(); - //throws npe unless there is a name + // throws npe unless there is a name String name = attrs.getNamedItem("name").getNodeValue(); String value = action.getTextContent(); Node actionType = attrs.getNamedItem("action"); @@ -401,7 +395,7 @@ protected void write(Node action, XHTMLContentHandler xhtml) throws SAXException private void throwIt(String className, String msg) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { Throwable t = null; if (msg == null || msg.equals("")) { try { @@ -429,7 +423,7 @@ private void throwIt(String className, String msg) } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else { - //wrap the throwable in a RuntimeException + // wrap the throwable in a RuntimeException throw new RuntimeException(t); } } @@ -444,11 +438,11 @@ private void kabOOM() { } private void hangHeavy(long maxMillis, long pulseCheckMillis, boolean interruptible) { - //do some heavy computation and occasionally check for - //whether time has exceeded maxMillis (see TIKA-1132 for inspiration) - //or whether the thread was interrupted. - //By creating a new Date in the inner loop, we're also intentionally - //triggering the gc most likely. + // do some heavy computation and occasionally check for + // whether time has exceeded maxMillis (see TIKA-1132 for inspiration) + // or whether the thread was interrupted. + // By creating a new Date in the inner loop, we're also intentionally + // triggering the gc most likely. long start = new Date().getTime(); long lastChecked = start; while (true) { diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java index c7716946f5..5c30a25ded 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java +++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mock; import java.io.IOException; import java.util.Map; - import org.apache.tika.exception.TikaException; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserFactory; diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java index 1902b08d8c..45924b6791 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java @@ -1,41 +1,36 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mock; import java.util.List; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.Test; public class MockParserTest extends TikaTest { @Test public void testFakeload() throws Exception { - //just make sure there aren't any exceptions + // just make sure there aren't any exceptions getRecursiveMetadata("mock_fakeload.xml"); } @Test public void testTimes() throws Exception { List metadataList = getRecursiveMetadata("mock_times.xml"); - assertContainsCount("hello", - metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT), 30); + assertContainsCount("hello", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT), 30); } } diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java b/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java index 61a0473608..f328f9f70d 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mock; @@ -20,14 +18,12 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; - +import org.apache.tika.config.Field; +import org.apache.tika.sax.XHTMLContentHandler; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.xml.sax.SAXException; -import org.apache.tika.config.Field; -import org.apache.tika.sax.XHTMLContentHandler; - /** * only parses vowels as specified in "vowel" field. */ diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java index 9462d0cb2e..70eec12e8d 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.multiple; @@ -27,9 +25,6 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; - -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -41,11 +36,12 @@ import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.utils.ParserUtils; +import org.junit.jupiter.api.Test; public class MultipleParserTest { /** - * Tests how {@link AbstractMultipleParser} works out which - * mime types to offer, based on the types of the parsers + * Tests how {@link AbstractMultipleParser} works out which mime types to offer, based on the + * types of the parsers */ @Test public void testMimeTypeSupported() { @@ -53,7 +49,7 @@ public void testMimeTypeSupported() { // Some media types Set onlyOct = Collections.singleton(MediaType.OCTET_STREAM); Set octAndText = - new HashSet<>(Arrays.asList(MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN)); + new HashSet<>(Arrays.asList(MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN)); // TODO One with a subtype } @@ -73,8 +69,7 @@ public void testFallback() throws Exception { // Some parsers ErrorParser pFail = new ErrorParser(); - DummyParser pContent = - new DummyParser(onlyOct, new HashMap<>(), "Fell back!"); + DummyParser pContent = new DummyParser(onlyOct, new HashMap<>(), "Fell back!"); EmptyParser pNothing = new EmptyParser(); @@ -83,7 +78,7 @@ public void testFallback() throws Exception { metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY); @@ -96,7 +91,7 @@ public void testFallback() throws Exception { metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY); @@ -115,7 +110,7 @@ public void testFallback() throws Exception { metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY); @@ -161,7 +156,7 @@ public void testSupplemental() throws Exception { new SupplementingParser(null, MetadataPolicy.DISCARD_ALL); fail("Discard shouldn't be supported"); } catch (IllegalArgumentException e) { - //swallow + // swallow } @@ -170,7 +165,7 @@ public void testSupplemental() throws Exception { metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back 1!", handler.toString()); assertEquals("Test1", metadata.get("T1")); @@ -184,11 +179,11 @@ public void testSupplemental() throws Exception { // Check the First, Last and All policies: // First Wins p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2, - pNothing); + pNothing); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back 1!Fell back 2!", handler.toString()); assertEquals("Test1", metadata.get("T1")); @@ -204,11 +199,11 @@ public void testSupplemental() throws Exception { // Last Wins p = new SupplementingParser(null, MetadataPolicy.LAST_WINS, pFail, pContent1, pContent2, - pNothing); + pNothing); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back 1!Fell back 2!", handler.toString()); assertEquals("Test1", metadata.get("T1")); @@ -224,11 +219,11 @@ public void testSupplemental() throws Exception { // Merge p = new SupplementingParser(null, MetadataPolicy.KEEP_ALL, pFail, pContent1, pContent2, - pNothing); + pNothing); metadata = new Metadata(); handler = new BodyContentHandler(); - p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context); + p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context); assertEquals("Fell back 1!Fell back 2!", handler.toString()); assertEquals("Test1", metadata.get("T1")); diff --git a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java index 111d2ea3cf..1ea7059d4e 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -26,36 +24,34 @@ import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.Set; - -import org.junit.jupiter.api.Test; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.junit.jupiter.api.Test; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; +import org.xml.sax.helpers.DefaultHandler; /** * Test cases for the {@link org.apache.tika.sax.BodyContentHandler} class. */ public class BasicContentHandlerFactoryTest { - //default max char len (at least in WriteOutContentHandler is 100k) + // default max char len (at least in WriteOutContentHandler is 100k) private static final int OVER_DEFAULT = 120000; - //copied from TikaTest in tika-parsers package + // copied from TikaTest in tika-parsers package public static void assertNotContains(String needle, String haystack) { assertFalse(haystack.contains(needle), needle + " found in:\n" + haystack); } public static void assertNotContains(String needle, byte[] hayStack) - throws UnsupportedEncodingException { + throws UnsupportedEncodingException { assertNotContains(needle, new String(hayStack, UTF_8)); } @@ -64,25 +60,24 @@ public static void assertContains(String needle, String haystack) { } public static void assertContains(String needle, byte[] hayStack) - throws UnsupportedEncodingException { + throws UnsupportedEncodingException { assertContains(needle, new String(hayStack, UTF_8)); } @Test public void testIgnore() throws Exception { Parser p = new MockParser(OVER_DEFAULT); - ContentHandler handler = - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1) - .getNewContentHandler(); + ContentHandler handler = new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1).getNewContentHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); - //unfortunatley, the DefaultHandler does not return "", + // unfortunatley, the DefaultHandler does not return "", assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); - //tests that no write limit exception is thrown + // tests that no write limit exception is thrown p = new MockParser(100); handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5) - .getNewContentHandler(); + .getNewContentHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); @@ -102,7 +97,7 @@ public void testText() throws Exception { assertNotContains(" 110000); - //now test write limit + // now test write limit p = new MockParser(10); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof WriteOutContentHandler); @@ -111,7 +106,7 @@ public void testText() throws Exception { assertContains("This ", extracted); assertNotContains("aaaa", extracted); - //now test outputstream call + // now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); @@ -128,8 +123,8 @@ public void testText() throws Exception { handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8); assertTrue(handler instanceof WriteOutContentHandler); assertWriteLimitReached(p, (WriteOutContentHandler) handler); - //When writing to an OutputStream and a write limit is reached, - //currently, nothing is written. + // When writing to an OutputStream and a write limit is reached, + // currently, nothing is written. assertEquals(0, os.toByteArray().length); } @@ -146,7 +141,7 @@ public void testHTML() throws Exception { assertContains("aaaaaaaaaa", extracted); assertTrue(extracted.length() > 110000); - //now test write limit + // now test write limit p = new MockParser(10); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof WriteOutContentHandler); @@ -155,7 +150,7 @@ public void testHTML() throws Exception { assertContains("This ", extracted); assertNotContains("aaaa", extracted); - //now test outputstream call + // now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); @@ -189,7 +184,7 @@ public void testXML() throws Exception { assertContains("aaaaaaaaaa", extracted); assertTrue(handler.toString().length() > 110000); - //now test write limit + // now test write limit p = new MockParser(10); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof WriteOutContentHandler); @@ -198,7 +193,7 @@ public void testXML() throws Exception { assertContains("This ", extracted); assertNotContains("aaaa", extracted); - //now test outputstream call + // now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); @@ -234,7 +229,7 @@ public void testBody() throws Exception { assertContains("aaaaaaaaaa", extracted); assertTrue(extracted.length() > 110000); - //now test write limit + // now test write limit p = new MockParser(10); handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(); assertTrue(handler instanceof BodyContentHandler); @@ -243,7 +238,7 @@ public void testBody() throws Exception { assertNotContains("This ", extracted); assertContains("aaaa", extracted); - //now test outputstream call + // now test outputstream call p = new MockParser(OVER_DEFAULT); ByteArrayOutputStream os = new ByteArrayOutputStream(); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8); @@ -264,7 +259,7 @@ public void testBody() throws Exception { } private void assertWriteLimitReached(Parser p, WriteOutContentHandler handler) - throws Exception { + throws Exception { boolean wlr = false; try { p.parse(null, handler, null, null); @@ -277,13 +272,13 @@ private void assertWriteLimitReached(Parser p, WriteOutContentHandler handler) assertTrue(wlr, "WriteLimitReached"); } - //TODO: is there a better way than to repeat this with diff signature? + // TODO: is there a better way than to repeat this with diff signature? private void assertWriteLimitReached(Parser p, BodyContentHandler handler) throws Exception { boolean wlr = false; try { p.parse(null, handler, null, null); } catch (SAXException e) { - if (! WriteLimitReachedException.isWriteLimitReached(e)) { + if (!WriteLimitReachedException.isWriteLimitReached(e)) { throw e; } @@ -292,8 +287,8 @@ private void assertWriteLimitReached(Parser p, BodyContentHandler handler) throw assertTrue(wlr, "WriteLimitReached"); } - //Simple mockparser that writes a title - //and charsToWrite number of 'a' + // Simple mockparser that writes a title + // and charsToWrite number of 'a' private static class MockParser implements Parser { private final String XHTML = "http://www.w3.org/1999/xhtml"; private final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); @@ -312,7 +307,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { handler.startDocument(); handler.startPrefixMapping("", XHTML); handler.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES); diff --git a/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java index 19bf853783..a72da0ae85 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -23,15 +21,13 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.mock.MockParser; +import org.junit.jupiter.api.Test; /** * Test cases for the {@link BodyContentHandler} class. @@ -39,8 +35,8 @@ public class BodyContentHandlerTest extends TikaTest { /** - * Test that the conversion to an {@link OutputStream} doesn't leave - * characters unflushed in an internal buffer. + * Test that the conversion to an {@link OutputStream} doesn't leave characters unflushed in an + * internal buffer. * * @see TIKA-179 */ @@ -48,9 +44,8 @@ public class BodyContentHandlerTest extends TikaTest { public void testOutputStream() throws Exception { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); - XHTMLContentHandler xhtml = - new XHTMLContentHandler(new BodyContentHandler( - new OutputStreamWriter(buffer, UTF_8)), + XHTMLContentHandler xhtml = new XHTMLContentHandler( + new BodyContentHandler(new OutputStreamWriter(buffer, UTF_8)), new Metadata()); xhtml.startDocument(); xhtml.element("p", "Test text"); @@ -61,7 +56,7 @@ public void testOutputStream() throws Exception { @Test public void testLimit() throws Exception { - //TIKA-2668 - java 11-ea + // TIKA-2668 - java 11-ea Parser p = new MockParser(); WriteOutContentHandler handler = new WriteOutContentHandler(15); Metadata metadata = new Metadata(); diff --git a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java index 88643147f5..e1abba182d 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -23,19 +21,17 @@ import java.io.OutputStream; import java.nio.charset.StandardCharsets; import javax.xml.parsers.ParserConfigurationException; - import org.apache.commons.io.output.ByteArrayOutputStream; +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.utils.XMLReaderUtils; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.xml.sax.SAXException; -import org.apache.tika.TikaTest; -import org.apache.tika.exception.TikaException; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.utils.XMLReaderUtils; - /** * Test that validates a custom {@link ContentHandlerDecorator} can handle errors during XML parsing * @@ -46,12 +42,13 @@ public class CustomErrorHandlerTest extends TikaTest { private static String DEFAULT_SAX_PARSER_FACTORY; private static String SAX_PARSER_FACTORY_KEY = "javax.xml.parsers.SAXParserFactory"; + @BeforeAll public static void setUp() throws TikaException { DEFAULT_SAX_PARSER_FACTORY = System.getProperty(SAX_PARSER_FACTORY_KEY); System.setProperty(SAX_PARSER_FACTORY_KEY, - "org.apache.tika.sax.ErrorResistantSAXParserFactory"); - //forces re-initialization + "org.apache.tika.sax.ErrorResistantSAXParserFactory"); + // forces re-initialization XMLReaderUtils.setPoolSize(10); } @@ -62,15 +59,16 @@ public static void tearDown() throws TikaException { } else { System.setProperty(SAX_PARSER_FACTORY_KEY, DEFAULT_SAX_PARSER_FACTORY); } - //forces re-initialization + // forces re-initialization XMLReaderUtils.setPoolSize(10); } + private void extractXml(InputStream blobStream, OutputStream textStream) - throws IOException, SAXException, TikaException, ParserConfigurationException { + throws IOException, SAXException, TikaException, ParserConfigurationException { try { ToXMLContentHandler contentHandler = - new ToXMLContentHandler(textStream, StandardCharsets.UTF_8.toString()); + new ToXMLContentHandler(textStream, StandardCharsets.UTF_8.toString()); NonValidatingContentHandler handler = new NonValidatingContentHandler(contentHandler); XMLReaderUtils.parseSAX(blobStream, handler, new ParseContext()); } finally { @@ -79,7 +77,7 @@ private void extractXml(InputStream blobStream, OutputStream textStream) } private String extractTestData(String name) - throws IOException, SAXException, TikaException, ParserConfigurationException { + throws IOException, SAXException, TikaException, ParserConfigurationException { try (InputStream is = getResourceAsStream("/test-documents/" + name)) { ByteArrayOutputStream out = new ByteArrayOutputStream(); extractXml(is, out); @@ -92,7 +90,8 @@ void testUndeclaredEntityXML() throws Exception { try { String content = extractTestData("undeclared_entity.xml"); assertContains("START", content); - //This assertion passes only if custom error handler is called to handle fatal exception + // This assertion passes only if custom error handler is called to handle fatal + // exception assertContains("END", content); } catch (SAXException e) { fail("Exception returned from parser and not handled in error handler " + e); diff --git a/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java index 3ad297811b..bc030011a1 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -34,9 +32,8 @@ public void testWhitespaceCollapsing() throws Exception { LinkContentHandler linkContentHandler = new LinkContentHandler(true); linkContentHandler.startElement(XHTMLContentHandler.XHTML, "a", "", new AttributesImpl()); - char[] anchorText = - {'\n', 'N', 'o', ' ', 'w', 'h', 'i', 't', 'e', '\n', '\t', '\t', 's', 'p', 'a', 'c', - 'e'}; + char[] anchorText = {'\n', 'N', 'o', ' ', 'w', 'h', 'i', 't', 'e', '\n', '\t', '\t', 's', + 'p', 'a', 'c', 'e'}; linkContentHandler.characters(anchorText, 1, anchorText.length - 1); linkContentHandler.endElement(XHTMLContentHandler.XHTML, "a", ""); @@ -73,7 +70,7 @@ public void testLinkTag() throws Exception { linkContentHandler.endElement(XHTMLContentHandler.XHTML, "link", ""); assertEquals("http://tika.apache.org/stylesheet.css", - linkContentHandler.getLinks().get(0).getUri()); + linkContentHandler.getLinks().get(0).getUri()); assertEquals("stylesheet", linkContentHandler.getLinks().get(0).getRel()); } @@ -91,7 +88,7 @@ public void testIframeTag() throws Exception { linkContentHandler.endElement(XHTMLContentHandler.XHTML, "iframe", ""); assertEquals("http://tika.apache.org/iframe.html", - linkContentHandler.getLinks().get(0).getUri()); + linkContentHandler.getLinks().get(0).getUri()); } /** @@ -108,7 +105,7 @@ public void testScriptTag() throws Exception { linkContentHandler.endElement(XHTMLContentHandler.XHTML, "script", ""); assertEquals("http://tika.apache.org/script.js", - linkContentHandler.getLinks().get(0).getUri()); + linkContentHandler.getLinks().get(0).getUri()); } /** diff --git a/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java b/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java index d903a4d632..2858e849b8 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java +++ b/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import java.io.IOException; import java.io.InputStream; - import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -46,22 +43,22 @@ public NonValidatingContentHandler(ContentHandler handler) { @Override public void warning(SAXParseException e) throws SAXException { - //NO-OP + // NO-OP } @Override public void error(SAXParseException e) throws SAXException { - //NO-OP + // NO-OP } @Override public void fatalError(SAXParseException e) throws SAXException { - //NO-OP + // NO-OP } @Override public InputSource resolveEntity(String publicId, String systemId) - throws SAXException, IOException { + throws SAXException, IOException { return new InputSource(new ClosedInputStream()); } diff --git a/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java index 6c7e94513e..107dccdb3e 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -22,7 +20,6 @@ import java.net.ConnectException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; - import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.xml.sax.InputSource; @@ -55,9 +52,8 @@ public void testExternalDTD() throws Exception { @Test public void testExternalEntity() throws Exception { - String xml = - "" + - " ]>&bar;"; + String xml = "" + + " ]>&bar;"; try { parser.parse(new InputSource(new StringReader(xml)), offline); } catch (ConnectException e) { diff --git a/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java index 47918a9b56..2f51199b78 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -21,12 +19,10 @@ import java.io.ByteArrayOutputStream; import java.io.OutputStreamWriter; - +import org.apache.tika.metadata.Metadata; import org.junit.jupiter.api.Test; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.metadata.Metadata; - /** * Test cases for the {@link RichTextContentHandler} class. */ @@ -39,8 +35,9 @@ public class RichTextContentHandlerTest { public void aTagTest() throws Exception { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); - XHTMLContentHandler xhtml = new XHTMLContentHandler(new RichTextContentHandler( - new OutputStreamWriter(buffer, UTF_8)), new Metadata()); + XHTMLContentHandler xhtml = new XHTMLContentHandler( + new RichTextContentHandler(new OutputStreamWriter(buffer, UTF_8)), + new Metadata()); xhtml.startDocument(); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "", "name", "", "value"); @@ -57,8 +54,9 @@ public void aTagTest() throws Exception { public void imgTagTest() throws Exception { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); - XHTMLContentHandler xhtml = new XHTMLContentHandler(new RichTextContentHandler( - new OutputStreamWriter(buffer, UTF_8)), new Metadata()); + XHTMLContentHandler xhtml = new XHTMLContentHandler( + new RichTextContentHandler(new OutputStreamWriter(buffer, UTF_8)), + new Metadata()); xhtml.startDocument(); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "", "alt", "", "value"); diff --git a/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java index 80d1bfd9a1..58195c2a23 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; diff --git a/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java index 421e6c2ecb..555bbb3564 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java @@ -1,35 +1,31 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; import static org.junit.jupiter.api.Assertions.fail; import java.io.IOException; - import org.apache.commons.io.input.NullInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TikaInputStream; - /** * Tests for the {@link SecureContentHandler} class. */ @@ -50,7 +46,7 @@ public void setUp() { @Test public void testZeroCharactersPerByte() throws IOException { try { - char[] ch = new char[]{'x'}; + char[] ch = new char[] {'x'}; for (int i = 0; i < MANY_BYTES; i++) { stream.read(); } diff --git a/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java b/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java index 361b88dc76..0c1796b62a 100755 --- a/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -41,15 +39,15 @@ public void testToTextContentHandler() throws Exception { public void testToXMLContentHandler() throws Exception { assertStartDocument("", new ToXMLContentHandler()); assertStartDocument("\n", - new ToXMLContentHandler("UTF-8")); + new ToXMLContentHandler("UTF-8")); assertCharacters("content", new ToXMLContentHandler()); assertCharacterEscaping("<&\">", new ToXMLContentHandler()); assertIgnorableWhitespace(" \t\r\n", new ToXMLContentHandler()); assertEmptyElement("
", new ToXMLContentHandler()); assertEmptyElementWithAttributes("", - new ToXMLContentHandler()); + new ToXMLContentHandler()); assertEmptyElementWithAttributeEscaping("

", - new ToXMLContentHandler()); + new ToXMLContentHandler()); assertElement("

content

", new ToXMLContentHandler()); assertElementWithAttributes("

content

", new ToXMLContentHandler()); } @@ -62,9 +60,9 @@ public void testToHTMLContentHandler() throws Exception { assertIgnorableWhitespace(" \t\r\n", new ToHTMLContentHandler()); assertEmptyElement("
", new ToHTMLContentHandler()); assertEmptyElementWithAttributes("", - new ToHTMLContentHandler()); + new ToHTMLContentHandler()); assertEmptyElementWithAttributeEscaping("

", - new ToHTMLContentHandler()); + new ToHTMLContentHandler()); assertElement("

content

", new ToHTMLContentHandler()); assertElementWithAttributes("

content

", new ToHTMLContentHandler()); } @@ -85,7 +83,7 @@ private void assertCharacterEscaping(String expected, ContentHandler handler) th } private void assertIgnorableWhitespace(String expected, ContentHandler handler) - throws Exception { + throws Exception { handler.ignorableWhitespace(" \t\r\n".toCharArray(), 0, 4); assertEquals(expected, handler.toString()); } @@ -98,7 +96,7 @@ private void assertEmptyElement(String expected, ContentHandler handler) throws } private void assertEmptyElementWithAttributes(String expected, ContentHandler handler) - throws Exception { + throws Exception { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "name", "name", "CDATA", "foo"); attributes.addAttribute("", "value", "value", "CDATA", "bar"); @@ -108,7 +106,7 @@ private void assertEmptyElementWithAttributes(String expected, ContentHandler ha } private void assertEmptyElementWithAttributeEscaping(String expected, ContentHandler handler) - throws Exception { + throws Exception { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "<&\">"); handler.startElement("", "p", "p", attributes); @@ -125,7 +123,7 @@ private void assertElement(String expected, ContentHandler handler) throws Excep } private void assertElementWithAttributes(String expected, ContentHandler handler) - throws Exception { + throws Exception { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "test"); handler.startElement("", "p", "p", attributes); diff --git a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java index 136c62b0c0..722d894176 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax; @@ -21,15 +19,13 @@ import java.util.ArrayList; import java.util.List; - +import org.apache.tika.metadata.Metadata; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.metadata.Metadata; - /** * Unit tests for the {@link XHTMLContentHandler} class. */ @@ -40,8 +36,8 @@ public class XHTMLContentHandlerTest { private XHTMLContentHandler xhtml; /** - * Return array of non-zerolength words. Splitting on whitespace will get us - * empty words for emptylines. + * Return array of non-zerolength words. Splitting on whitespace will get us empty words for + * emptylines. * * @param string some mix of newlines and real words * @return array of real words. @@ -65,8 +61,7 @@ public void setUp() { } /** - * Test that content in block elements are properly separated in text - * output. + * Test that content in block elements are properly separated in text output. * * @see TIKA-188 */ @@ -104,8 +99,7 @@ public void testExtraWhitespace() throws SAXException { } /** - * Test that content in option elements are properly separated in text - * output. + * Test that content in option elements are properly separated in text output. * * @see TIKA-394 */ @@ -145,12 +139,12 @@ public void testWhitespaceWithMenus() throws Exception { public void testAttributesOnBody() throws Exception { ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler(); XHTMLContentHandler xhtmlContentHandler = - new XHTMLContentHandler(toHTMLContentHandler, new Metadata()); + new XHTMLContentHandler(toHTMLContentHandler, new Metadata()); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", ""); attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", - "http://schema.org/Event"); + "http://schema.org/Event"); xhtmlContentHandler.startDocument(); xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "body", "body", attributes); @@ -164,12 +158,12 @@ public void testAttributesOnBody() throws Exception { public void testAttributesOnHtml() throws Exception { ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler(); XHTMLContentHandler xhtmlContentHandler = - new XHTMLContentHandler(toHTMLContentHandler, new Metadata()); + new XHTMLContentHandler(toHTMLContentHandler, new Metadata()); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", ""); attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", - "http://schema.org/Event"); + "http://schema.org/Event"); xhtmlContentHandler.startDocument(); xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "html", "html", attributes); diff --git a/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java b/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java index 2a3f1d4be0..6689c30b2a 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.sax.xpath; diff --git a/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java index 1f05631e5d..6e5deae60e 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -20,16 +18,14 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; +import aQute.bnd.annotation.metatype.Configurable; import java.util.Date; import java.util.HashMap; import java.util.Map; - -import aQute.bnd.annotation.metatype.Configurable; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.Field; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; +import org.junit.jupiter.api.Test; /** @@ -63,7 +59,7 @@ class MyParser extends Configurable { AnnotationUtils.assignFieldParams(new MyParser(), params); fail("Exception expected"); } catch (TikaConfigException e) { - //expected + // expected } } @@ -117,7 +113,7 @@ class MyParser extends Configurable { AnnotationUtils.assignFieldParams(new MyParser(), params); fail("Exception expected"); } catch (TikaConfigException e) { - //expected + // expected } } @@ -162,7 +158,7 @@ class Child extends Parent { AnnotationUtils.assignFieldParams(new Child(), params); fail("Exception expected, parent class field not set"); } catch (TikaConfigException e) { - //expected + // expected } } @@ -193,7 +189,7 @@ class Bean { AnnotationUtils.assignFieldParams(parser, params); fail("Exception expected, Date is not assignable to CharSequence."); } catch (TikaConfigException e) { - //expected + // expected } diff --git a/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java index 8a6574ae0c..00bb59e59c 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; diff --git a/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java index a689a5778e..229d2cf2ea 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -20,20 +18,17 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; - -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.parser.ParseContext; +import org.junit.jupiter.api.Test; public class ConcurrentUtilsTest { @Test public void testExecuteThread() throws Exception { ParseContext context = new ParseContext(); - Future result = ConcurrentUtils.execute(context, () -> - { - //Do nothing + Future result = ConcurrentUtils.execute(context, () -> { + // Do nothing }); assertNull(result.get()); @@ -44,9 +39,8 @@ public void testExecuteExecutor() throws Exception { TikaConfig config = TikaConfig.getDefaultConfig(); ParseContext context = new ParseContext(); context.set(ExecutorService.class, config.getExecutorService()); - Future result = ConcurrentUtils.execute(context, () -> - { - //Do nothing + Future result = ConcurrentUtils.execute(context, () -> { + // Do nothing }); assertNull(result.get()); diff --git a/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java index 030836ed5e..608e1c9048 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -21,7 +19,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.List; - import org.junit.jupiter.api.Test; /** @@ -58,16 +55,15 @@ public void testExtractLinksNone() { */ @Test public void testExtractLinksHttp() { - List links = RegexUtils.extractLinks( - "Test with http://www.nutch.org/index.html is it found? " + - "What about www.google.com at http://www.google.de " + - "A longer URL could be http://www.sybit.com/solutions/portals.html"); + List links = RegexUtils + .extractLinks("Test with http://www.nutch.org/index.html is it found? " + + "What about www.google.com at http://www.google.de " + + "A longer URL could be http://www.sybit.com/solutions/portals.html"); assertTrue(links.size() == 3, "Url not found!"); assertEquals("http://www.nutch.org/index.html", links.get(0), "Wrong URL"); assertEquals("http://www.google.de", links.get(1), "Wrong URL"); - assertEquals("http://www.sybit.com/solutions/portals.html", links.get(2), - "Wrong URL"); + assertEquals("http://www.sybit.com/solutions/portals.html", links.get(2), "Wrong URL"); } /** @@ -75,8 +71,8 @@ public void testExtractLinksHttp() { */ @Test public void testExtractLinksFtp() { - List links = RegexUtils.extractLinks("Test with ftp://www.nutch.org is it found? " + - "What about www.google.com at ftp://www.google.de"); + List links = RegexUtils.extractLinks("Test with ftp://www.nutch.org is it found? " + + "What about www.google.com at ftp://www.google.de"); assertTrue(links.size() == 2, "Url not found!"); assertEquals("ftp://www.nutch.org", links.get(0), "Wrong URL"); diff --git a/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java index 199c0031fa..d675c6e5fe 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -23,26 +21,25 @@ import java.util.Collections; import java.util.List; import java.util.Random; - import org.apache.custom.detect.MyCustomDetector; -import org.junit.jupiter.api.Test; - import org.apache.tika.detect.Detector; import org.apache.tika.detect.EmptyDetector; import org.apache.tika.detect.FileCommandDetector; import org.apache.tika.detect.OverrideDetector; import org.apache.tika.detect.ZeroSizeFileDetector; +import org.junit.jupiter.api.Test; public class ServiceLoaderUtilsTest { @Test public void testSort() throws Exception { - //OverrideDetector is moved to index 0 - //by the private service loading in DefaultDetector. - //This tests that a custom detector always comes first - //and then reverse alphabetical order - Detector[] detectors = new Detector[]{new MyCustomDetector(), new EmptyDetector(), - new FileCommandDetector(), new OverrideDetector(), new ZeroSizeFileDetector()}; + // OverrideDetector is moved to index 0 + // by the private service loading in DefaultDetector. + // This tests that a custom detector always comes first + // and then reverse alphabetical order + Detector[] detectors = new Detector[] {new MyCustomDetector(), new EmptyDetector(), + new FileCommandDetector(), new OverrideDetector(), + new ZeroSizeFileDetector()}; List expected = Arrays.asList(detectors); List shuffled = new ArrayList<>(expected); Random random = new Random(42); diff --git a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java index 310a8b158e..92b3f275ad 100644 --- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java +++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.utils; @@ -27,7 +25,8 @@ import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; - +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.ToTextContentHandler; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Test; import org.w3c.dom.Document; @@ -35,13 +34,11 @@ import org.w3c.dom.NodeList; import org.xml.sax.SAXException; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.ToTextContentHandler; - /** * Class to test that XMLReaderUtils defends against xxe and billion laughs. *

- * Different versions and different implementations vary. This is not a fully comprehensive set of tests. + * Different versions and different implementations vary. This is not a fully comprehensive set of + * tests. *

* Please add more. *

@@ -51,27 +48,34 @@ public class XMLReaderUtilsTest { private static final Locale defaultLocale = Locale.getDefault(); static { - //tests on content of Exception msgs require specifying locale. - //even this, though is not sufficient for the billion laughs tests ?! + // tests on content of Exception msgs require specifying locale. + // even this, though is not sufficient for the billion laughs tests ?! Locale.setDefault(Locale.US); } - private static final String EXTERNAL_DTD_SIMPLE_FILE = ""; - private static final String EXTERNAL_DTD_SIMPLE_URL = ""; - private static final String EXTERNAL_ENTITY = "" + - " ]>&bar;"; - private static final String EXTERNAL_LOCAL_DTD = "" + - "%local_dtd;]>"; - - private static final String BILLION_LAUGHS_CLASSICAL = "\n" + "\n" + " \n" + - " \n" + " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + "]>\n" + "&lol9;"; + private static final String EXTERNAL_DTD_SIMPLE_FILE = + ""; + private static final String EXTERNAL_DTD_SIMPLE_URL = + ""; + private static final String EXTERNAL_ENTITY = + "" + + " ]>&bar;"; + private static final String EXTERNAL_LOCAL_DTD = "" + + "%local_dtd;]>"; + + private static final String BILLION_LAUGHS_CLASSICAL = "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "]>\n" + "&lol9;"; private static String BILLION_LAUGHS_VARIANT; @@ -91,23 +95,25 @@ public class XMLReaderUtilsTest { BILLION_LAUGHS_VARIANT = xml.toString(); } - private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL, - EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD }; + private static final String[] EXTERNAL_ENTITY_XMLS = new String[] {EXTERNAL_DTD_SIMPLE_FILE, + EXTERNAL_DTD_SIMPLE_URL, EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD}; - private static final String[] BILLION_LAUGHS = new String[]{ BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT }; + private static final String[] BILLION_LAUGHS = + new String[] {BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT}; @AfterAll public static void tearDown() { Locale.setDefault(defaultLocale); } - //make sure that parseSAX actually defends against external entities + // make sure that parseSAX actually defends against external entities @Test public void testSAX() throws Exception { for (String xml : EXTERNAL_ENTITY_XMLS) { try { - XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), - new ToTextContentHandler(), new ParseContext()); + XMLReaderUtils.parseSAX( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ToTextContentHandler(), new ParseContext()); } catch (ConnectException e) { fail("Parser tried to access resource: " + xml, e); } @@ -118,7 +124,9 @@ public void testSAX() throws Exception { public void testDOM() throws Exception { for (String xml : EXTERNAL_ENTITY_XMLS) { try { - XMLReaderUtils.buildDOM(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext()); + XMLReaderUtils.buildDOM( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ParseContext()); } catch (ConnectException e) { fail("Parser tried to access resource: " + xml, e); } @@ -129,8 +137,10 @@ public void testDOM() throws Exception { public void testStax() throws Exception { for (String xml : EXTERNAL_ENTITY_XMLS) { try { - XMLInputFactory xmlInputFactory = XMLReaderUtils.getXMLInputFactory(new ParseContext()); - XMLEventReader reader = xmlInputFactory.createXMLEventReader(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); + XMLInputFactory xmlInputFactory = + XMLReaderUtils.getXMLInputFactory(new ParseContext()); + XMLEventReader reader = xmlInputFactory.createXMLEventReader( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); StringBuilder sb = new StringBuilder(); while (reader.hasNext()) { sb.append(reader.next()); @@ -156,8 +166,9 @@ public void testStax() throws Exception { public void testSAXBillionLaughs() throws Exception { for (String xml : BILLION_LAUGHS) { try { - XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), - new ToTextContentHandler(), new ParseContext()); + XMLReaderUtils.parseSAX( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ToTextContentHandler(), new ParseContext()); } catch (SAXException e) { limitCheck(e); } @@ -166,15 +177,18 @@ public void testSAXBillionLaughs() throws Exception { @Test public void testDOMBillionLaughs() throws Exception { - //confirm that ExpandEntityReferences has been set to false. + // confirm that ExpandEntityReferences has been set to false. - //some implementations ignore the expandEntityReferences=false, and we are still - //protected by the "The parser has encountered more than "20" entity expansions" SAXException. - //We need to check for either: empty content and no exception, or this SAXException + // some implementations ignore the expandEntityReferences=false, and we are still + // protected by the "The parser has encountered more than "20" entity expansions" + // SAXException. + // We need to check for either: empty content and no exception, or this SAXException for (String xml : BILLION_LAUGHS) { Document doc = null; try { - doc = XMLReaderUtils.buildDOM(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext()); + doc = XMLReaderUtils.buildDOM( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ParseContext()); } catch (SAXException e) { limitCheck(e); continue; @@ -182,10 +196,7 @@ public void testDOMBillionLaughs() throws Exception { NodeList nodeList = doc.getChildNodes(); StringBuilder sb = new StringBuilder(); dumpChildren(nodeList, sb); - assertEquals(0, sb - .toString() - .trim() - .length(), sb.toString()); + assertEquals(0, sb.toString().trim().length(), sb.toString()); } } @@ -202,22 +213,24 @@ private void dumpChildren(NodeList nodeList, StringBuilder sb) { @Test public void testStaxBillionLaughs() throws Exception { /* - Turning off dtd support of the XMLInputFactory in XMLReaderUtils turns off entity expansions and - causes a "NoSuchElementException" with the "'lol9' was referenced but not declared" message with this line: - tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false); - If that line doesn't exist, then we get a - NoSuchElementException with: "The parser has encountered more than "20" entity expansions in this document; this is the limit imposed by the JDK." + * Turning off dtd support of the XMLInputFactory in XMLReaderUtils turns off entity + * expansions and causes a "NoSuchElementException" with the + * "'lol9' was referenced but not declared" message with this line: + * tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false); If that line doesn't + * exist, then we get a NoSuchElementException with: "The parser has encountered more than " + * 20" entity expansions in this document; this is the limit imposed by the JDK." */ for (String xml : BILLION_LAUGHS) { XMLInputFactory xmlInputFactory = XMLReaderUtils.getXMLInputFactory(new ParseContext()); - XMLEventReader reader = xmlInputFactory.createXMLEventReader(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); + XMLEventReader reader = xmlInputFactory.createXMLEventReader( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); try { while (reader.hasNext()) { reader.next(); } } catch (NoSuchElementException e) { - //full message on temurin-17: The entity "lol9" was referenced, but not declared. + // full message on temurin-17: The entity "lol9" was referenced, but not declared. String msg = e.getLocalizedMessage(); if (msg != null) { @@ -239,14 +252,14 @@ private void limitCheck(SAXException e) throws SAXException { throw e; } - //depending on the flavor/version of the jdk, entity expansions may be triggered + // depending on the flavor/version of the jdk, entity expansions may be triggered // OR entitySizeLimit may be triggered - //See TIKA-4471 - if (msg.contains("JAXP00010001") || //entity expansions - msg.contains("JAXP00010003") || //max entity size limit - msg.contains("JAXP00010004") || //TotalEntitySizeLimit - msg.contains("entity expansions") || - e.getMessage().contains("maxGeneralEntitySizeLimit")) { + // See TIKA-4471 + if (msg.contains("JAXP00010001") || // entity expansions + msg.contains("JAXP00010003") || // max entity size limit + msg.contains("JAXP00010004") || // TotalEntitySizeLimit + msg.contains("entity expansions") + || e.getMessage().contains("maxGeneralEntitySizeLimit")) { return; } throw e; diff --git a/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java b/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java index a96ad20a47..c3f67d909e 100644 --- a/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java +++ b/tika-detectors/tika-detector-magika/src/main/java/org/apache/tika/detect/magika/MagikaDetector.java @@ -1,36 +1,30 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect.magika; import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.regex.Matcher; import java.util.regex.Pattern; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.Field; import org.apache.tika.detect.Detector; import org.apache.tika.io.BoundedInputStream; @@ -43,12 +37,13 @@ import org.apache.tika.utils.FileProcessResult; import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * Simple wrapper around Google's magika: https://github.com/google/magika - * The tool must be installed on the host where Tika is running. - * The default behavior is to run detection, report the results in the - * metadata and then return null so that other detectors will be used. + * Simple wrapper around Google's magika: https://github.com/google/magika The tool must be + * installed on the host where Tika is running. The default behavior is to run detection, report the + * results in the metadata and then return null so that other detectors will be used. */ public class MagikaDetector implements Detector { @@ -60,31 +55,25 @@ enum STATUS { public static Property MAGIKA_STATUS = Property.externalText(MAGIKA_PREFIX + "status"); public static Property MAGIKA_DESCRIPTION = - Property.externalText(MAGIKA_PREFIX + "description"); - public static Property MAGIKA_SCORE = - Property.externalReal(MAGIKA_PREFIX + "score"); - public static Property MAGIKA_GROUP = - Property.externalText(MAGIKA_PREFIX + "group"); - public static Property MAGIKA_LABEL = - Property.externalText(MAGIKA_PREFIX + "label"); - public static Property MAGIKA_MIME = - Property.externalText(MAGIKA_PREFIX + "mime_type"); - public static Property MAGIKA_IS_TEXT = - Property.externalBoolean(MAGIKA_PREFIX + "is_text"); - - public static Property MAGIKA_ERRORS = - Property.externalTextBag(MAGIKA_PREFIX + "errors"); + Property.externalText(MAGIKA_PREFIX + "description"); + public static Property MAGIKA_SCORE = Property.externalReal(MAGIKA_PREFIX + "score"); + public static Property MAGIKA_GROUP = Property.externalText(MAGIKA_PREFIX + "group"); + public static Property MAGIKA_LABEL = Property.externalText(MAGIKA_PREFIX + "label"); + public static Property MAGIKA_MIME = Property.externalText(MAGIKA_PREFIX + "mime_type"); + public static Property MAGIKA_IS_TEXT = Property.externalBoolean(MAGIKA_PREFIX + "is_text"); + + public static Property MAGIKA_ERRORS = Property.externalTextBag(MAGIKA_PREFIX + "errors"); public static Property MAGIKA_VERSION = Property.externalText(MAGIKA_PREFIX + "version"); - //TODO -- grab errors and warnings + // TODO -- grab errors and warnings private static final Logger LOGGER = LoggerFactory.getLogger(MagikaDetector.class); private static final long DEFAULT_TIMEOUT_MS = 60000; private static final String DEFAULT_MAGIKA_PATH = "magika"; - //we set this during the initial check. - //we assume that a new version is not installed during the lifecycle of the MagikaDetector + // we set this during the initial check. + // we assume that a new version is not installed during the lifecycle of the MagikaDetector private static String MAGIKA_VERSION_STRING = ""; private static ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -97,11 +86,10 @@ enum STATUS { private boolean useMime = false; public static boolean checkHasMagika(String magikaCommandPath) { - String[] commandline = new String[]{magikaCommandPath, "--version"}; + String[] commandline = new String[] {magikaCommandPath, "--version"}; FileProcessResult result = null; try { - result = ProcessUtils.execute(new ProcessBuilder(commandline), - 1000, 1000, 1000); + result = ProcessUtils.execute(new ProcessBuilder(commandline), 1000, 1000, 1000); } catch (IOException e) { LOGGER.debug("problem with magika"); return false; @@ -110,14 +98,12 @@ public static boolean checkHasMagika(String magikaCommandPath) { if (result.getExitValue() != 0) { return false; } - /* python - Matcher m = Pattern - .compile("Magika version:\\s+(.{4,50})").matcher(""); - - */ - //rust - Matcher m = Pattern - .compile("magika ([^\\s]{4,50})").matcher(""); + /* + * python Matcher m = Pattern .compile("Magika version:\\s+(.{4,50})").matcher(""); + * + */ + // rust + Matcher m = Pattern.compile("magika ([^\\s]{4,50})").matcher(""); for (String line : result.getStdout().split("[\r\n]+")) { if (m.reset(line).find()) { MAGIKA_VERSION_STRING = m.group(1); @@ -128,7 +114,7 @@ public static boolean checkHasMagika(String magikaCommandPath) { } /** - * @param input document input stream, or null + * @param input document input stream, or null * @param metadata input metadata for the document * @return mime as identified by the file command or application/octet-stream otherwise * @throws IOException @@ -147,8 +133,8 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } TikaInputStream tis = TikaInputStream.cast(input); if (tis != null) { - //spool the full file to disk, if called with a TikaInputStream - //and there is no underlying file + // spool the full file to disk, if called with a TikaInputStream + // and there is no underlying file return detectOnPath(tis.getPath(), metadata); } @@ -163,12 +149,10 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } /** - * As default behavior, Tika runs magika to add its detection - * to the metadata, but NOT to use detection in determining parsers - * etc. If this is set to true, this detector - * will return the first mime detected by magika and that - * mime will be used by the AutoDetectParser to select the appropriate - * parser. + * As default behavior, Tika runs magika to add its detection to the metadata, but NOT to use + * detection in determining parsers etc. If this is set to true, this detector will + * return the first mime detected by magika and that mime will be used by the AutoDetectParser + * to select the appropriate parser. * * @param useMime */ @@ -183,18 +167,15 @@ public boolean isUseMime() { private MediaType detectOnPath(Path path, Metadata metadata) throws IOException { - String[] args = new String[]{ - ProcessUtils.escapeCommandLine(magikaPath), - ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString()), - "--json" - }; + String[] args = new String[] {ProcessUtils.escapeCommandLine(magikaPath), + ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString()), "--json"}; ProcessBuilder builder = new ProcessBuilder(args); FileProcessResult result = ProcessUtils.execute(builder, timeoutMs, 10000000, 1000); return processResult(result, metadata, useMime); } protected static MediaType processResult(FileProcessResult result, Metadata metadata, - boolean returnMime) { + boolean returnMime) { metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue()); metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout()); @@ -213,13 +194,13 @@ protected static MediaType processResult(FileProcessResult result, Metadata meta metadata.set(MAGIKA_STATUS, STATUS.JSON_PARSE_EXCEPTION.name()); return MediaType.OCTET_STREAM; } - if (! rootArray.isArray() || rootArray.isEmpty()) { - //something went wrong + if (!rootArray.isArray() || rootArray.isEmpty()) { + // something went wrong return MediaType.OCTET_STREAM; } - //for now just take the first value + // for now just take the first value JsonNode root = rootArray.get(0); - //this is the more modern version + // this is the more modern version if (root.has("result")) { return processNewer(root.get("result"), metadata, returnMime); } else { @@ -229,9 +210,9 @@ protected static MediaType processResult(FileProcessResult result, Metadata meta private static MediaType processOlder(JsonNode root, Metadata metadata, boolean returnMime) { metadata.set(MAGIKA_STATUS, "ok"); - //TODO -- should we get values in "dl" instead or in addition? - if (! root.has("output")) { - //do something else + // TODO -- should we get values in "dl" instead or in addition? + if (!root.has("output")) { + // do something else return MediaType.OCTET_STREAM; } JsonNode mOutput = root.get("output"); @@ -244,7 +225,7 @@ private static MediaType processOlder(JsonNode root, Metadata metadata, boolean addString(mOutput, "ct_label", MAGIKA_LABEL, metadata); addString(mOutput, "mime_type", MAGIKA_MIME, metadata); metadata.set(MAGIKA_VERSION, MAGIKA_VERSION_STRING); - if (returnMime && ! StringUtils.isBlank(metadata.get(MAGIKA_MIME))) { + if (returnMime && !StringUtils.isBlank(metadata.get(MAGIKA_MIME))) { return MediaType.parse(metadata.get(MAGIKA_MIME)); } @@ -254,16 +235,16 @@ private static MediaType processOlder(JsonNode root, Metadata metadata, boolean private static MediaType processNewer(JsonNode result, Metadata metadata, boolean returnMime) { metadata.set(MAGIKA_STATUS, "ok"); - //TODO -- should we get values in "dl" instead or in addition? + // TODO -- should we get values in "dl" instead or in addition? addString(result, "status", MAGIKA_STATUS, metadata); - if (! result.has("value")) { + if (!result.has("value")) { return MediaType.OCTET_STREAM; } JsonNode mValue = result.get("value"); - if (! mValue.has("output")) { - //do something else + if (!mValue.has("output")) { + // do something else return MediaType.OCTET_STREAM; } @@ -283,7 +264,7 @@ private static MediaType processNewer(JsonNode result, Metadata metadata, boolea addString(mOutput, "mime_type", MAGIKA_MIME, metadata); setBoolean(mOutput, "is_text", MAGIKA_IS_TEXT, metadata); metadata.set(MAGIKA_VERSION, MAGIKA_VERSION_STRING); - if (returnMime && ! StringUtils.isBlank(metadata.get(MAGIKA_MIME))) { + if (returnMime && !StringUtils.isBlank(metadata.get(MAGIKA_MIME))) { return MediaType.parse(metadata.get(MAGIKA_MIME)); } @@ -292,12 +273,12 @@ private static MediaType processNewer(JsonNode result, Metadata metadata, boolea } private static void setBoolean(JsonNode node, String jsonKey, Property property, - Metadata metadata) { - if (! node.has(jsonKey)) { + Metadata metadata) { + if (!node.has(jsonKey)) { return; } - if (! node.get(jsonKey).isBoolean()) { - //log? + if (!node.get(jsonKey).isBoolean()) { + // log? return; } metadata.set(property, node.get(jsonKey).booleanValue()); @@ -305,20 +286,17 @@ private static void setBoolean(JsonNode node, String jsonKey, Property property, } private static void addString(JsonNode node, String jsonKey, Property property, - Metadata metadata) { + Metadata metadata) { if (node.has(jsonKey)) { if (node.get(jsonKey).isArray()) { for (JsonNode child : node.get(jsonKey)) { - String val = child - .asText(StringUtils.EMPTY); - if (! StringUtils.isBlank(val)) { + String val = child.asText(StringUtils.EMPTY); + if (!StringUtils.isBlank(val)) { metadata.add(property, val); } } } else { - String val = node - .get(jsonKey) - .asText(StringUtils.EMPTY); + String val = node.get(jsonKey).asText(StringUtils.EMPTY); if (StringUtils.isBlank(val)) { return; } @@ -329,16 +307,15 @@ private static void addString(JsonNode node, String jsonKey, Property property, @Field public void setMagikaPath(String fileCommandPath) { - //this opens up a potential command vulnerability. - //Don't ever let an untrusted user set this. + // this opens up a potential command vulnerability. + // Don't ever let an untrusted user set this. this.magikaPath = fileCommandPath; checkHasMagika(this.magikaPath); } /** - * If this is not called on a TikaInputStream, this detector - * will spool up to this many bytes to a file to be detected - * by the 'file' command. + * If this is not called on a TikaInputStream, this detector will spool up to this many bytes to + * a file to be detected by the 'file' command. * * @param maxBytes */ diff --git a/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java index 9b190d77cb..00437918e2 100644 --- a/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java +++ b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaIntegration.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect.magika; @@ -22,15 +20,13 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.List; - -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; @Disabled("need to have magika on the path") public class TestMagikaIntegration extends TikaTest { @@ -52,8 +48,7 @@ public void testIntegration() throws Exception { } private Path getConfig(String configName) throws URISyntaxException { - return Paths.get( - getClass().getResource("/configs/" + configName).toURI()); + return Paths.get(getClass().getResource("/configs/" + configName).toURI()); } } diff --git a/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java index 26b2900650..715a727951 100644 --- a/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java +++ b/tika-detectors/tika-detector-magika/src/test/java/org/apache/tika/detect/magika/TestMagikaJsonParsing.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect.magika; @@ -21,27 +19,26 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.utils.FileProcessResult; +import org.junit.jupiter.api.Test; public class TestMagikaJsonParsing extends TikaTest { - //TODO -- add testcontainers unit test with dockerized magika + // TODO -- add testcontainers unit test with dockerized magika @Test public void testPython0_5_1() throws Exception { - //this is the older python package available at the time of development from pypi + // this is the older python package available at the time of development from pypi FileProcessResult fileProcessResult = load("test-basic-0.5.1.json"); Metadata metadata = new Metadata(); MagikaDetector.processResult(fileProcessResult, metadata, false); assertEquals("ok", metadata.get(MagikaDetector.MAGIKA_STATUS)); assertEquals("Python source", metadata.get(MagikaDetector.MAGIKA_DESCRIPTION)); - assertEquals(0.999987125396, Double.parseDouble(metadata.get(MagikaDetector.MAGIKA_SCORE)), 0.0000001); + assertEquals(0.999987125396, Double.parseDouble(metadata.get(MagikaDetector.MAGIKA_SCORE)), + 0.0000001); assertEquals("code", metadata.get(MagikaDetector.MAGIKA_GROUP)); assertEquals("python", metadata.get(MagikaDetector.MAGIKA_LABEL)); assertEquals("text/x-python", metadata.get(MagikaDetector.MAGIKA_MIME)); @@ -50,59 +47,50 @@ public void testPython0_5_1() throws Exception { @Test public void testRust0_1_0_rc1() throws Exception { - //this is the way of the future -- rust-based + // this is the way of the future -- rust-based FileProcessResult fileProcessResult = load("test-basic.json"); Metadata metadata = new Metadata(); MagikaDetector.processResult(fileProcessResult, metadata, false); assertEquals("ok", metadata.get(MagikaDetector.MAGIKA_STATUS)); assertEquals("Python source", metadata.get(MagikaDetector.MAGIKA_DESCRIPTION)); - assertEquals(0.753000020980835, Double.parseDouble(metadata.get(MagikaDetector.MAGIKA_SCORE)), 0.0000001); + assertEquals(0.753000020980835, + Double.parseDouble(metadata.get(MagikaDetector.MAGIKA_SCORE)), 0.0000001); assertEquals("code", metadata.get(MagikaDetector.MAGIKA_GROUP)); assertEquals("python", metadata.get(MagikaDetector.MAGIKA_LABEL)); assertEquals("text/x-python", metadata.get(MagikaDetector.MAGIKA_MIME)); assertEquals(true, Boolean.parseBoolean(metadata.get(MagikaDetector.MAGIKA_IS_TEXT))); } -/* - @Test - public void testErrors() throws Exception { - FileProcessResult fileProcessResult = load("test-errors.json"); - Metadata metadata = new Metadata(); - SiegfriedDetector.processResult(fileProcessResult, metadata, false); - //debug(metadata); - assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION)); - assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE)); - assertEquals("x-fmt/111", metadata.get("sf:pronom:id")); - assertEquals("extension match txt", metadata.get("sf:pronom:basis")); - assertEquals("Plain Text File", metadata.get("sf:pronom:format")); - assertEquals("text/plain", metadata.get("sf:pronom:mime")); - assertNull(metadata.get("sf:pronom:version")); - assertEquals("empty source", metadata.get(SiegfriedDetector.SIEGFRIED_ERRORS)); - } - - @Test - public void testWarnings() throws Exception { - FileProcessResult fileProcessResult = load("test-warnings.json"); - Metadata metadata = new Metadata(); - SiegfriedDetector.processResult(fileProcessResult, metadata, false); - assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION)); - assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE)); - assertEquals("UNKNOWN", metadata.get("sf:pronom:id")); - assertNull(metadata.get("sf:pronom:basis")); - assertNull(metadata.get("sf:pronom:format")); - assertNull(metadata.get("sf:pronom:mime")); - assertNull(metadata.get("sf:pronom:version")); - assertTrue(metadata.get("sf:pronom:warning") - .startsWith("no match; possibilities based on extension are fmt/14, fmt/15, fmt/16, " + - "fmt/17, fmt/18, fmt/19")); - } - - -*/ + /* + * @Test public void testErrors() throws Exception { FileProcessResult fileProcessResult = + * load("test-errors.json"); Metadata metadata = new Metadata(); + * SiegfriedDetector.processResult(fileProcessResult, metadata, false); //debug(metadata); + * assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION)); + * assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE)); + * assertEquals("x-fmt/111", metadata.get("sf:pronom:id")); assertEquals("extension match txt", + * metadata.get("sf:pronom:basis")); assertEquals("Plain Text File", + * metadata.get("sf:pronom:format")); assertEquals("text/plain", + * metadata.get("sf:pronom:mime")); assertNull(metadata.get("sf:pronom:version")); + * assertEquals("empty source", metadata.get(SiegfriedDetector.SIEGFRIED_ERRORS)); } + * + * @Test public void testWarnings() throws Exception { FileProcessResult fileProcessResult = + * load("test-warnings.json"); Metadata metadata = new Metadata(); + * SiegfriedDetector.processResult(fileProcessResult, metadata, false); assertEquals("1.9.5", + * metadata.get(SiegfriedDetector.SIEGFRIED_VERSION)); assertEquals("default.sig", + * metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE)); assertEquals("UNKNOWN", + * metadata.get("sf:pronom:id")); assertNull(metadata.get("sf:pronom:basis")); + * assertNull(metadata.get("sf:pronom:format")); assertNull(metadata.get("sf:pronom:mime")); + * assertNull(metadata.get("sf:pronom:version")); assertTrue(metadata.get("sf:pronom:warning") + * .startsWith("no match; possibilities based on extension are fmt/14, fmt/15, fmt/16, " + + * "fmt/17, fmt/18, fmt/19")); } + * + * + */ private FileProcessResult load(String jsonFileName) throws IOException { - String jsonString = IOUtils.toString( - getClass().getResourceAsStream("/json/" + jsonFileName), StandardCharsets.UTF_8); + String jsonString = + IOUtils.toString(getClass().getResourceAsStream("/json/" + jsonFileName), + StandardCharsets.UTF_8); FileProcessResult r = new FileProcessResult(); r.setStdout(jsonString); r.setExitValue(0); diff --git a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java index 7629a066a6..7a64ca6833 100644 --- a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java +++ b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java @@ -1,34 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect.siegfried; import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.Field; import org.apache.tika.detect.Detector; import org.apache.tika.io.BoundedInputStream; @@ -42,11 +36,13 @@ import org.apache.tika.utils.FileProcessResult; import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * Simple wrapper around Siegfried https://github.com/richardlehane/siegfried - * The default behavior is to run detection, report the results in the - * metadata and then return null so that other detectors will be used. + * Simple wrapper around Siegfried https://github.com/richardlehane/siegfried The default behavior + * is to run detection, report the results in the metadata and then return null so that other + * detectors will be used. */ public class SiegfriedDetector implements Detector { @@ -58,21 +54,20 @@ enum STATUS { public static Property SIEGFRIED_STATUS = Property.externalText(SIEGFRIED_PREFIX + "status"); public static Property SIEGFRIED_VERSION = - Property.externalText(SIEGFRIED_PREFIX + "sf_version"); + Property.externalText(SIEGFRIED_PREFIX + "sf_version"); public static Property SIEGFRIED_SIGNATURE = - Property.externalText(SIEGFRIED_PREFIX + "signature"); + Property.externalText(SIEGFRIED_PREFIX + "signature"); public static Property SIEGFRIED_IDENTIFIERS_NAME = - Property.externalTextBag(SIEGFRIED_PREFIX + "identifiers_name"); + Property.externalTextBag(SIEGFRIED_PREFIX + "identifiers_name"); public static Property SIEGFRIED_IDENTIFIERS_DETAILS = - Property.externalTextBag(SIEGFRIED_PREFIX + "identifiers_details"); + Property.externalTextBag(SIEGFRIED_PREFIX + "identifiers_details"); - public static Property SIEGFRIED_ERRORS = - Property.externalTextBag(SIEGFRIED_PREFIX + "errors"); + public static Property SIEGFRIED_ERRORS = Property.externalTextBag(SIEGFRIED_PREFIX + "errors"); - //TODO -- grab errors and warnings + // TODO -- grab errors and warnings public static String ID = "id"; public static String FORMAT = "format"; @@ -97,12 +92,12 @@ enum STATUS { private boolean useMime = false; public static boolean checkHasSiegfried(String siegfriedCommandPath) { - String[] commandline = new String[]{siegfriedCommandPath, "-version"}; + String[] commandline = new String[] {siegfriedCommandPath, "-version"}; return ExternalParser.check(commandline); } /** - * @param input document input stream, or null + * @param input document input stream, or null * @param metadata input metadata for the document * @return mime as identified by the file command or application/octet-stream otherwise * @throws IOException @@ -121,8 +116,8 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } TikaInputStream tis = TikaInputStream.cast(input); if (tis != null) { - //spool the full file to disk, if called with a TikaInputStream - //and there is no underlying file + // spool the full file to disk, if called with a TikaInputStream + // and there is no underlying file return detectOnPath(tis.getPath(), metadata); } @@ -137,12 +132,10 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } /** - * As default behavior, Tika runs Siegfried to add its detection - * to the metadata, but NOT to use detection in determining parsers - * etc. If this is set to true, this detector - * will return the first mime detected by Siegfried and that - * mime will be used by the AutoDetectParser to select the appropriate - * parser. + * As default behavior, Tika runs Siegfried to add its detection to the metadata, but NOT to use + * detection in determining parsers etc. If this is set to true, this detector will + * return the first mime detected by Siegfried and that mime will be used by the + * AutoDetectParser to select the appropriate parser. * * @param useMime */ @@ -157,15 +150,15 @@ public boolean isUseMime() { private MediaType detectOnPath(Path path, Metadata metadata) throws IOException { - String[] args = new String[]{ProcessUtils.escapeCommandLine(siegfriedPath), "-json", - ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString())}; + String[] args = new String[] {ProcessUtils.escapeCommandLine(siegfriedPath), "-json", + ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString())}; ProcessBuilder builder = new ProcessBuilder(args); FileProcessResult result = ProcessUtils.execute(builder, timeoutMs, 1000000, 1000); return processResult(result, metadata, useMime); } protected static MediaType processResult(FileProcessResult result, Metadata metadata, - boolean returnMime) { + boolean returnMime) { metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue()); metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout()); @@ -214,15 +207,15 @@ protected static MediaType processResult(FileProcessResult result, Metadata meta if (errors.isTextual()) { metadata.add(SIEGFRIED_ERRORS, file.get(ERRORS).asText()); } else if (errors.isArray()) { - //is this even possible?! + // is this even possible?! for (JsonNode e : errors) { metadata.add(SIEGFRIED_ERRORS, e.asText()); } } } for (JsonNode match : file.get("matches")) { - String ns = match.has("ns") ? match.get("ns").asText(StringUtils.EMPTY) : - StringUtils.EMPTY; + String ns = match.has("ns") ? match.get("ns").asText(StringUtils.EMPTY) + : StringUtils.EMPTY; addNotBlank(match, "basis", metadata, SIEGFRIED_PREFIX + ns + ":" + BASIS); addNotBlank(match, "format", metadata, SIEGFRIED_PREFIX + ns + ":" + FORMAT); addNotBlank(match, "id", metadata, SIEGFRIED_PREFIX + ns + ":" + ID); @@ -230,7 +223,7 @@ protected static MediaType processResult(FileProcessResult result, Metadata meta addNotBlank(match, "version", metadata, SIEGFRIED_PREFIX + ns + ":" + VERSION); addNotBlank(match, "warning", metadata, SIEGFRIED_PREFIX + ns + ":" + WARNING); - //take the first non-octet-stream + // take the first non-octet-stream if (returnMime && mt.equals(MediaType.OCTET_STREAM)) { if (match.has("mime")) { String mimeString = match.get("mime").asText(StringUtils.EMPTY); @@ -247,7 +240,7 @@ protected static MediaType processResult(FileProcessResult result, Metadata meta } private static void addNotBlank(JsonNode node, String jsonKey, Metadata metadata, - String metadataKey) { + String metadataKey) { if (node.has(jsonKey)) { String val = node.get(jsonKey).asText(StringUtils.EMPTY); if (StringUtils.isBlank(val)) { @@ -259,16 +252,15 @@ private static void addNotBlank(JsonNode node, String jsonKey, Metadata metadata @Field public void setSiegfriedPath(String fileCommandPath) { - //this opens up a potential command vulnerability. - //Don't ever let an untrusted user set this. + // this opens up a potential command vulnerability. + // Don't ever let an untrusted user set this. this.siegfriedPath = fileCommandPath; checkHasSiegfried(this.siegfriedPath); } /** - * If this is not called on a TikaInputStream, this detector - * will spool up to this many bytes to a file to be detected - * by the 'file' command. + * If this is not called on a TikaInputStream, this detector will spool up to this many bytes to + * a file to be detected by the 'file' command. * * @param maxBytes */ diff --git a/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedIntegration.java b/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedIntegration.java index e96eff0fa8..fd0108409a 100644 --- a/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedIntegration.java +++ b/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedIntegration.java @@ -1,32 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect.siegfried; import java.net.URISyntaxException; import java.nio.file.Path; import java.nio.file.Paths; - -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; @Disabled("need to have siegfried on the path") public class TestSiegfriedIntegration extends TikaTest { @@ -39,8 +35,7 @@ public void testIntegration() throws Exception { } private Path getConfig(String configName) throws URISyntaxException { - return Paths.get( - getClass().getResource("/configs/" + configName).toURI()); + return Paths.get(getClass().getResource("/configs/" + configName).toURI()); } } diff --git a/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java b/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java index 237eab51f7..ea53b2dcf9 100644 --- a/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java +++ b/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect.siegfried; @@ -22,17 +20,15 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.utils.FileProcessResult; +import org.junit.jupiter.api.Test; public class TestSiegfriedJsonParsing extends TikaTest { - //TODO -- add testcontainers unit test with dockerized siegfried + // TODO -- add testcontainers unit test with dockerized siegfried @Test public void testBasic() throws Exception { @@ -42,8 +38,10 @@ public void testBasic() throws Exception { assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION)); assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE)); assertEquals("fmt/19", metadata.get("sf:pronom:id")); - assertEquals("extension match pdf; byte match at [[0 8] [810818 5]]", metadata.get("sf:pronom:basis")); - assertEquals("Acrobat PDF 1.5 - Portable Document Format", metadata.get("sf:pronom:format")); + assertEquals("extension match pdf; byte match at [[0 8] [810818 5]]", + metadata.get("sf:pronom:basis")); + assertEquals("Acrobat PDF 1.5 - Portable Document Format", + metadata.get("sf:pronom:format")); assertEquals("application/pdf", metadata.get("sf:pronom:mime")); assertEquals("1.5", metadata.get("sf:pronom:version")); @@ -54,7 +52,7 @@ public void testErrors() throws Exception { FileProcessResult fileProcessResult = load("test-errors.json"); Metadata metadata = new Metadata(); SiegfriedDetector.processResult(fileProcessResult, metadata, false); - //debug(metadata); + // debug(metadata); assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION)); assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE)); assertEquals("x-fmt/111", metadata.get("sf:pronom:id")); @@ -77,17 +75,17 @@ public void testWarnings() throws Exception { assertNull(metadata.get("sf:pronom:format")); assertNull(metadata.get("sf:pronom:mime")); assertNull(metadata.get("sf:pronom:version")); - assertTrue(metadata.get("sf:pronom:warning") - .startsWith("no match; possibilities based on extension are fmt/14, fmt/15, fmt/16, " + - "fmt/17, fmt/18, fmt/19")); + assertTrue(metadata.get("sf:pronom:warning").startsWith( + "no match; possibilities based on extension are fmt/14, fmt/15, fmt/16, " + + "fmt/17, fmt/18, fmt/19")); } - private FileProcessResult load(String jsonFileName) throws IOException { - String jsonString = IOUtils.toString( - getClass().getResourceAsStream("/json/" + jsonFileName), StandardCharsets.UTF_8); + String jsonString = + IOUtils.toString(getClass().getResourceAsStream("/json/" + jsonFileName), + StandardCharsets.UTF_8); FileProcessResult r = new FileProcessResult(); r.setStdout(jsonString); r.setExitValue(0); diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java index fc0d72f0a6..5723ae2086 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalConfig.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -81,9 +79,12 @@ public Path getErrorLogFile() { @Override public String toString() { - return "EvalConfig{" + "minExtractLength=" + minExtractLength + ", maxExtractLength=" + maxExtractLength + ", jdbcString='" + jdbcString + '\'' + ", jdbcDriverClass='" + - jdbcDriverClass + '\'' + ", forceDrop=" + forceDrop + ", maxFilesToAdd=" + maxFilesToAdd + ", maxTokens=" + maxTokens + ", maxContentLength=" + maxContentLength + - ", numThreads=" + numWorkers + ", errorLogFile=" + errorLogFile + '}'; + return "EvalConfig{" + "minExtractLength=" + minExtractLength + ", maxExtractLength=" + + maxExtractLength + ", jdbcString='" + jdbcString + '\'' + + ", jdbcDriverClass='" + jdbcDriverClass + '\'' + ", forceDrop=" + + forceDrop + ", maxFilesToAdd=" + maxFilesToAdd + ", maxTokens=" + + maxTokens + ", maxContentLength=" + maxContentLength + ", numThreads=" + + numWorkers + ", errorLogFile=" + errorLogFile + '}'; } public void setNumWorkers(int n) { diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalFilePaths.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalFilePaths.java index aa54cca2f0..96aabb61de 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalFilePaths.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalFilePaths.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -25,9 +23,8 @@ import java.util.Objects; /** - * Simple struct to keep track of relative path of source file ( - * original binary file, e.g. /subdir/document1.doc) - * and the extract file (e.g. /subdir/document1.doc.json). + * Simple struct to keep track of relative path of source file ( original binary file, e.g. + * /subdir/document1.doc) and the extract file (e.g. /subdir/document1.doc.json). */ class EvalFilePaths { @@ -48,7 +45,7 @@ public EvalFilePaths(Path relativeSourceFilePath, Path extractFile) { try { extractFileLength = Files.size(extractFile); } catch (IOException e) { - //swallow ? + // swallow ? } } this.relativeSourceFilePath = relativeSourceFilePath; @@ -59,12 +56,12 @@ public Path getRelativeSourceFilePath() { return relativeSourceFilePath; } - //this path may or may not exist and it could be null! + // this path may or may not exist and it could be null! public Path getExtractFile() { return extractFile; } - //if it doesn't exist, it'll be -1l. + // if it doesn't exist, it'll be -1l. public long getSourceFileLength() { return sourceFileLength; } @@ -108,7 +105,8 @@ public int hashCode() { @Override public String toString() { - return "EvalFilePaths{" + "relativeSourceFilePath=" + relativeSourceFilePath + ", extractFile=" + extractFile + ", sourceFileLength=" + sourceFileLength + - ", extractFileLength=" + extractFileLength + '}'; + return "EvalFilePaths{" + "relativeSourceFilePath=" + relativeSourceFilePath + + ", extractFile=" + extractFile + ", sourceFileLength=" + sourceFileLength + + ", extractFileLength=" + extractFileLength + '}'; } } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java index 48cea521ae..2d55065cf3 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -28,7 +26,6 @@ import org.apache.commons.cli.Options; import org.apache.commons.cli.help.HelpFormatter; import org.apache.commons.io.FilenameUtils; - import org.apache.tika.eval.app.db.ColInfo; import org.apache.tika.eval.app.db.Cols; import org.apache.tika.eval.app.db.TableInfo; @@ -47,43 +44,65 @@ public class ExtractComparer extends ProfilerBase { - private static final String DIGEST_KEY_PREFIX = TikaCoreProperties.TIKA_META_PREFIX + "digest" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + private static final String DIGEST_KEY_PREFIX = TikaCoreProperties.TIKA_META_PREFIX + "digest" + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; private final static String FIELD_A = "fa"; private final static String FIELD_B = "fb"; - public static TableInfo REF_PAIR_NAMES = new TableInfo("pair_names", new ColInfo(Cols.DIR_NAME_A, Types.VARCHAR, 128), new ColInfo(Cols.DIR_NAME_B, Types.VARCHAR, 128)); - public static TableInfo COMPARISON_CONTAINERS = - new TableInfo("containers", new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN), - new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12), new ColInfo(Cols.LENGTH, Types.BIGINT), new ColInfo(Cols.EXTRACT_FILE_LENGTH_A, Types.BIGINT), + public static TableInfo REF_PAIR_NAMES = + new TableInfo("pair_names", new ColInfo(Cols.DIR_NAME_A, Types.VARCHAR, 128), + new ColInfo(Cols.DIR_NAME_B, Types.VARCHAR, 128)); + public static TableInfo COMPARISON_CONTAINERS = new TableInfo("containers", + new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN), + new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12), + new ColInfo(Cols.LENGTH, Types.BIGINT), + new ColInfo(Cols.EXTRACT_FILE_LENGTH_A, Types.BIGINT), new ColInfo(Cols.EXTRACT_FILE_LENGTH_B, Types.BIGINT)); - public static TableInfo CONTENT_COMPARISONS = - new TableInfo("content_comparisons", new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, Types.VARCHAR, 1024), - new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, Types.VARCHAR, 1024), new ColInfo(Cols.TOP_10_MORE_IN_A, Types.VARCHAR, 1024), - new ColInfo(Cols.TOP_10_MORE_IN_B, Types.VARCHAR, 1024), new ColInfo(Cols.DICE_COEFFICIENT, Types.FLOAT), new ColInfo(Cols.OVERLAP, Types.FLOAT)); - public static TableInfo PROFILES_A = new TableInfo("profiles_a", ExtractProfiler.PROFILE_TABLE.getColInfos()); - public static TableInfo PROFILES_B = new TableInfo("profiles_b", ExtractProfiler.PROFILE_TABLE.getColInfos()); - public static TableInfo EMBEDDED_FILE_PATH_TABLE_A = new TableInfo("emb_path_a", ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos()); - public static TableInfo EMBEDDED_FILE_PATH_TABLE_B = new TableInfo("emb_path_b", ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos()); - public static TableInfo CONTENTS_TABLE_A = new TableInfo("contents_a", ExtractProfiler.CONTENTS_TABLE.getColInfos()); - public static TableInfo CONTENTS_TABLE_B = new TableInfo("contents_b", ExtractProfiler.CONTENTS_TABLE.getColInfos()); - public static TableInfo TAGS_TABLE_A = new TableInfo("tags_a", ExtractProfiler.TAGS_TABLE.getColInfos()); - public static TableInfo TAGS_TABLE_B = new TableInfo("tags_b", ExtractProfiler.TAGS_TABLE.getColInfos()); - public static TableInfo EXCEPTION_TABLE_A = new TableInfo("exceptions_a", ExtractProfiler.EXCEPTION_TABLE.getColInfos()); - public static TableInfo EXCEPTION_TABLE_B = new TableInfo("exceptions_b", ExtractProfiler.EXCEPTION_TABLE.getColInfos()); - public static TableInfo EXTRACT_EXCEPTION_TABLE_A = new TableInfo("extract_exceptions_a", ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos()); - public static TableInfo EXTRACT_EXCEPTION_TABLE_B = new TableInfo("extract_exceptions_b", ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos()); + public static TableInfo CONTENT_COMPARISONS = new TableInfo("content_comparisons", + new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, Types.VARCHAR, 1024), + new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, Types.VARCHAR, 1024), + new ColInfo(Cols.TOP_10_MORE_IN_A, Types.VARCHAR, 1024), + new ColInfo(Cols.TOP_10_MORE_IN_B, Types.VARCHAR, 1024), + new ColInfo(Cols.DICE_COEFFICIENT, Types.FLOAT), + new ColInfo(Cols.OVERLAP, Types.FLOAT)); + public static TableInfo PROFILES_A = + new TableInfo("profiles_a", ExtractProfiler.PROFILE_TABLE.getColInfos()); + public static TableInfo PROFILES_B = + new TableInfo("profiles_b", ExtractProfiler.PROFILE_TABLE.getColInfos()); + public static TableInfo EMBEDDED_FILE_PATH_TABLE_A = new TableInfo("emb_path_a", + ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos()); + public static TableInfo EMBEDDED_FILE_PATH_TABLE_B = new TableInfo("emb_path_b", + ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos()); + public static TableInfo CONTENTS_TABLE_A = + new TableInfo("contents_a", ExtractProfiler.CONTENTS_TABLE.getColInfos()); + public static TableInfo CONTENTS_TABLE_B = + new TableInfo("contents_b", ExtractProfiler.CONTENTS_TABLE.getColInfos()); + public static TableInfo TAGS_TABLE_A = + new TableInfo("tags_a", ExtractProfiler.TAGS_TABLE.getColInfos()); + public static TableInfo TAGS_TABLE_B = + new TableInfo("tags_b", ExtractProfiler.TAGS_TABLE.getColInfos()); + public static TableInfo EXCEPTION_TABLE_A = + new TableInfo("exceptions_a", ExtractProfiler.EXCEPTION_TABLE.getColInfos()); + public static TableInfo EXCEPTION_TABLE_B = + new TableInfo("exceptions_b", ExtractProfiler.EXCEPTION_TABLE.getColInfos()); + public static TableInfo EXTRACT_EXCEPTION_TABLE_A = new TableInfo("extract_exceptions_a", + ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos()); + public static TableInfo EXTRACT_EXCEPTION_TABLE_B = new TableInfo("extract_exceptions_b", + ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos()); static Options OPTIONS; - - //need to parameterize? + // need to parameterize? private final Path inputDir; private final Path extractsA; private final Path extractsB; private final TokenContraster tokenContraster = new TokenContraster(); private final ExtractReader extractReader; - public ExtractComparer(Path inputDir, Path extractsA, Path extractsB, ExtractReader extractReader, IDBWriter writer) { + public ExtractComparer(Path inputDir, Path extractsA, Path extractsB, + ExtractReader extractReader, IDBWriter writer) { super(writer); this.inputDir = inputDir; this.extractsA = extractsA; @@ -93,9 +112,11 @@ public ExtractComparer(Path inputDir, Path extractsA, Path extractsB, ExtractRea public static void USAGE() throws IOException { HelpFormatter helpFormatter = HelpFormatter.builder().get(); - helpFormatter.printHelp("java -jar tika-eval-x.y.jar Compare -extractsA extractsA -extractsB extractsB -db mydb", - "Tool: Compare", ExtractComparer.OPTIONS, - "Note: for the default h2 db, do not include the .mv.db at the end of the db name.", true); + helpFormatter.printHelp( + "java -jar tika-eval-x.y.jar Compare -extractsA extractsA -extractsB extractsB -db mydb", + "Tool: Compare", ExtractComparer.OPTIONS, + "Note: for the default h2 db, do not include the .mv.db at the end of the db name.", + true); } @Override @@ -104,7 +125,7 @@ public boolean processFileResource(FetchKey fetchKey) { EvalFilePaths fpsB = null; if (inputDir != null && (inputDir.equals(extractsA) || inputDir.equals(extractsB))) { - //crawling an extract dir + // crawling an extract dir fpsA = getPathsFromExtractCrawl(fetchKey, extractsA); fpsB = getPathsFromExtractCrawl(fetchKey, extractsB); @@ -116,13 +137,13 @@ public boolean processFileResource(FetchKey fetchKey) { try { compareFiles(fpsA, fpsB); } catch (Throwable e) { - //this should be cataclysmic... + // this should be cataclysmic... throw new RuntimeException("Exception while working on: " + fetchKey.getFetchKey(), e); } return true; } - //protected for testing, should find better way so that this can be private! + // protected for testing, should find better way so that this can be private! protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOException { ExtractReaderException.TYPE extractExceptionA = null; @@ -145,39 +166,41 @@ protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOExc extractExceptionB = e.getType(); } - //array indices for those metadata items handled in B + // array indices for those metadata items handled in B Set handledB = new HashSet<>(); String containerID = Integer.toString(ID.getAndIncrement()); - //container table + // container table Map contData = new HashMap<>(); contData.put(Cols.CONTAINER_ID, containerID); - contData.put(Cols.FILE_PATH, fpsA - .getRelativeSourceFilePath() - .toString()); + contData.put(Cols.FILE_PATH, fpsA.getRelativeSourceFilePath().toString()); long srcFileLength = getSourceFileLength(metadataListA, metadataListB); - contData.put(Cols.LENGTH, srcFileLength > NON_EXISTENT_FILE_LENGTH ? Long.toString(srcFileLength) : ""); - contData.put(Cols.FILE_EXTENSION, FilenameUtils.getExtension(fpsA - .getRelativeSourceFilePath() - .getFileName() - .toString())); + contData.put(Cols.LENGTH, + srcFileLength > NON_EXISTENT_FILE_LENGTH ? Long.toString(srcFileLength) + : ""); + contData.put(Cols.FILE_EXTENSION, FilenameUtils + .getExtension(fpsA.getRelativeSourceFilePath().getFileName().toString())); long extractFileLengthA = getFileLength(fpsA.getExtractFile()); - contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > NON_EXISTENT_FILE_LENGTH ? Long.toString(extractFileLengthA) : ""); + contData.put(Cols.EXTRACT_FILE_LENGTH_A, + extractFileLengthA > NON_EXISTENT_FILE_LENGTH + ? Long.toString(extractFileLengthA) + : ""); long extractFileLengthB = getFileLength(fpsB.getExtractFile()); - contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > NON_EXISTENT_FILE_LENGTH ? Long.toString(extractFileLengthB) : ""); + contData.put(Cols.EXTRACT_FILE_LENGTH_B, + extractFileLengthB > NON_EXISTENT_FILE_LENGTH + ? Long.toString(extractFileLengthB) + : ""); writer.writeRow(COMPARISON_CONTAINERS, contData); if (extractExceptionA != null) { - writeExtractException(EXTRACT_EXCEPTION_TABLE_A, containerID, fpsA - .getRelativeSourceFilePath() - .toString(), extractExceptionA); + writeExtractException(EXTRACT_EXCEPTION_TABLE_A, containerID, + fpsA.getRelativeSourceFilePath().toString(), extractExceptionA); } if (extractExceptionB != null) { - writeExtractException(EXTRACT_EXCEPTION_TABLE_B, containerID, fpsB - .getRelativeSourceFilePath() - .toString(), extractExceptionB); + writeExtractException(EXTRACT_EXCEPTION_TABLE_B, containerID, + fpsB.getRelativeSourceFilePath().toString(), extractExceptionB); } if (metadataListA == null && metadataListB == null) { @@ -189,22 +212,24 @@ protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOExc String sharedDigestKey = findSharedDigestKey(metadataListA, metadataListB); Map tokenStatsA = null; Map tokenStatsB = null; - //now get that metadata + // now get that metadata if (metadataListA != null) { for (int i = 0; i < metadataListA.size(); i++) { - //the first file should have the same id as the container id + // the first file should have the same id as the container id String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement()); Metadata metadataA = metadataListA.get(i); ContentTags contentTagsA = getContent(fpsA, metadataA); ContentTags contentTagsB = ContentTags.EMPTY_CONTENT_TAGS; Metadata metadataB = null; - //TODO: shouldn't be fileA!!!! + // TODO: shouldn't be fileA!!!! writeTagData(fileId, contentTagsA, TAGS_TABLE_A); - writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A); + writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, + numAttachmentsA, PROFILES_A); writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A); - int matchIndex = getMatch(i, sharedDigestKey, handledB, metadataListA, metadataListB); + int matchIndex = getMatch(i, sharedDigestKey, handledB, metadataListA, + metadataListB); if (matchIndex > -1 && !handledB.contains(matchIndex)) { metadataB = metadataListB.get(matchIndex); @@ -213,11 +238,12 @@ protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOExc if (metadataB != null) { contentTagsB = getContent(fpsB, metadataB); writeTagData(fileId, contentTagsB, TAGS_TABLE_B); - writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B); + writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, + numAttachmentsB, PROFILES_B); writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B); } writeEmbeddedFilePathData(i, fileId, metadataA, metadataB); - //write content + // write content try { tokenStatsA = calcTextStats(contentTagsA); writeContentData(fileId, tokenStatsA, CONTENTS_TABLE_A); @@ -229,16 +255,19 @@ protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOExc throw new RuntimeException(e); } if (metadataB != null) { - TokenCounts tokenCountsA = (TokenCounts) tokenStatsA.get(BasicTokenCountStatsCalculator.class); - TokenCounts tokenCountsB = (TokenCounts) tokenStatsB.get(BasicTokenCountStatsCalculator.class); - //arbitrary decision...only run the comparisons if there are > 10 tokens total - //We may want to bump that value a bit higher? - //now run comparisons + TokenCounts tokenCountsA = (TokenCounts) tokenStatsA + .get(BasicTokenCountStatsCalculator.class); + TokenCounts tokenCountsB = (TokenCounts) tokenStatsB + .get(BasicTokenCountStatsCalculator.class); + // arbitrary decision...only run the comparisons if there are > 10 tokens total + // We may want to bump that value a bit higher? + // now run comparisons if (tokenCountsA.getTotalTokens() + tokenCountsB.getTotalTokens() > 10) { Map data = new HashMap<>(); data.put(Cols.ID, fileId); - ContrastStatistics contrastStatistics = tokenContraster.calculateContrastStatistics(tokenCountsA, tokenCountsB); + ContrastStatistics contrastStatistics = tokenContraster + .calculateContrastStatistics(tokenCountsA, tokenCountsB); writeContrasts(data, contrastStatistics); writer.writeRow(CONTENT_COMPARISONS, data); @@ -246,8 +275,8 @@ protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOExc } } } - //now try to get any Metadata objects in B - //that haven't yet been handled. + // now try to get any Metadata objects in B + // that haven't yet been handled. if (metadataListB != null) { for (int i = 0; i < metadataListB.size(); i++) { if (handledB.contains(i)) { @@ -255,14 +284,15 @@ protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOExc } Metadata metadataB = metadataListB.get(i); ContentTags contentTagsB = getContent(fpsB, metadataB); - //the first file should have the same id as the container id + // the first file should have the same id as the container id String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement()); writeTagData(fileId, contentTagsB, TAGS_TABLE_B); - writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B); + writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, + numAttachmentsB, PROFILES_B); writeEmbeddedFilePathData(i, fileId, null, metadataB); writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B); - //write content + // write content try { tokenStatsB = calcTextStats(contentTagsB); writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B); @@ -274,8 +304,8 @@ protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOExc } /** - * Checks only the first item in each list. Returns the first - * digest key shared by both, if it exists, null otherwise. + * Checks only the first item in each list. Returns the first digest key shared by both, if it + * exists, null otherwise. * * @param metadataListA * @param metadataListB @@ -287,9 +317,7 @@ private String findSharedDigestKey(List metadataListA, List } Set digestA = new HashSet<>(); if (metadataListA != null && !metadataListA.isEmpty()) { - for (String n : metadataListA - .get(0) - .names()) { + for (String n : metadataListA.get(0).names()) { if (n.startsWith(DIGEST_KEY_PREFIX)) { digestA.add(n); } @@ -305,7 +333,7 @@ private String findSharedDigestKey(List metadataListA, List } private void writeEmbeddedFilePathData(int i, String fileId, Metadata mA, Metadata mB) { - //container file, don't write anything + // container file, don't write anything if (i == 0) { return; } @@ -349,52 +377,54 @@ private long getSourceFileLength(List metadataListA, List me /** - * Try to find the matching metadata based on the AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH - * If you can't find it, return -1; + * Try to find the matching metadata based on the + * AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH If you can't find it, return -1; * - * @param aIndex index for match in metadataListA + * @param aIndex index for match in metadataListA * @param metadataListA * @param metadataListB * @return */ - private int getMatch(int aIndex, String sharedDigestKey, Set handledB, List metadataListA, List metadataListB) { - //TODO: could make this more robust + private int getMatch(int aIndex, String sharedDigestKey, Set handledB, + List metadataListA, List metadataListB) { + // TODO: could make this more robust if (metadataListB == null || metadataListB.size() == 0) { return -1; } - //assume first is always the container file + // assume first is always the container file if (aIndex == 0) { return 0; } if (sharedDigestKey != null) { - //first try to find matching digests - return findMatchingDigests(sharedDigestKey, handledB, metadataListA.get(aIndex), metadataListB); + // first try to find matching digests + return findMatchingDigests(sharedDigestKey, handledB, metadataListA.get(aIndex), + metadataListB); } - //assume same embedded resource path. Not always true! + // assume same embedded resource path. Not always true! Metadata thisMetadata = metadataListA.get(aIndex); String embeddedPath = thisMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); if (embeddedPath != null) { for (int j = 0; j < metadataListB.size(); j++) { - String thatEmbeddedPath = metadataListB - .get(j) - .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); + String thatEmbeddedPath = + metadataListB.get(j).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); if (embeddedPath.equals(thatEmbeddedPath)) { return j; } } } - //last resort, if lists are same size, guess the same index + // last resort, if lists are same size, guess the same index if (metadataListA.size() == metadataListB.size()) { - //assume no rearrangments if lists are the same size + // assume no rearrangments if lists are the same size return aIndex; } return -1; } - private int findMatchingDigests(String sharedDigestKey, Set handledB, Metadata metadata, List metadataListB) { + private int findMatchingDigests(String sharedDigestKey, Set handledB, + Metadata metadata, List metadataListB) { String digestA = metadata.get(sharedDigestKey); if (digestA == null) { return -1; @@ -410,7 +440,8 @@ private int findMatchingDigests(String sharedDigestKey, Set handledB, M String digestB = mB.get(sharedDigestKey); if (digestA.equalsIgnoreCase(digestB)) { cand = i; - if (resourceName != null && resourceName.equals(mB.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH))) { + if (resourceName != null && resourceName + .equals(mB.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH))) { return i; } } @@ -421,14 +452,17 @@ private int findMatchingDigests(String sharedDigestKey, Set handledB, M private void writeContrasts(Map data, ContrastStatistics contrastStatistics) { writeContrastString(data, Cols.TOP_10_MORE_IN_A, contrastStatistics.getTopNMoreA()); writeContrastString(data, Cols.TOP_10_MORE_IN_B, contrastStatistics.getTopNMoreB()); - writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, contrastStatistics.getTopNUniqueA()); - writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, contrastStatistics.getTopNUniqueB()); + writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, + contrastStatistics.getTopNUniqueA()); + writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, + contrastStatistics.getTopNUniqueB()); data.put(Cols.OVERLAP, Double.toString(contrastStatistics.getOverlap())); data.put(Cols.DICE_COEFFICIENT, Double.toString(contrastStatistics.getDiceCoefficient())); } - private void writeContrastString(Map data, Cols col, TokenIntPair[] tokenIntPairs) { + private void writeContrastString(Map data, Cols col, + TokenIntPair[] tokenIntPairs) { int i = 0; StringBuilder sb = new StringBuilder(); @@ -436,10 +470,7 @@ private void writeContrastString(Map data, Cols col, TokenIntPair[ if (i++ > 0) { sb.append(" | "); } - sb - .append(p.getToken()) - .append(": ") - .append(p.getValue()); + sb.append(p.getToken()).append(": ").append(p.getValue()); } data.put(col, sb.toString()); } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java index 0ab120c815..e5de5fe22b 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparerRunner.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -43,9 +41,6 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.help.HelpFormatter; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.eval.app.db.Cols; import org.apache.tika.eval.app.db.JDBCUtil; import org.apache.tika.eval.app.db.MimeBuffer; @@ -59,6 +54,8 @@ import org.apache.tika.pipes.core.pipesiterator.CallablePipesIterator; import org.apache.tika.pipes.core.pipesiterator.PipesIterator; import org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class ExtractComparerRunner { @@ -71,25 +68,38 @@ public class ExtractComparerRunner { static { OPTIONS = new Options() - .addOption(Option.builder("a").longOpt("extractsA").hasArg().desc("required: directory of 'A' extracts").get()) - .addOption(Option.builder("b").longOpt("extractsB").hasArg().desc("required: directory of 'B' extracts").get()) - .addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: directory for original binary input documents." - + " If not specified, -extracts is crawled as is.").get()) - .addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db path").get()) - .addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json config file").get()) - .addOption(Option.builder("n").longOpt("numWorkers").hasArg().desc("number of worker threads").get()) - .addOption(Option.builder("m").longOpt("maxExtractLength").hasArg().desc("maximum extract length").get()) - ; + .addOption(Option.builder("a").longOpt("extractsA").hasArg() + .desc("required: directory of 'A' extracts").get()) + .addOption(Option.builder("b").longOpt("extractsB").hasArg() + .desc("required: directory of 'B' extracts").get()) + .addOption(Option.builder("i").longOpt("inputDir").hasArg().desc( + "optional: directory for original binary input documents." + + " If not specified, -extracts is crawled as is.") + .get()) + .addOption(Option.builder("d").longOpt("db").hasArg() + .desc("optional: db path").get()) + .addOption(Option.builder("c").longOpt("config").hasArg() + .desc("tika-eval json config file").get()) + .addOption(Option.builder("n").longOpt("numWorkers").hasArg() + .desc("number of worker threads").get()) + .addOption(Option.builder("m").longOpt("maxExtractLength").hasArg() + .desc("maximum extract length").get()); } public static void main(String[] args) throws Exception { DefaultParser defaultCLIParser = new DefaultParser(); CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args); - EvalConfig evalConfig = commandLine.hasOption('c') ? EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig(); - Path extractsADir = commandLine.hasOption('a') ? Paths.get(commandLine.getOptionValue('a')) : Paths.get(USAGE_FAIL("Must specify extractsA dir: -a")); - Path extractsBDir = commandLine.hasOption('b') ? Paths.get(commandLine.getOptionValue('b')) : Paths.get(USAGE_FAIL("Must specify extractsB dir: -b")); - Path inputDir = commandLine.hasOption('i') ? Paths.get(commandLine.getOptionValue('i')) : extractsADir; - String dbPath = commandLine.hasOption('d') ? commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d"); + EvalConfig evalConfig = commandLine.hasOption('c') + ? EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) + : new EvalConfig(); + Path extractsADir = commandLine.hasOption('a') ? Paths.get(commandLine.getOptionValue('a')) + : Paths.get(USAGE_FAIL("Must specify extractsA dir: -a")); + Path extractsBDir = commandLine.hasOption('b') ? Paths.get(commandLine.getOptionValue('b')) + : Paths.get(USAGE_FAIL("Must specify extractsB dir: -b")); + Path inputDir = commandLine.hasOption('i') ? Paths.get(commandLine.getOptionValue('i')) + : extractsADir; + String dbPath = commandLine.hasOption('d') ? commandLine.getOptionValue('d') + : USAGE_FAIL("Must specify the db name: -d"); if (commandLine.hasOption('n')) { evalConfig.setNumWorkers(Integer.parseInt(commandLine.getOptionValue('n'))); @@ -107,15 +117,16 @@ private static String getJdbcConnectionString(String dbPath) { if (dbPath.startsWith("jdbc:")) { return dbPath; } - //default to h2 + // default to h2 Path p = Paths.get(dbPath); return "jdbc:h2:file:" + p.toAbsolutePath(); } - private static void execute(Path inputDir, Path extractsA, Path extractsB, String dbPath, EvalConfig evalConfig) throws SQLException, IOException { + private static void execute(Path inputDir, Path extractsA, Path extractsB, String dbPath, + EvalConfig evalConfig) throws SQLException, IOException { - //parameterize this? if necesssary + // parameterize this? if necesssary try { ProfilerBase.loadCommonTokens(null, null); } catch (IOException e) { @@ -133,30 +144,36 @@ private static void execute(Path inputDir, Path extractsA, Path extractsB, Strin AtomicBoolean crawlerActive = new AtomicBoolean(true); ArrayBlockingQueue queue = new ArrayBlockingQueue<>(1000); - CallablePipesIterator pipesIterator = new CallablePipesIterator(createIterator(inputDir), queue); + CallablePipesIterator pipesIterator = + new CallablePipesIterator(createIterator(inputDir), queue); - ExecutorService executorService = Executors.newFixedThreadPool(evalConfig.getNumWorkers() + 2); - ExecutorCompletionService executorCompletionService = new ExecutorCompletionService<>(executorService); + ExecutorService executorService = + Executors.newFixedThreadPool(evalConfig.getNumWorkers() + 2); + ExecutorCompletionService executorCompletionService = + new ExecutorCompletionService<>(executorService); - StatusReporter statusReporter = new StatusReporter(pipesIterator, processed, activeWorkers, crawlerActive); + StatusReporter statusReporter = + new StatusReporter(pipesIterator, processed, activeWorkers, crawlerActive); executorCompletionService.submit(statusReporter); executorCompletionService.submit(pipesIterator); for (int i = 0; i < evalConfig.getNumWorkers(); i++) { - ExtractReader extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, evalConfig.getMinExtractLength(), evalConfig.getMaxExtractLength()); - ExtractComparer extractComparer = new ExtractComparer(inputDir, extractsA, extractsB, extractReader, - builder.getDBWriter(builder.getNonRefTableInfos(), jdbcUtil, mimeBuffer)); + ExtractReader extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, + evalConfig.getMinExtractLength(), evalConfig.getMaxExtractLength()); + ExtractComparer extractComparer = new ExtractComparer(inputDir, extractsA, extractsB, + extractReader, builder.getDBWriter(builder.getNonRefTableInfos(), + jdbcUtil, mimeBuffer)); executorCompletionService.submit(new ComparerWorker(queue, extractComparer, processed)); } int finished = 0; try { while (finished < evalConfig.getNumWorkers() + 2) { - //blocking + // blocking Future future = executorCompletionService.take(); Long result = future.get(); if (result != null) { - //if the dir walker has finished + // if the dir walker has finished if (result == DIR_WALKER_COMPLETED_VALUE) { queue.put(PipesIterator.COMPLETED_SEMAPHORE); crawlerActive.set(false); @@ -184,20 +201,25 @@ private static PipesIterator createIterator(Path inputDir) { return fs; } - private static MimeBuffer initTables(JDBCUtil jdbcUtil, ExtractComparerBuilder builder, String connectionString, EvalConfig evalConfig) throws SQLException, IOException { + private static MimeBuffer initTables(JDBCUtil jdbcUtil, ExtractComparerBuilder builder, + String connectionString, EvalConfig evalConfig) + throws SQLException, IOException { - //step 1. create the tables - jdbcUtil.createTables(builder.getNonRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); + // step 1. create the tables + jdbcUtil.createTables(builder.getNonRefTableInfos(), + JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); jdbcUtil.createTables(builder.getRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); - //step 2. create mime buffer - return new MimeBuffer(jdbcUtil.getConnection(), builder.getMimeTable(), MimeTypes.getDefaultMimeTypes()); + // step 2. create mime buffer + return new MimeBuffer(jdbcUtil.getConnection(), builder.getMimeTable(), + MimeTypes.getDefaultMimeTypes()); } private static void USAGE() throws IOException { HelpFormatter helpFormatter = HelpFormatter.builder().get(); - helpFormatter.printHelp("java -jar tika-eval-app-x.y.z.jar FileProfiler -e docs -d mydb [-i inputDir, -c config.json]", - "Tool: Profile", OPTIONS, null, true); + helpFormatter.printHelp( + "java -jar tika-eval-app-x.y.z.jar FileProfiler -e docs -d mydb [-i inputDir, -c config.json]", + "Tool: Profile", OPTIONS, null, true); } private static String USAGE_FAIL(String msg) throws IOException { @@ -211,7 +233,8 @@ private static class ComparerWorker implements Callable { private final ExtractComparer extractComparer; private final AtomicInteger processed; - ComparerWorker(ArrayBlockingQueue queue, ExtractComparer extractComparer, AtomicInteger processed) { + ComparerWorker(ArrayBlockingQueue queue, ExtractComparer extractComparer, + AtomicInteger processed) { this.queue = queue; this.extractComparer = extractComparer; this.processed = processed; @@ -228,7 +251,7 @@ public Long call() throws Exception { if (t == PipesIterator.COMPLETED_SEMAPHORE) { LOG.debug("worker hit semaphore and is stopping"); extractComparer.closeWriter(); - //hangs + // hangs queue.put(PipesIterator.COMPLETED_SEMAPHORE); return COMPARER_WORKER_COMPLETED_VALUE; } @@ -295,15 +318,15 @@ protected TableInfo getMimeTable() { return ProfilerBase.MIME_TABLE; } - public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException { + public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) + throws IOException, SQLException { boolean refTablesPopulated = true; try { Connection connection = dbUtil.getConnection(); for (TableInfo tableInfo : getRefTableInfos()) { int rows = 0; - try (ResultSet rs = connection - .createStatement() - .executeQuery("select * from " + tableInfo.getName())) { + try (ResultSet rs = connection.createStatement() + .executeQuery("select * from " + tableInfo.getName())) { while (rs.next()) { rows++; } @@ -315,7 +338,7 @@ public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOE } } catch (SQLException e) { - //swallow + // swallow } if (refTablesPopulated) { LOG.info("ref tables are already populated"); @@ -347,7 +370,8 @@ public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOE writer.close(); } - protected IDBWriter getDBWriter(List tableInfos, JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException { + protected IDBWriter getDBWriter(List tableInfos, JDBCUtil dbUtil, + MimeBuffer mimeBuffer) throws IOException, SQLException { Connection conn = dbUtil.getConnection(); return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer); } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java index b618bf0af2..6f3271f8ef 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfileRunner.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -43,9 +41,6 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.help.HelpFormatter; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.eval.app.db.Cols; import org.apache.tika.eval.app.db.JDBCUtil; import org.apache.tika.eval.app.db.MimeBuffer; @@ -59,6 +54,8 @@ import org.apache.tika.pipes.core.pipesiterator.CallablePipesIterator; import org.apache.tika.pipes.core.pipesiterator.PipesIterator; import org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class ExtractProfileRunner { @@ -71,23 +68,34 @@ public class ExtractProfileRunner { static { OPTIONS = new Options() - .addOption(Option.builder("e").longOpt("extracts").hasArg().desc("required: directory of extracts").get()) - .addOption(Option.builder("i").longOpt("inputDir").hasArg().desc("optional: directory for original binary input documents." - + " If not specified, -extracts is crawled as is.").get()) - .addOption(Option.builder("d").longOpt("db").hasArg().desc("optional: db path").get()) - .addOption(Option.builder("c").longOpt("config").hasArg().desc("tika-eval json config file").get()) - .addOption(Option.builder("n").longOpt("numWorkers").hasArg().desc("number of worker threads").get()) - .addOption(Option.builder("m").longOpt("maxExtractLength").hasArg().desc("maximum extract length").get()) - ; + .addOption(Option.builder("e").longOpt("extracts").hasArg() + .desc("required: directory of extracts").get()) + .addOption(Option.builder("i").longOpt("inputDir").hasArg().desc( + "optional: directory for original binary input documents." + + " If not specified, -extracts is crawled as is.") + .get()) + .addOption(Option.builder("d").longOpt("db").hasArg() + .desc("optional: db path").get()) + .addOption(Option.builder("c").longOpt("config").hasArg() + .desc("tika-eval json config file").get()) + .addOption(Option.builder("n").longOpt("numWorkers").hasArg() + .desc("number of worker threads").get()) + .addOption(Option.builder("m").longOpt("maxExtractLength").hasArg() + .desc("maximum extract length").get()); } public static void main(String[] args) throws Exception { DefaultParser defaultCLIParser = new DefaultParser(); CommandLine commandLine = defaultCLIParser.parse(OPTIONS, args); - EvalConfig evalConfig = commandLine.hasOption('c') ? EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) : new EvalConfig(); - Path extractsDir = commandLine.hasOption('e') ? Paths.get(commandLine.getOptionValue('e')) : Paths.get(USAGE_FAIL("Must specify extracts dir: -i")); - Path inputDir = commandLine.hasOption('i') ? Paths.get(commandLine.getOptionValue('i')) : extractsDir; - String dbPath = commandLine.hasOption('d') ? commandLine.getOptionValue('d') : USAGE_FAIL("Must specify the db name: -d"); + EvalConfig evalConfig = commandLine.hasOption('c') + ? EvalConfig.load(Paths.get(commandLine.getOptionValue('c'))) + : new EvalConfig(); + Path extractsDir = commandLine.hasOption('e') ? Paths.get(commandLine.getOptionValue('e')) + : Paths.get(USAGE_FAIL("Must specify extracts dir: -i")); + Path inputDir = commandLine.hasOption('i') ? Paths.get(commandLine.getOptionValue('i')) + : extractsDir; + String dbPath = commandLine.hasOption('d') ? commandLine.getOptionValue('d') + : USAGE_FAIL("Must specify the db name: -d"); String jdbcString = getJdbcConnectionString(dbPath); if (commandLine.hasOption('n')) { evalConfig.setNumWorkers(Integer.parseInt(commandLine.getOptionValue('n'))); @@ -103,15 +111,16 @@ private static String getJdbcConnectionString(String dbPath) { if (dbPath.startsWith("jdbc:")) { return dbPath; } - //default to h2 + // default to h2 Path p = Paths.get(dbPath); return "jdbc:h2:file:" + p.toAbsolutePath(); } - private static void execute(Path inputDir, Path extractsDir, String dbPath, EvalConfig evalConfig) throws SQLException, IOException { + private static void execute(Path inputDir, Path extractsDir, String dbPath, + EvalConfig evalConfig) throws SQLException, IOException { - //parameterize this? if necesssary + // parameterize this? if necesssary try { ProfilerBase.loadCommonTokens(null, null); } catch (IOException e) { @@ -129,28 +138,35 @@ private static void execute(Path inputDir, Path extractsDir, String dbPath, Eval ArrayBlockingQueue queue = new ArrayBlockingQueue<>(1000); - CallablePipesIterator pipesIterator = new CallablePipesIterator(createIterator(inputDir), queue); - ExecutorService executorService = Executors.newFixedThreadPool(evalConfig.getNumWorkers() + 2); - ExecutorCompletionService executorCompletionService = new ExecutorCompletionService<>(executorService); - - StatusReporter statusReporter = new StatusReporter(pipesIterator, processed, activeWorkers, crawlerActive); + CallablePipesIterator pipesIterator = + new CallablePipesIterator(createIterator(inputDir), queue); + ExecutorService executorService = + Executors.newFixedThreadPool(evalConfig.getNumWorkers() + 2); + ExecutorCompletionService executorCompletionService = + new ExecutorCompletionService<>(executorService); + + StatusReporter statusReporter = + new StatusReporter(pipesIterator, processed, activeWorkers, crawlerActive); executorCompletionService.submit(statusReporter); executorCompletionService.submit(pipesIterator); for (int i = 0; i < evalConfig.getNumWorkers(); i++) { - ExtractReader extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, evalConfig.getMinExtractLength(), evalConfig.getMaxExtractLength()); - ExtractProfiler extractProfiler = new ExtractProfiler(inputDir, extractsDir, extractReader, builder.getDBWriter(builder.tableInfos, jdbcUtil, mimeBuffer)); + ExtractReader extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, + evalConfig.getMinExtractLength(), evalConfig.getMaxExtractLength()); + ExtractProfiler extractProfiler = new ExtractProfiler(inputDir, extractsDir, + extractReader, + builder.getDBWriter(builder.tableInfos, jdbcUtil, mimeBuffer)); executorCompletionService.submit(new ProfileWorker(queue, extractProfiler, processed)); } int finished = 0; try { while (finished < evalConfig.getNumWorkers() + 2) { - //blocking + // blocking Future future = executorCompletionService.take(); Long result = future.get(); if (result != null) { - //if the dir walker has finished + // if the dir walker has finished if (result == DIR_WALKER_COMPLETED_VALUE) { queue.put(PipesIterator.COMPLETED_SEMAPHORE); crawlerActive.set(false); @@ -178,20 +194,25 @@ private static PipesIterator createIterator(Path inputDir) { return fs; } - private static MimeBuffer initTables(JDBCUtil jdbcUtil, ExtractProfilerBuilder builder, String connectionString, EvalConfig evalConfig) throws SQLException, IOException { + private static MimeBuffer initTables(JDBCUtil jdbcUtil, ExtractProfilerBuilder builder, + String connectionString, EvalConfig evalConfig) + throws SQLException, IOException { - //step 1. create the tables - jdbcUtil.createTables(builder.getNonRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); + // step 1. create the tables + jdbcUtil.createTables(builder.getNonRefTableInfos(), + JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); jdbcUtil.createTables(builder.getRefTableInfos(), JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS); - //step 2. create mime buffer - return new MimeBuffer(jdbcUtil.getConnection(), builder.getMimeTable(), MimeTypes.getDefaultMimeTypes()); + // step 2. create mime buffer + return new MimeBuffer(jdbcUtil.getConnection(), builder.getMimeTable(), + MimeTypes.getDefaultMimeTypes()); } private static void USAGE() throws IOException { HelpFormatter helpFormatter = HelpFormatter.builder().get(); - helpFormatter.printHelp("java -jar tika-eval-app-x.y.z.jar FileProfiler -e docs -d mydb [-i inputDir, -c config.json]", - "Tool: Profile", OPTIONS, null, true); + helpFormatter.printHelp( + "java -jar tika-eval-app-x.y.z.jar FileProfiler -e docs -d mydb [-i inputDir, -c config.json]", + "Tool: Profile", OPTIONS, null, true); } private static String USAGE_FAIL(String msg) throws IOException { @@ -205,7 +226,8 @@ private static class ProfileWorker implements Callable { private final ExtractProfiler extractProfiler; private final AtomicInteger processed; - ProfileWorker(ArrayBlockingQueue queue, ExtractProfiler extractProfiler, AtomicInteger processed) { + ProfileWorker(ArrayBlockingQueue queue, ExtractProfiler extractProfiler, + AtomicInteger processed) { this.queue = queue; this.extractProfiler = extractProfiler; this.processed = processed; @@ -222,7 +244,7 @@ public Long call() throws Exception { if (t == PipesIterator.COMPLETED_SEMAPHORE) { LOG.debug("worker hit semaphore and is stopping"); extractProfiler.closeWriter(); - //hangs + // hangs queue.put(PipesIterator.COMPLETED_SEMAPHORE); return PROFILE_WORKER_COMPLETED_VALUE; } @@ -268,15 +290,15 @@ protected TableInfo getMimeTable() { return ProfilerBase.MIME_TABLE; } - public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException { + public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) + throws IOException, SQLException { boolean refTablesPopulated = true; try { Connection connection = dbUtil.getConnection(); for (TableInfo tableInfo : getRefTableInfos()) { int rows = 0; - try (ResultSet rs = connection - .createStatement() - .executeQuery("select * from " + tableInfo.getName())) { + try (ResultSet rs = connection.createStatement() + .executeQuery("select * from " + tableInfo.getName())) { while (rs.next()) { rows++; } @@ -288,7 +310,7 @@ public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOE } } catch (SQLException e) { - //swallow + // swallow } if (refTablesPopulated) { LOG.info("ref tables are already populated"); @@ -320,7 +342,8 @@ public void populateRefTables(JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOE writer.close(); } - protected IDBWriter getDBWriter(List tableInfos, JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException { + protected IDBWriter getDBWriter(List tableInfos, JDBCUtil dbUtil, + MimeBuffer mimeBuffer) throws IOException, SQLException { Connection conn = dbUtil.getConnection(); return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer); } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java index 40c4b9cdc5..726545a5bb 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -24,7 +22,6 @@ import java.util.Map; import org.apache.commons.cli.Options; - import org.apache.tika.eval.app.db.ColInfo; import org.apache.tika.eval.app.db.Cols; import org.apache.tika.eval.app.db.TableInfo; @@ -39,38 +36,80 @@ public class ExtractProfiler extends ProfilerBase { private final static String FIELD = "f"; - public static TableInfo EXTRACT_EXCEPTION_TABLE = - new TableInfo("extract_exceptions", new ColInfo(Cols.CONTAINER_ID, Types.INTEGER), new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN), - new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER), new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER)); - public static TableInfo EXCEPTION_TABLE = - new TableInfo("parse_exceptions", new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.ORIG_STACK_TRACE, Types.VARCHAR, 8192), - new ColInfo(Cols.SORT_STACK_TRACE, Types.VARCHAR, 8192), new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER)); - public static TableInfo CONTAINER_TABLE = - new TableInfo("containers", new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN), - new ColInfo(Cols.LENGTH, Types.BIGINT), new ColInfo(Cols.EXTRACT_FILE_LENGTH, Types.BIGINT)); - public static TableInfo PROFILE_TABLE = new TableInfo("profiles", new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.CONTAINER_ID, Types.INTEGER), - new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 256), new ColInfo(Cols.MD5, Types.CHAR, 32), new ColInfo(Cols.LENGTH, Types.BIGINT), - new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN), new ColInfo(Cols.EMBEDDED_DEPTH, Types.INTEGER), new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024), - new ColInfo(Cols.ATTACHMENT_TYPE, Types.VARCHAR, 32), new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12), new ColInfo(Cols.MIME_ID, Types.INTEGER), - new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER), new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER), new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER), - new ColInfo(Cols.NUM_PAGES, Types.INTEGER), new ColInfo(Cols.NUM_OCR_PAGES, Types.INTEGER), new ColInfo(Cols.HAS_CONTENT, Types.BOOLEAN)); - public static TableInfo EMBEDDED_FILE_PATH_TABLE = - new TableInfo("emb_file_names", new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024)); - public static TableInfo CONTENTS_TABLE = new TableInfo("contents", new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.CONTENT_LENGTH, Types.INTEGER), - new ColInfo(Cols.NUM_UNIQUE_TOKENS, Types.INTEGER), new ColInfo(Cols.NUM_TOKENS, Types.INTEGER), new ColInfo(Cols.COMMON_TOKENS_LANG, Types.VARCHAR, 12), - new ColInfo(Cols.NUM_UNIQUE_COMMON_TOKENS, Types.INTEGER), new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER), - new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER), new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER), new ColInfo(Cols.OOV, Types.DOUBLE), - new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024), new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12), new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT), - new ColInfo(Cols.LANG_ID_2, Types.VARCHAR, 12), new ColInfo(Cols.LANG_ID_PROB_2, Types.FLOAT), new ColInfo(Cols.UNICODE_CHAR_BLOCKS, Types.VARCHAR, 1024), - new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT), new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER), new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT), - new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT), new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)); + public static TableInfo EXTRACT_EXCEPTION_TABLE = new TableInfo("extract_exceptions", + new ColInfo(Cols.CONTAINER_ID, Types.INTEGER), + new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN), + new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER), + new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER)); + public static TableInfo EXCEPTION_TABLE = new TableInfo("parse_exceptions", + new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.ORIG_STACK_TRACE, Types.VARCHAR, 8192), + new ColInfo(Cols.SORT_STACK_TRACE, Types.VARCHAR, 8192), + new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER)); + public static TableInfo CONTAINER_TABLE = new TableInfo("containers", + new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN), + new ColInfo(Cols.LENGTH, Types.BIGINT), + new ColInfo(Cols.EXTRACT_FILE_LENGTH, Types.BIGINT)); + public static TableInfo PROFILE_TABLE = new TableInfo("profiles", + new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.CONTAINER_ID, Types.INTEGER), + new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 256), + new ColInfo(Cols.MD5, Types.CHAR, 32), new ColInfo(Cols.LENGTH, Types.BIGINT), + new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN), + new ColInfo(Cols.EMBEDDED_DEPTH, Types.INTEGER), + new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024), + new ColInfo(Cols.ATTACHMENT_TYPE, Types.VARCHAR, 32), + new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12), + new ColInfo(Cols.MIME_ID, Types.INTEGER), + new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER), + new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER), + new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER), + new ColInfo(Cols.NUM_PAGES, Types.INTEGER), + new ColInfo(Cols.NUM_OCR_PAGES, Types.INTEGER), + new ColInfo(Cols.HAS_CONTENT, Types.BOOLEAN)); + public static TableInfo EMBEDDED_FILE_PATH_TABLE = new TableInfo("emb_file_names", + new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024)); + public static TableInfo CONTENTS_TABLE = + new TableInfo("contents", new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.CONTENT_LENGTH, Types.INTEGER), + new ColInfo(Cols.NUM_UNIQUE_TOKENS, Types.INTEGER), + new ColInfo(Cols.NUM_TOKENS, Types.INTEGER), + new ColInfo(Cols.COMMON_TOKENS_LANG, Types.VARCHAR, 12), + new ColInfo(Cols.NUM_UNIQUE_COMMON_TOKENS, Types.INTEGER), + new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER), + new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER), + new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER), + new ColInfo(Cols.OOV, Types.DOUBLE), + new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024), + new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12), + new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT), + new ColInfo(Cols.LANG_ID_2, Types.VARCHAR, 12), + new ColInfo(Cols.LANG_ID_PROB_2, Types.FLOAT), + new ColInfo(Cols.UNICODE_CHAR_BLOCKS, Types.VARCHAR, 1024), + new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT), + new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER), + new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT), + new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT), + new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)); public static TableInfo TAGS_TABLE = - new TableInfo("tags", new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.TAGS_A, Types.INTEGER), new ColInfo(Cols.TAGS_B, Types.INTEGER), - new ColInfo(Cols.TAGS_DIV, Types.INTEGER), new ColInfo(Cols.TAGS_I, Types.INTEGER), new ColInfo(Cols.TAGS_IMG, Types.INTEGER), - new ColInfo(Cols.TAGS_LI, Types.INTEGER), new ColInfo(Cols.TAGS_OL, Types.INTEGER), new ColInfo(Cols.TAGS_P, Types.INTEGER), - new ColInfo(Cols.TAGS_TABLE, Types.INTEGER), new ColInfo(Cols.TAGS_TD, Types.INTEGER), new ColInfo(Cols.TAGS_TITLE, Types.INTEGER), - new ColInfo(Cols.TAGS_TR, Types.INTEGER), new ColInfo(Cols.TAGS_U, Types.INTEGER), new ColInfo(Cols.TAGS_UL, Types.INTEGER), - new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN)); + new TableInfo("tags", new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.TAGS_A, Types.INTEGER), + new ColInfo(Cols.TAGS_B, Types.INTEGER), + new ColInfo(Cols.TAGS_DIV, Types.INTEGER), + new ColInfo(Cols.TAGS_I, Types.INTEGER), + new ColInfo(Cols.TAGS_IMG, Types.INTEGER), + new ColInfo(Cols.TAGS_LI, Types.INTEGER), + new ColInfo(Cols.TAGS_OL, Types.INTEGER), + new ColInfo(Cols.TAGS_P, Types.INTEGER), + new ColInfo(Cols.TAGS_TABLE, Types.INTEGER), + new ColInfo(Cols.TAGS_TD, Types.INTEGER), + new ColInfo(Cols.TAGS_TITLE, Types.INTEGER), + new ColInfo(Cols.TAGS_TR, Types.INTEGER), + new ColInfo(Cols.TAGS_U, Types.INTEGER), + new ColInfo(Cols.TAGS_UL, Types.INTEGER), + new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN)); static Options OPTIONS; private final Path inputDir; @@ -91,7 +130,7 @@ public boolean processFileResource(FetchKey fetchKey) { EvalFilePaths fps = null; if (inputDir != null && inputDir.equals(extracts)) { - //crawling an extract dir + // crawling an extract dir fps = getPathsFromExtractCrawl(fetchKey, extracts); } else { fps = getPathsFromSrcCrawl(fetchKey, inputDir, extracts); @@ -110,14 +149,14 @@ public boolean processFileResource(FetchKey fetchKey) { Map contOutput = new HashMap<>(); long srcFileLen = getSourceFileLength(fps, metadataList); - contOutput.put(Cols.LENGTH, srcFileLen > NON_EXISTENT_FILE_LENGTH ? Long.toString(srcFileLen) : ""); + contOutput.put(Cols.LENGTH, + srcFileLen > NON_EXISTENT_FILE_LENGTH ? Long.toString(srcFileLen) : ""); contOutput.put(Cols.CONTAINER_ID, containerIdString); - contOutput.put(Cols.FILE_PATH, fps - .getRelativeSourceFilePath() - .toString()); + contOutput.put(Cols.FILE_PATH, fps.getRelativeSourceFilePath().toString()); if (fps.getExtractFileLength() > 0) { - contOutput.put(Cols.EXTRACT_FILE_LENGTH, (fps.getExtractFile() == null) ? "" : Long.toString(fps.getExtractFileLength())); + contOutput.put(Cols.EXTRACT_FILE_LENGTH, (fps.getExtractFile() == null) ? "" + : Long.toString(fps.getExtractFileLength())); } try { writer.writeRow(CONTAINER_TABLE, contOutput); @@ -128,9 +167,8 @@ public boolean processFileResource(FetchKey fetchKey) { if (extractExceptionType != null) { try { - writeExtractException(EXTRACT_EXCEPTION_TABLE, containerIdString, fps - .getRelativeSourceFilePath() - .toString(), extractExceptionType); + writeExtractException(EXTRACT_EXCEPTION_TABLE, containerIdString, + fps.getRelativeSourceFilePath().toString(), extractExceptionType); } catch (IOException e) { throw new RuntimeException(e); } @@ -141,10 +179,11 @@ public boolean processFileResource(FetchKey fetchKey) { int i = 0; for (Metadata m : metadataList) { ContentTags contentTags = getContent(fps, m); - //the first file should have the same id as the container id + // the first file should have the same id as the container id String fileId = (i == 0) ? containerIdString : Integer.toString(ID.incrementAndGet()); writeTagData(fileId, contentTags, TAGS_TABLE); - writeProfileData(fps, i, contentTags, m, fileId, containerIdString, numAttachments, PROFILE_TABLE); + writeProfileData(fps, i, contentTags, m, fileId, containerIdString, numAttachments, + PROFILE_TABLE); writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE); writeExceptionData(fileId, m, EXCEPTION_TABLE); try { @@ -159,7 +198,8 @@ public boolean processFileResource(FetchKey fetchKey) { } - private void writeEmbeddedPathData(int i, String fileId, Metadata m, TableInfo embeddedFilePathTable) { + private void writeEmbeddedPathData(int i, String fileId, Metadata m, + TableInfo embeddedFilePathTable) { if (i == 0) { return; } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java index 18e30a5e44..ba5fe66be5 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -37,10 +35,6 @@ import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.SAXException; - import org.apache.tika.eval.app.db.ColInfo; import org.apache.tika.eval.app.db.Cols; import org.apache.tika.eval.app.db.TableInfo; @@ -74,6 +68,9 @@ import org.apache.tika.pipes.core.fetcher.FetchKey; import org.apache.tika.sax.ToXMLContentHandler; import org.apache.tika.utils.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.SAXException; public abstract class ProfilerBase { @@ -81,27 +78,36 @@ public abstract class ProfilerBase { public static final String FALSE = Boolean.toString(false); protected static final AtomicInteger ID = new AtomicInteger(); static final long NON_EXISTENT_FILE_LENGTH = -1l; - final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for file_path - //Container exception key from the 1.x branch - private static final Property CONTAINER_EXCEPTION_1X = Property.externalText("X-TIKA" + ":EXCEPTION:runtime"); + final static int FILE_PATH_MAX_LEN = 1024;// max len for varchar for file_path + // Container exception key from the 1.x branch + private static final Property CONTAINER_EXCEPTION_1X = + Property.externalText("X-TIKA" + ":EXCEPTION:runtime"); private static final Logger LOG = LoggerFactory.getLogger(ProfilerBase.class); private static final String[] EXTRACT_EXTENSIONS = {".json", ".txt", ""}; private static final String[] COMPRESSION_EXTENSIONS = {"", ".bz2", ".gzip", ".zip",}; private static final String ZERO = "0"; private static final String UNKNOWN_EXTENSION = "unk"; - //make this configurable + // make this configurable private static final String DIGEST_KEY = "X-TIKA:digest:MD5"; private static final Map UC_TAGS_OF_INTEREST = initTags(); - private final static Pattern ACCESS_PERMISSION_EXCEPTION = Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException"); - private final static Pattern ENCRYPTION_EXCEPTION = Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException"); - public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new TableInfo("ref_extract_exception_types", new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER), - new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)); - public static TableInfo REF_PARSE_ERROR_TYPES = - new TableInfo("ref_parse_error_types", new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER), new ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128)); - public static TableInfo REF_PARSE_EXCEPTION_TYPES = - new TableInfo("ref_parse_exception_types", new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER), new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)); - public static TableInfo MIME_TABLE = new TableInfo("mimes", new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"), new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256), - new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)); + private final static Pattern ACCESS_PERMISSION_EXCEPTION = + Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException"); + private final static Pattern ENCRYPTION_EXCEPTION = + Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException"); + public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new TableInfo( + "ref_extract_exception_types", + new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER), + new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)); + public static TableInfo REF_PARSE_ERROR_TYPES = new TableInfo("ref_parse_error_types", + new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER), + new ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128)); + public static TableInfo REF_PARSE_EXCEPTION_TYPES = new TableInfo("ref_parse_exception_types", + new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER), + new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)); + public static TableInfo MIME_TABLE = + new TableInfo("mimes", new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256), + new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)); private static CommonTokenCountManager COMMON_TOKEN_COUNT_MANAGER; private static Pattern FILE_NAME_CLEANER = Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$"); private static LanguageIDWrapper LANG_ID = new LanguageIDWrapper(); @@ -121,7 +127,7 @@ public ProfilerBase(IDBWriter writer) { } private static Map initTags() { - //simplify this mess + // simplify this mess Map tmp = new HashMap<>(); tmp.put("A", Cols.TAGS_A); tmp.put("B", Cols.TAGS_B); @@ -141,9 +147,9 @@ private static Map initTags() { } /** - * @param p path to the common_tokens directory. If this is null, try to load from classPath - * @param defaultLangCode this is the language code to use if a common_words list doesn't exist for the - * detected langauge; can be null + * @param p path to the common_tokens directory. If this is null, try to load from classPath + * @param defaultLangCode this is the language code to use if a common_words list doesn't exist + * for the detected langauge; can be null * @throws IOException */ public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException { @@ -154,10 +160,10 @@ private static String getFileName(String path) { if (path == null) { return ""; } - //filenameUtils checks for a null byte in the path. - //it will throw an IllegalArgumentException if there is a null byte. - //given that we're recording names and not using them on a file path - //we should ignore this. + // filenameUtils checks for a null byte in the path. + // it will throw an IllegalArgumentException if there is a null byte. + // given that we're recording names and not using them on a file path + // we should ignore this. try { return FilenameUtils.getName(path); } catch (IllegalArgumentException e) { @@ -169,19 +175,21 @@ private static String getFileName(String path) { } catch (IllegalArgumentException e) { LOG.warn("Again: {} in {}", e.getMessage(), path); } - //give up + // give up return ""; } /** - * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated + * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the + * string was truncated * * @param contentTags * @param maxLength * @param data * @return */ - protected static String truncateContent(ContentTags contentTags, int maxLength, Map data) { + protected static String truncateContent(ContentTags contentTags, int maxLength, + Map data) { data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE"); if (contentTags == null) { return ""; @@ -211,16 +219,14 @@ static List countAttachments(List list) { if (list == null || list.size() == 0) { return ret; } - //container document attachment count = list.size()-1 + // container document attachment count = list.size()-1 ret.add(list.size() - 1); Map counts = new HashMap<>(); for (int i = 1; i < list.size(); i++) { - String path = list - .get(i) - .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); + String path = list.get(i).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); if (path == null) { - //shouldn't ever happen + // shouldn't ever happen continue; } String[] parts = path.split("/"); @@ -240,9 +246,7 @@ static List countAttachments(List list) { } for (int i = 1; i < list.size(); i++) { - Integer count = counts.get(list - .get(i) - .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + Integer count = counts.get(list.get(i).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); if (count == null) { count = 0; } @@ -253,7 +257,8 @@ static List countAttachments(List list) { } - private static void join(String delimiter, StringBuilder sb, String[] parts, int start, int end) { + private static void join(String delimiter, StringBuilder sb, String[] parts, int start, + int end) { for (int i = start; i <= end; i++) { sb.append(delimiter); sb.append(parts[i]); @@ -267,44 +272,33 @@ private static ContentTags parseContentAndTags(EvalFilePaths evalFilePaths, Meta } String handlerClass = metadata.get(TikaCoreProperties.TIKA_CONTENT_HANDLER); - if (evalFilePaths - .getExtractFile() - .getFileName() - .toString() - .toLowerCase(Locale.ENGLISH) - .endsWith(".html")) { + if (evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH) + .endsWith(".html")) { try { return ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet()); } catch (IOException | SAXException e) { - LOG.warn("Problem parsing html in {}; backing off to treat string as text", evalFilePaths - .getExtractFile() - .toAbsolutePath() - .toString(), e); + LOG.warn("Problem parsing html in {}; backing off to treat string as text", + evalFilePaths.getExtractFile().toAbsolutePath().toString(), e); return new ContentTags(s, true); } - } else if (evalFilePaths - .getExtractFile() - .getFileName() - .toString() - .toLowerCase(Locale.ENGLISH) - .endsWith(".xhtml") || (handlerClass != null && handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) { + } else if (evalFilePaths.getExtractFile().getFileName().toString() + .toLowerCase(Locale.ENGLISH).endsWith(".xhtml") + || (handlerClass != null && handlerClass + .equals(ToXMLContentHandler.class.getSimpleName()))) { try { return ContentTagParser.parseXML(s, UC_TAGS_OF_INTEREST.keySet()); } catch (TikaException | IOException | SAXException e) { - LOG.warn("Problem parsing xhtml in {}; backing off to html parser", evalFilePaths - .getExtractFile() - .toAbsolutePath() - .toString(), e); + LOG.warn("Problem parsing xhtml in {}; backing off to html parser", + evalFilePaths.getExtractFile().toAbsolutePath().toString(), e); try { - ContentTags contentTags = ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet()); + ContentTags contentTags = + ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet()); contentTags.setParseException(true); return contentTags; } catch (IOException | SAXException e2) { - LOG.warn("Problem parsing html in {}; backing off to treat string as text", evalFilePaths - .getExtractFile() - .toAbsolutePath() - .toString(), e2); + LOG.warn("Problem parsing html in {}; backing off to treat string as text", + evalFilePaths.getExtractFile().toAbsolutePath().toString(), e2); } return new ContentTags(s, true); } @@ -312,7 +306,8 @@ private static ContentTags parseContentAndTags(EvalFilePaths evalFilePaths, Meta return new ContentTags(s); } - private CompositeTextStatsCalculator initAnalyzersAndTokenCounter(int maxTokens, LanguageIDWrapper langIder) { + private CompositeTextStatsCalculator initAnalyzersAndTokenCounter(int maxTokens, + LanguageIDWrapper langIder) { analyzerManager = AnalyzerManager.newInstance(maxTokens); List calculators = new ArrayList<>(); calculators.add(new CommonTokens(COMMON_TOKEN_COUNT_MANAGER)); @@ -323,7 +318,8 @@ private CompositeTextStatsCalculator initAnalyzersAndTokenCounter(int maxTokens, calculators.add(new ContentLengthCalculator()); calculators.add(new UnicodeBlockCounter(maxContentLengthForLangId)); - return new CompositeTextStatsCalculator(calculators, analyzerManager.getGeneralAnalyzer(), langIder); + return new CompositeTextStatsCalculator(calculators, analyzerManager.getGeneralAnalyzer(), + langIder); } /** @@ -355,7 +351,8 @@ public void setMaxTokens(int maxTokens) { initAnalyzersAndTokenCounter(maxTokens, new LanguageIDWrapper()); } - protected void writeExtractException(TableInfo extractExceptionTable, String containerId, String filePath, ExtractReaderException.TYPE type) throws IOException { + protected void writeExtractException(TableInfo extractExceptionTable, String containerId, + String filePath, ExtractReaderException.TYPE type) throws IOException { Map data = new HashMap<>(); data.put(Cols.CONTAINER_ID, containerId); data.put(Cols.FILE_PATH, filePath); @@ -364,8 +361,9 @@ protected void writeExtractException(TableInfo extractExceptionTable, String con } - protected void writeProfileData(EvalFilePaths fps, int i, ContentTags contentTags, Metadata m, String fileId, String containerId, List numAttachments, - TableInfo profileTable) { + protected void writeProfileData(EvalFilePaths fps, int i, ContentTags contentTags, Metadata m, + String fileId, String containerId, List numAttachments, + TableInfo profileTable) { Map data = new HashMap<>(); data.put(Cols.ID, fileId); @@ -387,13 +385,10 @@ protected void writeProfileData(EvalFilePaths fps, int i, ContentTags contentTag data.put(Cols.NUM_OCR_PAGES, Integer.toString(nOCRPages)); } - //if the outer wrapper document + // if the outer wrapper document if (i == 0) { data.put(Cols.IS_EMBEDDED, FALSE); - data.put(Cols.FILE_NAME, fps - .getRelativeSourceFilePath() - .getFileName() - .toString()); + data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString()); data.put(Cols.EMBEDDED_DEPTH, "0"); } else { data.put(Cols.IS_EMBEDDED, TRUE); @@ -440,9 +435,7 @@ protected void writeProfileData(EvalFilePaths fps, int i, ContentTags contentTag protected void writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable) { Map data = new HashMap<>(); getExceptionStrings(m, data); - if (data - .keySet() - .size() > 0) { + if (data.keySet().size() > 0) { try { data.put(Cols.ID, fileId); writer.writeRow(exceptionTable, data); @@ -453,9 +446,9 @@ protected void writeExceptionData(String fileId, Metadata m, TableInfo exception } protected Map calcTextStats(ContentTags contentTags) { -/* if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) { - return Collections.EMPTY_MAP; - }*/ + /* + * if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) { return Collections.EMPTY_MAP; } + */ Map data = new HashMap<>(); String content = truncateContent(contentTags, maxContentLength, data); if (content == null || content.isBlank()) { @@ -465,15 +458,15 @@ protected Map calcTextStats(ContentTags contentTags) { } /** - * Checks to see if metadata is null or content is empty (null or only whitespace). - * If any of these, then this does no processing, and the fileId is not - * entered into the content table. + * Checks to see if metadata is null or content is empty (null or only whitespace). If any of + * these, then this does no processing, and the fileId is not entered into the content table. * * @param fileId * @param textStats * @param contentsTable */ - protected void writeContentData(String fileId, Map textStats, TableInfo contentsTable) throws IOException { + protected void writeContentData(String fileId, Map textStats, + TableInfo contentsTable) throws IOException { Map data = new HashMap<>(); data.put(Cols.ID, fileId); if (textStats.containsKey(ContentLengthCalculator.class)) { @@ -489,11 +482,15 @@ protected void writeContentData(String fileId, Map textStats, Tab CommonTokenResult commonTokenResult = (CommonTokenResult) textStats.get(CommonTokens.class); if (commonTokenResult != null) { data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode()); - data.put(Cols.NUM_UNIQUE_COMMON_TOKENS, Integer.toString(commonTokenResult.getUniqueCommonTokens())); + data.put(Cols.NUM_UNIQUE_COMMON_TOKENS, + Integer.toString(commonTokenResult.getUniqueCommonTokens())); data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens())); - data.put(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getUniqueAlphabeticTokens())); - data.put(Cols.NUM_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getAlphabeticTokens())); - double oov = commonTokenResult.getAlphabeticTokens() > 0 ? commonTokenResult.getOOV() : -1.0; + data.put(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, + Integer.toString(commonTokenResult.getUniqueAlphabeticTokens())); + data.put(Cols.NUM_ALPHABETIC_TOKENS, + Integer.toString(commonTokenResult.getAlphabeticTokens())); + double oov = commonTokenResult.getAlphabeticTokens() > 0 ? commonTokenResult.getOOV() + : -1.0; data.put(Cols.OOV, Double.toString(oov)); } TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class); @@ -503,7 +500,8 @@ protected void writeContentData(String fileId, Map textStats, Tab data.put(Cols.NUM_TOKENS, Integer.toString(tokenCounts.getTotalTokens())); } if (textStats.get(TokenEntropy.class) != null) { - data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString((Double) textStats.get(TokenEntropy.class))); + data.put(Cols.TOKEN_ENTROPY_RATE, + Double.toString((Double) textStats.get(TokenEntropy.class))); } @@ -585,29 +583,31 @@ void getExceptionStrings(Metadata metadata, Map data) { } if (fullTrace != null) { - //check for "expected" exceptions...exceptions - //that can't be fixed. - //Do not store trace for "expected" exceptions + // check for "expected" exceptions...exceptions + // that can't be fixed. + // Do not store trace for "expected" exceptions Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace); if (matcher.find()) { - data.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal())); + data.put(Cols.PARSE_EXCEPTION_ID, + Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal())); return; } matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace); if (matcher.find()) { - data.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal())); + data.put(Cols.PARSE_EXCEPTION_ID, + Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal())); return; } data.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal())); data.put(Cols.ORIG_STACK_TRACE, fullTrace); - //TikaExceptions can have object ids, as in the "@2b1ea6ee" in: - //org.apache.tika.exception.TikaException: TIKA-198: Illegal - //IOException from org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee - //For reporting purposes, let's snip off the object id so that we can more - //easily count exceptions. + // TikaExceptions can have object ids, as in the "@2b1ea6ee" in: + // org.apache.tika.exception.TikaException: TIKA-198: Illegal + // IOException from org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee + // For reporting purposes, let's snip off the object id so that we can more + // easily count exceptions. String sortTrace = EvalExceptionUtils.normalize(fullTrace); data.put(Cols.SORT_STACK_TRACE, sortTrace); } @@ -615,52 +615,35 @@ void getExceptionStrings(Metadata metadata, Map data) { void unicodeBlocks(Map tokenStats, Map data) { - Map blocks = (Map) tokenStats.get(UnicodeBlockCounter.class); + Map blocks = + (Map) tokenStats.get(UnicodeBlockCounter.class); List> pairs = new ArrayList<>(); for (Map.Entry e : blocks.entrySet()) { - pairs.add(Pair.of(e.getKey(), e - .getValue() - .intValue())); + pairs.add(Pair.of(e.getKey(), e.getValue().intValue())); } - pairs.sort((o1, o2) -> o2 - .getValue() - .compareTo(o1.getValue())); + pairs.sort((o1, o2) -> o2.getValue().compareTo(o1.getValue())); StringBuilder sb = new StringBuilder(); for (int i = 0; i < 20 && i < pairs.size(); i++) { if (i > 0) { sb.append(" | "); } - sb - .append(pairs - .get(i) - .getKey()) - .append(": ") - .append(pairs - .get(i) - .getValue()); + sb.append(pairs.get(i).getKey()).append(": ").append(pairs.get(i).getValue()); } data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString()); } void langid(Map stats, Map data) { - List probabilities = (List) stats.get(LanguageIDWrapper.class); + List probabilities = + (List) stats.get(LanguageIDWrapper.class); if (probabilities.size() > 0) { - data.put(Cols.LANG_ID_1, probabilities - .get(0) - .getLanguage()); - data.put(Cols.LANG_ID_PROB_1, Double.toString(probabilities - .get(0) - .getRawScore())); + data.put(Cols.LANG_ID_1, probabilities.get(0).getLanguage()); + data.put(Cols.LANG_ID_PROB_1, Double.toString(probabilities.get(0).getRawScore())); } if (probabilities.size() > 1) { - data.put(Cols.LANG_ID_2, probabilities - .get(1) - .getLanguage()); - data.put(Cols.LANG_ID_PROB_2, Double.toString(probabilities - .get(1) - .getRawScore())); + data.put(Cols.LANG_ID_2, probabilities.get(1).getLanguage()); + data.put(Cols.LANG_ID_PROB_2, Double.toString(probabilities.get(1).getRawScore())); } } @@ -684,10 +667,7 @@ void writeTokenCounts(Map textStats, Map data) { if (i++ > 0) { sb.append(" | "); } - sb - .append(t.getToken()) - .append(": ") - .append(t.getValue()); + sb.append(t.getToken()).append(": ").append(t.getValue()); } data.put(Cols.TOP_N_TOKENS, sb.toString()); @@ -706,25 +686,25 @@ protected EvalFilePaths getPathsFromExtractCrawl(FetchKey fetchKey, Path extract String relExtractFilePath = fetchKey.getFetchKey(); Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath); Path relativeSourceFilePath = Paths.get(m.replaceAll("")); - //just try slapping the relextractfilepath on the extractdir + // just try slapping the relextractfilepath on the extractdir Path extractFile = extracts.resolve(relExtractFilePath); if (!Files.isRegularFile(extractFile)) { - //if that doesn't work, try to find the right extract file. - //This is necessary if crawling extractsA and trying to find a file in - //extractsB that is not in the same format: json vs txt or compressed + // if that doesn't work, try to find the right extract file. + // This is necessary if crawling extractsA and trying to find a file in + // extractsB that is not in the same format: json vs txt or compressed extractFile = findFile(extracts, relativeSourceFilePath); } return new EvalFilePaths(relativeSourceFilePath, extractFile); } - //call this if the crawler is crawling through the src directory + // call this if the crawler is crawling through the src directory protected EvalFilePaths getPathsFromSrcCrawl(FetchKey fetchKey, Path srcDir, Path extracts) { Path relativeSourceFilePath = Paths.get(fetchKey.getFetchKey()); Path extractFile = findFile(extracts, relativeSourceFilePath); Path inputFile = srcDir.resolve(relativeSourceFilePath); long srcLen = -1l; - //try to get the length of the source file in case there was an error - //in both extracts + // try to get the length of the source file in case there was an error + // in both extracts try { srcLen = Files.size(inputFile); } catch (IOException e) { @@ -780,7 +760,7 @@ long getSourceFileLength(Metadata m) { try { return Long.parseLong(lenString); } catch (NumberFormatException e) { - //swallow + // swallow } return NON_EXISTENT_FILE_LENGTH; } @@ -790,7 +770,7 @@ protected long getFileLength(Path p) { try { return Files.size(p); } catch (IOException e) { - //swallow + // swallow } } return NON_EXISTENT_FILE_LENGTH; @@ -803,8 +783,7 @@ public enum EXCEPTION_TYPE { } /** - * If information was gathered from the log file about - * a parse error + * If information was gathered from the log file about a parse error */ public enum PARSE_ERROR_TYPE { OOM, TIMEOUT @@ -812,4 +791,3 @@ public enum PARSE_ERROR_TYPE { } - diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java index 3f81f17758..01a449c66a 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/StatusReporter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -22,11 +20,10 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.pipes.core.pipesiterator.CallablePipesIterator; import org.apache.tika.utils.DurationFormatUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class StatusReporter implements Callable { @@ -40,7 +37,8 @@ public class StatusReporter implements Callable { private final NumberFormat numberFormat = NumberFormat.getNumberInstance(Locale.ROOT); - public StatusReporter(CallablePipesIterator pipesIterator, AtomicInteger filesProcessed, AtomicInteger activeWorkers, AtomicBoolean crawlerIsActive) { + public StatusReporter(CallablePipesIterator pipesIterator, AtomicInteger filesProcessed, + AtomicInteger activeWorkers, AtomicBoolean crawlerIsActive) { this.pipesIterator = pipesIterator; this.filesProcessed = filesProcessed; this.activeWorkers = activeWorkers; @@ -56,7 +54,7 @@ public Long call() throws Exception { Thread.sleep(1000); } catch (InterruptedException e) { LOGGER.info("Interrupted?"); - //expected + // expected return COMPLETED_VAL; } report(); @@ -74,8 +72,11 @@ private void report() { int avg = (elapsedSecs > 5 || cnt > 100) ? (int) ((double) cnt / elapsedSecs) : -1; String elapsedString = DurationFormatUtils.formatMillis(System.currentTimeMillis() - start); - String docsPerSec = avg > -1 ? String.format(Locale.ROOT, " (%s docs per sec)", numberFormat.format(avg)) : ""; - String msg = String.format(Locale.ROOT, "Processed %s documents in %s%s.", numberFormat.format(cnt), elapsedString, docsPerSec); + String docsPerSec = avg > -1 + ? String.format(Locale.ROOT, " (%s docs per sec)", numberFormat.format(avg)) + : ""; + String msg = String.format(Locale.ROOT, "Processed %s documents in %s%s.", + numberFormat.format(cnt), elapsedString, docsPerSec); LOGGER.info(msg); int stillAlive = activeWorkers.get(); @@ -95,7 +96,7 @@ private void report() { } LOGGER.info(msg); - if (! crawlerIsActive.get()) { + if (!crawlerIsActive.get()) { msg = "The directory crawler has completed its crawl.\n"; LOGGER.info(msg); } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java index ce32d78f4b..8603040c70 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -20,9 +18,8 @@ import java.util.ArrayList; import java.util.List; -import org.h2.tools.Console; - import org.apache.tika.eval.app.reports.ResultsReporter; +import org.h2.tools.Console; public class TikaEvalCLI { static final String[] tools = {"Profile", "Compare", "Report", "StartDB"}; @@ -31,9 +28,7 @@ private static String specifyTools() { StringBuilder sb = new StringBuilder(); sb.append("Must specify one of the following tools in the first parameter:\n"); for (String s : tools) { - sb - .append(s) - .append("\n"); + sb.append(s).append("\n"); } return sb.toString(); diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/AbstractDBBuffer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/AbstractDBBuffer.java index 952339c4a0..cea7ee8e60 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/AbstractDBBuffer.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/AbstractDBBuffer.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.db; @@ -68,8 +66,8 @@ public int getNumWrites() { return numWrites; } - //Odd to throw RuntimeException, I know. It should be - //catastrophic if this buffer can't write to the db. + // Odd to throw RuntimeException, I know. It should be + // catastrophic if this buffer can't write to the db. public abstract void write(int id, String value) throws RuntimeException; public abstract void close() throws SQLException; diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/ColInfo.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/ColInfo.java index a326a831c9..991293f0b9 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/ColInfo.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/ColInfo.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.db; @@ -62,7 +60,7 @@ public String getConstraints() { } /** - * Gets the precision. This can be null! + * Gets the precision. This can be null! * * @return precision or null */ diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java index 5ab3d86658..b865f82dc0 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java @@ -1,22 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.db; public enum Cols { + // @formatter:off //container table CONTAINER_ID, FILE_PATH, EXTRACT_FILE_LENGTH, @@ -57,6 +56,5 @@ public enum Cols { //structure tags TAGS_A, TAGS_B, TAGS_DIV, TAGS_I, TAGS_IMG, TAGS_LI, TAGS_P, TAGS_OL, TAGS_TABLE, TAGS_TD, TAGS_TITLE, TAGS_TR, TAGS_UL, TAGS_U, TAGS_PARSE_EXCEPTION, //if there was a SAX|IO|TikaException while parsing the html or xhtml - + // @formatter:on } - diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/DBBuffer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/DBBuffer.java index 36f65a44d0..00bbf0ddf9 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/DBBuffer.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/DBBuffer.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.db; @@ -25,8 +23,10 @@ public class DBBuffer extends AbstractDBBuffer { private final PreparedStatement st; - public DBBuffer(Connection connection, String tableName, String idColumnName, String valueColumnName) throws SQLException { - st = connection.prepareStatement("insert into " + tableName + "( " + idColumnName + ", " + valueColumnName + ") values (?,?);"); + public DBBuffer(Connection connection, String tableName, String idColumnName, + String valueColumnName) throws SQLException { + st = connection.prepareStatement("insert into " + tableName + "( " + idColumnName + ", " + + valueColumnName + ") values (?,?);"); } @Override diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/H2Util.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/H2Util.java index 2547d802d4..5ac503440e 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/H2Util.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/H2Util.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.db; @@ -49,9 +47,7 @@ public static boolean databaseExists(Path db) { } private static String getConnectionString(Path db, boolean createDBIfItDoesntExist) { - String s = "jdbc:h2:" + FilenameUtils.separatorsToUnix(db - .toAbsolutePath() - .toString()); + String s = "jdbc:h2:" + FilenameUtils.separatorsToUnix(db.toAbsolutePath().toString()); if (!createDBIfItDoesntExist) { s += ";IFEXISTS=TRUE"; } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/JDBCUtil.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/JDBCUtil.java index d776e1ef1c..62b4bdc38c 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/JDBCUtil.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/JDBCUtil.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.db; @@ -51,30 +49,29 @@ public JDBCUtil(String connectionString, String driverClass) { this.driverClass = driverClass; if (driverClass == null || driverClass.isEmpty()) { if (System.getProperty("jdbc.drivers") != null) { - //user has specified it on the command line - //stop now + // user has specified it on the command line + // stop now } else { - //try to use the mappings in db.properties to determine the class + // try to use the mappings in db.properties to determine the class try (InputStream is = JDBCUtil.class.getResourceAsStream("/db.properties")) { Properties properties = new Properties(); properties.load(is); for (String k : properties.stringPropertyNames()) { - Matcher m = Pattern - .compile("(?i)jdbc:" + k) - .matcher(connectionString); + Matcher m = Pattern.compile("(?i)jdbc:" + k).matcher(connectionString); if (m.find()) { this.driverClass = properties.getProperty(k); } } } catch (IOException e) { - //swallow + // swallow } } } } - public static void batchInsert(PreparedStatement insertStatement, TableInfo table, Map data) throws SQLException { + public static void batchInsert(PreparedStatement insertStatement, TableInfo table, + Map data) throws SQLException { try { int i = 1; @@ -84,7 +81,8 @@ public static void batchInsert(PreparedStatement insertStatement, TableInfo tabl } for (Cols c : data.keySet()) { if (!table.containsColumn(c)) { - throw new IllegalArgumentException("Can't add data to " + c + " because it doesn't exist in the table: " + table.getName()); + throw new IllegalArgumentException("Can't add data to " + c + + " because it doesn't exist in the table: " + table.getName()); } } insertStatement.addBatch(); @@ -93,7 +91,8 @@ public static void batchInsert(PreparedStatement insertStatement, TableInfo tabl } } - public static void updateInsertStatement(int dbColOffset, PreparedStatement st, ColInfo colInfo, String value) throws SQLException { + public static void updateInsertStatement(int dbColOffset, PreparedStatement st, ColInfo colInfo, + String value) throws SQLException { if (value == null) { st.setNull(dbColOffset, colInfo.getType()); return; @@ -105,12 +104,12 @@ public static void updateInsertStatement(int dbColOffset, PreparedStatement st, value = value.substring(0, colInfo.getPrecision()); LOG.warn("truncated varchar value in {} : {}", colInfo.getName(), value); } - //postgres doesn't allow \0000 + // postgres doesn't allow \0000 value = value.replaceAll("\u0000", " "); st.setString(dbColOffset, value); break; case Types.CHAR: - //postgres doesn't allow \0000 + // postgres doesn't allow \0000 value = value.replaceAll("\u0000", " "); st.setString(dbColOffset, value); break; @@ -130,7 +129,8 @@ public static void updateInsertStatement(int dbColOffset, PreparedStatement st, st.setBoolean(dbColOffset, Boolean.parseBoolean(value)); break; default: - throw new UnsupportedOperationException("Don't yet support type: " + colInfo.getType()); + throw new UnsupportedOperationException( + "Don't yet support type: " + colInfo.getType()); } } catch (NumberFormatException e) { if (!"".equals(value)) { @@ -144,8 +144,7 @@ public static void updateInsertStatement(int dbColOffset, PreparedStatement st, } /** - * Override this any optimizations you want to do on the db - * before writing/reading. + * Override this any optimizations you want to do on the db before writing/reading. * * @return * @throws IOException @@ -170,7 +169,7 @@ public Connection getConnection() throws SQLException { } /** - * JDBC driver class. Override as necessary. + * JDBC driver class. Override as necessary. * * @return */ @@ -199,15 +198,14 @@ public Set getTables(Connection connection) throws SQLException { try (ResultSet rs = dbMeta.getTables(null, null, "%", null)) { while (rs.next()) { - tables.add(rs - .getString(3) - .toLowerCase(Locale.US)); + tables.add(rs.getString(3).toLowerCase(Locale.US)); } } return tables; } - public void createTables(List tableInfos, CREATE_TABLE createTable) throws SQLException, IOException { + public void createTables(List tableInfos, CREATE_TABLE createTable) + throws SQLException, IOException { Connection conn = getConnection(); for (TableInfo tableInfo : tableInfos) { @@ -246,12 +244,10 @@ String normalizeTableName(String tableName) { return tableName; } - //does not close the connection + // does not close the connection private void createTable(Connection conn, TableInfo tableInfo) throws SQLException { StringBuilder createSql = new StringBuilder(); - createSql - .append("CREATE TABLE ") - .append(tableInfo.getName()); + createSql.append("CREATE TABLE ").append(tableInfo.getName()); createSql.append("("); int last = 0; diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java index e03a63a3ed..ddd9bc88be 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.db; @@ -34,9 +32,11 @@ public class MimeBuffer extends AbstractDBBuffer { private final Connection connection; - public MimeBuffer(Connection connection, TableInfo mimeTable, MimeTypes mimeTypes) throws SQLException { - st = connection.prepareStatement( - "insert into " + mimeTable.getName() + "( " + Cols.MIME_ID.name() + ", " + Cols.MIME_STRING.name() + ", " + Cols.FILE_EXTENSION.name() + ") values (?,?,?)"); + public MimeBuffer(Connection connection, TableInfo mimeTable, MimeTypes mimeTypes) + throws SQLException { + st = connection.prepareStatement("insert into " + mimeTable.getName() + "( " + + Cols.MIME_ID.name() + ", " + Cols.MIME_STRING.name() + ", " + + Cols.FILE_EXTENSION.name() + ") values (?,?,?)"); this.mimeTypes = mimeTypes; this.connection = connection; } @@ -71,7 +71,7 @@ public void close() throws SQLException { } private static class MimeUtil { - //TODO: see if MimeType now works for these + // TODO: see if MimeType now works for these private static final String APPLICATION = "application"; private static final String TEXT = "text"; private static final String HTML = "html"; @@ -83,19 +83,19 @@ private static class MimeUtil { private static final String EMPTY_STRING = ""; /** - * Utility method to convert from a string value representing a content type - * (e.g. "application/pdf") into the most common extension for that file type - * (e.g. "pdf"). + * Utility method to convert from a string value representing a content type (e.g. + * "application/pdf") into the most common extension for that file type (e.g. "pdf"). *

- * This will has special handling for texty filetypes whose MimeTypes - * don't currently return anything for {@link MimeType#getExtension}; + * This will has special handling for texty filetypes whose MimeTypes don't currently return + * anything for {@link MimeType#getExtension}; * * @param contentType string representing a content type, for example: "application/pdf" * @param mimeTypes MimeRepository * @return extension or empty string * @throws MimeTypeException thrown if MimeTypes can't parse the contentType */ - public static String getExtension(String contentType, MimeTypes mimeTypes) throws MimeTypeException { + public static String getExtension(String contentType, MimeTypes mimeTypes) + throws MimeTypeException { MimeType mime = mimeTypes.forName(contentType); return getExtension(mime); } @@ -107,7 +107,7 @@ public static String getExtension(MimeType mime) { ext = ext.substring(1); } - //special handling for text/html/xml + // special handling for text/html/xml if (ext.isEmpty()) { ext = tryTextyTypes(mime.getType()); } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/TableInfo.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/TableInfo.java index bb20b57c0a..3ba5fb3637 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/TableInfo.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/TableInfo.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.db; @@ -70,4 +68,3 @@ public boolean containsColumn(Cols cols) { return colNames.contains(cols); } } - diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/DBWriter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/DBWriter.java index a44ddfd833..e807cdd921 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/DBWriter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/DBWriter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.io; @@ -25,24 +23,22 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.eval.app.db.ColInfo; import org.apache.tika.eval.app.db.Cols; import org.apache.tika.eval.app.db.JDBCUtil; import org.apache.tika.eval.app.db.MimeBuffer; import org.apache.tika.eval.app.db.TableInfo; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * This is still in its early stages. The idea is to - * get something working with h2 and then add to that - * as necessary. + * This is still in its early stages. The idea is to get something working with h2 and then add to + * that as necessary. *

* Beware, this deletes the db file with each initialization. *

- * Each thread must construct its own DBWriter because each - * DBWriter creates its own PreparedStatements at initialization. + * Each thread must construct its own DBWriter because each DBWriter creates its own + * PreparedStatements at initialization. */ public class DBWriter implements IDBWriter { @@ -50,18 +46,19 @@ public class DBWriter implements IDBWriter { private static final AtomicInteger WRITER_ID = new AtomicInteger(); private final Long commitEveryXRows = 10000L; - //private final Long commitEveryXMS = 60000L; + // private final Long commitEveryXMS = 60000L; private final Connection conn; private final JDBCUtil dbUtil; private final MimeBuffer mimeBuffer; private final int myId = WRITER_ID.getAndIncrement(); - // + // private final Map inserts = new HashMap<>(); private final Map lastInsertMap = new HashMap<>(); - public DBWriter(Connection connection, List tableInfos, JDBCUtil dbUtil, MimeBuffer mimeBuffer) throws IOException, SQLException { + public DBWriter(Connection connection, List tableInfos, JDBCUtil dbUtil, + MimeBuffer mimeBuffer) throws IOException, SQLException { this.conn = connection; this.mimeBuffer = mimeBuffer; @@ -83,9 +80,7 @@ public int getMimeId(String mimeString) { private PreparedStatement createPreparedInsert(TableInfo tableInfo) throws SQLException { StringBuilder sb = new StringBuilder(); - sb - .append("INSERT INTO ") - .append(tableInfo.getName()); + sb.append("INSERT INTO ").append(tableInfo.getName()); sb.append("("); int i = 0; for (ColInfo c : tableInfo.getColInfos()) { @@ -115,16 +110,18 @@ public void writeRow(TableInfo table, Map data) throws IOException try { PreparedStatement p = inserts.get(table.getName()); if (p == null) { - throw new RuntimeException("Failed to create prepared statement for: " + table.getName()); + throw new RuntimeException( + "Failed to create prepared statement for: " + table.getName()); } dbUtil.batchInsert(p, table, data); LastInsert lastInsert = lastInsertMap.get(table.getName()); lastInsert.rowCount++; long elapsed = System.currentTimeMillis() - lastInsert.lastInsert; if ( - //elapsed > commitEveryXMS || - lastInsert.rowCount % commitEveryXRows == 0) { - LOG.info("writer ({}) on table ({}) is committing after {} rows and {} ms", myId, table.getName(), lastInsert.rowCount, elapsed); + // elapsed > commitEveryXMS || + lastInsert.rowCount % commitEveryXRows == 0) { + LOG.info("writer ({}) on table ({}) is committing after {} rows and {} ms", myId, + table.getName(), lastInsert.rowCount, elapsed); p.executeBatch(); conn.commit(); lastInsert.lastInsert = System.currentTimeMillis(); @@ -135,8 +132,8 @@ public void writeRow(TableInfo table, Map data) throws IOException } /** - * This closes the writer by executing batch and - * committing changes. This DOES NOT close the connection + * This closes the writer by executing batch and committing changes. This DOES NOT close the + * connection * * @throws IOException */ diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java index 3bac1a8495..42156d88d3 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.io; @@ -34,9 +32,6 @@ import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.commons.compress.compressors.z.ZCompressorInputStream; import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -44,6 +39,8 @@ import org.apache.tika.sax.ToTextContentHandler; import org.apache.tika.sax.ToXMLContentHandler; import org.apache.tika.serialization.JsonMetadataList; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class ExtractReader { @@ -65,12 +62,14 @@ public ExtractReader(ALTER_METADATA_LIST alterMetadataList) { this(alterMetadataList, IGNORE_LENGTH, IGNORE_LENGTH); } - public ExtractReader(ALTER_METADATA_LIST alterMetadataList, long minExtractLength, long maxExtractLength) { + public ExtractReader(ALTER_METADATA_LIST alterMetadataList, long minExtractLength, + long maxExtractLength) { this.alterMetadataList = alterMetadataList; this.minExtractLength = minExtractLength; this.maxExtractLength = maxExtractLength; if (maxExtractLength > IGNORE_LENGTH && minExtractLength >= maxExtractLength) { - throw new IllegalArgumentException("minExtractLength(" + minExtractLength + ") must be < maxExtractLength(" + maxExtractLength + ")"); + throw new IllegalArgumentException("minExtractLength(" + minExtractLength + + ") must be < maxExtractLength(" + maxExtractLength + ")"); } } @@ -79,9 +78,8 @@ protected static FileSuffixes parseSuffixes(String fName) { if (fName == null) { return fileSuffixes; } - Matcher m = Pattern - .compile("(?i)^(.*?)\\.(json|txt|x?html)(?:\\.(bz2|gz(?:ip)?|zip))?$") - .matcher(fName); + Matcher m = Pattern.compile("(?i)^(.*?)\\.(json|txt|x?html)(?:\\.(bz2|gz(?:ip)?|zip))?$") + .matcher(fName); if (m.find()) { fileSuffixes.originalFileName = m.group(1); fileSuffixes.setFormat(m.group(2)); @@ -97,11 +95,10 @@ public List loadExtract(Path extractFile) throws ExtractReaderExceptio throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE); } - FileSuffixes fileSuffixes = parseSuffixes(extractFile - .getFileName() - .toString()); + FileSuffixes fileSuffixes = parseSuffixes(extractFile.getFileName().toString()); if (fileSuffixes.format == null) { - throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX); + throw new ExtractReaderException( + ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX); } if (!Files.isRegularFile(extractFile)) { throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE); @@ -119,13 +116,13 @@ public List loadExtract(Path extractFile) throws ExtractReaderExceptio } if (minExtractLength > IGNORE_LENGTH && length < minExtractLength) { - LOG.info("minExtractLength {} > IGNORE_LENGTH {} and length {} < minExtractLength {} for file '{}'", - minExtractLength, IGNORE_LENGTH, length, minExtractLength, extractFile); + LOG.info("minExtractLength {} > IGNORE_LENGTH {} and length {} < minExtractLength {} for file '{}'", + minExtractLength, IGNORE_LENGTH, length, minExtractLength, extractFile); throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_SHORT); } if (maxExtractLength > IGNORE_LENGTH && length > maxExtractLength) { - LOG.info("maxExtractLength {} > IGNORE_LENGTH {} and length {} > maxExtractLength {} for file '{}'", - maxExtractLength, IGNORE_LENGTH, length, maxExtractLength, extractFile); + LOG.info("maxExtractLength {} > IGNORE_LENGTH {} and length {} > maxExtractLength {} for file '{}'", + maxExtractLength, IGNORE_LENGTH, length, maxExtractLength, extractFile); throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_LONG); } @@ -146,7 +143,8 @@ public List loadExtract(Path extractFile) throws ExtractReaderExceptio is = new ZCompressorInputStream(is); break; default: - LOG.warn("Can't yet process compression of type: {}", fileSuffixes.compression); + LOG.warn("Can't yet process compression of type: {}", + fileSuffixes.compression); return metadataList; } } @@ -158,11 +156,14 @@ public List loadExtract(Path extractFile) throws ExtractReaderExceptio try { if (fileSuffixes.format == FileSuffixes.FORMAT.JSON) { metadataList = JsonMetadataList.fromJson(reader); - if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) { + if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) + && metadataList.size() > 1) { while (metadataList.size() > 1) { metadataList.remove(metadataList.size() - 1); } - } else if (alterMetadataList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) && metadataList.size() > 1) { + } else if (alterMetadataList + .equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) + && metadataList.size() > 1) { StringBuilder sb = new StringBuilder(); Metadata containerMetadata = metadataList.get(0); for (Metadata m : metadataList) { @@ -189,19 +190,22 @@ public List loadExtract(Path extractFile) throws ExtractReaderExceptio return metadataList; } - private List generateListFromTextFile(Reader reader, FileSuffixes fileSuffixes) throws IOException { + private List generateListFromTextFile(Reader reader, FileSuffixes fileSuffixes) + throws IOException { List metadataList = new ArrayList<>(); String content = IOUtils.toString(reader); Metadata m = new Metadata(); m.set(TikaCoreProperties.TIKA_CONTENT, content); if (fileSuffixes.format == FileSuffixes.FORMAT.HTML) { - m.set(TikaCoreProperties.TIKA_CONTENT_HANDLER, ToXMLContentHandler.class.getSimpleName()); + m.set(TikaCoreProperties.TIKA_CONTENT_HANDLER, + ToXMLContentHandler.class.getSimpleName()); } else if (fileSuffixes.format == FileSuffixes.FORMAT.TXT) { - m.set(TikaCoreProperties.TIKA_CONTENT_HANDLER, ToTextContentHandler.class.getSimpleName()); + m.set(TikaCoreProperties.TIKA_CONTENT_HANDLER, + ToTextContentHandler.class.getSimpleName()); } - //Let's hope the file name has a suffix that can - //be used to determine the mime. Could be wrong or missing, - //but better than nothing. + // Let's hope the file name has a suffix that can + // be used to determine the mime. Could be wrong or missing, + // but better than nothing. m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileSuffixes.originalFileName); MediaType mimeType = mimeTypes.detect(null, m); @@ -214,8 +218,8 @@ private List generateListFromTextFile(Reader reader, FileSuffixes file } public enum ALTER_METADATA_LIST { - AS_IS, //leave the metadata list as is - FIRST_ONLY, //take only the metadata list for the "container" document + AS_IS, // leave the metadata list as is + FIRST_ONLY, // take only the metadata list for the "container" document CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into the first } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReaderException.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReaderException.java index 42db189f5a..dc7f8be0fa 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReaderException.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReaderException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.io; @@ -40,9 +38,16 @@ public TYPE getType() { } public enum TYPE { - //what do you see when you look at the extract file - NO_EXTRACT_FILE, ZERO_BYTE_EXTRACT_FILE, IO_EXCEPTION, EXTRACT_PARSE_EXCEPTION, EXTRACT_FILE_TOO_SHORT, EXTRACT_FILE_TOO_LONG, - INCORRECT_EXTRACT_FILE_SUFFIX//extract file must have suffix of .json or .txt, + // what do you see when you look at the extract file + NO_EXTRACT_FILE, ZERO_BYTE_EXTRACT_FILE, IO_EXCEPTION, EXTRACT_PARSE_EXCEPTION, EXTRACT_FILE_TOO_SHORT, EXTRACT_FILE_TOO_LONG, INCORRECT_EXTRACT_FILE_SUFFIX// extract + // file + // must + // have + // suffix + // of + // .json + // or + // .txt, // optionally followed by gzip, zip or bz2 } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/IDBWriter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/IDBWriter.java index a6b8924d17..4552014f9f 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/IDBWriter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/IDBWriter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.io; diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/Report.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/Report.java index 2e8c739a5f..44b2a10e9f 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/Report.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/Report.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.reports; @@ -46,7 +44,7 @@ public class Report { private static final Logger LOG = LoggerFactory.getLogger(Report.class); - final String NULL_VALUE = "";//TODO: make this configurable!!! + final String NULL_VALUE = "";// TODO: make this configurable!!! Map cellFormatters = new HashMap<>(); String sql; String reportFilename; @@ -101,7 +99,7 @@ private void dumpReportToWorkbook(Statement st, SXSSFWorkbook wb) throws SQLExce ResultSetMetaData meta = rs.getMetaData(); Row xssfRow = sheet.createRow(rowCount++); - //write headers and cache them to check against styles + // write headers and cache them to check against styles for (int i = 1; i <= meta.getColumnCount(); i++) { Cell cell = xssfRow.createCell(i - 1); cell.setCellValue(meta.getColumnLabel(i)); @@ -138,7 +136,7 @@ private void dumpReportToWorkbook(Statement st, SXSSFWorkbook wb) throws SQLExce Cell cell = sqlRow.createCell(0); cell.setCellStyle(sqlCellStyle); - cell.setCellValue(sql.trim());//.replaceAll("[\r\n]+", "\r\n")); + cell.setCellValue(sql.trim());// .replaceAll("[\r\n]+", "\r\n")); } private XSLXCellFormatter getDefaultFormatter(int columnType) { @@ -154,10 +152,11 @@ private XSLXCellFormatter getDefaultFormatter(int columnType) { } } - private void writeCell(ResultSetMetaData meta, int colIndex, ResultSet rs, Cell cell) throws SQLException { + private void writeCell(ResultSetMetaData meta, int colIndex, ResultSet rs, Cell cell) + throws SQLException { switch (meta.getColumnType(colIndex)) { - //fall through on numerics + // fall through on numerics case Types.BIGINT: case Types.SMALLINT: case Types.INTEGER: @@ -173,7 +172,7 @@ private void writeCell(ResultSetMetaData meta, int colIndex, ResultSet rs, Cell cell.setCellValue(dbl); } break; - //fall through strings + // fall through strings case Types.BOOLEAN: case Types.CHAR: case Types.VARCHAR: @@ -191,7 +190,8 @@ private void writeCell(ResultSetMetaData meta, int colIndex, ResultSet rs, Cell } else { cell.setCellValue(rs.getString(colIndex)); } - LOG.warn("Couldn't find type for: {}. Defaulting to String", meta.getColumnType(colIndex)); + LOG.warn("Couldn't find type for: {}. Defaulting to String", + meta.getColumnType(colIndex)); } } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java index bf83593023..c1e13d834c 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.reports; @@ -32,6 +30,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; + import javax.xml.parsers.DocumentBuilder; import org.apache.commons.cli.CommandLine; @@ -40,6 +39,11 @@ import org.apache.commons.cli.ParseException; import org.apache.commons.cli.help.HelpFormatter; import org.apache.poi.common.usermodel.HyperlinkType; +import org.apache.tika.eval.app.ExtractComparer; +import org.apache.tika.eval.app.ExtractProfiler; +import org.apache.tika.eval.app.db.H2Util; +import org.apache.tika.eval.app.db.JDBCUtil; +import org.apache.tika.utils.XMLReaderUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; @@ -47,12 +51,6 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; -import org.apache.tika.eval.app.ExtractComparer; -import org.apache.tika.eval.app.ExtractProfiler; -import org.apache.tika.eval.app.db.H2Util; -import org.apache.tika.eval.app.db.JDBCUtil; -import org.apache.tika.utils.XMLReaderUtils; - public class ResultsReporter { private static final Logger LOG = LoggerFactory.getLogger(ResultsReporter.class); @@ -60,15 +58,20 @@ public class ResultsReporter { static { OPTIONS = new Options(); - OPTIONS - .addOption("rd", "reportsDir", true, - "directory for the reports. " + "If not specified, will write to 'reports'" + "BEWARE: Will overwrite existing reports without warning!") - .addOption("rf", "reportsFile", true, - "xml specifying sql to call for the reports." + "If not specified, will use default reports in resources/tika-eval-*-config.xml") - .addOption("db", true, "default database (in memory H2). Specify a file name for the H2 database.") - .addOption("jdbc", true, "EXPERT: full jdbc connection string. Specify this or use -db ") - .addOption("jdbcdriver", true, "EXPERT: specify the jdbc driver class if all else fails") - .addOption("tablePrefix", true, "EXPERT: if not using the default tables, specify your table name prefix"); + OPTIONS.addOption("rd", "reportsDir", true, "directory for the reports. " + + "If not specified, will write to 'reports'" + + "BEWARE: Will overwrite existing reports without warning!") + .addOption("rf", "reportsFile", true, + "xml specifying sql to call for the reports." + + "If not specified, will use default reports in resources/tika-eval-*-config.xml") + .addOption("db", true, + "default database (in memory H2). Specify a file name for the H2 database.") + .addOption("jdbc", true, + "EXPERT: full jdbc connection string. Specify this or use -db ") + .addOption("jdbcdriver", true, + "EXPERT: specify the jdbc driver class if all else fails") + .addOption("tablePrefix", true, + "EXPERT: if not using the default tables, specify your table name prefix"); } @@ -78,8 +81,11 @@ public class ResultsReporter { public static void USAGE() throws IOException { HelpFormatter helpFormatter = HelpFormatter.builder().get(); - helpFormatter.printHelp("java -jar tika-eval-x.y.jar Report -db mydb [-rd myreports] [-rf myreports.xml]", "Tool: Report", ResultsReporter.OPTIONS, - "Note: for h2 db, do not include the .mv.db at the end of the db name.", true); + helpFormatter.printHelp( + "java -jar tika-eval-x.y.jar Report -db mydb [-rd myreports] [-rf myreports.xml]", + "Tool: Report", ResultsReporter.OPTIONS, + "Note: for h2 db, do not include the .mv.db at the end of the db name.", + true); } public static ResultsReporter build(Path p) throws Exception { @@ -92,9 +98,7 @@ public static ResultsReporter build(Path p) throws Exception { doc = docBuilder.parse(is); } Node docElement = doc.getDocumentElement(); - assert (docElement - .getNodeName() - .equals("reports")); + assert (docElement.getNodeName().equals("reports")); NodeList children = docElement.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { Node n = children.item(i); @@ -120,15 +124,9 @@ private static Report buildReport(Node n) { Report r = new Report(); NamedNodeMap attrs = n.getAttributes(); - r.includeSql = Boolean.parseBoolean(attrs - .getNamedItem("includeSql") - .getNodeValue()); - r.reportFilename = attrs - .getNamedItem("reportFilename") - .getNodeValue(); - r.reportName = attrs - .getNamedItem("reportName") - .getNodeValue(); + r.includeSql = Boolean.parseBoolean(attrs.getNamedItem("includeSql").getNodeValue()); + r.reportFilename = attrs.getNamedItem("reportFilename").getNodeValue(); + r.reportName = attrs.getNamedItem("reportName").getNodeValue(); for (int i = 0; i < children.getLength(); i++) { Node child = children.item(i); @@ -137,7 +135,8 @@ private static Report buildReport(Node n) { } if ("sql".equals(child.getNodeName())) { if (r.sql != null) { - throw new IllegalArgumentException("Can only have one sql statement per report"); + throw new IllegalArgumentException( + "Can only have one sql statement per report"); } r.sql = child.getTextContent(); } else if ("colformats".equals(child.getNodeName())) { @@ -158,17 +157,11 @@ private static Map getCellFormatters(Node n) { continue; } NamedNodeMap attrs = child.getAttributes(); - String columnName = attrs - .getNamedItem("name") - .getNodeValue(); + String columnName = attrs.getNamedItem("name").getNodeValue(); assert (!ret.containsKey(columnName)); - String type = attrs - .getNamedItem("type") - .getNodeValue(); + String type = attrs.getNamedItem("type").getNodeValue(); if ("numberFormatter".equals(type)) { - String format = attrs - .getNamedItem("format") - .getNodeValue(); + String format = attrs.getNamedItem("format").getNodeValue(); XSLXCellFormatter f = new XLSXNumFormatter(format); ret.put(columnName, f); } else if ("urlLink".equals(type)) { @@ -227,7 +220,8 @@ public static void main(String[] args) throws Exception { } Path db = Paths.get(dbString); if (!H2Util.databaseExists(db)) { - throw new RuntimeException("I'm sorry, but I couldn't find this h2 database: " + db); + throw new RuntimeException( + "I'm sorry, but I couldn't find this h2 database: " + db); } dbUtil = new H2Util(db); } else if (commandLine.hasOption("jdbc")) { @@ -237,7 +231,8 @@ public static void main(String[] args) throws Exception { } dbUtil = new JDBCUtil(commandLine.getOptionValue("jdbc"), driverClass); } else { - System.err.println("Must specify either -db for the default in-memory h2 database\n" + "or -jdbc for a full jdbc connection string"); + System.err.println("Must specify either -db for the default in-memory h2 database\n" + + "or -jdbc for a full jdbc connection string"); USAGE(); return; } @@ -273,14 +268,10 @@ private static Path getDefaultReportsConfig(Connection c) throws IOException, SQ try (ResultSet rs = md.getTables(null, null, "%", null)) { while (rs.next()) { String tName = rs.getString(3); - if (ExtractComparer.CONTENTS_TABLE_B - .getName() - .equalsIgnoreCase(tName)) { + if (ExtractComparer.CONTENTS_TABLE_B.getName().equalsIgnoreCase(tName)) { internalPath = "/comparison-reports.xml"; break; - } else if (ExtractProfiler.PROFILE_TABLE - .getName() - .equalsIgnoreCase(tName)) { + } else if (ExtractProfiler.PROFILE_TABLE.getName().equalsIgnoreCase(tName)) { internalPath = "/profile-reports.xml"; break; } @@ -288,10 +279,12 @@ private static Path getDefaultReportsConfig(Connection c) throws IOException, SQ } if (internalPath == null) { - throw new RuntimeException("Couldn't determine if this database was a 'profiler' or 'comparison' db"); + throw new RuntimeException( + "Couldn't determine if this database was a 'profiler' or 'comparison' db"); } Path tmp = Files.createTempFile("tmp-tika-reports", ".xml"); - Files.copy(ResultsReporter.class.getResourceAsStream(internalPath), tmp, StandardCopyOption.REPLACE_EXISTING); + Files.copy(ResultsReporter.class.getResourceAsStream(internalPath), tmp, + StandardCopyOption.REPLACE_EXISTING); return tmp; } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXHREFFormatter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXHREFFormatter.java index ab5dc886a9..7068815115 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXHREFFormatter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXHREFFormatter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.reports; @@ -30,8 +28,8 @@ public class XLSXHREFFormatter implements XSLXCellFormatter { - //xlsx files can only have this many hyperlinks - //if they have more Excel can't read the file + // xlsx files can only have this many hyperlinks + // if they have more Excel can't read the file private static final int MAX_HYPERLINKS = 65000; @@ -59,24 +57,20 @@ public void reset(XSSFWorkbook workbook) { } @Override - public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException { + public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) + throws SQLException { if (links < MAX_HYPERLINKS) { - Hyperlink hyperlink = workbook - .getCreationHelper() - .createHyperlink(linkType); + Hyperlink hyperlink = workbook.getCreationHelper().createHyperlink(linkType); String path = resultSet.getString(dbColNum); String address = urlBase + path; hyperlink.setAddress(address); cell.setHyperlink(hyperlink); cell.setCellStyle(style); - String fName = Paths - .get(path) - .getFileName() - .toString(); + String fName = Paths.get(path).getFileName().toString(); cell.setCellValue(fName); links++; } else { - //silently stop adding hyperlinks + // silently stop adding hyperlinks } } } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXNumFormatter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXNumFormatter.java index eba1c84724..bab1c08129 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXNumFormatter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXNumFormatter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.reports; @@ -37,14 +35,13 @@ class XLSXNumFormatter implements XSLXCellFormatter { @Override public void reset(XSSFWorkbook workbook) { style = workbook.createCellStyle(); - style.setDataFormat(workbook - .getCreationHelper() - .createDataFormat() - .getFormat(formatString)); + style.setDataFormat( + workbook.getCreationHelper().createDataFormat().getFormat(formatString)); } @Override - public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException { + public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) + throws SQLException { double d = resultSet.getDouble(dbColNum); if (resultSet.wasNull()) { diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XSLXCellFormatter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XSLXCellFormatter.java index 2eb0071401..50d34ebff1 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XSLXCellFormatter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XSLXCellFormatter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.reports; @@ -27,5 +25,6 @@ interface XSLXCellFormatter { public void reset(XSSFWorkbook workbook); - public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException; + public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) + throws SQLException; } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java index ea21112c42..5ac458fd9f 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.tools; @@ -24,9 +22,9 @@ import org.apache.tika.utils.ProcessUtils; /** - * Utility class that runs TopCommonTokenCounter against a directory - * of table files (named {lang}_table.gz or leipzip-like afr_...-sentences.txt) - * and outputs common tokens files for each input table file in the output directory. + * Utility class that runs TopCommonTokenCounter against a directory of table files (named + * {lang}_table.gz or leipzip-like afr_...-sentences.txt) and outputs common tokens files for each + * input table file in the output directory. */ public class BatchTopCommonTokenCounter { @@ -38,21 +36,12 @@ public static void main(String[] args) throws Exception { for (Map.Entry> e : langFiles.entrySet()) { - String[] cmd = new String[e - .getValue() - .size() + 1]; + String[] cmd = new String[e.getValue().size() + 1]; Path commonTokensFile = commonTokensDir.resolve(e.getKey()); - cmd[0] = ProcessUtils.escapeCommandLine(commonTokensFile - .toAbsolutePath() - .toString()); - for (int i = 0; i < e - .getValue() - .size(); i++) { - cmd[i + 1] = ProcessUtils.escapeCommandLine(e - .getValue() - .get(i) - .toAbsolutePath() - .toString()); + cmd[0] = ProcessUtils.escapeCommandLine(commonTokensFile.toAbsolutePath().toString()); + for (int i = 0; i < e.getValue().size(); i++) { + cmd[i + 1] = ProcessUtils + .escapeCommandLine(e.getValue().get(i).toAbsolutePath().toString()); } TopCommonTokenCounter.main(cmd); } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/CommonTokenOverlapCounter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/CommonTokenOverlapCounter.java index ed1b09e3e0..16b5c43f52 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/CommonTokenOverlapCounter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/CommonTokenOverlapCounter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.tools; @@ -37,9 +35,7 @@ public static void main(String[] args) throws Exception { private void execute(Path commonTokensDir) throws IOException { List langs = new ArrayList<>(); - for (File f : commonTokensDir - .toFile() - .listFiles()) { + for (File f : commonTokensDir.toFile().listFiles()) { langs.add(f.getName()); } CommonTokenCountManager mgr = new CommonTokenCountManager(commonTokensDir, ""); diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigHelper.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigHelper.java index df42d414bb..e5e34f125e 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigHelper.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigHelper.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.tools; @@ -29,26 +27,17 @@ public class LeipzigHelper { static Map> getFiles(Path leipzigDir) throws IOException { - Matcher tableMatcher = Pattern - .compile("([a-z]+)_table(\\.txt)?(\\.gz)?$") - .matcher(""); - Matcher leipzigMatcher = Pattern - .compile("([a-z]{3,3}(-(simp|trad|rom|zaw))?)[-_].*$") - .matcher(""); + Matcher tableMatcher = Pattern.compile("([a-z]+)_table(\\.txt)?(\\.gz)?$").matcher(""); + Matcher leipzigMatcher = + Pattern.compile("([a-z]{3,3}(-(simp|trad|rom|zaw))?)[-_].*$").matcher(""); Map> m = new TreeMap<>(); - for (File f : leipzigDir - .toFile() - .listFiles()) { + for (File f : leipzigDir.toFile().listFiles()) { System.err.println(f); String lang = null; - if (tableMatcher - .reset(f.getName()) - .find()) { + if (tableMatcher.reset(f.getName()).find()) { lang = tableMatcher.group(1); - } else if (leipzigMatcher - .reset(f.getName()) - .find()) { + } else if (leipzigMatcher.reset(f.getName()).find()) { lang = leipzigMatcher.group(1); } if (lang == null) { diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigSampler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigSampler.java index 30f44c7e37..da3bb52874 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigSampler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigSampler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.tools; @@ -40,7 +38,8 @@ public static void main(String[] args) throws Exception { } } - private void execute(Path leipzigDir, int sentsPerLang, BufferedWriter writer) throws IOException { + private void execute(Path leipzigDir, int sentsPerLang, BufferedWriter writer) + throws IOException { Map> fileMap = LeipzigHelper.getFiles(leipzigDir); for (Map.Entry> e : fileMap.entrySet()) { List sentences = new ArrayList<>(); diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java index 27393c5280..19cbb602b5 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.tools; @@ -55,20 +53,16 @@ import org.apache.lucene.util.Version; /** - * COPIED VERBATIM FROM LUCENE - * This class forces a composite reader (eg a {@link - * MultiReader} or {@link DirectoryReader}) to emulate a - * {@link LeafReader}. This requires implementing the postings - * APIs on-the-fly, using the static methods in {@link - * MultiTerms}, {@link MultiDocValues}, by stepping through - * the sub-readers to merge fields/terms, appending docs, etc. + * COPIED VERBATIM FROM LUCENE This class forces a composite reader (eg a {@link MultiReader} + * or {@link DirectoryReader}) to emulate a {@link LeafReader}. This requires implementing the + * postings APIs on-the-fly, using the static methods in {@link MultiTerms}, {@link MultiDocValues}, + * by stepping through the sub-readers to merge fields/terms, appending docs, etc. * - *

NOTE: this class almost always results in a - * performance hit. If this is important to your use case, - * you'll get better performance by gathering the sub readers using - * {@link IndexReader#getContext()} to get the - * leaves and then operate per-LeafReader, - * instead of using this class. + *

+ * NOTE: this class almost always results in a performance hit. If this is important to your + * use case, you'll get better performance by gathering the sub readers using + * {@link IndexReader#getContext()} to get the leaves and then operate per-LeafReader, instead of + * using this class. */ public final class SlowCompositeReaderWrapper extends LeafReader { @@ -81,24 +75,19 @@ public final class SlowCompositeReaderWrapper extends LeafReader { private final CompositeReader in; private final LeafMetaData metaData; // Cached copy of FieldInfos to prevent it from being re-created on each - // getFieldInfos call. Most (if not all) other LeafReader implementations + // getFieldInfos call. Most (if not all) other LeafReader implementations // also have a cached FieldInfos instance so this is consistent. SOLR-12878 private final FieldInfos fieldInfos; SlowCompositeReaderWrapper(CompositeReader reader) throws IOException { in = reader; in.registerParentReader(this); - if (reader - .leaves() - .isEmpty()) { + if (reader.leaves().isEmpty()) { metaData = new LeafMetaData(Version.LATEST.major, Version.LATEST, null, false); } else { Version minVersion = Version.LATEST; for (LeafReaderContext leafReaderContext : reader.leaves()) { - Version leafVersion = leafReaderContext - .reader() - .getMetaData() - .getMinVersion(); + Version leafVersion = leafReaderContext.reader().getMetaData().getMinVersion(); if (leafVersion == null) { minVersion = null; break; @@ -106,20 +95,17 @@ public final class SlowCompositeReaderWrapper extends LeafReader { minVersion = leafVersion; } } - metaData = new LeafMetaData(reader - .leaves() - .get(0) - .reader() - .getMetaData() - .getCreatedVersionMajor(), minVersion, null, false); + metaData = new LeafMetaData( + reader.leaves().get(0).reader().getMetaData().getCreatedVersionMajor(), + minVersion, null, false); } fieldInfos = FieldInfos.getMergedFieldInfos(in); } /** - * This method is sugar for getting an {@link LeafReader} from - * an {@link IndexReader} of any kind. If the reader is already atomic, - * it is returned unchanged, otherwise wrapped by this class. + * This method is sugar for getting an {@link LeafReader} from an {@link IndexReader} of any + * kind. If the reader is already atomic, it is returned unchanged, otherwise wrapped by this + * class. */ public static LeafReader wrap(IndexReader reader) throws IOException { if (reader instanceof CompositeReader) { @@ -155,14 +141,13 @@ public Terms terms(String field) throws IOException { return cachedTerms.computeIfAbsent(field, f -> { try { return MultiTerms.getTerms(in, f); - } catch (IOException e) { // yuck! ...sigh... checked exceptions with built-in lambdas are a pain + } catch (IOException e) { // yuck! ...sigh... checked exceptions with built-in + // lambdas are a pain throw new RuntimeException("unwrapMe", e); } }); } catch (RuntimeException e) { - if (e - .getMessage() - .equals("unwrapMe") && e.getCause() instanceof IOException) { + if (e.getMessage().equals("unwrapMe") && e.getCause() instanceof IOException) { throw (IOException) e.getCause(); } throw e; @@ -206,20 +191,14 @@ public SortedDocValues getSortedDocValues(String field) throws IOException { return dv; } } - int size = in - .leaves() - .size(); + int size = in.leaves().size(); final SortedDocValues[] values = new SortedDocValues[size]; final int[] starts = new int[size + 1]; long totalCost = 0; for (int i = 0; i < size; i++) { - LeafReaderContext context = in - .leaves() - .get(i); + LeafReaderContext context = in.leaves().get(i); final LeafReader reader = context.reader(); - final FieldInfo fieldInfo = reader - .getFieldInfos() - .fieldInfo(field); + final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); if (fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED) { return null; } @@ -256,20 +235,14 @@ public SortedSetDocValues getSortedSetDocValues(String field) throws IOException } assert map != null; - int size = in - .leaves() - .size(); + int size = in.leaves().size(); final SortedSetDocValues[] values = new SortedSetDocValues[size]; final int[] starts = new int[size + 1]; long cost = 0; for (int i = 0; i < size; i++) { - LeafReaderContext context = in - .leaves() - .get(i); + LeafReaderContext context = in.leaves().get(i); final LeafReader reader = context.reader(); - final FieldInfo fieldInfo = reader - .getFieldInfos() - .fieldInfo(field); + final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); if (fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED_SET) { return null; } @@ -293,32 +266,32 @@ public NumericDocValues getNormValues(String field) throws IOException { @Override public FloatVectorValues getFloatVectorValues(String s) throws IOException { - //TODO figure out how to implement this... if needed + // TODO figure out how to implement this... if needed return null; } @Override public ByteVectorValues getByteVectorValues(String s) throws IOException { - //TODO figure out how to implement this... if needed + // TODO figure out how to implement this... if needed return null; } @Override - public void searchNearestVectors(String string, float[] floats, KnnCollector kc, Bits bits) throws IOException { - //TODO figure out how to implement this... if needed + public void searchNearestVectors(String string, float[] floats, KnnCollector kc, Bits bits) + throws IOException { + // TODO figure out how to implement this... if needed } @Override - public void searchNearestVectors(String string, byte[] bytes, KnnCollector kc, Bits bits) throws IOException { - //TODO figure out how to implement this... if needed + public void searchNearestVectors(String string, byte[] bytes, KnnCollector kc, Bits bits) + throws IOException { + // TODO figure out how to implement this... if needed } @Override public Fields getTermVectors(int docID) throws IOException { ensureOpen(); - return in - .termVectors() - .get(docID); + return in.termVectors().get(docID); } @Override @@ -341,9 +314,7 @@ public int maxDoc() { @Override public void document(int docID, StoredFieldVisitor visitor) throws IOException { ensureOpen(); - in - .storedFields() - .document(docID, visitor); + in.storedFields().document(docID, visitor); } @Override @@ -360,7 +331,7 @@ public Bits getLiveDocs() { @Override public PointValues getPointValues(String field) { ensureOpen(); - return null; // because not supported. Throw UOE? + return null; // because not supported. Throw UOE? } @Override @@ -378,9 +349,7 @@ protected void doClose() throws IOException { public void checkIntegrity() throws IOException { ensureOpen(); for (LeafReaderContext ctx : in.leaves()) { - ctx - .reader() - .checkIntegrity(); + ctx.reader().checkIntegrity(); } } diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java index 4cdcf4f198..6253ed1fd8 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.tools; @@ -49,40 +47,48 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; - import org.apache.tika.eval.core.tokens.AnalyzerManager; import org.apache.tika.eval.core.tokens.URLEmailNormalizingFilterFactory; import org.apache.tika.utils.ProcessUtils; /** - * Utility class that reads in a UTF-8 input file with one document per row - * and outputs the 20000 tokens with the highest document frequencies. + * Utility class that reads in a UTF-8 input file with one document per row and outputs the 20000 + * tokens with the highest document frequencies. *

- * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters, - * but includes bigrams for cjk. + * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters, but includes + * bigrams for cjk. *

- * It also has a include list for __email__ and __url__ and a skip list - * for common html markup terms. + * It also has a include list for __email__ and __url__ and a skip list for common html markup + * terms. */ public class TopCommonTokenCounter { private static final String FIELD = "f"; - //these should exist in every list - static Set INCLUDE_LIST = new HashSet<>(Arrays.asList(new String[]{URLEmailNormalizingFilterFactory.URL, URLEmailNormalizingFilterFactory.EMAIL})); - //words to ignore - //these are common 4 letter html markup words that we do - //not want to count in case of failed markup processing. - //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055 - static Set SKIP_LIST = new HashSet<>( - Arrays.asList("span", "table", "href", "head", "title", "body", "html", "tagname", "lang", "style", "script", "strong", "blockquote", "form", "iframe", "section", - "colspan", "rowspan")); + // these should exist in every list + static Set INCLUDE_LIST = new HashSet<>(Arrays.asList(new String[] { + URLEmailNormalizingFilterFactory.URL, URLEmailNormalizingFilterFactory.EMAIL})); + // words to ignore + // these are common 4 letter html markup words that we do + // not want to count in case of failed markup processing. + // see: + // https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055 + static Set SKIP_LIST = new HashSet<>(Arrays.asList("span", "table", "href", "head", + "title", "body", "html", "tagname", "lang", "style", "script", "strong", + "blockquote", "form", "iframe", "section", "colspan", "rowspan")); private static String LICENSE = - "# Licensed to the Apache Software Foundation (ASF) under one or more\n" + "# contributor license agreements. See the NOTICE file distributed with\n" + - "# this work for additional information regarding copyright ownership.\n" + "# The ASF licenses this file to You under the Apache License, Version 2.0\n" + - "# (the \"License\"); you may not use this file except in compliance with\n" + "# the License. You may obtain a copy of the License at\n" + "#\n" + - "# http://www.apache.org/licenses/LICENSE-2.0\n" + "#\n" + "# Unless required by applicable law or agreed to in writing, software\n" + - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" + - "# See the License for the specific language governing permissions and\n" + "# limitations under the License.\n" + "#\n"; + "# Licensed to the Apache Software Foundation (ASF) under one or more\n" + + "# contributor license agreements. See the NOTICE file distributed with\n" + + "# this work for additional information regarding copyright ownership.\n" + + "# The ASF licenses this file to You under the Apache License, Version 2.0\n" + + "# (the \"License\"); you may not use this file except in compliance with\n" + + "# the License. You may obtain a copy of the License at\n" + + "#\n" + "# http://www.apache.org/licenses/LICENSE-2.0\n" + + "#\n" + + "# Unless required by applicable law or agreed to in writing, software\n" + + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" + + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" + + "# See the License for the specific language governing permissions and\n" + + "# limitations under the License.\n" + "#\n"; private static int TOP_N = 30000; private static int MIN_DOC_FREQ = 10; @@ -94,15 +100,16 @@ public static void main(String[] args) throws Exception { } TopCommonTokenCounter counter = new TopCommonTokenCounter(); if (Files.exists(commonTokensFile)) { - System.err.println(commonTokensFile - .getFileName() - .toString() + " exists. I'm skipping this."); + System.err.println(commonTokensFile.getFileName().toString() + + " exists. I'm skipping this."); return; } counter.execute(commonTokensFile, inputFiles); } - private static void writeTopN(Path path, long totalDocs, long sumDocFreqs, long sumTotalTermFreqs, long uniqueTerms, AbstractTokenTFDFPriorityQueue queue) throws IOException { + private static void writeTopN(Path path, long totalDocs, long sumDocFreqs, + long sumTotalTermFreqs, long uniqueTerms, AbstractTokenTFDFPriorityQueue queue) + throws IOException { if (Files.isRegularFile(path)) { System.err.println("File " + path.getFileName() + " already exists. Skipping."); return; @@ -116,7 +123,7 @@ private static void writeTopN(Path path, long totalDocs, long sumDocFreqs, long writer.write("#SUM_TERM_FREQS\t" + sumTotalTermFreqs + "\n"); writer.write("#UNIQUE_TERMS\t" + uniqueTerms + "\n"); writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n"); - //add these tokens no matter what + // add these tokens no matter what for (String t : INCLUDE_LIST) { writer.write(t); writer.newLine(); @@ -132,12 +139,8 @@ private static void writeTopN(Path path, long totalDocs, long sumDocFreqs, long private static String getRow(StringBuilder sb, TokenDFTF tp) { sb.setLength(0); sb.append(clean(tp.token)); - sb - .append("\t") - .append(tp.df); - sb - .append("\t") - .append(tp.tf); + sb.append("\t").append(tp.df); + sb.append("\t").append(tp.tf); return sb.toString(); } @@ -145,9 +148,7 @@ private static String clean(String s) { if (s == null) { return ""; } - return s - .replaceAll("\\s+", " ") - .trim(); + return s.replaceAll("\\s+", " ").trim(); } private void execute(Path commonTokensFile, List inputFiles) throws Exception { @@ -168,11 +169,9 @@ private void execute(Path commonTokensFile, List inputFiles) throws Except try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) { List docs = new ArrayList<>(); for (Path inputFile : inputFiles) { - //total hack - boolean isLeipzig = inputFile - .getFileName() - .toString() - .contains("-sentences.txt"); + // total hack + boolean isLeipzig = + inputFile.getFileName().toString().contains("-sentences.txt"); int lines = 0; try (BufferedReader reader = getReader(inputFile)) { String line = reader.readLine(); @@ -194,7 +193,9 @@ private void execute(Path commonTokensFile, List inputFiles) throws Except } line = reader.readLine(); if (++lines % 100000 == 0) { - System.out.println("processed " + lines + " for " + inputFile.getFileName() + " :: " + commonTokensFile.toAbsolutePath()); + System.out.println("processed " + lines + " for " + + inputFile.getFileName() + " :: " + + commonTokensFile.toAbsolutePath()); } } } @@ -246,9 +247,7 @@ private void execute(Path commonTokensFile, List inputFiles) throws Except private BufferedReader getReader(Path inputFile) throws IOException { InputStream is = Files.newInputStream(inputFile); - if (inputFile - .toString() - .endsWith(".gz")) { + if (inputFile.toString().endsWith(".gz")) { is = new GzipCompressorInputStream(is); } return new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); @@ -262,7 +261,7 @@ private abstract static class AbstractTokenTFDFPriorityQueue extends PriorityQue public TokenDFTF[] getArray() { TokenDFTF[] topN = new TokenDFTF[size()]; - //now we reverse the queue + // now we reverse the queue TokenDFTF term = pop(); int i = topN.length - 1; while (term != null && i > -1) { @@ -350,7 +349,7 @@ protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) { public TokenDFTF[] getArray() { TokenDFTF[] topN = new TokenDFTF[size()]; - //now we reverse the queue + // now we reverse the queue TokenDFTF term = pop(); int i = topN.length - 1; while (term != null && i > -1) { diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TrainTestSplit.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TrainTestSplit.java index eec35cb425..75f2a18d84 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TrainTestSplit.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TrainTestSplit.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.tools; @@ -54,9 +52,7 @@ public static void main(String[] args) throws Exception { private void execute(Path leipzigDir, Path outputDir) throws Exception { initOutDirs(outputDir); - for (File f : leipzigDir - .toFile() - .listFiles()) { + for (File f : leipzigDir.toFile().listFiles()) { if (f.isDirectory()) { continue; } @@ -65,7 +61,7 @@ private void execute(Path leipzigDir, Path outputDir) throws Exception { } private void initOutDirs(Path outputDir) throws Exception { - for (String which : new String[]{TRAINING, DEVTEST, TESTING}) { + for (String which : new String[] {TRAINING, DEVTEST, TESTING}) { Path target = outputDir.resolve(which); if (!Files.isDirectory(target)) { Files.createDirectories(target); @@ -82,17 +78,11 @@ private void processFile(File f, Path outputDir) throws Exception { while (line != null) { float r = random.nextFloat(); if (r <= trainingP) { - writers - .get(TRAINING) - .write(line + "\n"); + writers.get(TRAINING).write(line + "\n"); } else if (r < trainingP + devTestP) { - writers - .get(DEVTEST) - .write(line + "\n"); + writers.get(DEVTEST).write(line + "\n"); } else { - writers - .get(TESTING) - .write(line + "\n"); + writers.get(TESTING).write(line + "\n"); } line = reader.readLine(); } @@ -107,16 +97,15 @@ private void processFile(File f, Path outputDir) throws Exception { private Map getWriters(Path outputDir, File f) throws IOException { Map writers = new HashMap<>(); - for (String which : new String[]{TRAINING, DEVTEST, TESTING}) { + for (String which : new String[] {TRAINING, DEVTEST, TESTING}) { writers.put(which, getWriter(outputDir, which, f)); } return writers; } private BufferedWriter getWriter(Path outputDir, String which, File f) throws IOException { - OutputStream os = new GzipCompressorOutputStream(new BufferedOutputStream(Files.newOutputStream(outputDir - .resolve(which) - .resolve(f.getName() + ".gz")))); + OutputStream os = new GzipCompressorOutputStream(new BufferedOutputStream(Files + .newOutputStream(outputDir.resolve(which).resolve(f.getName() + ".gz")))); return new BufferedWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8)); } } diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/AnalyzerManagerTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/AnalyzerManagerTest.java index efa0e4610f..e9af30bbae 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/AnalyzerManagerTest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/AnalyzerManagerTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -28,10 +26,9 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.junit.jupiter.api.Test; - import org.apache.tika.eval.core.tokens.AlphaIdeographFilterFactory; import org.apache.tika.eval.core.tokens.AnalyzerManager; +import org.junit.jupiter.api.Test; public class AnalyzerManagerTest { @@ -66,7 +63,8 @@ public void testCommon() throws Exception { Set seen = new HashSet<>(); while (ts.incrementToken()) { String t = termAtt.toString(); - if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray(), t.length()) && t.contains("5")) { + if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray(), t.length()) + && t.contains("5")) { fail("Shouldn't have found a numeric"); } seen.add(termAtt.toString()); @@ -86,9 +84,7 @@ public void testTokenCountFilter() throws Exception { for (int i = 0; i < 1001000; i++) { sb.append("the "); } - TokenStream ts = analyzerManager - .getGeneralAnalyzer() - .tokenStream("f", sb.toString()); + TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString()); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); int tokens = 0; diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java index 395c90fe6d..0d1dc4a860 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/EvalConfigTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/MockDBWriter.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/MockDBWriter.java index da37ed4dac..387b7df986 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/MockDBWriter.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/MockDBWriter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -30,13 +28,12 @@ public class MockDBWriter implements IDBWriter { public Map mimes = new HashMap<>(); - //Map of tableName and tables - //each table consists of a list of rows. - //Each row consists of a map of columns/values + // Map of tableName and tables + // each table consists of a list of rows. + // Each row consists of a map of columns/values Map>> db = new HashMap<>(); - public MockDBWriter() throws Exception { - } + public MockDBWriter() throws Exception {} @Override public void writeRow(TableInfo tableInfo, Map row) throws IOException { @@ -50,7 +47,7 @@ public void writeRow(TableInfo tableInfo, Map row) throws IOExcept @Override public void close() throws IOException { - //no-op + // no-op } @Override diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java index 922fcd3faa..bb1c944bf1 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/ProfilerBatchTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -32,16 +30,15 @@ import java.util.List; import org.apache.commons.io.FileUtils; +import org.apache.tika.eval.app.db.Cols; +import org.apache.tika.eval.app.db.H2Util; +import org.apache.tika.eval.app.db.TableInfo; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import org.apache.tika.eval.app.db.Cols; -import org.apache.tika.eval.app.db.H2Util; -import org.apache.tika.eval.app.db.TableInfo; - public class ProfilerBatchTest { private static Connection CONN; @@ -51,20 +48,16 @@ public class ProfilerBatchTest { @BeforeAll public static void setUp() throws Exception { DB_DIR = Files.createTempDirectory("profiler-test"); - Path extractsRoot = Paths.get(ProfilerBatchTest.class - .getResource("/test-dirs/extractsA") - .toURI()); + Path extractsRoot = Paths + .get(ProfilerBatchTest.class.getResource("/test-dirs/extractsA").toURI()); - Path inputRoot = Paths.get(ProfilerBatchTest.class - .getResource("/test-dirs/raw_input") - .toURI()); + Path inputRoot = Paths + .get(ProfilerBatchTest.class.getResource("/test-dirs/raw_input").toURI()); DB = DB_DIR.resolve("mydb"); - String[] args = new String[]{ - "-i", inputRoot.toAbsolutePath().toString(), - "-e", extractsRoot.toAbsolutePath().toString(), - "-d", "jdbc:h2:file:" + DB.toAbsolutePath().toString() - }; + String[] args = new String[] {"-i", inputRoot.toAbsolutePath().toString(), "-e", + extractsRoot.toAbsolutePath().toString(), "-d", + "jdbc:h2:file:" + DB.toAbsolutePath().toString()}; ExtractProfileRunner.main(args); } @@ -109,11 +102,10 @@ public void testSimpleDBWriteAndRead() throws Exception { } } /* - debugTable(ExtractProfiler.CONTAINER_TABLE); - debugTable(ExtractProfiler.PROFILE_TABLE); - debugTable(ExtractProfiler.CONTENTS_TABLE); - debugTable(ExtractProfiler.EXCEPTION_TABLE); - debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);*/ + * debugTable(ExtractProfiler.CONTAINER_TABLE); debugTable(ExtractProfiler.PROFILE_TABLE); + * debugTable(ExtractProfiler.CONTENTS_TABLE); debugTable(ExtractProfiler.EXCEPTION_TABLE); + * debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE); + */ assertEquals(17, fNameList.size()); assertTrue(fNameList.contains("file1.pdf"), "file1.pdf"); assertTrue(fNameList.contains("file2_attachANotB.doc"), "file2_attachANotB.doc"); @@ -125,20 +117,25 @@ public void testSimpleDBWriteAndRead() throws Exception { @Test public void testExtractErrors() throws Exception { - String sql = - "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join containers c on c.container_id = e.container_id " + " where c.file_path='file9_noextract.txt'"; + String sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + + " join containers c on c.container_id = e.container_id " + + " where c.file_path='file9_noextract.txt'"; - /*debugTable(ExtractProfiler.CONTAINER_TABLE); - debugTable(ExtractProfiler.PROFILE_TABLE); - debugTable(ExtractProfiler.CONTENTS_TABLE); - debugTable(ExtractProfiler.EXCEPTION_TABLE); - debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);*/ + /* + * debugTable(ExtractProfiler.CONTAINER_TABLE); debugTable(ExtractProfiler.PROFILE_TABLE); + * debugTable(ExtractProfiler.CONTENTS_TABLE); debugTable(ExtractProfiler.EXCEPTION_TABLE); + * debugTable(ExtractProfiler.EXTRACT_EXCEPTION_TABLE); + */ assertEquals("0", getSingleResult(sql), "missing extract: file9_noextract.txt"); - sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join containers c on c.container_id = e.container_id " + " where c.file_path='file5_emptyA.pdf'"; + sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + + " join containers c on c.container_id = e.container_id " + + " where c.file_path='file5_emptyA.pdf'"; assertEquals("1", getSingleResult(sql), "empty extract: file5_emptyA.pdf"); - sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + " join containers c on c.container_id = e.container_id " + " where c.file_path='file7_badJson.pdf'"; + sql = "select EXTRACT_EXCEPTION_ID from extract_exceptions e" + + " join containers c on c.container_id = e.container_id " + + " where c.file_path='file7_badJson.pdf'"; assertEquals("2", getSingleResult(sql), "extract error:file7_badJson.pdf"); } @@ -155,9 +152,8 @@ private String getSingleResult(String sql) throws Exception { int hits = 0; String val = ""; while (rs.next()) { - assertEquals(1, rs - .getMetaData() - .getColumnCount(), "must have only one column in result"); + assertEquals(1, rs.getMetaData().getColumnCount(), + "must have only one column in result"); val = rs.getString(1); hits++; } @@ -165,7 +161,7 @@ private String getSingleResult(String sql) throws Exception { return val; } - //TODO: lots more testing! + // TODO: lots more testing! public void debugTable(TableInfo table) throws Exception { Statement st = null; @@ -173,17 +169,13 @@ public void debugTable(TableInfo table) throws Exception { String sql = "select * from " + table.getName(); st = CONN.createStatement(); ResultSet rs = st.executeQuery(sql); - int colCount = rs - .getMetaData() - .getColumnCount(); + int colCount = rs.getMetaData().getColumnCount(); System.out.println("TABLE: " + table.getName()); for (int i = 1; i <= colCount; i++) { if (i > 1) { System.out.print(" | "); } - System.out.print(rs - .getMetaData() - .getColumnName(i)); + System.out.print(rs.getMetaData().getColumnName(i)); } System.out.println(""); int rowCount = 0; diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java index 4c409e84c6..d2249ae28c 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/SimpleComparerTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -30,11 +28,6 @@ import java.util.SortedSet; import java.util.TreeSet; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.eval.app.db.Cols; import org.apache.tika.eval.app.db.TableInfo; @@ -43,10 +36,14 @@ import org.apache.tika.eval.core.util.ContentTags; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; -//These tests ensure that the comparer is extracting the right information -//into a Map. A full integration test -//should also ensure that the elements are properly being written to the db +// These tests ensure that the comparer is extracting the right information +// into a Map. A full integration test +// should also ensure that the elements are properly being written to the db public class SimpleComparerTest extends TikaTest { @@ -56,30 +53,33 @@ public class SimpleComparerTest extends TikaTest { @BeforeAll public static void staticSetUp() throws Exception { WRITER = new MockDBWriter(); - ProfilerBase.loadCommonTokens(Paths.get(SimpleComparerTest.class - .getResource("/common_tokens") - .toURI()), "en"); + ProfilerBase.loadCommonTokens( + Paths.get(SimpleComparerTest.class.getResource("/common_tokens").toURI()), + "en"); } @BeforeEach public void setUp() throws Exception { WRITER.clear(); comparer = new ExtractComparer(null, Paths.get("extractsA"), Paths.get("extractsB"), - new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, IGNORE_LENGTH, IGNORE_LENGTH), WRITER); + new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS, IGNORE_LENGTH, + IGNORE_LENGTH), + WRITER); } @Test public void testBasic() throws Exception { - EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file1.pdf.json"), getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()); - EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file1.pdf.json"), getResourceAsFile("/test-dirs/extractsB/file1.pdf.json").toPath()); + EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()); + EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsB/file1.pdf.json").toPath()); comparer.compareFiles(fpsA, fpsB); List> tableInfos = WRITER.getTable(ExtractComparer.CONTENT_COMPARISONS); Map row = tableInfos.get(0); - assertTrue(row - .get(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A) - .startsWith("1,200: 1 | 120000: 1 | over: 1")); + assertTrue(row.get(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A) + .startsWith("1,200: 1 | 120000: 1 | over: 1")); tableInfos = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A); row = tableInfos.get(0); @@ -111,8 +111,10 @@ public void testBasic() throws Exception { @Test public void testBasicSpanish() throws Exception { - EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file1.pdf.json"), getResourceAsFile("/test-dirs/extractsA/file12_es.txt.json").toPath()); - EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file1.pdf.json"), getResourceAsFile("/test-dirs/extractsB/file12_es.txt.json").toPath()); + EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsA/file12_es.txt.json").toPath()); + EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsB/file12_es.txt.json").toPath()); comparer.compareFiles(fpsA, fpsB); @@ -131,12 +133,15 @@ public void testBasicSpanish() throws Exception { @Test public void testChinese() throws Exception { - //make sure that language id matches common words - //file names. The test file contains MT'd Simplified Chinese with - //known "common words" appended at end. + // make sure that language id matches common words + // file names. The test file contains MT'd Simplified Chinese with + // known "common words" appended at end. - EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file13_attachANotB.doc.json"), getResourceAsFile("/test-dirs/extractsA/file13_attachANotB.doc.json").toPath()); - EvalFilePaths fpsB = new EvalFilePaths(Paths.get("non-existent.json"), Paths.get("/test-dirs/extractsB/non-existent.json")); + EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file13_attachANotB.doc.json"), + getResourceAsFile("/test-dirs/extractsA/file13_attachANotB.doc.json") + .toPath()); + EvalFilePaths fpsB = new EvalFilePaths(Paths.get("non-existent.json"), + Paths.get("/test-dirs/extractsB/non-existent.json")); comparer.compareFiles(fpsA, fpsB); @@ -151,12 +156,15 @@ public void testChinese() throws Exception { @Test public void testEmpty() throws Exception { - EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file1.pdf"), getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()); - EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file1.pdf"), getResourceAsFile("/test-dirs/extractsB/file4_emptyB.pdf.json").toPath()); + EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file1.pdf"), + getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()); + EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file1.pdf"), + getResourceAsFile("/test-dirs/extractsB/file4_emptyB.pdf.json").toPath()); comparer.compareFiles(fpsA, fpsB); List> table = WRITER.getTable(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B); Map row = table.get(0); - assertEquals(Integer.toString(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()), row.get(Cols.EXTRACT_EXCEPTION_ID)); + assertEquals(Integer.toString(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()), + row.get(Cols.EXTRACT_EXCEPTION_ID)); } @@ -172,12 +180,12 @@ public void testGetContent() throws Exception { assertEquals(4, content.length()); assertEquals("TRUE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN)); - //test Metadata with no content + // test Metadata with no content content = ProfilerBase.truncateContent(ContentTags.EMPTY_CONTENT_TAGS, 10, data); assertEquals(0, content.length()); assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN)); - //test null Metadata + // test null Metadata content = ProfilerBase.truncateContent(null, 10, data); assertEquals(0, content.length()); assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN)); @@ -185,14 +193,18 @@ public void testGetContent() throws Exception { @Test public void testAccessException() throws Exception { - EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file6_accessEx.pdf.json"), getResourceAsFile("/test-dirs/extractsA/file6_accessEx.pdf.json").toPath()); - EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file6_accessEx.pdf.json"), getResourceAsFile("/test-dirs/extractsB/file6_accessEx.pdf.json").toPath()); + EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file6_accessEx.pdf.json"), + getResourceAsFile("/test-dirs/extractsA/file6_accessEx.pdf.json").toPath()); + EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file6_accessEx.pdf.json"), + getResourceAsFile("/test-dirs/extractsB/file6_accessEx.pdf.json").toPath()); comparer.compareFiles(fpsA, fpsB); - for (TableInfo t : new TableInfo[]{ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B}) { + for (TableInfo t : new TableInfo[] {ExtractComparer.EXCEPTION_TABLE_A, + ExtractComparer.EXCEPTION_TABLE_B}) { List> table = WRITER.getTable(t); Map rowA = table.get(0); - assertEquals(Integer.toString(ProfilerBase.EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()), rowA.get(Cols.PARSE_EXCEPTION_ID)); + assertEquals(Integer.toString(ProfilerBase.EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()), + rowA.get(Cols.PARSE_EXCEPTION_ID)); assertNull(rowA.get(Cols.ORIG_STACK_TRACE)); assertNull(rowA.get(Cols.SORT_STACK_TRACE)); } @@ -202,8 +214,9 @@ public void testAccessException() throws Exception { public void testAttachmentCounts() { List list = new ArrayList<>(); Metadata m0 = new Metadata(); - m0.set(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, "dir1/dir2/file.zip");//bad data should be ignored - //in the first metadata object + m0.set(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, "dir1/dir2/file.zip");// bad data should + // be ignored + // in the first metadata object list.add(m0); Metadata m1 = new Metadata(); m1.set(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text1.txt"); @@ -235,22 +248,26 @@ public void testAttachmentCounts() { @Test public void testDifferentlyOrderedAttachments() throws Exception { - EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file14_diffAttachOrder.json"), getResourceAsFile("/test-dirs/extractsA/file14_diffAttachOrder.json").toPath()); - EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file6_accessEx.pdf.json"), getResourceAsFile("/test-dirs/extractsB/file14_diffAttachOrder.json").toPath()); + EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file14_diffAttachOrder.json"), + getResourceAsFile("/test-dirs/extractsA/file14_diffAttachOrder.json") + .toPath()); + EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file6_accessEx.pdf.json"), + getResourceAsFile("/test-dirs/extractsB/file14_diffAttachOrder.json") + .toPath()); comparer.compareFiles(fpsA, fpsB); List> tableInfos = WRITER.getTable(ExtractComparer.CONTENT_COMPARISONS); assertEquals(3, tableInfos.size()); for (int i = 0; i < tableInfos.size(); i++) { - assertEquals("1.0", tableInfos - .get(i) - .get(Cols.OVERLAP), "problem with " + i); + assertEquals("1.0", tableInfos.get(i).get(Cols.OVERLAP), "problem with " + i); } } @Test public void testTags() throws Exception { - EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file15_tags.json"), getResourceAsFile("/test-dirs/extractsA/file15_tags.json").toPath()); - EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file15_tags.html"), getResourceAsFile("/test-dirs/extractsB/file15_tags.html").toPath()); + EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file15_tags.json"), + getResourceAsFile("/test-dirs/extractsA/file15_tags.json").toPath()); + EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file15_tags.html"), + getResourceAsFile("/test-dirs/extractsB/file15_tags.html").toPath()); comparer.compareFiles(fpsA, fpsB); List> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A); assertEquals(1, tableInfosA.size()); @@ -268,8 +285,10 @@ public void testTags() throws Exception { @Test public void testBadTags() throws Exception { - EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file16_badtags.json"), getResourceAsFile("/test-dirs/extractsA/file16_badTags.json").toPath()); - EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file16_badtags.html"), getResourceAsFile("/test-dirs/extractsB/file16_badTags.html").toPath()); + EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file16_badtags.json"), + getResourceAsFile("/test-dirs/extractsA/file16_badTags.json").toPath()); + EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file16_badtags.html"), + getResourceAsFile("/test-dirs/extractsB/file16_badTags.html").toPath()); comparer.compareFiles(fpsA, fpsB); List> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A); assertEquals(1, tableInfosA.size()); @@ -279,22 +298,25 @@ public void testBadTags() throws Exception { List> tableInfosB = WRITER.getTable(ExtractComparer.TAGS_TABLE_B); assertEquals(1, tableInfosB.size()); Map tableInfoB = tableInfosB.get(0); - //there actually is a tag problem, but jsoup fixes it. - //this confirms behavior. + // there actually is a tag problem, but jsoup fixes it. + // this confirms behavior. assertEquals("false", tableInfoB.get(Cols.TAGS_PARSE_EXCEPTION)); } @Test public void testTagsOutOfOrder() throws Exception { - EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file17_tagsOutOfOrder.json"), getResourceAsFile("/test-dirs/extractsA/file17_tagsOutOfOrder.json").toPath()); - EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file16_badTags.html"), getResourceAsFile("/test-dirs/extractsB/file16_badTags.html").toPath()); + EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file17_tagsOutOfOrder.json"), + getResourceAsFile("/test-dirs/extractsA/file17_tagsOutOfOrder.json") + .toPath()); + EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file16_badTags.html"), + getResourceAsFile("/test-dirs/extractsB/file16_badTags.html").toPath()); comparer.compareFiles(fpsA, fpsB); List> tableInfosA = WRITER.getTable(ExtractComparer.TAGS_TABLE_A); assertEquals(1, tableInfosA.size()); Map tableInfoA = tableInfosA.get(0); assertEquals("true", tableInfoA.get(Cols.TAGS_PARSE_EXCEPTION)); - //confirm that backoff to html parser worked + // confirm that backoff to html parser worked List> contentsA = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A); assertEquals(1, contentsA.size()); Map contentsARow1 = contentsA.get(0); @@ -310,13 +332,19 @@ public void testTagsOutOfOrder() throws Exception { public void testDebug() throws Exception { Path commonTokens = Paths.get(getResourceAsFile("/common_tokens_short.txt").toURI()); ProfilerBase.loadCommonTokens(commonTokens, "en"); - EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file1.pdf.json"), getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()); - EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file1.pdf.json"), getResourceAsFile("/test-dirs/extractsB/file1.pdf.json").toPath()); + EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()); + EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsB/file1.pdf.json").toPath()); comparer.compareFiles(fpsA, fpsB); - for (TableInfo t : new TableInfo[]{ExtractComparer.COMPARISON_CONTAINERS, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, - ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B, ExtractComparer.PROFILES_A, ExtractComparer.PROFILES_B, ExtractComparer.CONTENTS_TABLE_A, - ExtractComparer.CONTENTS_TABLE_B, ExtractComparer.CONTENT_COMPARISONS}) { - //debugPrintTable(t); + for (TableInfo t : new TableInfo[] {ExtractComparer.COMPARISON_CONTAINERS, + ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, + ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, + ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B, + ExtractComparer.PROFILES_A, ExtractComparer.PROFILES_B, + ExtractComparer.CONTENTS_TABLE_A, ExtractComparer.CONTENTS_TABLE_B, + ExtractComparer.CONTENT_COMPARISONS}) { + // debugPrintTable(t); } } @@ -394,9 +422,13 @@ public void oneOff() throws Exception { EvalFilePaths fpsA = new EvalFilePaths(Paths.get("file1.pdf.json"), p1); EvalFilePaths fpsB = new EvalFilePaths(Paths.get("file1.pdf.json"), p2); comparer.compareFiles(fpsA, fpsB); - for (TableInfo t : new TableInfo[]{ExtractComparer.COMPARISON_CONTAINERS, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, - ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B, ExtractComparer.PROFILES_A, ExtractComparer.PROFILES_B, ExtractComparer.CONTENTS_TABLE_A, - ExtractComparer.CONTENTS_TABLE_B, ExtractComparer.CONTENT_COMPARISONS}) { + for (TableInfo t : new TableInfo[] {ExtractComparer.COMPARISON_CONTAINERS, + ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, + ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, + ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B, + ExtractComparer.PROFILES_A, ExtractComparer.PROFILES_B, + ExtractComparer.CONTENTS_TABLE_A, ExtractComparer.CONTENTS_TABLE_B, + ExtractComparer.CONTENT_COMPARISONS}) { debugPrintTable(t); } } diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java index 6bb22f9a6c..808b805ac9 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/TikaEvalCLITest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app; @@ -32,16 +30,15 @@ import java.util.List; import java.util.Set; +import org.apache.tika.TikaTest; +import org.apache.tika.utils.ProcessUtils; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import org.apache.tika.TikaTest; -import org.apache.tika.utils.ProcessUtils; - public class TikaEvalCLITest extends TikaTest { - //TODO: these barely reach the minimal acceptable stage for unit tests + // TODO: these barely reach the minimal acceptable stage for unit tests private final static String dbName = "testdb"; @@ -69,20 +66,15 @@ private static void compare() throws IOException { List args = new ArrayList<>(); args.add("Compare"); args.add("-a"); - args.add(ProcessUtils.escapeCommandLine(extractsDir - .resolve("extractsA") - .toAbsolutePath() - .toString())); + args.add(ProcessUtils.escapeCommandLine( + extractsDir.resolve("extractsA").toAbsolutePath().toString())); args.add("-b"); - args.add(ProcessUtils.escapeCommandLine(extractsDir - .resolve("extractsB") - .toAbsolutePath() - .toString())); + args.add(ProcessUtils.escapeCommandLine( + extractsDir.resolve("extractsB").toAbsolutePath().toString())); args.add("-d"); - args.add(ProcessUtils.escapeCommandLine(compareDBDir - .toAbsolutePath() - .toString() + "/" + dbName)); + args.add(ProcessUtils.escapeCommandLine( + compareDBDir.toAbsolutePath().toString() + "/" + dbName)); execute(args, 60000); @@ -92,15 +84,12 @@ private static void profile() throws IOException { List args = new ArrayList<>(); args.add("Profile"); args.add("-e"); - args.add(ProcessUtils.escapeCommandLine(extractsDir - .resolve("extractsA") - .toAbsolutePath() - .toString())); + args.add(ProcessUtils.escapeCommandLine( + extractsDir.resolve("extractsA").toAbsolutePath().toString())); args.add("-d"); - args.add(ProcessUtils.escapeCommandLine(profileDBDir - .toAbsolutePath() - .toString() + "/" + dbName)); + args.add(ProcessUtils.escapeCommandLine( + profileDBDir.toAbsolutePath().toString() + "/" + dbName)); execute(args, 60000); } @@ -108,13 +97,10 @@ private static void reportProfile() throws IOException { List args = new ArrayList<>(); args.add("Report"); args.add("-db"); - args.add(ProcessUtils.escapeCommandLine(profileDBDir - .toAbsolutePath() - .toString() + "/" + dbName)); + args.add(ProcessUtils.escapeCommandLine( + profileDBDir.toAbsolutePath().toString() + "/" + dbName)); args.add("-rd"); - args.add(ProcessUtils.escapeCommandLine(profileReportsDir - .toAbsolutePath() - .toString())); + args.add(ProcessUtils.escapeCommandLine(profileReportsDir.toAbsolutePath().toString())); execute(args, 60000); } @@ -122,13 +108,10 @@ private static void reportCompare() throws IOException { List args = new ArrayList<>(); args.add("Report"); args.add("-db"); - args.add(ProcessUtils.escapeCommandLine(compareDBDir - .toAbsolutePath() - .toString() + "/" + dbName)); + args.add(ProcessUtils.escapeCommandLine( + compareDBDir.toAbsolutePath().toString() + "/" + dbName)); args.add("-rd"); - args.add(ProcessUtils.escapeCommandLine(compareReportsDir - .toAbsolutePath() - .toString())); + args.add(ProcessUtils.escapeCommandLine(compareReportsDir.toAbsolutePath().toString())); execute(args, 60000); } @@ -153,22 +136,21 @@ private static void execute(List incomingArgs, long maxMillis) throws IO try { exitValue = process.exitValue(); } catch (IllegalThreadStateException e) { - //swallow + // swallow } elapsed = System.currentTimeMillis() - started; } if (exitValue == Integer.MIN_VALUE) { process.destroy(); - throw new RuntimeException("Process never exited within the allowed amount of time.\n" + "I needed to destroy it"); + throw new RuntimeException("Process never exited within the allowed amount of time.\n" + + "I needed to destroy it"); } } @Test public void testBasicCompare() throws Exception { Set fNames = new HashSet<>(); - for (File f : compareDBDir - .toFile() - .listFiles()) { + for (File f : compareDBDir.toFile().listFiles()) { fNames.add(f.getName()); } assertContains(dbName + ".mv.db", fNames); @@ -177,9 +159,7 @@ public void testBasicCompare() throws Exception { @Test public void testBasicProfile() throws Exception { Set fNames = new HashSet<>(); - for (File f : profileDBDir - .toFile() - .listFiles()) { + for (File f : profileDBDir.toFile().listFiles()) { fNames.add(f.getName()); } assertContains(dbName + ".mv.db", fNames); @@ -192,10 +172,7 @@ public void testProfileReports() throws Exception { int cnt = 0; for (Path report : v.getPaths()) { - if (report - .getFileName() - .toString() - .endsWith(".xlsx")) { + if (report.getFileName().toString().endsWith(".xlsx")) { cnt++; } } @@ -208,10 +185,7 @@ public void testComparisonReports() throws Exception { Files.walkFileTree(compareReportsDir, v); int cnt = 0; for (Path report : v.getPaths()) { - if (report - .getFileName() - .toString() - .endsWith(".xlsx")) { + if (report.getFileName().toString().endsWith(".xlsx")) { cnt++; } } @@ -227,24 +201,19 @@ public void testOneOff() throws Exception { List args = new ArrayList<>(); args.add("Compare"); args.add("-extractsA"); - args.add(ProcessUtils.escapeCommandLine(extractsDir - .resolve("extractsA") - .toAbsolutePath() - .toString())); + args.add(ProcessUtils.escapeCommandLine( + extractsDir.resolve("extractsA").toAbsolutePath().toString())); args.add("-extractsB"); - args.add(ProcessUtils.escapeCommandLine(extractsDir - .resolve("extractsB") - .toAbsolutePath() - .toString())); + args.add(ProcessUtils.escapeCommandLine( + extractsDir.resolve("extractsB").toAbsolutePath().toString())); args.add("-db"); - args.add(ProcessUtils.escapeCommandLine(compareDBDir - .toAbsolutePath() - .toString() + "/" + dbName)); + args.add(ProcessUtils.escapeCommandLine( + compareDBDir.toAbsolutePath().toString() + "/" + dbName)); execute(args, 60000); - // args.add("-drop"); -// args.add("-jdbc"); -// args.add("jdbc:postgresql:tika_eval?user=user&password=password"); + // args.add("-drop"); + // args.add("-jdbc"); + // args.add("jdbc:postgresql:tika_eval?user=user&password=password"); } @@ -252,7 +221,8 @@ private final static class CachingFileVisitor implements FileVisitor { Set paths = new HashSet<>(); @Override - public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { + public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) + throws IOException { return FileVisitResult.CONTINUE; } diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/db/AbstractBufferTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/db/AbstractBufferTest.java index 32c4e52405..98faaee10f 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/db/AbstractBufferTest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/db/AbstractBufferTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.db; @@ -47,7 +45,8 @@ public class AbstractBufferTest { @Timeout(30000) public void runTest() throws InterruptedException, ExecutionException { List keys = new ArrayList<>(); - Collections.addAll(keys, new String[]{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}); + Collections.addAll(keys, + new String[] {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}); int numGets = 100; int numTesters = 20; @@ -66,15 +65,8 @@ public void runTest() throws InterruptedException, ExecutionException { Future futureResult = completionService.poll(1, TimeUnit.SECONDS); if (futureResult != null) { results++; - assertEquals(keys.size(), futureResult - .get() - .getMap() - .keySet() - .size()); - for (Map.Entry e : futureResult - .get() - .getMap() - .entrySet()) { + assertEquals(keys.size(), futureResult.get().getMap().keySet().size()); + for (Map.Entry e : futureResult.get().getMap().entrySet()) { if (!combined.containsKey(e.getKey())) { combined.put(e.getKey(), e.getValue()); } else { @@ -121,9 +113,9 @@ public MyTestResult call() throws Exception { m.put(k, val); } - //now add the val for every key - //just in case the rand() process didn't hit - //all indices + // now add the val for every key + // just in case the rand() process didn't hit + // all indices for (String k : keys) { Integer val = dbBuffer.getId(k); m.put(k, val); @@ -161,7 +153,7 @@ public void write(int id, String value) throws RuntimeException { @Override public void close() throws SQLException { - //no-op + // no-op } } } diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/io/ExtractReaderTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/io/ExtractReaderTest.java index 8c33d42ffc..ab83aa6b4d 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/io/ExtractReaderTest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/io/ExtractReaderTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.io; @@ -23,12 +21,11 @@ import java.nio.file.Path; import java.util.List; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class ExtractReaderTest extends TikaTest { @@ -37,7 +34,8 @@ public class ExtractReaderTest extends TikaTest { @BeforeEach public void setUp() throws Exception { - testJsonFile = getResourceAsFile("/test-dirs/extractsA/file2_attachANotB.doc.json").toPath(); + testJsonFile = getResourceAsFile("/test-dirs/extractsA/file2_attachANotB.doc.json") + .toPath(); testTxtFile = getResourceAsFile("/test-dirs/extractsB/file13_attachANotB.doc.txt").toPath(); } @@ -48,44 +46,25 @@ public void testBasic() throws Exception { List metadataList = extractReader.loadExtract(testJsonFile); assertEquals(2, metadataList.size()); - assertEquals(1, metadataList - .get(0) - .getValues(TikaCoreProperties.TIKA_CONTENT).length); - assertEquals(1, metadataList - .get(1) - .getValues(TikaCoreProperties.TIKA_CONTENT).length); - assertContains("fox", metadataList - .get(0) - .get(TikaCoreProperties.TIKA_CONTENT)); - assertContains("attachment", metadataList - .get(1) - .get(TikaCoreProperties.TIKA_CONTENT)); + assertEquals(1, metadataList.get(0).getValues(TikaCoreProperties.TIKA_CONTENT).length); + assertEquals(1, metadataList.get(1).getValues(TikaCoreProperties.TIKA_CONTENT).length); + assertContains("fox", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + assertContains("attachment", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY); metadataList = extractReader.loadExtract(testJsonFile); assertEquals(1, metadataList.size()); - assertEquals(1, metadataList - .get(0) - .getValues(TikaCoreProperties.TIKA_CONTENT).length); - assertContains("fox", metadataList - .get(0) - .get(TikaCoreProperties.TIKA_CONTENT)); - assertNotContained("attachment", metadataList - .get(0) - .get(TikaCoreProperties.TIKA_CONTENT)); - - extractReader = new ExtractReader(ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST); + assertEquals(1, metadataList.get(0).getValues(TikaCoreProperties.TIKA_CONTENT).length); + assertContains("fox", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + assertNotContained("attachment", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + + extractReader = new ExtractReader( + ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST); metadataList = extractReader.loadExtract(testJsonFile); assertEquals(1, metadataList.size()); - assertEquals(1, metadataList - .get(0) - .getValues(TikaCoreProperties.TIKA_CONTENT).length); - assertContains("fox", metadataList - .get(0) - .get(TikaCoreProperties.TIKA_CONTENT)); - assertContains("attachment", metadataList - .get(0) - .get(TikaCoreProperties.TIKA_CONTENT)); + assertEquals(1, metadataList.get(0).getValues(TikaCoreProperties.TIKA_CONTENT).length); + assertContains("fox", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + assertContains("attachment", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); } @Test @@ -95,9 +74,10 @@ public void testTextBasic() throws IOException { assertEquals(1, metadataList.size()); Metadata m = metadataList.get(0); assertEquals(1, m.getValues(TikaCoreProperties.TIKA_CONTENT).length); - assertContains("the quick brown fox fox fox jumped over the lazy lazy dog", m.get(TikaCoreProperties.TIKA_CONTENT)); + assertContains("the quick brown fox fox fox jumped over the lazy lazy dog", + m.get(TikaCoreProperties.TIKA_CONTENT)); - //test that the mime is inferred from the file extension + // test that the mime is inferred from the file extension assertEquals("application/msword", m.get(Metadata.CONTENT_TYPE)); } diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/reports/ResultsReporterTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/reports/ResultsReporterTest.java index a7f381fff8..9f19c17b36 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/reports/ResultsReporterTest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/reports/ResultsReporterTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.reports; @@ -23,12 +21,11 @@ import java.sql.Connection; import java.sql.Statement; +import org.apache.tika.eval.app.db.H2Util; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import org.apache.tika.eval.app.db.H2Util; - public class ResultsReporterTest { private Path configFile; private Path tmpDir; @@ -37,10 +34,7 @@ public class ResultsReporterTest { @BeforeEach public void setUp() throws Exception { - configFile = Paths.get(this - .getClass() - .getResource("/reports.xml") - .toURI()); + configFile = Paths.get(this.getClass().getResource("/reports.xml").toURI()); tmpDir = Files.createTempDirectory("tika-eval-report-test-"); connection = new H2Util(tmpDir.resolve(dbName)).getConnection(); diff --git a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/tools/TopCommonTokenCounterTest.java b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/tools/TopCommonTokenCounterTest.java index e79fabc320..a9ec2ffbac 100644 --- a/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/tools/TopCommonTokenCounterTest.java +++ b/tika-eval/tika-eval-app/src/test/java/org/apache/tika/eval/app/tools/TopCommonTokenCounterTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.app.tools; @@ -26,13 +24,12 @@ import java.util.List; import org.apache.commons.io.FileUtils; +import org.apache.tika.TikaTest; +import org.apache.tika.utils.ProcessUtils; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import org.apache.tika.TikaTest; -import org.apache.tika.utils.ProcessUtils; - public class TopCommonTokenCounterTest extends TikaTest { private final static String INPUT_FILE = "lang_file.txt"; private final static String COMMON_TOKENS_FILE = "common_tokens"; @@ -42,11 +39,13 @@ public class TopCommonTokenCounterTest extends TikaTest { @BeforeAll public static void setUp() throws Exception { - String[] docs = - new String[]{"th quick brown fox", "jumped over th brown lazy", "brown lazy fox", "\u666e\u6797\u65af\u987f\u5927\u5b66", "\u666e\u6797\u65af\u987f\u5927\u5b66"}; + String[] docs = new String[] {"th quick brown fox", "jumped over th brown lazy", + "brown lazy fox", "\u666e\u6797\u65af\u987f\u5927\u5b66", + "\u666e\u6797\u65af\u987f\u5927\u5b66"}; - try (BufferedWriter writer = Files.newBufferedWriter(WORKING_DIR.resolve(INPUT_FILE), StandardCharsets.UTF_8)) { - //do this 10 times to bump the numbers above the TopCommonTokenCounter's MIN_DOC_FREQ + try (BufferedWriter writer = Files.newBufferedWriter(WORKING_DIR.resolve(INPUT_FILE), + StandardCharsets.UTF_8)) { + // do this 10 times to bump the numbers above the TopCommonTokenCounter's MIN_DOC_FREQ for (int i = 0; i < 10; i++) { for (String d : docs) { writer.write(d); @@ -55,21 +54,18 @@ public static void setUp() throws Exception { } writer.flush(); } - TopCommonTokenCounter.main(new String[]{ProcessUtils.escapeCommandLine(WORKING_DIR - .resolve(COMMON_TOKENS_FILE) - .toAbsolutePath() - .toString()), ProcessUtils.escapeCommandLine(WORKING_DIR - .resolve(INPUT_FILE) - .toAbsolutePath() - .toString())}); + TopCommonTokenCounter.main(new String[] { + ProcessUtils.escapeCommandLine(WORKING_DIR.resolve(COMMON_TOKENS_FILE) + .toAbsolutePath().toString()), + ProcessUtils.escapeCommandLine(WORKING_DIR.resolve(INPUT_FILE) + .toAbsolutePath().toString())}); } @Test public void testSimple() throws Exception { - List rows = FileUtils.readLines(WORKING_DIR - .resolve(COMMON_TOKENS_FILE) - .toFile(), StandardCharsets.UTF_8); + List rows = FileUtils.readLines(WORKING_DIR.resolve(COMMON_TOKENS_FILE).toFile(), + StandardCharsets.UTF_8); List tokens = new ArrayList<>(); for (String row : rows) { if (!row.startsWith("#")) { @@ -78,9 +74,9 @@ public void testSimple() throws Exception { } assertEquals("brown", tokens.get(2)); assertEquals("lazy", tokens.get(4)); - assertNotContained("th", tokens);//3 char word should be dropped - assertNotContained("\u987f\u5927\u5b66", tokens);//cjk trigram should not be included - assertNotContained("\u5b66", tokens);//cjk unigram should not be included - assertContains("\u5927\u5b66", tokens);//cjk bigrams only + assertNotContained("th", tokens);// 3 char word should be dropped + assertNotContained("\u987f\u5927\u5b66", tokens);// cjk trigram should not be included + assertNotContained("\u5b66", tokens);// cjk unigram should not be included + assertContains("\u5927\u5b66", tokens);// cjk bigrams only } } diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/langid/LanguageIDWrapper.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/langid/LanguageIDWrapper.java index 5f14de90c4..7e14e6536c 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/langid/LanguageIDWrapper.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/langid/LanguageIDWrapper.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.langid; import java.util.List; - import org.apache.tika.eval.core.textstats.StringStatsCalculator; import org.apache.tika.langdetect.opennlp.OpenNLPDetector; import org.apache.tika.language.detect.LanguageResult; @@ -26,8 +23,7 @@ public class LanguageIDWrapper implements StringStatsCalculator probabilities = - (List) results.get(LanguageIDWrapper.class); + (List) results.get(LanguageIDWrapper.class); if (probabilities.size() > 0) { metadata.set(LANGUAGE, probabilities.get(0).getLanguage()); metadata.set(LANGUAGE_CONFIDENCE, probabilities.get(0).getRawScore()); diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/BasicTokenCountStatsCalculator.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/BasicTokenCountStatsCalculator.java index 95a6fa99af..a9b76d6343 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/BasicTokenCountStatsCalculator.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/BasicTokenCountStatsCalculator.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/BytesRefCalculator.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/BytesRefCalculator.java index aa2b206fe4..b11bdb1a16 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/BytesRefCalculator.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/BytesRefCalculator.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokens.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokens.java index dbdd4a67dd..e0e814277d 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokens.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokens.java @@ -1,28 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import java.util.List; import java.util.Map; import java.util.Set; - import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.lang3.tuple.Pair; - import org.apache.tika.eval.core.tokens.AlphaIdeographFilterFactory; import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.CommonTokenResult; @@ -45,7 +41,7 @@ public CommonTokens(CommonTokenCountManager mgr) { @Override public CommonTokenResult calculate(List languages, TokenCounts tokenCounts) { Pair pair = - commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); + commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); String actualLangCode = pair.getKey(); Set commonTokens = pair.getValue().getTokens(); int numUniqueCommonTokens = 0; @@ -66,6 +62,6 @@ public CommonTokenResult calculate(List languages, TokenCounts t } return new CommonTokenResult(actualLangCode, numUniqueCommonTokens, numCommonTokens, - numUniqueAlphabeticTokens, numAlphabeticTokens); + numUniqueAlphabeticTokens, numAlphabeticTokens); } } diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensBhattacharyya.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensBhattacharyya.java index f23c17252a..19c80225e9 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensBhattacharyya.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensBhattacharyya.java @@ -1,28 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import java.util.List; import java.util.Map; - import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.util.FastMath; - import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.LangModel; import org.apache.tika.eval.core.tokens.TokenCounts; @@ -39,7 +35,7 @@ public CommonTokensBhattacharyya(CommonTokenCountManager mgr) { @Override public Double calculate(List languages, TokenCounts tokenCounts) { Pair pair = - commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); + commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); LangModel model = pair.getValue(); double sum = 0.0; if (tokenCounts.getTokens().entrySet().size() == 0) { @@ -47,7 +43,7 @@ public Double calculate(List languages, TokenCounts tokenCounts) } for (Map.Entry e : tokenCounts.getTokens().entrySet()) { double p = (double) e.getValue().intValue() / (double) tokenCounts.getTotalTokens(); - if (p == 0.0) { //shouldn't happen, but be defensive + if (p == 0.0) { // shouldn't happen, but be defensive continue; } double q = model.getProbability(e.getKey()); diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensCosine.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensCosine.java index d0b2752493..3ca21b5a89 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensCosine.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensCosine.java @@ -1,29 +1,25 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import java.util.Collection; import java.util.List; import java.util.Map; - import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.util.FastMath; - import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.LangModel; import org.apache.tika.eval.core.tokens.TokenCounts; @@ -40,7 +36,7 @@ public CommonTokensCosine(CommonTokenCountManager mgr) { @Override public Double calculate(List languages, TokenCounts tokenCounts) { Pair pair = - commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); + commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); LangModel model = pair.getValue(); double kl = 0.0; if (tokenCounts.getTokens().entrySet().size() == 0) { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensHellinger.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensHellinger.java index cbbcacc46d..7e0e661ae6 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensHellinger.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensHellinger.java @@ -1,28 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import java.util.List; import java.util.Map; - import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.util.FastMath; - import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.LangModel; import org.apache.tika.eval.core.tokens.TokenCounts; @@ -39,7 +35,7 @@ public CommonTokensHellinger(CommonTokenCountManager mgr) { @Override public Double calculate(List languages, TokenCounts tokenCounts) { Pair pair = - commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); + commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); LangModel model = pair.getValue(); double sum = 0.0; if (tokenCounts.getTokens().entrySet().size() == 0) { @@ -47,7 +43,7 @@ public Double calculate(List languages, TokenCounts tokenCounts) } for (Map.Entry e : tokenCounts.getTokens().entrySet()) { double p = (double) e.getValue().intValue() / (double) tokenCounts.getTotalTokens(); - if (p == 0.0) { //shouldn't happen, but be defensive + if (p == 0.0) { // shouldn't happen, but be defensive continue; } double q = model.getProbability(e.getKey()); diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDNormed.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDNormed.java index ec0a99ba7b..ad0dd5094d 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDNormed.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDNormed.java @@ -1,28 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import java.util.List; import java.util.Map; - import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.util.FastMath; - import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.LangModel; import org.apache.tika.eval.core.tokens.TokenCounts; @@ -39,7 +35,7 @@ public CommonTokensKLDNormed(CommonTokenCountManager mgr) { @Override public Double calculate(List languages, TokenCounts tokenCounts) { Pair pair = - commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); + commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); LangModel model = pair.getValue(); double kl = 0.0; if (tokenCounts.getTokens().entrySet().size() == 0) { @@ -48,7 +44,7 @@ public Double calculate(List languages, TokenCounts tokenCounts) double worstCase = 0.0; for (Map.Entry e : tokenCounts.getTokens().entrySet()) { double p = (double) e.getValue().intValue() / (double) tokenCounts.getTotalTokens(); - if (p == 0.0) { //shouldn't happen, but be defensive + if (p == 0.0) { // shouldn't happen, but be defensive continue; } double q = model.getProbability(e.getKey()); diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDivergence.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDivergence.java index 59d31aeb99..22871f7150 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDivergence.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDivergence.java @@ -1,28 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import java.util.List; import java.util.Map; - import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.util.FastMath; - import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.LangModel; import org.apache.tika.eval.core.tokens.TokenCounts; @@ -39,7 +35,7 @@ public CommonTokensKLDivergence(CommonTokenCountManager mgr) { @Override public Double calculate(List languages, TokenCounts tokenCounts) { Pair pair = - commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); + commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); LangModel model = pair.getValue(); double kl = 0.0; if (tokenCounts.getTokens().entrySet().size() == 0) { @@ -47,7 +43,7 @@ public Double calculate(List languages, TokenCounts tokenCounts) } for (Map.Entry e : tokenCounts.getTokens().entrySet()) { double p = (double) e.getValue().intValue() / (double) tokenCounts.getTotalTokens(); - if (p == 0.0) { //shouldn't happen, but be defensive + if (p == 0.0) { // shouldn't happen, but be defensive continue; } double q = model.getProbability(e.getKey()); diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CompositeTextStatsCalculator.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CompositeTextStatsCalculator.java index 626e27aaa5..281aefb3e9 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CompositeTextStatsCalculator.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CompositeTextStatsCalculator.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; @@ -21,13 +19,11 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.util.BytesRef; - import org.apache.tika.eval.core.langid.LanguageIDWrapper; import org.apache.tika.eval.core.tokens.AnalyzerManager; import org.apache.tika.eval.core.tokens.TokenCounts; @@ -38,22 +34,22 @@ public class CompositeTextStatsCalculator { private static final String FIELD = "f"; private static final int DEFAULT_MAX_TOKENS = 10_000_000; - private final byte[] whitespace = new byte[]{' '}; + private final byte[] whitespace = new byte[] {' '}; private final Analyzer analyzer; private final LanguageIDWrapper languageIDWrapper; private final List languageAwareTokenCountStats = - new ArrayList<>(); + new ArrayList<>(); private final List tokenCountStatCalculators = new ArrayList<>(); private final List stringStatCalculators = new ArrayList<>(); private final List bytesRefCalculators = new ArrayList<>(); public CompositeTextStatsCalculator(List calculators) { this(calculators, AnalyzerManager.newInstance(DEFAULT_MAX_TOKENS).getGeneralAnalyzer(), - new LanguageIDWrapper()); + new LanguageIDWrapper()); } public CompositeTextStatsCalculator(List calculators, Analyzer analyzer, - LanguageIDWrapper languageIDWrapper) { + LanguageIDWrapper languageIDWrapper) { this.analyzer = analyzer; this.languageIDWrapper = languageIDWrapper; for (TextStatsCalculator t : calculators) { @@ -62,22 +58,21 @@ public CompositeTextStatsCalculator(List calculators, Analy } else if (t instanceof LanguageAwareTokenCountStats) { languageAwareTokenCountStats.add((LanguageAwareTokenCountStats) t); if (languageIDWrapper == null) { - throw new IllegalArgumentException("Must specify a LanguageIdWrapper " + - "if you want to calculate languageAware stats: " + t.getClass()); + throw new IllegalArgumentException("Must specify a LanguageIdWrapper " + + "if you want to calculate languageAware stats: " + + t.getClass()); } } else if (t instanceof TokenCountStatsCalculator) { tokenCountStatCalculators.add((TokenCountStatsCalculator) t); if (analyzer == null) { - throw new IllegalArgumentException( - "Analyzer must not be null if you are using " + "a TokenCountStats: " + - t.getClass()); + throw new IllegalArgumentException("Analyzer must not be null if you are using " + + "a TokenCountStats: " + t.getClass()); } } else if (t instanceof BytesRefCalculator) { bytesRefCalculators.add((BytesRefCalculator) t); if (analyzer == null) { - throw new IllegalArgumentException( - "Analyzer must not be null if you are using " + - "a BytesRefCalculator: " + t.getClass()); + throw new IllegalArgumentException("Analyzer must not be null if you are using " + + "a BytesRefCalculator: " + t.getClass()); } } else { throw new IllegalArgumentException("I regret I don't yet handle: " + t.getClass()); @@ -92,8 +87,8 @@ public Map calculate(String txt) { } TokenCounts tokenCounts = null; - if (tokenCountStatCalculators.size() > 0 || languageAwareTokenCountStats.size() > 0 || - bytesRefCalculators.size() > 0) { + if (tokenCountStatCalculators.size() > 0 || languageAwareTokenCountStats.size() > 0 + || bytesRefCalculators.size() > 0) { try { tokenCounts = tokenize(txt, results); } catch (IOException e) { @@ -102,8 +97,9 @@ public Map calculate(String txt) { } if (languageAwareTokenCountStats.size() > 0) { - List langs = results.containsKey(LanguageIDWrapper.class) ? - (List) results.get(LanguageIDWrapper.class) : languageIDWrapper.calculate(txt); + List langs = results.containsKey(LanguageIDWrapper.class) + ? (List) results.get(LanguageIDWrapper.class) + : languageIDWrapper.calculate(txt); results.put(LanguageIDWrapper.class, langs); for (LanguageAwareTokenCountStats calc : languageAwareTokenCountStats) { results.put(calc.getClass(), calc.calculate(langs, tokenCounts)); diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/ContentLengthCalculator.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/ContentLengthCalculator.java index 1b7985775e..a3e490ab3b 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/ContentLengthCalculator.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/ContentLengthCalculator.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/LanguageAwareTokenCountStats.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/LanguageAwareTokenCountStats.java index fdc02fce73..c90180dcd2 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/LanguageAwareTokenCountStats.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/LanguageAwareTokenCountStats.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import java.util.List; - import org.apache.tika.eval.core.tokens.TokenCounts; import org.apache.tika.language.detect.LanguageResult; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/StringStatsCalculator.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/StringStatsCalculator.java index 1efd309ab7..1a8f034b7b 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/StringStatsCalculator.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/StringStatsCalculator.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextProfileSignature.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextProfileSignature.java index 9f726c93a9..2b44916d58 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextProfileSignature.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextProfileSignature.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; @@ -20,18 +18,17 @@ import java.util.Comparator; import java.util.List; import java.util.Map; - import org.apache.commons.codec.binary.Base32; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.mutable.MutableInt; - import org.apache.tika.eval.core.tokens.TokenCounts; /** * Copied nearly directly from Apache Nutch: * https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/TextProfileSignature.java *

- * See documentation: https://nutch.apache.org/apidocs/apidocs-2.0/org/apache/nutch/crawl/TextProfileSignature.html + * See documentation: + * https://nutch.apache.org/apidocs/apidocs-2.0/org/apache/nutch/crawl/TextProfileSignature.html *

* This returns the base32 encoded sha256 */ @@ -87,9 +84,8 @@ public String calculate(TokenCounts tokenCounts) { } /** - * Be careful -- for CJK languages, the default analyzer uses character - * bigrams. You will "ignore" all cjk language tokens if you set - * minTokenLength > 2! + * Be careful -- for CJK languages, the default analyzer uses character bigrams. You will + * "ignore" all cjk language tokens if you set minTokenLength > 2! * * @param minTokenLength -- include tokens of this length or greater. */ @@ -117,8 +113,7 @@ public String toString() { private class TokenComparator implements Comparator { /** - * Sort tokens first by decreasing frequency and second in lexicographic - * (Unicode) order + * Sort tokens first by decreasing frequency and second in lexicographic (Unicode) order */ public int compare(Token t1, Token t2) { int diffCnt = t2.cnt - t1.cnt; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextSha256Signature.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextSha256Signature.java index baa93c7067..a96ca775f1 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextSha256Signature.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextSha256Signature.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import java.security.MessageDigest; - import org.apache.commons.codec.binary.Base32; import org.apache.commons.codec.digest.DigestUtils; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextStatsCalculator.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextStatsCalculator.java index cdaed21873..986003202c 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextStatsCalculator.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextStatsCalculator.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenCountPriorityQueue.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenCountPriorityQueue.java index 70345a72f5..c9faabb14b 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenCountPriorityQueue.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenCountPriorityQueue.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import org.apache.lucene.util.PriorityQueue; - import org.apache.tika.eval.core.tokens.TokenIntPair; public class TokenCountPriorityQueue extends PriorityQueue { @@ -39,7 +36,7 @@ protected boolean lessThan(TokenIntPair arg0, TokenIntPair arg1) { public TokenIntPair[] getArray() { TokenIntPair[] topN = new TokenIntPair[size()]; - //now we reverse the queue + // now we reverse the queue TokenIntPair term = pop(); int i = topN.length - 1; while (term != null && i > -1) { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenCountStatsCalculator.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenCountStatsCalculator.java index c42ff35eb4..07d678194e 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenCountStatsCalculator.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenCountStatsCalculator.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java index ebb2d0a5fd..492796d70b 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.math3.util.FastMath; - import org.apache.tika.eval.core.tokens.TokenCounts; public class TokenEntropy implements TokenCountStatsCalculator { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenLengths.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenLengths.java index c98eb06d74..70ede82399 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenLengths.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenLengths.java @@ -1,26 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import java.util.Map; - import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; - import org.apache.tika.eval.core.tokens.TokenCounts; public class TokenLengths implements TokenCountStatsCalculator { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TopNTokens.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TopNTokens.java index faf74dea2f..46c1059cf7 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TopNTokens.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TopNTokens.java @@ -1,26 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; import java.util.Map; - import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; - import org.apache.tika.eval.core.tokens.TokenCounts; import org.apache.tika.eval.core.tokens.TokenIntPair; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/UnicodeBlockCounter.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/UnicodeBlockCounter.java index c02852e88e..6ff4ae44a6 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/UnicodeBlockCounter.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/UnicodeBlockCounter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; @@ -22,7 +20,6 @@ import java.util.Collections; import java.util.HashMap; import java.util.Map; - import org.apache.commons.lang3.mutable.MutableInt; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java index cd5476ecb8..c9b39aac78 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AlphaIdeographFilterFactory.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; import java.io.IOException; import java.util.Map; - import org.apache.lucene.analysis.FilteringTokenFilter; import org.apache.lucene.analysis.TokenFilterFactory; import org.apache.lucene.analysis.TokenStream; @@ -27,7 +24,8 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** - * Factory for filter that only allows tokens with characters that "isAlphabetic" or "isIdeographic" through. + * Factory for filter that only allows tokens with characters that "isAlphabetic" or "isIdeographic" + * through. */ public class AlphaIdeographFilterFactory extends TokenFilterFactory { @@ -83,8 +81,8 @@ public AlphaFilter(TokenStream in) { @Override protected boolean accept() throws IOException { String type = typeAtt.type(); - if (type == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMOJI] || - type == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.NUM]) { + if (type == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMOJI] + || type == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.NUM]) { return false; } return isAlphabetic(termAtt.buffer(), termAtt.length()); diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java index e0a8487f64..fcbe4b12e2 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerDeserializer.java @@ -1,30 +1,27 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.io.Reader; import java.util.Collections; import java.util.HashMap; import java.util.Map; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.custom.CustomAnalyzer; import org.apache.lucene.util.ClasspathResourceLoader; @@ -41,13 +38,13 @@ class AnalyzerDeserializer { private static final String COMMENT = "_comment"; public static Map buildAnalyzers(Reader reader, int maxTokens) - throws IOException { + throws IOException { JsonNode root = new ObjectMapper().readTree(reader); Map analyzers = new HashMap<>(); if (!root.isObject() || root.get(ANALYZERS) == null) { throw new IllegalArgumentException( - "root object must be object with an 'analyzers' element"); + "root object must be object with an 'analyzers' element"); } for (Map.Entry e : root.get(ANALYZERS).properties()) { String analyzerName = e.getKey(); @@ -58,14 +55,14 @@ public static Map buildAnalyzers(Reader reader, int maxTokens) } public static Analyzer buildAnalyzer(String analyzerName, JsonNode node, int maxTokens) - throws IOException { + throws IOException { if (!node.isObject()) { throw new IllegalArgumentException( - "Expecting map of charfilter, tokenizer, tokenfilters"); + "Expecting map of charfilter, tokenizer, tokenfilters"); } - CustomAnalyzer.Builder builder = - CustomAnalyzer.builder(new ClasspathResourceLoader(AnalyzerDeserializer.class)); + CustomAnalyzer.Builder builder = CustomAnalyzer + .builder(new ClasspathResourceLoader(AnalyzerDeserializer.class)); for (Map.Entry e : node.properties()) { String k = e.getKey(); if (k.equals(CHAR_FILTERS)) { @@ -75,30 +72,31 @@ public static Analyzer buildAnalyzer(String analyzerName, JsonNode node, int max } else if (k.equals(TOKENIZER)) { buildTokenizerFactory(e.getValue(), analyzerName, builder); } else if (!k.equals(COMMENT)) { - throw new IllegalArgumentException( - "Should have one of three values here:" + CHAR_FILTERS + ", " + TOKENIZER + - ", " + TOKEN_FILTERS + ". I don't recognize: " + k); + throw new IllegalArgumentException("Should have one of three values here:" + + CHAR_FILTERS + ", " + TOKENIZER + ", " + TOKEN_FILTERS + + ". I don't recognize: " + k); } } return builder.build(); } private static void buildTokenizerFactory(JsonNode map, String analyzerName, - CustomAnalyzer.Builder builder) throws IOException { + CustomAnalyzer.Builder builder) throws IOException { if (!map.isObject()) { - throw new IllegalArgumentException("Expecting a map with \"factory\" string and " + - "\"params\" map in tokenizer factory;" + " not: " + map.toString() + " in " + - analyzerName); + throw new IllegalArgumentException("Expecting a map with \"factory\" string and " + + "\"params\" map in tokenizer factory;" + " not: " + map.toString() + + " in " + analyzerName); } JsonNode factoryEl = map.get(FACTORY); if (factoryEl == null || !factoryEl.isTextual()) { throw new IllegalArgumentException( - "Expecting value for factory in char filter factory builder in:" + - analyzerName); + "Expecting value for factory in char filter factory builder in:" + + analyzerName); } String factoryName = factoryEl.asText(); - factoryName = factoryName.startsWith("oala.") ? - factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName; + factoryName = factoryName.startsWith("oala.") + ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") + : factoryName; JsonNode paramsEl = map.get(PARAMS); Map params = mapify(paramsEl); @@ -106,27 +104,27 @@ private static void buildTokenizerFactory(JsonNode map, String analyzerName, } private static void buildCharFilters(JsonNode el, String analyzerName, - CustomAnalyzer.Builder builder) throws IOException { + CustomAnalyzer.Builder builder) throws IOException { if (el == null || el.isNull()) { return; } if (!el.isArray()) { - throw new IllegalArgumentException( - "Expecting array for charfilters, but got:" + el.toString() + " for " + - analyzerName); + throw new IllegalArgumentException("Expecting array for charfilters, but got:" + + el.toString() + " for " + analyzerName); } for (JsonNode filterMap : el) { if (!filterMap.isObject()) { throw new IllegalArgumentException( - "Expecting a map with \"factory\" string and \"params\" map in char filter factory;" + - " not: " + filterMap.toString() + " in " + analyzerName); + "Expecting a map with \"factory\" string and \"params\" map in char filter factory;" + + " not: " + filterMap.toString() + " in " + + analyzerName); } JsonNode factoryEl = filterMap.get(FACTORY); if (factoryEl == null || !factoryEl.isTextual()) { throw new IllegalArgumentException( - "Expecting value for factory in char filter factory builder in:" + - analyzerName); + "Expecting value for factory in char filter factory builder in:" + + analyzerName); } String factoryName = factoryEl.asText(); factoryName = factoryName.replaceAll("oala.", "org.apache.lucene.analysis."); @@ -138,32 +136,32 @@ private static void buildCharFilters(JsonNode el, String analyzerName, } private static void buildTokenFilterFactories(JsonNode el, String analyzerName, int maxTokens, - CustomAnalyzer.Builder builder) - throws IOException { + CustomAnalyzer.Builder builder) throws IOException { if (el == null || el.isNull()) { return; } if (!el.isArray()) { - throw new IllegalArgumentException( - "Expecting array for tokenfilters, but got:" + el.toString() + " in " + - analyzerName); + throw new IllegalArgumentException("Expecting array for tokenfilters, but got:" + + el.toString() + " in " + analyzerName); } for (JsonNode filterMap : el) { if (!filterMap.isObject()) { throw new IllegalArgumentException( - "Expecting a map with \"factory\" string and \"params\" map in token filter factory;" + - " not: " + filterMap.toString() + " in " + analyzerName); + "Expecting a map with \"factory\" string and \"params\" map in token filter factory;" + + " not: " + filterMap.toString() + " in " + + analyzerName); } JsonNode factoryEl = filterMap.get(FACTORY); if (factoryEl == null || !factoryEl.isTextual()) { throw new IllegalArgumentException( - "Expecting value for factory in token filter factory builder in " + - analyzerName); + "Expecting value for factory in token filter factory builder in " + + analyzerName); } String factoryName = factoryEl.asText(); - factoryName = factoryName.startsWith("oala.") ? - factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName; + factoryName = factoryName.startsWith("oala.") + ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") + : factoryName; JsonNode paramsEl = filterMap.get(PARAMS); Map params = mapify(paramsEl); builder.addTokenFilter(factoryName, params); @@ -188,7 +186,7 @@ private static Map mapify(JsonNode paramsEl) { JsonNode value = e.getValue(); if (value.isObject() || value.isArray() || value.isNull()) { throw new IllegalArgumentException( - "Expecting parameter to have primitive value: " + value.toString()); + "Expecting parameter to have primitive value: " + value.toString()); } String v = e.getValue().asText(); params.put(e.getKey(), v); diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerManager.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerManager.java index f760304fee..533f35848d 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerManager.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/AnalyzerManager.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; @@ -22,7 +20,6 @@ import java.io.Reader; import java.nio.charset.StandardCharsets; import java.util.Map; - import org.apache.lucene.analysis.Analyzer; public class AnalyzerManager { @@ -42,7 +39,7 @@ private AnalyzerManager(Analyzer generalAnalyzer, Analyzer commonTokensAnalyzer) public static AnalyzerManager newInstance(int maxTokens) { Map map; try (InputStream is = AnalyzerManager.class.getClassLoader() - .getResourceAsStream("lucene-analyzers.json")) { + .getResourceAsStream("lucene-analyzers.json")) { try (Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8)) { map = AnalyzerDeserializer.buildAnalyzers(reader, maxTokens); } @@ -72,8 +69,8 @@ public Analyzer getGeneralAnalyzer() { } /** - * This analyzer should be used to generate common tokens lists from - * large corpora. It is not used by tika-eval in profiling or comparing. + * This analyzer should be used to generate common tokens lists from large corpora. It is not + * used by tika-eval in profiling or comparing. * * @return */ diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java index 87d88343a2..139b89d570 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CJKBigramAwareLengthFilterFactory.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; import java.io.IOException; import java.util.Map; - import org.apache.lucene.analysis.FilteringTokenFilter; import org.apache.lucene.analysis.TokenFilterFactory; import org.apache.lucene.analysis.TokenStream; @@ -27,9 +24,8 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** - * Creates a very narrowly focused TokenFilter that limits tokens based on length - * _unless_ they've been identified as <DOUBLE> or <SINGLE> - * by the CJKBigramFilter. + * Creates a very narrowly focused TokenFilter that limits tokens based on length _unless_ they've + * been identified as <DOUBLE> or <SINGLE> by the CJKBigramFilter. *

* This class is intended to be used when generating "common tokens" files. */ @@ -40,6 +36,7 @@ public class CJKBigramAwareLengthFilterFactory extends TokenFilterFactory { private final int min; private final int max; + public CJKBigramAwareLengthFilterFactory() { min = 3; max = 20; @@ -77,20 +74,12 @@ protected boolean accept() throws IOException { } /* - private static boolean isCJ(int codePoint) { - if ( - (codePoint >= 0x4E00 && codePoint <= 0x9FFF) || - ( codePoint >= 0x3400 && codePoint <= 0x4dbf) || - ( codePoint >= 0x20000 && codePoint <= 0x2a6df) || - ( codePoint >= 0x2A700 && codePoint <= 0x2b73f) || - ( codePoint >= 0x2B740 && codePoint <= 0x2B81F) || - ( codePoint >= 0x2B820 && codePoint <- 0x2CEAF) || - ( codePoint >= 0xF900 && codePoint <= 0xFAFF) || - ( codePoint >= 0x2F800 && codePoint <= 0x2Fa1F) - ) { - return true; - } - return false; - }*/ + * private static boolean isCJ(int codePoint) { if ( (codePoint >= 0x4E00 && codePoint <= + * 0x9FFF) || ( codePoint >= 0x3400 && codePoint <= 0x4dbf) || ( codePoint >= 0x20000 && + * codePoint <= 0x2a6df) || ( codePoint >= 0x2A700 && codePoint <= 0x2b73f) || ( codePoint >= + * 0x2B740 && codePoint <= 0x2B81F) || ( codePoint >= 0x2B820 && codePoint <- 0x2CEAF) || ( + * codePoint >= 0xF900 && codePoint <= 0xFAFF) || ( codePoint >= 0x2F800 && codePoint <= + * 0x2Fa1F) ) { return true; } return false; } + */ } diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenCountManager.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenCountManager.java index 696890d045..ec10c2da9a 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenCountManager.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenCountManager.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; @@ -32,7 +30,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.tuple.Pair; import org.slf4j.Logger; @@ -44,8 +41,8 @@ public class CommonTokenCountManager { private static final Charset COMMON_TOKENS_CHARSET = StandardCharsets.UTF_8; private static final String TERM_FREQS = "#SUM_TERM_FREQS\t"; private final Path commonTokensDir; - //if we have no model or if no langid is passed in - //make this configurable + // if we have no model or if no langid is passed in + // make this configurable private final String defaultLangCode; Map commonTokenMap = new ConcurrentHashMap<>(); Set alreadyTriedToLoad = new HashSet<>(); @@ -63,8 +60,8 @@ public CommonTokenCountManager(Path commonTokensDir, String defaultLangCode) { this.commonTokensDir = commonTokensDir; if (!"".equals(defaultLangCode)) { tryToLoad(defaultLangCode); - //if you couldn't load it, make sure to add an empty - //set to prevent npes later + // if you couldn't load it, make sure to add an empty + // set to prevent npes later LangModel langModel = commonTokenMap.get(defaultLangCode); if (langModel == null) { LOG.warn("No common tokens for default language: '" + defaultLangCode + "'"); @@ -78,7 +75,7 @@ public CommonTokenCountManager(Path commonTokensDir, String defaultLangCode) { public Set getTokens(String lang) { return Collections.unmodifiableSet( - new HashSet(commonTokenMap.get(getActualLangCode(lang)).getTokens())); + new HashSet(commonTokenMap.get(getActualLangCode(lang)).getTokens())); } public Set getLangs() { @@ -87,16 +84,15 @@ public Set getLangs() { /** * @param lang - * @return pair of actual language code used and a set of common - * tokens for that language + * @return pair of actual language code used and a set of common tokens for that language */ public Pair getLangTokens(String lang) { String actualLangCode = getActualLangCode(lang); return Pair.of(actualLangCode, commonTokenMap.get(actualLangCode)); } - //return langcode for lang that you are actually using - //lazily load the appropriate model + // return langcode for lang that you are actually using + // lazily load the appropriate model private String getActualLangCode(String langCode) { if (langCode == null || "".equals(langCode)) { return defaultLangCode; @@ -121,8 +117,8 @@ private synchronized void tryToLoad(String langCode) { if (alreadyTriedToLoad.contains(langCode)) { return; } - //check once more now that we're in a - //synchronized block + // check once more now that we're in a + // synchronized block if (commonTokenMap.get(langCode) != null) { return; } @@ -141,17 +137,17 @@ private synchronized void tryToLoad(String langCode) { if (is == null) { - String path = (p == null) ? "resource on class path: /common_tokens/" + langCode : - p.toAbsolutePath().toString(); - LOG.warn("Couldn't find common tokens file for: '" + langCode + "' tried here: " + - path); + String path = (p == null) ? "resource on class path: /common_tokens/" + langCode + : p.toAbsolutePath().toString(); + LOG.warn("Couldn't find common tokens file for: '" + langCode + "' tried here: " + + path); alreadyTriedToLoad.add(langCode); return; } LangModel model = null; - try (BufferedReader reader = new BufferedReader( - new InputStreamReader(is, COMMON_TOKENS_CHARSET))) { + try (BufferedReader reader = + new BufferedReader(new InputStreamReader(is, COMMON_TOKENS_CHARSET))) { alreadyTriedToLoad.add(langCode); String line = reader.readLine(); while (line != null) { @@ -166,18 +162,18 @@ private synchronized void tryToLoad(String langCode) { line = reader.readLine(); continue; } - //allow language models with, e.g. tab-delimited counts after the term + // allow language models with, e.g. tab-delimited counts after the term String[] cols = line.split("\t"); String t = cols[0].trim(); if (t.length() > 0 && cols.length > 2) { if (model == null) { throw new IllegalArgumentException( - "Common tokens file must have included comment line " + - " with " + TERM_FREQS); + "Common tokens file must have included comment line " + + " with " + TERM_FREQS); } - //document frequency + // document frequency String df = cols[1]; - //token frequency + // token frequency long tf = Long.parseLong(cols[2]); model.add(t, tf); } diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenResult.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenResult.java index 2fb26ab067..b603c966dd 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenResult.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenResult.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; @@ -20,13 +18,13 @@ public class CommonTokenResult { private final String langCode; - private final int uniqueCommonTokens;//types + private final int uniqueCommonTokens;// types private final int commonTokens; private final int uniqueAlphabeticTokens; private final int alphabeticTokens; public CommonTokenResult(String langCode, int uniqueCommonTokens, int commonTokens, - int uniqueAlphabeticTokens, int alphabeticTokens) { + int uniqueAlphabeticTokens, int alphabeticTokens) { this.langCode = langCode; this.uniqueCommonTokens = uniqueCommonTokens; this.commonTokens = commonTokens; @@ -63,8 +61,8 @@ public int getUniqueAlphabeticTokens() { } /** - * @return number of tokens that had at least one alphabetic/ideographic character - * whether or not a common token + * @return number of tokens that had at least one alphabetic/ideographic character whether or + * not a common token */ public int getAlphabeticTokens() { return alphabeticTokens; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/ContrastStatistics.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/ContrastStatistics.java index 3331ea8296..40b242cdf8 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/ContrastStatistics.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/ContrastStatistics.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/LangModel.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/LangModel.java index 2c7f889b5b..f9b9a34e1d 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/LangModel.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/LangModel.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenContraster.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenContraster.java index b61862511e..761aa7fc64 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenContraster.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenContraster.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; import java.util.Map; - import org.apache.commons.lang3.mutable.MutableInt; import org.apache.lucene.util.PriorityQueue; @@ -49,7 +46,7 @@ public class TokenContraster { public ContrastStatistics calculateContrastStatistics(TokenCounts tokensA, - TokenCounts tokensB) { + TokenCounts tokensB) { reset(); this.tokensA = tokensA; this.tokensB = tokensB; @@ -118,8 +115,8 @@ private void finishComputing() { long sumUniqTokens = tokensA.getTotalUniqueTokens() + tokensB.getTotalUniqueTokens(); diceCoefficient = (double) diceCoefficientNum / (double) sumUniqTokens; - overlap = - (float) overlapNum / (double) (tokensA.getTotalTokens() + tokensB.getTotalTokens()); + overlap = (float) overlapNum + / (double) (tokensA.getTotalTokens() + tokensB.getTotalTokens()); } @@ -155,7 +152,7 @@ protected boolean lessThan(TokenCountDiff arg0, TokenCountDiff arg1) { public TokenIntPair[] getArray() { TokenIntPair[] topN = new TokenIntPair[size()]; - //now we reverse the queue + // now we reverse the queue TokenCountDiff token = pop(); int i = topN.length - 1; while (token != null && i > -1) { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCountPriorityQueue.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCountPriorityQueue.java index d2b0c90f25..1aa6907e14 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCountPriorityQueue.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCountPriorityQueue.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; @@ -37,7 +35,7 @@ protected boolean lessThan(TokenIntPair arg0, TokenIntPair arg1) { public TokenIntPair[] getArray() { TokenIntPair[] topN = new TokenIntPair[size()]; - //now we reverse the queue + // now we reverse the queue TokenIntPair term = pop(); int i = topN.length - 1; while (term != null && i > -1) { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounter.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounter.java index bd9cf9c3a1..41f44644bb 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounter.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; @@ -20,14 +18,12 @@ import java.util.Collections; import java.util.HashMap; import java.util.Map; - import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.commons.math3.util.FastMath; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; - import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator; import org.apache.tika.eval.core.textstats.TokenCountPriorityQueue; import org.apache.tika.eval.core.textstats.TokenEntropy; @@ -35,19 +31,17 @@ import org.apache.tika.eval.core.textstats.TopNTokens; /** - * @deprecated use {@link CompositeTextStatsCalculator} - * with {@link TokenEntropy}, - * {@link TokenLengths} - * and {@link TopNTokens}. + * @deprecated use {@link CompositeTextStatsCalculator} with {@link TokenEntropy}, + * {@link TokenLengths} and {@link TopNTokens}. */ @Deprecated public class TokenCounter { private final TokenStatistics NULL_TOKEN_STAT = - new TokenStatistics(0, 0, new TokenIntPair[0], 0.0d, new SummaryStatistics()); + new TokenStatistics(0, 0, new TokenIntPair[0], 0.0d, new SummaryStatistics()); private final Analyzer generalAnalyzer; - Map> map = new HashMap<>(); //Map> + Map> map = new HashMap<>(); // Map> Map tokenStatistics = new HashMap<>(); private int topN = 10; @@ -87,7 +81,7 @@ private void _add(String field, Analyzer analyzer, String content) throws IOExce double base = 2.0; org.apache.tika.eval.core.textstats.TokenCountPriorityQueue queue = - new TokenCountPriorityQueue(topN); + new TokenCountPriorityQueue(topN); SummaryStatistics summaryStatistics = new SummaryStatistics(); for (Map.Entry e : tokenMap.entrySet()) { @@ -109,15 +103,13 @@ private void _add(String field, Analyzer analyzer, String content) throws IOExce ent = (-1.0d / (double) totalTokens) * ent; } -/* Collections.sort(allTokens); - List topNList = new ArrayList<>(topN); - for (int i = 0; i < topN && i < allTokens.size(); i++) { - topNList.add(allTokens.get(i)); - }*/ + /* + * Collections.sort(allTokens); List topNList = new ArrayList<>(topN); for + * (int i = 0; i < topN && i < allTokens.size(); i++) { topNList.add(allTokens.get(i)); } + */ - tokenStatistics.put(field, - new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, - summaryStatistics)); + tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, totalTokens, + queue.getArray(), ent, summaryStatistics)); } diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounts.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounts.java index 8b420696c0..dc8e86f7ec 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounts.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounts.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; import java.util.HashMap; import java.util.Map; - import org.apache.commons.lang3.mutable.MutableInt; public class TokenCounts { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenIntPair.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenIntPair.java index 046f201173..a88760341e 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenIntPair.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenIntPair.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenStatistics.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenStatistics.java index 2fa37eda21..2b178c933a 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenStatistics.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenStatistics.java @@ -1,23 +1,20 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; import java.util.Arrays; - import org.apache.commons.math3.stat.descriptive.SummaryStatistics; @@ -30,7 +27,7 @@ public class TokenStatistics { private final SummaryStatistics summaryStatistics; public TokenStatistics(int totalUniqueTokens, int totalTokens, TokenIntPair[] topN, - double entropy, SummaryStatistics summaryStatistics) { + double entropy, SummaryStatistics summaryStatistics) { this.totalUniqueTokens = totalUniqueTokens; this.totalTokens = totalTokens; this.topN = topN; @@ -69,9 +66,9 @@ public SummaryStatistics getSummaryStatistics() { @Override public String toString() { - return "TokenStatistics{" + "totalTokens=" + totalTokens + ", totalUniqueTokens=" + - totalUniqueTokens + ", topN=" + Arrays.toString(topN) + ", entropy=" + entropy + - ", summaryStatistics=" + summaryStatistics + '}'; + return "TokenStatistics{" + "totalTokens=" + totalTokens + ", totalUniqueTokens=" + + totalUniqueTokens + ", topN=" + Arrays.toString(topN) + ", entropy=" + + entropy + ", summaryStatistics=" + summaryStatistics + '}'; } @Override @@ -105,11 +102,11 @@ public boolean equals(Object o) { return false; } - //if both have n==0, don't bother with the stats + // if both have n==0, don't bother with the stats if (summaryStatistics.getN() == 0L) { return true; } - //TODO: consider adding others... + // TODO: consider adding others... if (!doubleEquals(summaryStatistics.getGeometricMean(), thatS.getGeometricMean())) { return false; } diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java index 4ab2672d14..170e6e57a3 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/URLEmailNormalizingFilterFactory.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; import java.io.IOException; import java.util.Map; - import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilterFactory; import org.apache.lucene.analysis.TokenStream; @@ -27,11 +24,10 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** - * Factory for filter that normalizes urls and emails to __url__ and __email__ - * respectively. WARNING:This will not work correctly unless the - * {@link UAX29URLEmailTokenizer} is used! This must be run _before_ the - * {@link AlphaIdeographFilterFactory}, or else the emails/urls will already - * be removed! + * Factory for filter that normalizes urls and emails to __url__ and __email__ respectively. + * WARNING:This will not work correctly unless the {@link UAX29URLEmailTokenizer} is used! + * This must be run _before_ the {@link AlphaIdeographFilterFactory}, or else the emails/urls will + * already be removed! */ public class URLEmailNormalizingFilterFactory extends TokenFilterFactory { @@ -45,6 +41,7 @@ public class URLEmailNormalizingFilterFactory extends TokenFilterFactory { public URLEmailNormalizingFilterFactory() { super(); } + public URLEmailNormalizingFilterFactory(Map args) { super(args); } @@ -73,12 +70,12 @@ public boolean incrementToken() throws IOException { if (!input.incrementToken()) { return false; } - //== is actually substantially faster than .equals(String) + // == is actually substantially faster than .equals(String) if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.URL]) { termAtt.copyBuffer(URL_CHARS, 0, URL_CHARS.length); termAtt.setLength(URL_CHARS.length); - } else if (typeAtt.type() == - UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMAIL]) { + } else if (typeAtt + .type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMAIL]) { termAtt.copyBuffer(EMAIL_CHARS, 0, EMAIL_CHARS.length); termAtt.setLength(EMAIL_CHARS.length); } diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java index c3eecc252c..dc10bc65af 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTagParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.util; @@ -25,7 +23,10 @@ import java.util.Map; import java.util.Set; import javax.xml.XMLConstants; - +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.ToTextContentHandler; +import org.apache.tika.utils.XMLReaderUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.DataNode; @@ -39,30 +40,25 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.exception.TikaException; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.ToTextContentHandler; -import org.apache.tika.utils.XMLReaderUtils; - public class ContentTagParser { private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext(); public static ContentTags parseXML(String html, Set uppercaseTagsOfInterest) - throws TikaException, IOException, SAXException { + throws TikaException, IOException, SAXException { Map tags = new HashMap<>(); XHTMLContentTagHandler xhtmlContentTagHandler = - new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags); - XMLReaderUtils.parseSAX(new StringReader(html), - xhtmlContentTagHandler, EMPTY_PARSE_CONTEXT); + new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags); + XMLReaderUtils.parseSAX(new StringReader(html), xhtmlContentTagHandler, + EMPTY_PARSE_CONTEXT); return new ContentTags(xhtmlContentTagHandler.toString(), tags); } public static ContentTags parseHTML(String html, Set uppercaseTagsOfInterest) - throws SAXException, IOException { + throws SAXException, IOException { Map tags = new HashMap<>(); XHTMLContentTagHandler xhtmlContentTagHandler = - new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags); + new XHTMLContentTagHandler(uppercaseTagsOfInterest, tags); Document document = Jsoup.parse(html); NodeTraversor.filter(new TikaNodeFilter(xhtmlContentTagHandler), document); @@ -79,7 +75,7 @@ private TikaNodeFilter(ContentHandler handler) { @Override public NodeFilter.FilterResult head(Node node, int i) { - //skip document fragment + // skip document fragment if ("html".equals(node.nodeName())) { ignore = false; } @@ -100,8 +96,8 @@ public NodeFilter.FilterResult head(Node node, int i) { } return NodeFilter.FilterResult.CONTINUE; } else if (node instanceof DataNode) { - //maybe handle script data directly here instead of - //passing it through to the HTMLHandler? + // maybe handle script data directly here instead of + // passing it through to the HTMLHandler? String txt = ((DataNode) node).getWholeData(); if (txt != null) { char[] chars = txt.toCharArray(); @@ -116,12 +112,11 @@ public NodeFilter.FilterResult head(Node node, int i) { return NodeFilter.FilterResult.CONTINUE; } AttributesImpl attributes = new AttributesImpl(); - Iterator jsoupAttrs = node - .attributes() - .iterator(); + Iterator jsoupAttrs = node.attributes().iterator(); while (jsoupAttrs.hasNext()) { Attribute jsoupAttr = jsoupAttrs.next(); - attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "", jsoupAttr.getValue()); + attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "", + jsoupAttr.getValue()); } try { handler.startElement("", node.nodeName(), node.nodeName(), attributes); @@ -165,22 +160,22 @@ SAXException getWrapped() { } private static class XHTMLContentTagHandler extends ToTextContentHandler { - //Used to have a stack to make sure that starting/ending tags were matched - //However, this was a non-starter because tag soup fixes non-matching tags for html - //and the straight SAXParser throws an exception for mismatched tags in xml + // Used to have a stack to make sure that starting/ending tags were matched + // However, this was a non-starter because tag soup fixes non-matching tags for html + // and the straight SAXParser throws an exception for mismatched tags in xml private final Map tags; private final Set uppercaseTagsOfInterest; public XHTMLContentTagHandler(Set uppercaseTagsOfInterest, - Map tags) { + Map tags) { this.uppercaseTagsOfInterest = uppercaseTagsOfInterest; this.tags = tags; } @Override public void startElement(String uri, String localName, String qName, Attributes atts) - throws SAXException { + throws SAXException { super.startElement(uri, localName, qName, atts); String uc = (qName == null) ? "" : qName.toUpperCase(Locale.ENGLISH); if (uppercaseTagsOfInterest.contains(uc)) { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTags.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTags.java index 6c34c2a390..fb5ea5095c 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTags.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/ContentTags.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.util; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/EvalExceptionUtils.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/EvalExceptionUtils.java index 3b94547184..70286324bd 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/EvalExceptionUtils.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/EvalExceptionUtils.java @@ -1,34 +1,30 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.util; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.commons.lang3.StringUtils; - import org.apache.tika.utils.ExceptionUtils; public class EvalExceptionUtils { - //these remove runtime info from the stacktraces so - //that actual causes can be counted. + // these remove runtime info from the stacktraces so + // that actual causes can be counted. private final static Pattern CAUSED_BY_SNIPPER = - Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+"); + Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+"); public static String normalize(String stacktrace) { if (StringUtils.isBlank(stacktrace)) { diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/langid/LangIdTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/langid/LangIdTest.java index d2b84ea379..862536a94c 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/langid/LangIdTest.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/langid/LangIdTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.langid; @@ -26,21 +24,19 @@ import java.util.HashSet; import java.util.Locale; import java.util.Set; - +import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import org.apache.tika.eval.core.tokens.CommonTokenCountManager; - public class LangIdTest { @Test - @Disabled("make sure to run this when updating common tokens or the language model" + - "but there's no reason to test this for every build.") + @Disabled("make sure to run this when updating common tokens or the language model" + + "but there's no reason to test this for every build.") public void testCommonTokensCoverage() throws Exception { - //make sure that there is a common tokens file for every - //language + // make sure that there is a common tokens file for every + // language LanguageIDWrapper wrapper = new LanguageIDWrapper(); CommonTokenCountManager commonTokens = new CommonTokenCountManager(null, "eng"); @@ -50,9 +46,9 @@ public void testCommonTokensCoverage() throws Exception { Set tokens = commonTokens.getTokens(lang); if (tokens.size() == 0) { System.out.printf(Locale.US, "missing common tokens for: %s%n", lang); - } else if (tokens.size() < 250) { //ssw has 255 + } else if (tokens.size() < 250) { // ssw has 255 fail(String.format(Locale.US, "common tokens too small (%s) for: %s", tokens.size(), - lang)); + lang)); } } diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java index f1fd21c21e..0908cb57d0 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java @@ -1,37 +1,34 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.metadata; import static org.junit.jupiter.api.Assertions.assertEquals; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.filter.DefaultMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; +import org.junit.jupiter.api.Test; public class TikaEvalMetadataFilterTest { @Test public void testBasic() throws Exception { - for (MetadataFilter filter : new MetadataFilter[]{new TikaEvalMetadataFilter(), - //make sure that the TikaEvalMetadataFilter is loaded automatically - new DefaultMetadataFilter()}) { + for (MetadataFilter filter : new MetadataFilter[] {new TikaEvalMetadataFilter(), + // make sure that the TikaEvalMetadataFilter is loaded automatically + new DefaultMetadataFilter()}) { Metadata metadata = new Metadata(); String content = "the quick brown fox, Zothro 1234 1235, jumped over the lazy dog"; metadata.set(TikaCoreProperties.TIKA_CONTENT, content); @@ -46,13 +43,13 @@ public void testBasic() throws Exception { assertEquals(0.0999, - Double.parseDouble(metadata.get(TikaEvalMetadataFilter.OUT_OF_VOCABULARY)), - 0.1); + Double.parseDouble( + metadata.get(TikaEvalMetadataFilter.OUT_OF_VOCABULARY)), + 0.1); assertEquals("eng", metadata.get(TikaEvalMetadataFilter.LANGUAGE)); - assertEquals(0.0196, - Double.parseDouble(metadata.get(TikaEvalMetadataFilter.LANGUAGE_CONFIDENCE)), - 0.1); + assertEquals(0.0196, Double.parseDouble( + metadata.get(TikaEvalMetadataFilter.LANGUAGE_CONFIDENCE)), 0.1); } } } diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/textstats/TextStatsTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/textstats/TextStatsTest.java index 2ca6138dd9..938ce4f7ec 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/textstats/TextStatsTest.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/textstats/TextStatsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.textstats; @@ -22,21 +20,18 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; - import org.apache.commons.codec.binary.Base32; import org.apache.commons.codec.digest.DigestUtils; -import org.junit.jupiter.api.Test; - import org.apache.tika.eval.core.langid.LanguageIDWrapper; import org.apache.tika.eval.core.tokens.CommonTokenResult; import org.apache.tika.language.detect.LanguageResult; +import org.junit.jupiter.api.Test; public class TextStatsTest { @Test public void testBasic() throws Exception { - String txt = - "The quick brown fox &&^&%@! ; ; ; ;;; ;;; 8675309 jumped over tHe lazy wombat"; + String txt = "The quick brown fox &&^&%@! ; ; ; ;;; ;;; 8675309 jumped over tHe lazy wombat"; String txtCleaned = "the quick brown fox 8675309 jumped over the lazy wombat"; List calcs = new ArrayList<>(); calcs.add(new TextProfileSignature()); @@ -63,17 +58,17 @@ public void testBasic() throws Exception { assertEquals(3.12, (double) stats.get(TokenEntropy.class), 0.01); List probabilities = - (List) stats.get(LanguageIDWrapper.class); + (List) stats.get(LanguageIDWrapper.class); assertEquals("eng", probabilities.get(0).getLanguage()); assertEquals(0.02, probabilities.get(1).getRawScore(), 0.01); String textProfileSignature = (String) stats.get(TextProfileSignature.class); assertEquals("XF3W27O7IWOJVVNQ4HLKYYPCPPX3L2M72YSEMZ3WADL4VTXVITIA====", - textProfileSignature); + textProfileSignature); - assertEquals(new Base32() - .encodeAsString(DigestUtils.sha256(txtCleaned.getBytes(StandardCharsets.UTF_8))), - stats.get(TextSha256Signature.class)); + assertEquals(new Base32().encodeAsString( + DigestUtils.sha256(txtCleaned.getBytes(StandardCharsets.UTF_8))), + stats.get(TextSha256Signature.class)); } @Test @@ -87,17 +82,17 @@ public void testCJK() throws Exception { Map stats = calc.calculate(txt); List probabilities = - (List) stats.get(LanguageIDWrapper.class); + (List) stats.get(LanguageIDWrapper.class); assertEquals("cmn", probabilities.get(0).getLanguage()); assertEquals(0.009, probabilities.get(1).getRawScore(), 0.01); String textProfileSignature = (String) stats.get(TextProfileSignature.class); assertEquals("XKXLY6FNIGK2KGEF6HOSKSVGYDLLOFIAGO73RLMJ22PZVXBTXFFA====", - textProfileSignature); + textProfileSignature); - //now test that if a user accidentally sets mintoken length > 2 - //the output will the be same as empty text + // now test that if a user accidentally sets mintoken length > 2 + // the output will the be same as empty text calcs.clear(); calcs.add(new TextProfileSignature()); calc = new CompositeTextStatsCalculator(calcs); diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/LuceneTokenCounter.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/LuceneTokenCounter.java index 00487f844d..b5f05766b8 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/LuceneTokenCounter.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/LuceneTokenCounter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; @@ -20,7 +18,6 @@ import java.io.IOException; import java.util.HashMap; import java.util.Map; - import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.commons.math3.util.FastMath; import org.apache.lucene.analysis.Analyzer; @@ -32,8 +29,7 @@ import org.apache.lucene.util.BytesRef; /** - * Experimental class uses Lucene's MemoryIndex to effectively build the - * token info. + * Experimental class uses Lucene's MemoryIndex to effectively build the token info. */ public class LuceneTokenCounter { private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a"; @@ -53,10 +49,10 @@ public LuceneTokenCounter(Analyzer generalAnalyzer) throws IOException { public void add(String field, String content) throws IOException { memoryIndex.addField(field, content, generalAnalyzer); - //memoryIndex.addField(field+ALPHA_IDEOGRAPH_SUFFIX, - // content, alphaIdeographAnalyzer); + // memoryIndex.addField(field+ALPHA_IDEOGRAPH_SUFFIX, + // content, alphaIdeographAnalyzer); count(field); - //count(field+ALPHA_IDEOGRAPH_SUFFIX); + // count(field+ALPHA_IDEOGRAPH_SUFFIX); } @@ -75,10 +71,9 @@ void count(String field) throws IOException { Terms terms = leafReader.terms(field); if (terms == null) { - //if there were no terms - fieldStats.put(field, - new TokenStatistics(uniqueTokenCount, tokenCountInt, new TokenIntPair[0], ent, - summStats)); + // if there were no terms + fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, + new TokenIntPair[0], ent, summStats)); return; } @@ -93,8 +88,8 @@ void count(String field) throws IOException { throw new IllegalArgumentException("Sorry can't handle longs yet"); } int tf = (int) termFreq; - //TODO: figure out how to avoid Stringifying this - //to get codepoint count + // TODO: figure out how to avoid Stringifying this + // to get codepoint count String t = bytesRef.utf8ToString(); int len = t.codePointCount(0, t.length()); for (int i = 0; i < tf; i++) { @@ -114,9 +109,8 @@ void count(String field) throws IOException { ent = (-1.0d / (double) tokenCountInt) * ent; } - fieldStats.put(field, - new TokenStatistics(uniqueTokenCount, tokenCountInt, queue.getArray(), ent, - summStats)); + fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, queue.getArray(), + ent, summStats)); } public void setTopN(int topN) { @@ -136,52 +130,32 @@ public void clear() { memoryIndex.reset(); fieldStats.clear(); } -/* - public ContrastStatistics contrast(String fieldA, String fieldB) throws IOException { - long diceDenom = getUniqueTokenCount(fieldA) + - getUniqueTokenCount(fieldB); - - long diceNum = 0; - long overlapNum = 0; - - Terms termsA = getTerms(fieldA); - Terms termsB = getTerms(fieldB); - - TermsEnum termsEnumA = termsA.iterator(); - TermsEnum termsEnumB = termsB.iterator(); - - BytesRef bytesRefA = termsEnumA.next(); - BytesRef bytesRefB = termsEnumB.next(); - - while (bytesRefA != null) { - int compare = bytesRefA.compareTo(bytesRefB); - while (compare > 0) { - if (bytesRefB == null) { - break; - } - //handle term in B, but not A - - compare = bytesRefA.compareTo(bytesRefB); - bytesRefB = termsEnumB.next(); - } - if (compare == 0) { - diceNum += 2; - overlapNum += 2 * Math.min(termsEnumA.totalTermFreq(), termsEnumB.totalTermFreq()); - } - - bytesRefA = termsEnumA.next(); - } - - - for (PairCount p : tokens.values()) { - if (p.a > 0 && p.b > 0) { - diceNum += 2; - overlapNum += 2 * Math.min(p.a, p.b); - } - } - - float dice = (float) diceNum / (float) diceDenom; - float overlap = (float) overlapNum / (float) (theseTokens.getTokenCount() + thoseTokens.getTokenCount()); - } -*/ + /* + * public ContrastStatistics contrast(String fieldA, String fieldB) throws IOException { long + * diceDenom = getUniqueTokenCount(fieldA) + getUniqueTokenCount(fieldB); + * + * long diceNum = 0; long overlapNum = 0; + * + * Terms termsA = getTerms(fieldA); Terms termsB = getTerms(fieldB); + * + * TermsEnum termsEnumA = termsA.iterator(); TermsEnum termsEnumB = termsB.iterator(); + * + * BytesRef bytesRefA = termsEnumA.next(); BytesRef bytesRefB = termsEnumB.next(); + * + * while (bytesRefA != null) { int compare = bytesRefA.compareTo(bytesRefB); while (compare > 0) + * { if (bytesRefB == null) { break; } //handle term in B, but not A + * + * compare = bytesRefA.compareTo(bytesRefB); bytesRefB = termsEnumB.next(); } if (compare == 0) + * { diceNum += 2; overlapNum += 2 * Math.min(termsEnumA.totalTermFreq(), + * termsEnumB.totalTermFreq()); } + * + * bytesRefA = termsEnumA.next(); } + * + * + * for (PairCount p : tokens.values()) { if (p.a > 0 && p.b > 0) { diceNum += 2; overlapNum += 2 + * * Math.min(p.a, p.b); } } + * + * float dice = (float) diceNum / (float) diceDenom; float overlap = (float) overlapNum / + * (float) (theseTokens.getTokenCount() + thoseTokens.getTokenCount()); } + */ } diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TokenCounterTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TokenCounterTest.java index dc687a2b25..01fea1326b 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TokenCounterTest.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TokenCounterTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.tokens; @@ -24,7 +22,6 @@ import java.util.HashMap; import java.util.Map; import java.util.Random; - import org.apache.commons.lang3.mutable.MutableInt; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -46,13 +43,12 @@ public static void setUp() throws IOException { @Test public void testBasic() throws Exception { - String s = - " bde cde def abc efg f f f f ghijklmnop a a a a a a a a a a a a a a a a a b b b b b b b b b b b b b"; + String s = " bde cde def abc efg f f f f ghijklmnop a a a a a a a a a a a a a a a a a b b b b b b b b b b b b b"; TokenCounter counter = new TokenCounter(analyzerManager.getGeneralAnalyzer()); counter.add(FIELD, s); TokenStatistics simpleTokenStatistics = counter.getTokenStatistics(FIELD); LuceneTokenCounter tokenCounter = - new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer()); + new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer()); tokenCounter.add(FIELD, s); assertEquals(simpleTokenStatistics, tokenCounter.getTokenStatistics(FIELD)); } @@ -73,7 +69,7 @@ public void testRandom() throws Exception { start = System.currentTimeMillis(); LuceneTokenCounter tokenCounter = - new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer()); + new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer()); tokenCounter.add(FIELD, s); lucene += System.currentTimeMillis() - start; @@ -84,8 +80,7 @@ public void testRandom() throws Exception { @Test public void testCommonTokens() throws Exception { TokenCounter tokenCounter = new TokenCounter(analyzerManager.getCommonTokensAnalyzer()); - String s = - "the http://www.cnn.com and blahdeblah@apache.org are in valuable www.sites.org 普林斯顿大学"; + String s = "the http://www.cnn.com and blahdeblah@apache.org are in valuable www.sites.org 普林斯顿大学"; tokenCounter.add(FIELD, s); Map tokens = tokenCounter.getTokens(FIELD); assertEquals(new MutableInt(2), tokens.get("___url___")); diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/util/LanguageIdTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/util/LanguageIdTest.java index 8ae73d1dee..6021673a98 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/util/LanguageIdTest.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/util/LanguageIdTest.java @@ -1,36 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.util; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.List; - -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.Timeout; - import org.apache.tika.eval.core.langid.LanguageIDWrapper; import org.apache.tika.language.detect.LanguageResult; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; public class LanguageIdTest { @Test @Timeout(10000) public void testDefenseAgainstBadRegexInOpenNLP() throws Exception { - //TIKA-2777 + // TIKA-2777 StringBuilder sb = new StringBuilder(); for (int i = 0; i < 50000; i++) { sb.append("a"); diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/util/MimeUtilTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/util/MimeUtilTest.java index 63793ba06c..7ac577cb02 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/util/MimeUtilTest.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/util/MimeUtilTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.eval.core.util; @@ -21,13 +19,12 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; @Disabled("Fix mimetype.getExtension to work with these and then we can get rid of MimeUtil") public class MimeUtilTest { @@ -59,7 +56,7 @@ private void assertResult(String contentType, String expected) throws MimeTypeEx MimeTypes r = tikaConfig.getMimeRepository(); MimeType mt = r.forName(contentType); -// String ext = MimeUtil.getExtension(contentType, config); + // String ext = MimeUtil.getExtension(contentType, config); assertEquals(expected, mt.getExtension()); } } diff --git a/tika-example/src/main/java/org/apache/tika/example/AdvancedTypeDetector.java b/tika-example/src/main/java/org/apache/tika/example/AdvancedTypeDetector.java index d78199fa61..6b7a472112 100755 --- a/tika-example/src/main/java/org/apache/tika/example/AdvancedTypeDetector.java +++ b/tika-example/src/main/java/org/apache/tika/example/AdvancedTypeDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; diff --git a/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java b/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java index af95e5e958..6be552eb71 100644 --- a/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -22,9 +20,6 @@ import java.util.ArrayList; import java.util.List; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; @@ -35,17 +30,18 @@ import org.apache.tika.sax.xpath.Matcher; import org.apache.tika.sax.xpath.MatchingContentHandler; import org.apache.tika.sax.xpath.XPathParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Examples of using different Content Handlers to - * get different parts of the file's contents + * Examples of using different Content Handlers to get different parts of the file's contents */ public class ContentHandlerExample { protected final int MAXIMUM_TEXT_CHUNK_SIZE = 40; /** - * Example of extracting the plain text of the contents. - * Will return only the "body" part of the document + * Example of extracting the plain text of the contents. Will return only the "body" part of the + * document */ public String parseToPlainText() throws IOException, SAXException, TikaException { BodyContentHandler handler = new BodyContentHandler(); @@ -73,8 +69,7 @@ public String parseToHTML() throws IOException, SAXException, TikaException { } /** - * Example of extracting just the body as HTML, without the - * head part, as a string + * Example of extracting just the body as HTML, without the head part, as a string */ public String parseBodyToHTML() throws IOException, SAXException, TikaException { ContentHandler handler = new BodyContentHandler(new ToXMLContentHandler()); @@ -88,14 +83,16 @@ public String parseBodyToHTML() throws IOException, SAXException, TikaException } /** - * Example of extracting just one part of the document's body, - * as HTML as a string, excluding the rest + * Example of extracting just one part of the document's body, as HTML as a string, excluding + * the rest */ public String parseOnePartToHTML() throws IOException, SAXException, TikaException { // Only get things under html -> body -> div (class=header) XPathParser xhtmlParser = new XPathParser("xhtml", XHTMLContentHandler.XHTML); - Matcher divContentMatcher = xhtmlParser.parse("/xhtml:html/xhtml:body/xhtml:div/descendant::node()"); - ContentHandler handler = new MatchingContentHandler(new ToXMLContentHandler(), divContentMatcher); + Matcher divContentMatcher = + xhtmlParser.parse("/xhtml:html/xhtml:body/xhtml:div/descendant::node()"); + ContentHandler handler = + new MatchingContentHandler(new ToXMLContentHandler(), divContentMatcher); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); @@ -106,8 +103,8 @@ public String parseOnePartToHTML() throws IOException, SAXException, TikaExcepti } /** - * Example of extracting the plain text in chunks, with each chunk - * of no more than a certain maximum size + * Example of extracting the plain text in chunks, with each chunk of no more than a certain + * maximum size */ public List parseToPlainTextChunks() throws IOException, SAXException, TikaException { final List chunks = new ArrayList<>(); diff --git a/tika-example/src/main/java/org/apache/tika/example/CustomMimeInfo.java b/tika-example/src/main/java/org/apache/tika/example/CustomMimeInfo.java index c286353ecb..8e6e534874 100755 --- a/tika-example/src/main/java/org/apache/tika/example/CustomMimeInfo.java +++ b/tika-example/src/main/java/org/apache/tika/example/CustomMimeInfo.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -35,7 +33,8 @@ public static String customMimeInfo() throws Exception { public static String customCompositeDetector() throws Exception { String path = "file:///path/to/prescription-type.xml"; MimeTypes typeDatabase = MimeTypesFactory.create(new URL(path)); - Tika tika = new Tika(new CompositeDetector(typeDatabase, new EncryptedPrescriptionDetector())); + Tika tika = new Tika( + new CompositeDetector(typeDatabase, new EncryptedPrescriptionDetector())); return tika.detect("/path/to/tmp/prescription.xpd"); } diff --git a/tika-example/src/main/java/org/apache/tika/example/DescribeMetadata.java b/tika-example/src/main/java/org/apache/tika/example/DescribeMetadata.java index 50f7840b24..1830ef8046 100755 --- a/tika-example/src/main/java/org/apache/tika/example/DescribeMetadata.java +++ b/tika-example/src/main/java/org/apache/tika/example/DescribeMetadata.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -24,6 +22,6 @@ */ public class DescribeMetadata { public static void main(String[] args) throws Exception { - TikaCLI.main(new String[]{"--list-met-models"}); + TikaCLI.main(new String[] {"--list-met-models"}); } } diff --git a/tika-example/src/main/java/org/apache/tika/example/DirListParser.java b/tika-example/src/main/java/org/apache/tika/example/DirListParser.java index 2d92763ea0..c6fee202e4 100755 --- a/tika-example/src/main/java/org/apache/tika/example/DirListParser.java +++ b/tika-example/src/main/java/org/apache/tika/example/DirListParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -27,9 +25,6 @@ import java.util.Set; import org.apache.commons.io.FileUtils; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -37,16 +32,19 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Parses the output of /bin/ls and counts the number of files and the number of - * executables using Tika. + * Parses the output of /bin/ls and counts the number of files and the number of executables using + * Tika. */ public class DirListParser implements Parser { private static final long serialVersionUID = 2717930544410610735L; - private static Set SUPPORTED_TYPES = new HashSet<>(Collections.singletonList(MediaType.TEXT_PLAIN)); + private static Set SUPPORTED_TYPES = + new HashSet<>(Collections.singletonList(MediaType.TEXT_PLAIN)); public static void main(String[] args) throws IOException, SAXException, TikaException { DirListParser parser = new DirListParser(); @@ -60,8 +58,7 @@ public static void main(String[] args) throws IOException, SAXException, TikaExc /* * (non-Javadoc) * - * @see org.apache.tika.parser.Parser#getSupportedTypes( - * org.apache.tika.parser.ParseContext) + * @see org.apache.tika.parser.Parser#getSupportedTypes( org.apache.tika.parser.ParseContext) */ public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; @@ -70,25 +67,24 @@ public Set getSupportedTypes(ParseContext context) { /* * (non-Javadoc) * - * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, - * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata) + * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, + * org.apache.tika.metadata.Metadata) */ - public void parse(InputStream is, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { + public void parse(InputStream is, ContentHandler handler, Metadata metadata) + throws IOException, SAXException, TikaException { this.parse(is, handler, metadata, new ParseContext()); } /* * (non-Javadoc) * - * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, - * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, - * org.apache.tika.parser.ParseContext) + * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, + * org.apache.tika.metadata.Metadata, org.apache.tika.parser.ParseContext) */ - public void parse(InputStream is, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + public void parse(InputStream is, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { - List lines = FileUtils.readLines(TikaInputStream - .get(is) - .getFile(), UTF_8); + List lines = FileUtils.readLines(TikaInputStream.get(is).getFile(), UTF_8); for (String line : lines) { String[] fileToks = line.split("\\s+"); if (fileToks.length < 8) { @@ -111,12 +107,14 @@ public void parse(InputStream is, ContentHandler handler, Metadata metadata, Par fileName.append(" "); } fileName.deleteCharAt(fileName.length() - 1); - this.addMetadata(metadata, filePermissions, numHardLinks, fileOwner, fileOwnerGroup, fileSize, lastModDate.toString(), fileName.toString()); + this.addMetadata(metadata, filePermissions, numHardLinks, fileOwner, fileOwnerGroup, + fileSize, lastModDate.toString(), fileName.toString()); } } - private void addMetadata(Metadata metadata, String filePerms, String numHardLinks, String fileOwner, String fileOwnerGroup, String fileSize, String lastModDate, - String fileName) { + private void addMetadata(Metadata metadata, String filePerms, String numHardLinks, + String fileOwner, String fileOwnerGroup, String fileSize, String lastModDate, + String fileName) { metadata.add("FilePermissions", filePerms); metadata.add("NumHardLinks", numHardLinks); metadata.add("FileOwner", fileOwner); diff --git a/tika-example/src/main/java/org/apache/tika/example/DisplayMetInstance.java b/tika-example/src/main/java/org/apache/tika/example/DisplayMetInstance.java index ecb4072a72..47bacd3cb1 100755 --- a/tika-example/src/main/java/org/apache/tika/example/DisplayMetInstance.java +++ b/tika-example/src/main/java/org/apache/tika/example/DisplayMetInstance.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -20,13 +18,12 @@ import java.io.IOException; import java.net.URL; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.SAXException; /** * Grabs a PDF file from a URL and prints its {@link Metadata} diff --git a/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java b/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java index f960a09789..037bc8dcca 100644 --- a/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -30,14 +28,13 @@ /** - * This class shows how to dump a TikaConfig object to a configuration file. - * This allows users to easily dump the default TikaConfig as a base from which - * to start if they want to modify the default configuration file. + * This class shows how to dump a TikaConfig object to a configuration file. This allows users to + * easily dump the default TikaConfig as a base from which to start if they want to modify the + * default configuration file. *

* For those who want to modify the mimes file, take a look at - * tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml - * for inspiration. Consider adding org/apache/tika/mime/custom-mimetypes.xml - * for your custom mime types. + * tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml for inspiration. Consider + * adding org/apache/tika/mime/custom-mimetypes.xml for your custom mime types. */ public class DumpTikaConfigExample { @@ -60,7 +57,8 @@ public static void main(String[] args) throws Exception { mode = TikaConfigSerializer.Mode.STATIC; } else { System.out.println("Use:"); - System.out.println(" DumpTikaConfig [--dump-minimal] [--dump-current] [--dump-static] [filename] [encoding]"); + System.out.println( + " DumpTikaConfig [--dump-minimal] [--dump-current] [--dump-static] [filename] [encoding]"); System.out.println(""); System.out.println("--dump-minimal Produce the minimal config file"); System.out.println("--dump-current The current (with defaults) config file"); diff --git a/tika-example/src/main/java/org/apache/tika/example/EncryptedPrescriptionDetector.java b/tika-example/src/main/java/org/apache/tika/example/EncryptedPrescriptionDetector.java index ed3cfc5088..9846c8c38e 100755 --- a/tika-example/src/main/java/org/apache/tika/example/EncryptedPrescriptionDetector.java +++ b/tika-example/src/main/java/org/apache/tika/example/EncryptedPrescriptionDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -21,6 +19,7 @@ import java.io.InputStream; import java.security.GeneralSecurityException; import java.security.Key; + import javax.crypto.Cipher; import javax.crypto.CipherInputStream; import javax.xml.namespace.QName; @@ -45,7 +44,8 @@ public MediaType detect(InputStream stream, Metadata metadata) throws IOExceptio InputStream decrypted = new CipherInputStream(lookahead, cipher); QName name = new XmlRootExtractor().extractRootElement(decrypted); - if (name != null && "http://example.com/xpd".equals(name.getNamespaceURI()) && "prescription".equals(name.getLocalPart())) { + if (name != null && "http://example.com/xpd".equals(name.getNamespaceURI()) + && "prescription".equals(name.getLocalPart())) { type = MediaType.application("x-prescription"); } } catch (GeneralSecurityException e) { diff --git a/tika-example/src/main/java/org/apache/tika/example/EncryptedPrescriptionParser.java b/tika-example/src/main/java/org/apache/tika/example/EncryptedPrescriptionParser.java index 28ef460ac5..f7c63d57e0 100755 --- a/tika-example/src/main/java/org/apache/tika/example/EncryptedPrescriptionParser.java +++ b/tika-example/src/main/java/org/apache/tika/example/EncryptedPrescriptionParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -23,22 +21,23 @@ import java.security.Key; import java.util.Collections; import java.util.Set; + import javax.crypto.Cipher; import javax.crypto.CipherInputStream; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class EncryptedPrescriptionParser implements Parser { private static final long serialVersionUID = -7816987249611278541L; - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { try { Key key = Pharmacy.getKey(); Cipher cipher = Cipher.getInstance("RSA"); diff --git a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java index 68d136202c..5f5cd74d0a 100644 --- a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java +++ b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -24,9 +22,6 @@ import java.util.UUID; import org.apache.commons.io.FilenameUtils; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; @@ -41,13 +36,16 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class ExtractEmbeddedFiles { private Parser parser = new AutoDetectParser(); private Detector detector = ((AutoDetectParser) parser).getDetector(); private TikaConfig config = TikaConfig.getDefaultConfig(); - public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException { + public void extract(InputStream is, Path outputDir) + throws SAXException, TikaException, IOException { Metadata m = new Metadata(); ParseContext c = new ParseContext(); ContentHandler h = new BodyContentHandler(-1); @@ -74,17 +72,18 @@ public boolean shouldParseEmbedded(Metadata metadata) { } @Override - public void parseEmbedded(TikaInputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { + public void parseEmbedded(TikaInputStream stream, ContentHandler handler, Metadata metadata, + boolean outputHtml) throws SAXException, IOException { - //try to get the name of the embedded file from the metadata + // try to get the name of the embedded file from the metadata String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (name == null) { name = "file_" + fileCount++; } else { - //make sure to select only the file name (not any directory paths - //that might be included in the name) and make sure - //to normalize the name + // make sure to select only the file name (not any directory paths + // that might be included in the name) and make sure + // to normalize the name name = name.replaceAll("\u0000", " "); int prefix = FilenameUtils.getPrefixLength(name); if (prefix > -1) { @@ -93,15 +92,13 @@ public void parseEmbedded(TikaInputStream stream, ContentHandler handler, Metada name = FilenameUtils.normalize(FilenameUtils.getName(name)); } - //now try to figure out the right extension for the embedded file + // now try to figure out the right extension for the embedded file MediaType contentType = detector.detect(stream, metadata); if (name.indexOf('.') == -1 && contentType != null) { try { - name += config - .getMimeRepository() - .forName(contentType.toString()) - .getExtension(); + name += config.getMimeRepository().forName(contentType.toString()) + .getExtension(); } catch (MimeTypeException e) { e.printStackTrace(); } @@ -109,9 +106,7 @@ public void parseEmbedded(TikaInputStream stream, ContentHandler handler, Metada Path outputFile = outputDir.resolve(name); if (Files.exists(outputFile)) { - outputFile = outputDir.resolve(UUID - .randomUUID() - .toString() + "-" + name); + outputFile = outputDir.resolve(UUID.randomUUID().toString() + "-" + name); } Files.createDirectories(outputFile.getParent()); Files.copy(stream, outputFile); diff --git a/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java b/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java index b389282980..a9cad93f91 100644 --- a/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/GrabPhoneNumbersExample.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -37,14 +35,12 @@ import org.apache.tika.sax.PhoneExtractingContentHandler; /** - * Class to demonstrate how to use the {@link org.apache.tika.sax.PhoneExtractingContentHandler} - * to get a list of all of the phone numbers from every file in a directory. + * Class to demonstrate how to use the {@link org.apache.tika.sax.PhoneExtractingContentHandler} to + * get a list of all of the phone numbers from every file in a directory. *

- * You can run this main method by running - * + * You can run this main method by running * mvn exec:java -Dexec.mainClass="org.apache.tika.example.GrabPhoneNumbersExample" -Dexec.args="/path/to/directory" - * - * from the tika-example directory. + * from the tika-example directory. */ public class GrabPhoneNumbersExample { private static HashSet phoneNumbers = new HashSet<>(); @@ -66,7 +62,8 @@ public static void processFolder(Path folder) { try { Files.walkFileTree(folder, new SimpleFileVisitor() { @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) + throws IOException { try { process(file); successfulFiles++; @@ -78,7 +75,8 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IO } @Override - public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { + public FileVisitResult visitFileFailed(Path file, IOException exc) + throws IOException { failedFiles++; return FileVisitResult.CONTINUE; } @@ -91,9 +89,11 @@ public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOExce public static void process(Path path) throws Exception { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); - // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them + // The PhoneExtractingContentHandler will examine any characters for phone numbers before + // passing them // to the underlying Handler. - PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata); + PhoneExtractingContentHandler handler = + new PhoneExtractingContentHandler(new BodyContentHandler(), metadata); try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) { parser.parse(stream, handler, metadata, new ParseContext()); } diff --git a/tika-example/src/main/java/org/apache/tika/example/ImportContextImpl.java b/tika-example/src/main/java/org/apache/tika/example/ImportContextImpl.java index 97daa93a26..d9689ce5e1 100755 --- a/tika-example/src/main/java/org/apache/tika/example/ImportContextImpl.java +++ b/tika-example/src/main/java/org/apache/tika/example/ImportContextImpl.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -23,6 +21,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.Date; + import javax.jcr.Item; import org.apache.jackrabbit.server.io.DefaultIOListener; @@ -30,13 +29,12 @@ import org.apache.jackrabbit.server.io.IOUtil; import org.apache.jackrabbit.server.io.ImportContext; import org.apache.jackrabbit.webdav.io.InputContext; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.detect.Detector; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * ImportContextImpl... @@ -53,22 +51,22 @@ public class ImportContextImpl implements ImportContext { private boolean completed; /** - * Creates a new item import context. The specified InputStream is written - * to a temporary file in order to avoid problems with multiple IOHandlers - * that try to run the import but fail. The temporary file is deleted as - * soon as this context is informed that the import has been completed and - * it will not be used any more. + * Creates a new item import context. The specified InputStream is written to a temporary file + * in order to avoid problems with multiple IOHandlers that try to run the import but fail. The + * temporary file is deleted as soon as this context is informed that the import has been + * completed and it will not be used any more. * * @param importRoot * @param systemId - * @param ctx input context, or null - * @param stream document input stream, or null + * @param ctx input context, or null + * @param stream document input stream, or null * @param ioListener - * @param detector content type detector + * @param detector content type detector * @throws IOException * @see ImportContext#informCompleted(boolean) */ - public ImportContextImpl(Item importRoot, String systemId, InputContext ctx, InputStream stream, IOListener ioListener, Detector detector) throws IOException { + public ImportContextImpl(Item importRoot, String systemId, InputContext ctx, InputStream stream, + IOListener ioListener, Detector detector) throws IOException { this.importRoot = importRoot; this.systemId = systemId; this.inputCtx = ctx; @@ -111,9 +109,8 @@ public boolean hasStream() { } /** - * Returns a new InputStream to the temporary file created - * during instanciation or null, if this context does not - * provide a stream. + * Returns a new InputStream to the temporary file created during instanciation or + * null, if this context does not provide a stream. * * @see ImportContext#getInputStream() * @see #hasStream() @@ -164,7 +161,8 @@ public long getContentLength() { length = inputFile.length(); } if (length < 0) { - LOG.debug("Unable to determine content length -> default value = {}", IOUtil.UNDEFINED_LENGTH); + LOG.debug("Unable to determine content length -> default value = {}", + IOUtil.UNDEFINED_LENGTH); } return length; } diff --git a/tika-example/src/main/java/org/apache/tika/example/InterruptableParsingExample.java b/tika-example/src/main/java/org/apache/tika/example/InterruptableParsingExample.java index f8e4bdbe8a..d0516d74f7 100644 --- a/tika-example/src/main/java/org/apache/tika/example/InterruptableParsingExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/InterruptableParsingExample.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -24,21 +22,19 @@ import java.nio.file.Path; import java.util.Locale; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** - * This example demonstrates how to interrupt document parsing if - * some condition is met. + * This example demonstrates how to interrupt document parsing if some condition is met. *

- * {@link InterruptableParsingExample.InterruptingContentHandler} throws special exception as soon as - * find {@code query} string in parsed file. + * {@link InterruptableParsingExample.InterruptingContentHandler} throws special exception as soon + * as find {@code query} string in parsed file. *

* See also http://stackoverflow.com/questions/31939851 */ @@ -53,9 +49,7 @@ public boolean findInFile(String query, Path path) { context.set(Parser.class, tika.getParser()); try (InputStream is = new BufferedInputStream(Files.newInputStream(path))) { - tika - .getParser() - .parse(is, handler, metadata, context); + tika.getParser().parse(is, handler, metadata, context); } catch (QueryMatchedException e) { return true; } catch (SAXException | TikaException | IOException e) { @@ -85,9 +79,7 @@ static class InterruptingContentHandler extends DefaultHandler { public void characters(char[] ch, int start, int length) throws SAXException { sb.append(new String(ch, start, length).toLowerCase(Locale.getDefault())); - if (sb - .toString() - .contains(query)) { + if (sb.toString().contains(query)) { throw new QueryMatchedException(); } diff --git a/tika-example/src/main/java/org/apache/tika/example/Language.java b/tika-example/src/main/java/org/apache/tika/example/Language.java index c1ac26bd0d..6a45bad50e 100755 --- a/tika-example/src/main/java/org/apache/tika/example/Language.java +++ b/tika-example/src/main/java/org/apache/tika/example/Language.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -31,7 +29,8 @@ public class Language { public static void languageDetection() throws IOException { LanguageDetector detector = new OptimaizeLangDetector().loadModels(); - LanguageResult result = detector.detect("Alla människor är födda fria och lika i värde och rättigheter."); + LanguageResult result = detector + .detect("Alla människor är födda fria och lika i värde och rättigheter."); System.out.println(result.getLanguage()); } diff --git a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java index 7f43f134cf..fa39709176 100755 --- a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java +++ b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectingParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -20,9 +18,6 @@ import java.io.IOException; import java.io.InputStream; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageHandler; import org.apache.tika.language.detect.LanguageResult; @@ -31,11 +26,14 @@ import org.apache.tika.parser.DelegatingParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.TeeContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class LanguageDetectingParser extends DelegatingParser { private static final long serialVersionUID = 4291320409396502774L; - public void parse(InputStream stream, ContentHandler handler, final Metadata metadata, ParseContext context) throws SAXException, IOException, TikaException { + public void parse(InputStream stream, ContentHandler handler, final Metadata metadata, + ParseContext context) throws SAXException, IOException, TikaException { LanguageHandler langHandler = new LanguageHandler(); ContentHandler tee = new TeeContentHandler(handler, langHandler); diff --git a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java index e077c16f15..03b6158f1d 100644 --- a/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/LanguageDetectorExample.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; diff --git a/tika-example/src/main/java/org/apache/tika/example/LuceneIndexer.java b/tika-example/src/main/java/org/apache/tika/example/LuceneIndexer.java index 6d887386f5..5da84d6be7 100755 --- a/tika-example/src/main/java/org/apache/tika/example/LuceneIndexer.java +++ b/tika-example/src/main/java/org/apache/tika/example/LuceneIndexer.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -23,7 +21,6 @@ import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; - import org.apache.tika.Tika; public class LuceneIndexer { diff --git a/tika-example/src/main/java/org/apache/tika/example/LuceneIndexerExtended.java b/tika-example/src/main/java/org/apache/tika/example/LuceneIndexerExtended.java index 6acdbd3b5f..1da3ece5b2 100755 --- a/tika-example/src/main/java/org/apache/tika/example/LuceneIndexerExtended.java +++ b/tika-example/src/main/java/org/apache/tika/example/LuceneIndexerExtended.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -28,7 +26,6 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.FSDirectory; - import org.apache.tika.Tika; @SuppressWarnings("deprecation") @@ -44,7 +41,8 @@ public LuceneIndexerExtended(IndexWriter writer, Tika tika) { public static void main(String[] args) throws Exception { IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new StandardAnalyzer()); - try (IndexWriter writer = new IndexWriter(FSDirectory.open(Paths.get(args[0])), indexWriterConfig)) { + try (IndexWriter writer = + new IndexWriter(FSDirectory.open(Paths.get(args[0])), indexWriterConfig)) { LuceneIndexer indexer = new LuceneIndexer(new Tika(), writer); for (int i = 1; i < args.length; i++) { indexer.indexDocument(new File(args[i])); diff --git a/tika-example/src/main/java/org/apache/tika/example/MediaTypeExample.java b/tika-example/src/main/java/org/apache/tika/example/MediaTypeExample.java index 91823e51e1..c5d0304f1e 100755 --- a/tika-example/src/main/java/org/apache/tika/example/MediaTypeExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/MediaTypeExample.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; diff --git a/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java b/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java index c474da542b..b4a7773273 100755 --- a/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java +++ b/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -26,7 +24,6 @@ import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; - import org.apache.tika.Tika; import org.apache.tika.metadata.DublinCore; import org.apache.tika.metadata.Metadata; @@ -68,14 +65,12 @@ public void indexWithDublinCore(File file) throws Exception { met.add(TikaCoreProperties.CREATOR, "Tika in Action"); met.set(TikaCoreProperties.CREATED, new Date()); met.set(TikaCoreProperties.FORMAT, tika.detect(file)); - met.set(DublinCore.SOURCE, file - .toURI() - .toURL() - .toString()); + met.set(DublinCore.SOURCE, file.toURI().toURL().toString()); met.add(TikaCoreProperties.SUBJECT, "File"); met.add(TikaCoreProperties.SUBJECT, "Indexing"); met.add(TikaCoreProperties.SUBJECT, "Metadata"); - met.set(Property.externalClosedChoise(TikaCoreProperties.RIGHTS.getName(), "public", "private"), "public"); + met.set(Property.externalClosedChoise(TikaCoreProperties.RIGHTS.getName(), "public", + "private"), "public"); try (InputStream is = new FileInputStream(file)) { tika.parse(is, met); Document document = new Document(); diff --git a/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java b/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java index 54efda3507..46a78aeb55 100755 --- a/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java +++ b/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -24,8 +22,6 @@ import java.nio.file.Paths; import org.apache.commons.io.FileUtils; -import org.xml.sax.ContentHandler; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; @@ -40,11 +36,12 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; /** - * Demonstrates how to call the different components within Tika: its - * {@link Detector} framework (aka MIME identification and repository), its - * {@link Parser} interface, its {@link org.apache.tika.language.LanguageIdentifier} and other goodies. + * Demonstrates how to call the different components within Tika: its {@link Detector} framework + * (aka MIME identification and repository), its {@link Parser} interface, its + * {@link org.apache.tika.language.LanguageIdentifier} and other goodies. *

* It also shows the "easy way" via {@link AutoDetectParser} */ @@ -70,7 +67,8 @@ public static void main(String[] args) throws Exception { System.out.println(text); } - public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception { + public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, + Metadata metadata) throws Exception { System.out.println("Handling using AutoDetectParser: [" + filename + "]"); AutoDetectParser parser = new AutoDetectParser(tikaConfig); @@ -80,23 +78,28 @@ public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig return handler.toString(); } - public static String parseUsingComponents(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception { + public static String parseUsingComponents(String filename, TikaConfig tikaConfig, + Metadata metadata) throws Exception { MimeTypes mimeRegistry = tikaConfig.getMimeRepository(); System.out.println("Examining: [" + filename + "]"); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename); - System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]"); + System.out.println("The MIME type (based on filename) is: [" + + mimeRegistry.detect(null, metadata) + "]"); InputStream stream = TikaInputStream.get(Paths.get(filename)); - System.out.println("The MIME type (based on MAGIC) is: [" + mimeRegistry.detect(stream, metadata) + "]"); + System.out.println("The MIME type (based on MAGIC) is: [" + + mimeRegistry.detect(stream, metadata) + "]"); stream = TikaInputStream.get(Paths.get(filename)); Detector detector = tikaConfig.getDetector(); - System.out.println("The MIME type (based on the Detector interface) is: [" + detector.detect(stream, metadata) + "]"); + System.out.println("The MIME type (based on the Detector interface) is: [" + + detector.detect(stream, metadata) + "]"); LanguageDetector langDetector = new OptimaizeLangDetector().loadModels(); - LanguageResult lang = langDetector.detect(FileUtils.readFileToString(new File(filename), UTF_8)); + LanguageResult lang = + langDetector.detect(FileUtils.readFileToString(new File(filename), UTF_8)); System.out.println("The language of this content is: [" + lang.getLanguage() + "]"); diff --git a/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java b/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java index 065f89908a..bab9fb4e2b 100644 --- a/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -26,8 +24,6 @@ import java.util.ArrayList; import java.util.List; -import org.xml.sax.SAXException; - import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -43,15 +39,16 @@ import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.serialization.JsonMetadataList; +import org.xml.sax.SAXException; public class ParsingExample { /** - * Example of how to use Tika's parseToString method to parse the content of a file, - * and return any text found. + * Example of how to use Tika's parseToString method to parse the content of a file, and return + * any text found. *

- * Note: Tika.parseToString() will extract content from the outer container - * document and any embedded/attached documents. + * Note: Tika.parseToString() will extract content from the outer container document and any + * embedded/attached documents. * * @return The content of a file. */ @@ -63,26 +60,24 @@ public String parseToStringExample() throws IOException, SAXException, TikaExcep } /** - * Example of how to use Tika to parse a file when you do not know its file type - * ahead of time. + * Example of how to use Tika to parse a file when you do not know its file type ahead of time. *

- * AutoDetectParser attempts to discover the file's type automatically, then call - * the exact Parser built for that file type. + * AutoDetectParser attempts to discover the file's type automatically, then call the exact + * Parser built for that file type. *

- * The stream to be parsed by the Parser. In this case, we get a file from the - * resources folder of this project. + * The stream to be parsed by the Parser. In this case, we get a file from the resources folder + * of this project. *

- * Handlers are used to get the exact information you want out of the host of - * information gathered by Parsers. The body content handler, intuitively, extracts - * everything that would go between HTML body tags. + * Handlers are used to get the exact information you want out of the host of information + * gathered by Parsers. The body content handler, intuitively, extracts everything that would go + * between HTML body tags. *

- * The Metadata object will be filled by the Parser with Metadata discovered about - * the file being parsed. + * The Metadata object will be filled by the Parser with Metadata discovered about the file + * being parsed. *

- * Note: This example will extract content from the outer document and all - * embedded documents. However, if you choose to use a {@link ParseContext}, - * make sure to set a {@link Parser} or else embedded content will not be - * parsed. + * Note: This example will extract content from the outer document and all embedded documents. + * However, if you choose to use a {@link ParseContext}, make sure to set a {@link Parser} or + * else embedded content will not be parsed. * * @return The content of a file. */ @@ -97,9 +92,8 @@ public String parseExample() throws IOException, SAXException, TikaException { } /** - * If you don't want content from embedded documents, send in - * a {@link org.apache.tika.parser.ParseContext} that does contains a - * {@link EmptyParser}. + * If you don't want content from embedded documents, send in a + * {@link org.apache.tika.parser.ParseContext} that does contains a {@link EmptyParser}. * * @return The content of a file. */ @@ -109,7 +103,8 @@ public String parseNoEmbeddedExample() throws IOException, SAXException, TikaExc Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, new EmptyParser()); - try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { + try (InputStream stream = + ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { parser.parse(stream, handler, metadata, parseContext); return handler.toString(); } @@ -117,8 +112,8 @@ public String parseNoEmbeddedExample() throws IOException, SAXException, TikaExc /** - * This example shows how to extract content from the outer document and all - * embedded documents. The key is to specify a {@link Parser} in the {@link ParseContext}. + * This example shows how to extract content from the outer document and all embedded documents. + * The key is to specify a {@link Parser} in the {@link ParseContext}. * * @return content, including from embedded documents * @throws IOException @@ -131,44 +126,45 @@ public String parseEmbeddedExample() throws IOException, SAXException, TikaExcep Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); - try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { + try (InputStream stream = + ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { parser.parse(stream, handler, metadata, context); return handler.toString(); } } /** - * For documents that may contain embedded documents, it might be helpful - * to create list of metadata objects, one for the container document and - * one for each embedded document. This allows easy access to both the - * extracted content and the metadata of each embedded document. - * Note that many document formats can contain embedded documents, - * including traditional container formats -- zip, tar and others -- but also - * common office document formats including: MSWord, MSExcel, - * MSPowerPoint, RTF, PDF, MSG and several others. + * For documents that may contain embedded documents, it might be helpful to create list of + * metadata objects, one for the container document and one for each embedded document. This + * allows easy access to both the extracted content and the metadata of each embedded document. + * Note that many document formats can contain embedded documents, including traditional + * container formats -- zip, tar and others -- but also common office document formats + * including: MSWord, MSExcel, MSPowerPoint, RTF, PDF, MSG and several others. *

- * The "content" format is determined by the ContentHandlerFactory, and - * the content is stored in {@link org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT} + * The "content" format is determined by the ContentHandlerFactory, and the content is stored in + * {@link org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT} *

- * The drawback to the RecursiveParserWrapper is that it caches metadata and contents - * in memory. This should not be used on files whose contents are too big to be handled - * in memory. + * The drawback to the RecursiveParserWrapper is that it caches metadata and contents in memory. + * This should not be used on files whose contents are too big to be handled in memory. * * @return a list of metadata object, one each for the container file and each embedded file * @throws IOException * @throws SAXException * @throws TikaException */ - public List recursiveParserWrapperExample() throws IOException, SAXException, TikaException { + public List recursiveParserWrapperExample() + throws IOException, SAXException, TikaException { Parser p = new AutoDetectParser(); - ContentHandlerFactory factory = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1); + ContentHandlerFactory factory = new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded.docx"); ParseContext context = new ParseContext(); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(factory, -1); - try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { + try (InputStream stream = + ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { wrapper.parse(stream, handler, metadata, context); } @@ -176,21 +172,20 @@ public List recursiveParserWrapperExample() throws IOException, SAXExc } /** - * We include a simple JSON serializer for a list of metadata with - * {@link JsonMetadataList}. - * That class also includes a deserializer to convert from JSON - * back to a List. + * We include a simple JSON serializer for a list of metadata with {@link JsonMetadataList}. + * That class also includes a deserializer to convert from JSON back to a List. *

- * This functionality is also available in tika-app's GUI, and - * with the -J option on tika-app's commandline. For tika-server - * users, there is the "rmeta" service that will return this format. + * This functionality is also available in tika-app's GUI, and with the -J option on tika-app's + * commandline. For tika-server users, there is the "rmeta" service that will return this + * format. * * @return a JSON representation of a list of Metadata objects * @throws IOException * @throws SAXException * @throws TikaException */ - public String serializedRecursiveParserWrapperExample() throws IOException, SAXException, TikaException { + public String serializedRecursiveParserWrapperExample() + throws IOException, SAXException, TikaException { List metadataList = recursiveParserWrapperExample(); StringWriter writer = new StringWriter(); JsonMetadataList.toJson(metadataList, writer); @@ -205,10 +200,12 @@ public String serializedRecursiveParserWrapperExample() throws IOException, SAXE * @throws SAXException * @throws TikaException */ - public List extractEmbeddedDocumentsExample(Path outputPath) throws IOException, SAXException, TikaException { + public List extractEmbeddedDocumentsExample(Path outputPath) + throws IOException, SAXException, TikaException { ExtractEmbeddedFiles ex = new ExtractEmbeddedFiles(); List ret = new ArrayList<>(); - try (TikaInputStream stream = TikaInputStream.get(ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx"))) { + try (TikaInputStream stream = TikaInputStream.get( + ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx"))) { ex.extract(stream, outputPath); try (DirectoryStream dirStream = Files.newDirectoryStream(outputPath)) { for (Path entry : dirStream) { diff --git a/tika-example/src/main/java/org/apache/tika/example/Pharmacy.java b/tika-example/src/main/java/org/apache/tika/example/Pharmacy.java index e993599620..f81846e261 100755 --- a/tika-example/src/main/java/org/apache/tika/example/Pharmacy.java +++ b/tika-example/src/main/java/org/apache/tika/example/Pharmacy.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; diff --git a/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java b/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java index b5ddffdb3f..75582876ff 100644 --- a/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java +++ b/tika-example/src/main/java/org/apache/tika/example/PickBestTextEncodingParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -25,9 +23,6 @@ import java.util.List; import java.util.Map; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.detect.EncodingDetector; import org.apache.tika.detect.NonDetectingEncodingDetector; import org.apache.tika.exception.TikaException; @@ -39,17 +34,17 @@ import org.apache.tika.parser.txt.TXTParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Inspired by TIKA-1443 and https://wiki.apache.org/tika/CompositeParserDiscussion - * this tries several different text encodings, then does the real - * text parsing based on which is "best". + * Inspired by TIKA-1443 and https://wiki.apache.org/tika/CompositeParserDiscussion this tries + * several different text encodings, then does the real text parsing based on which is "best". *

* The logic for "best" needs a lot of work! *

- * This is not recommended for actual production use... It is mostly to - * prove that the {@link AbstractMultipleParser} environment is - * sufficient to support this use-case + * This is not recommended for actual production use... It is mostly to prove that the + * {@link AbstractMultipleParser} environment is sufficient to support this use-case *

* TODO Implement proper "Junk" detection * @@ -86,15 +81,14 @@ protected void parserPrepare(Parser parser, Metadata metadata, ParseContext cont super.parserPrepare(parser, metadata, context); // Specify which charset to try - String charset = context - .get(CharsetTester.class) - .getNextCharset(); + String charset = context.get(CharsetTester.class).getNextCharset(); Charset charsetCS = Charset.forName(charset); context.set(EncodingDetector.class, new NonDetectingEncodingDetector(charsetCS)); } @Override - protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandler handler, ParseContext context, Exception exception) { + protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandler handler, + ParseContext context, Exception exception) { // Get the current charset CharsetTester charsetTester = context.get(CharsetTester.class); String charset = charsetTester.getCurrentCharset(); @@ -112,7 +106,8 @@ protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandl String text = charsetTester.charsetText.get(pcharset); int cEnglish = 0; for (char c : text.toCharArray()) { - if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9')) { cEnglish++; } } @@ -130,9 +125,10 @@ protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandl } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata originalMetadata, ParseContext context) throws IOException, SAXException, TikaException { + public void parse(InputStream stream, ContentHandler handler, Metadata originalMetadata, + ParseContext context) throws IOException, SAXException, TikaException { // Use a BodyContentHandler for each of the charset test, - // then their real ContentHandler for the last one + // then their real ContentHandler for the last one CharsetContentHandlerFactory handlerFactory = new CharsetContentHandlerFactory(); handlerFactory.handler = handler; @@ -144,9 +140,10 @@ public void parse(InputStream stream, ContentHandler handler, Metadata originalM } @Override - public void parse(InputStream stream, ContentHandlerFactory handlers, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + public void parse(InputStream stream, ContentHandlerFactory handlers, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { // We only work with one ContentHandler as far as the user is - // concerned, any others are purely internal! + // concerned, any others are purely internal! parse(stream, handlers.getNewContentHandler(), metadata, context); } diff --git a/tika-example/src/main/java/org/apache/tika/example/PrescriptionParser.java b/tika-example/src/main/java/org/apache/tika/example/PrescriptionParser.java index ab492aa96b..f5199650a2 100755 --- a/tika-example/src/main/java/org/apache/tika/example/PrescriptionParser.java +++ b/tika-example/src/main/java/org/apache/tika/example/PrescriptionParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -20,26 +18,28 @@ import java.util.Collections; import java.util.Set; -import org.xml.sax.ContentHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.xml.ElementMetadataHandler; import org.apache.tika.parser.xml.XMLParser; import org.apache.tika.sax.TeeContentHandler; +import org.xml.sax.ContentHandler; public class PrescriptionParser extends XMLParser { private static final long serialVersionUID = 7690682277511967388L; @Override - protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) { + protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, + ParseContext context) { String xpd = "http://example.com/2011/xpd"; ContentHandler doctor = new ElementMetadataHandler(xpd, "doctor", metadata, "xpd:doctor"); - ContentHandler patient = new ElementMetadataHandler(xpd, "patient", metadata, "xpd:patient"); + ContentHandler patient = + new ElementMetadataHandler(xpd, "patient", metadata, "xpd:patient"); - return new TeeContentHandler(super.getContentHandler(handler, metadata, context), doctor, patient); + return new TeeContentHandler(super.getContentHandler(handler, metadata, context), doctor, + patient); } @Override diff --git a/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java b/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java index 6778157bc0..da646fa99c 100755 --- a/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java +++ b/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -36,20 +34,19 @@ import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; - import org.apache.tika.metadata.DublinCore; import org.apache.tika.metadata.TikaCoreProperties; /** - * Builds on top of the LuceneIndexer and the Metadata discussions in Chapter 6 - * to output an RSS (or RDF) feed of files crawled by the LuceneIndexer within - * the last N minutes. + * Builds on top of the LuceneIndexer and the Metadata discussions in Chapter 6 to output an RSS (or + * RDF) feed of files crawled by the LuceneIndexer within the last N minutes. */ @SuppressWarnings("deprecation") public class RecentFiles { private IndexReader reader; - private SimpleDateFormat rssDateFormat = new SimpleDateFormat("E, dd MMM yyyy HH:mm:ss z", Locale.getDefault()); + private SimpleDateFormat rssDateFormat = + new SimpleDateFormat("E, dd MMM yyyy HH:mm:ss z", Locale.getDefault()); public String generateRSS(Path indexFile) throws CorruptIndexException, IOException { StringBuilder output = new StringBuilder(); @@ -58,12 +55,14 @@ public String generateRSS(Path indexFile) throws CorruptIndexException, IOExcept try { reader = DirectoryReader.open(FSDirectory.open(indexFile)); searcher = new IndexSearcher(reader); - GregorianCalendar gc = new java.util.GregorianCalendar(TimeZone.getDefault(), Locale.getDefault()); + GregorianCalendar gc = new java.util.GregorianCalendar(TimeZone.getDefault(), + Locale.getDefault()); gc.setTime(new Date()); String nowDateTime = ISO8601.format(gc); gc.add(java.util.GregorianCalendar.MINUTE, -5); String fiveMinsAgo = ISO8601.format(gc); - TermRangeQuery query = new TermRangeQuery(TikaCoreProperties.CREATED.getName(), new BytesRef(fiveMinsAgo), new BytesRef(nowDateTime), true, true); + TermRangeQuery query = new TermRangeQuery(TikaCoreProperties.CREATED.getName(), + new BytesRef(fiveMinsAgo), new BytesRef(nowDateTime), true, true); TopScoreDocCollector collector = TopScoreDocCollector.create(20, 10000); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; @@ -92,8 +91,12 @@ public String getRSSItem(Document doc) { for (String topic : doc.getValues(TikaCoreProperties.SUBJECT.getName())) { output.append(emitTag("category", topic, null, null)); } - output.append(emitTag("pubDate", rssDateFormat.format(ISO8601.parse(doc.get(TikaCoreProperties.CREATED.getName()))), null, null)); - output.append(emitTag("description", doc.get(TikaCoreProperties.TITLE.getName()), null, null)); + output.append(emitTag("pubDate", + rssDateFormat.format(ISO8601 + .parse(doc.get(TikaCoreProperties.CREATED.getName()))), + null, null)); + output.append(emitTag("description", doc.get(TikaCoreProperties.TITLE.getName()), null, + null)); output.append(""); return output.toString(); } @@ -104,7 +107,8 @@ public String getRSSHeaders() { output.append(""); output.append(" "); output.append(" Tika in Action: Recent Files Feed."); - output.append(" Chapter 6 Examples demonstrating " + "use of Tika Metadata for RSS."); + output.append(" Chapter 6 Examples demonstrating " + + "use of Tika Metadata for RSS."); output.append(" tikainaction.rss"); output.append(" "); output.append(rssDateFormat.format(new Date())); @@ -118,7 +122,8 @@ public String getRSSFooters() { return " "; } - private String emitTag(String tagName, String value, String attributeName, String attributeValue) { + private String emitTag(String tagName, String value, String attributeName, + String attributeValue) { StringBuilder output = new StringBuilder(); output.append("<"); output.append(tagName); diff --git a/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java b/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java index b25871641a..bab9e3e813 100755 --- a/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java +++ b/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -29,9 +27,6 @@ import java.util.Set; import org.apache.commons.io.IOUtils; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -40,6 +35,8 @@ import org.apache.tika.sax.Link; import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Demonstrates Tika and its ability to sense symlinks. @@ -61,9 +58,7 @@ public void rollback(File deployArea) throws IOException, SAXException, TikaExce } links.sort(Comparator.comparing(Link::getText)); - this.updateVersion(links - .get(links.size() - 2) - .getText()); + this.updateVersion(links.get(links.size() - 2).getText()); } private void updateVersion(String version) { @@ -71,9 +66,7 @@ private void updateVersion(String version) { } private boolean isSymlink(File f) throws IOException { - return !f - .getAbsolutePath() - .equals(f.getCanonicalPath()); + return !f.getAbsolutePath().equals(f.getCanonicalPath()); } class DeploymentAreaParser implements Parser { @@ -86,32 +79,33 @@ class DeploymentAreaParser implements Parser { * org.apache.tika.parser.ParseContext) */ public Set getSupportedTypes(ParseContext context) { - return Collections.unmodifiableSet(new HashSet<>(Collections.singletonList(MediaType.TEXT_PLAIN))); + return Collections.unmodifiableSet( + new HashSet<>(Collections.singletonList(MediaType.TEXT_PLAIN))); } /* * (non-Javadoc) * - * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, - * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata) + * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, + * org.apache.tika.metadata.Metadata) */ - public void parse(InputStream is, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { + public void parse(InputStream is, ContentHandler handler, Metadata metadata) + throws IOException, SAXException, TikaException { parse(is, handler, metadata, new ParseContext()); } /* * (non-Javadoc) * - * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, - * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, - * org.apache.tika.parser.ParseContext) + * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, + * org.apache.tika.metadata.Metadata, org.apache.tika.parser.ParseContext) */ - public void parse(InputStream is, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + public void parse(InputStream is, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { File deployArea = new File(IOUtils.toString(is, UTF_8)); - File[] versions = deployArea.listFiles(pathname -> !pathname - .getName() - .startsWith("current")); + File[] versions = deployArea + .listFiles(pathname -> !pathname.getName().startsWith("current")); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); @@ -119,10 +113,7 @@ public void parse(InputStream is, ContentHandler handler, Metadata metadata, Par if (isSymlink(v)) { continue; } - xhtml.startElement("a", "href", v - .toURI() - .toURL() - .toExternalForm()); + xhtml.startElement("a", "href", v.toURI().toURL().toExternalForm()); xhtml.characters(v.getName()); xhtml.endElement("a"); } diff --git a/tika-example/src/main/java/org/apache/tika/example/SimpleTextExtractor.java b/tika-example/src/main/java/org/apache/tika/example/SimpleTextExtractor.java index 0035d02532..f094f228c1 100755 --- a/tika-example/src/main/java/org/apache/tika/example/SimpleTextExtractor.java +++ b/tika-example/src/main/java/org/apache/tika/example/SimpleTextExtractor.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; diff --git a/tika-example/src/main/java/org/apache/tika/example/SimpleTypeDetector.java b/tika-example/src/main/java/org/apache/tika/example/SimpleTypeDetector.java index 2b242cc13a..8bdb784b34 100755 --- a/tika-example/src/main/java/org/apache/tika/example/SimpleTypeDetector.java +++ b/tika-example/src/main/java/org/apache/tika/example/SimpleTypeDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; diff --git a/tika-example/src/main/java/org/apache/tika/example/SpringExample.java b/tika-example/src/main/java/org/apache/tika/example/SpringExample.java index 9d85486f7c..9cd1244ae0 100755 --- a/tika-example/src/main/java/org/apache/tika/example/SpringExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/SpringExample.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -22,19 +20,20 @@ import java.io.ByteArrayInputStream; import java.io.OutputStreamWriter; -import org.springframework.context.ApplicationContext; -import org.springframework.context.support.ClassPathXmlApplicationContext; - import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.WriteOutContentHandler; +import org.springframework.context.ApplicationContext; +import org.springframework.context.support.ClassPathXmlApplicationContext; public class SpringExample { public static void main(String[] args) throws Exception { - ApplicationContext context = new ClassPathXmlApplicationContext(new String[]{"org/apache/tika/example/spring.xml"}); + ApplicationContext context = new ClassPathXmlApplicationContext( + new String[] {"org/apache/tika/example/spring.xml"}); Parser parser = context.getBean("tika", Parser.class); - parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(UTF_8)), new WriteOutContentHandler(new OutputStreamWriter(System.out, UTF_8)), new Metadata(), - new ParseContext()); + parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(UTF_8)), + new WriteOutContentHandler(new OutputStreamWriter(System.out, UTF_8)), + new Metadata(), new ParseContext()); } } diff --git a/tika-example/src/main/java/org/apache/tika/example/StandardsExtractionExample.java b/tika-example/src/main/java/org/apache/tika/example/StandardsExtractionExample.java index 69aea730e1..e6eda70e18 100644 --- a/tika-example/src/main/java/org/apache/tika/example/StandardsExtractionExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/StandardsExtractionExample.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -37,15 +35,13 @@ import org.apache.tika.sax.StandardsExtractingContentHandler; /** - * Class to demonstrate how to use the {@link StandardsExtractingContentHandler} - * to get a list of the standard references from every file in a directory. + * Class to demonstrate how to use the {@link StandardsExtractingContentHandler} to get a list of + * the standard references from every file in a directory. * *

- * You can run this main method by running - * + * You can run this main method by running * mvn exec:java -Dexec.mainClass="org.apache.tika.example.StandardsExtractionExample" -Dexec.args="/path/to/input" - * - * from the tika-example directory. + * from the tika-example directory. *

*/ public class StandardsExtractionExample { @@ -55,7 +51,8 @@ public class StandardsExtractionExample { public static void main(String[] args) { if (args.length < 1) { - System.err.println("Usage: " + StandardsExtractionExample.class.getName() + " /path/to/input"); + System.err.println("Usage: " + StandardsExtractionExample.class.getName() + + " /path/to/input"); System.exit(1); } String pathname = args[0]; @@ -71,7 +68,8 @@ public static void processFolder(Path folder) { try { Files.walkFileTree(folder, new SimpleFileVisitor() { @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) + throws IOException { try { process(file); successfulFiles++; @@ -83,7 +81,8 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IO } @Override - public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { + public FileVisitResult visitFileFailed(Path file, IOException exc) + throws IOException { failedFiles++; return FileVisitResult.CONTINUE; } @@ -99,12 +98,14 @@ public static void process(Path path) throws Exception { // The StandardsExtractingContentHandler will examine any characters for // standard references before passing them // to the underlying Handler. - StandardsExtractingContentHandler handler = new StandardsExtractingContentHandler(new BodyContentHandler(-1), metadata); + StandardsExtractingContentHandler handler = + new StandardsExtractingContentHandler(new BodyContentHandler(-1), metadata); handler.setThreshold(0.75); try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) { parser.parse(stream, handler, metadata, new ParseContext()); } - String[] references = metadata.getValues(StandardsExtractingContentHandler.STANDARD_REFERENCES); + String[] references = + metadata.getValues(StandardsExtractingContentHandler.STANDARD_REFERENCES); Collections.addAll(standardReferences, references); } } diff --git a/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java b/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java index 44b514c657..bed9a5a948 100755 --- a/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -34,10 +32,6 @@ import java.util.Map; import java.util.zip.GZIPInputStream; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -56,6 +50,9 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.TeeContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; public class TIAParsingExample { public static String parseToStringExample() throws Exception { @@ -156,7 +153,8 @@ public static void testTeeContentHandler(String filename) throws Exception { Parser parser = new AutoDetectParser(); LinkContentHandler linkCollector = new LinkContentHandler(); try (Writer writer = Files.newBufferedWriter(Paths.get(filename), StandardCharsets.UTF_8)) { - ContentHandler handler = new TeeContentHandler(new BodyContentHandler(writer), linkCollector); + ContentHandler handler = + new TeeContentHandler(new BodyContentHandler(writer), linkCollector); parser.parse(stream, handler, metadata, context); } } @@ -191,7 +189,8 @@ public static void testCompositeDocument() throws Exception { private static final long serialVersionUID = 4424210691523343833L; @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { // custom processing of the component document } }); diff --git a/tika-example/src/main/java/org/apache/tika/example/TextStatsFromTikaEval.java b/tika-example/src/main/java/org/apache/tika/example/TextStatsFromTikaEval.java index e2a9fbb839..4368d298fb 100644 --- a/tika-example/src/main/java/org/apache/tika/example/TextStatsFromTikaEval.java +++ b/tika-example/src/main/java/org/apache/tika/example/TextStatsFromTikaEval.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -27,16 +25,14 @@ /** - * These examples create a new {@link CompositeTextStatsCalculator} - * for each call. This is extremely inefficient because the lang id - * model has to be loaded and the common words for each call. + * These examples create a new {@link CompositeTextStatsCalculator} for each call. This is extremely + * inefficient because the lang id model has to be loaded and the common words for each call. */ public class TextStatsFromTikaEval { /** - * Use the default language id models and the default common tokens - * lists in tika-eval to calculate the out-of-vocabulary percentage - * for a given string. + * Use the default language id models and the default common tokens lists in tika-eval to + * calculate the out-of-vocabulary percentage for a given string. * * @param txt * @return @@ -48,11 +44,11 @@ public double getOOV(String txt) { Map results = calc.calculate(txt); /* - Note that the OOV requires language id, so you can also - retrieve the detected languages with this: - - List detectedLanguages = (List) results.get(LanguageIDWrapper.class); - + * Note that the OOV requires language id, so you can also retrieve the detected languages + * with this: + * + * List detectedLanguages = (List) results.get(LanguageIDWrapper.class); + * */ CommonTokenResult result = (CommonTokenResult) results.get(CommonTokens.class); diff --git a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java index a21e5bd392..b29137ef42 100644 --- a/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/TranscribeTranslateExample.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.example; @@ -26,25 +24,19 @@ import org.apache.tika.language.translate.impl.GoogleTranslator; /** - * This example demonstrates primitive logic for - * chaining Tika API calls. In this case translation - * could be considered as a downstream process to - * transcription. - * We simply pass the output of - * a call to {@link Tika#parseToString(Path)} - * into {@link Translator#translate(String, String)}. - * The {@link GoogleTranslator} is configured with a target - * language of "en-US". + * This example demonstrates primitive logic for chaining Tika API calls. In this case translation + * could be considered as a downstream process to transcription. We simply pass the output of a call + * to {@link Tika#parseToString(Path)} into {@link Translator#translate(String, String)}. The + * {@link GoogleTranslator} is configured with a target language of "en-US". * * @author lewismc */ public class TranscribeTranslateExample { /** - * Use {@link GoogleTranslator} to execute translation on - * input data. This implementation needs configured as explained in the Javadoc. - * In this implementation, Google will try to guess the input language. The target - * language is "en-US". + * Use {@link GoogleTranslator} to execute translation on input data. This implementation needs + * configured as explained in the Javadoc. In this implementation, Google will try to guess the + * input language. The target language is "en-US". * * @param text input text to translate. * @return translated text String. @@ -64,8 +56,7 @@ public static String googleTranslateToEnglish(String text) { /** * Use {@link org.apache.tika.parser.transcribe.aws.AmazonTranscribe} to execute transcription - * on input data. - * This implementation needs to be configured as explained in the Javadoc. + * on input data. This implementation needs to be configured as explained in the Javadoc. * * @param file the name of the file (which needs to be on the Java Classpath) to transcribe. * @return transcribed text. @@ -81,13 +72,14 @@ public static String amazonTranscribe(Path tikaConfig, Path file) throws Excepti * transcription then translation on the given resource, or *
  • transcribe ${tika-config.xml} ${file}; which executes only translation
  • * - * @param args either of the commands described above and the input file - * (which needs to be on the Java Classpath). - *

    - *

    - *

    - * ${tika-config.xml} must include credentials for aws and a temporary storage bucket: - *

    +     * @param args either of the commands described above and the input file (which needs to be on
    +     *        the Java Classpath).
    +     *        

    + *

    + *

    + * ${tika-config.xml} must include credentials for aws and a temporary storage bucket: + * + *

          *             {@code
          *              
          *               
    @@ -108,8 +100,10 @@ public static void main(String[] args) throws Exception {
             String text = null;
             if (args.length > 1) {
                 if ("transcribe-translate".equals(args[1])) {
    -                text = googleTranslateToEnglish(amazonTranscribe(Paths.get(args[0]), Paths.get(args[1])));
    -                System.out.print("Transcription and translation successful!\nEXTRACTED TEXT: " + text);
    +                text = googleTranslateToEnglish(
    +                                amazonTranscribe(Paths.get(args[0]), Paths.get(args[1])));
    +                System.out.print("Transcription and translation successful!\nEXTRACTED TEXT: "
    +                                + text);
                 } else if ("transcribe".equals(args[1])) {
                     text = amazonTranscribe(Paths.get(args[0]), Paths.get(args[1]));
                     System.out.print("Transcription successful!\nEXTRACTED TEXT: " + text);
    diff --git a/tika-example/src/main/java/org/apache/tika/example/TranslatorExample.java b/tika-example/src/main/java/org/apache/tika/example/TranslatorExample.java
    index 4b0a24cd02..b0ca15fae9 100644
    --- a/tika-example/src/main/java/org/apache/tika/example/TranslatorExample.java
    +++ b/tika-example/src/main/java/org/apache/tika/example/TranslatorExample.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     
     package org.apache.tika.example;
    diff --git a/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java b/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java
    index ef2a4c5088..351525fb24 100755
    --- a/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java
    +++ b/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     
     package org.apache.tika.example;
    @@ -29,17 +27,18 @@
     import org.apache.tika.metadata.TikaCoreProperties;
     
     /**
    - * Generates document summaries for corpus analysis in the Open Relevance
    - * project.
    + * Generates document summaries for corpus analysis in the Open Relevance project.
      */
     @SuppressWarnings("deprecation")
     public class TrecDocumentGenerator {
    -    public TrecDocument summarize(File file) throws FileNotFoundException, IOException, TikaException {
    +    public TrecDocument summarize(File file)
    +                    throws FileNotFoundException, IOException, TikaException {
             Tika tika = new Tika();
             Metadata met = new Metadata();
     
             String contents = tika.parseToString(new FileInputStream(file), met);
    -        return new TrecDocument(met.get(TikaCoreProperties.RESOURCE_NAME_KEY), contents, met.getDate(TikaCoreProperties.CREATED));
    +        return new TrecDocument(met.get(TikaCoreProperties.RESOURCE_NAME_KEY), contents,
    +                        met.getDate(TikaCoreProperties.CREATED));
     
         }
     
    @@ -58,8 +57,7 @@ public TrecDocument(CharSequence docname, CharSequence body, Date date) {
                 this.date = date;
             }
     
    -        public TrecDocument() {
    -        }
    +        public TrecDocument() {}
     
             /**
              * @return the docname
    diff --git a/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java b/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java
    index 460fef0075..57b2a040c5 100755
    --- a/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java
    +++ b/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     
     package org.apache.tika.example;
    @@ -23,8 +21,7 @@
     import java.util.zip.ZipFile;
     
     /**
    - * Example code listing from Chapter 1. Lists a zip file's entries using JDK's
    - * standard APIs.
    + * Example code listing from Chapter 1. Lists a zip file's entries using JDK's standard APIs.
      */
     public class ZipListFiles {
         public static void main(String[] args) throws Exception {
    diff --git a/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java b/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java
    index 4f28f90d39..df6574a1f8 100755
    --- a/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java
    +++ b/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     
     package org.apache.tika.example;
    diff --git a/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java b/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
    index 2bf625c9fd..c9ca4d3729 100644
    --- a/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
    +++ b/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     
     package org.apache.tika.example;
    @@ -25,12 +23,11 @@
     import java.io.IOException;
     import java.util.List;
     
    +import org.apache.tika.exception.TikaException;
     import org.junit.jupiter.api.BeforeEach;
     import org.junit.jupiter.api.Test;
     import org.xml.sax.SAXException;
     
    -import org.apache.tika.exception.TikaException;
    -
     public class ContentHandlerExampleTest {
         ContentHandlerExample example;
     
    @@ -41,17 +38,13 @@ public void setUp() {
     
         @Test
         public void testParseToPlainText() throws IOException, SAXException, TikaException {
    -        String result = example
    -                .parseToPlainText()
    -                .trim();
    +        String result = example.parseToPlainText().trim();
             assertEquals("test", result, "Expected 'test', but got '" + result + "'");
         }
     
         @Test
         public void testParseToHTML() throws IOException, SAXException, TikaException {
    -        String result = example
    -                .parseToHTML()
    -                .trim();
    +        String result = example.parseToHTML().trim();
     
             assertContains("", result);
    @@ -63,9 +56,7 @@ public void testParseToHTML() throws IOException, SAXException, TikaException {
     
         @Test
         public void testParseBodyToHTML() throws IOException, SAXException, TikaException {
    -        String result = example
    -                .parseBodyToHTML()
    -                .trim();
    +        String result = example.parseBodyToHTML().trim();
     
             assertNotContained("", result);
    @@ -77,9 +68,7 @@ public void testParseBodyToHTML() throws IOException, SAXException, TikaExceptio
     
         @Test
         public void testParseOnePartToHTML() throws IOException, SAXException, TikaException {
    -        String result = example
    -                .parseOnePartToHTML()
    -                .trim();
    +        String result = example.parseOnePartToHTML().trim();
     
             assertNotContained("", result);
    diff --git a/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java b/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
    index 8e41ae49f9..b4f513111d 100644
    --- a/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
    +++ b/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     package org.apache.tika.example;
     
    @@ -29,16 +27,15 @@
     import java.nio.charset.Charset;
     import java.nio.file.Files;
     
    -import org.junit.jupiter.api.AfterEach;
    -import org.junit.jupiter.api.BeforeEach;
    -import org.junit.jupiter.api.Test;
    -
     import org.apache.tika.config.TikaConfig;
     import org.apache.tika.config.TikaConfigSerializer;
     import org.apache.tika.detect.CompositeDetector;
     import org.apache.tika.parser.AutoDetectParser;
     import org.apache.tika.parser.CompositeParser;
     import org.apache.tika.parser.Parser;
    +import org.junit.jupiter.api.AfterEach;
    +import org.junit.jupiter.api.BeforeEach;
    +import org.junit.jupiter.api.Test;
     
     public class DumpTikaConfigExampleTest {
         private File configFile;
    @@ -46,9 +43,7 @@ public class DumpTikaConfigExampleTest {
         @BeforeEach
         public void setUp() {
             try {
    -            configFile = Files
    -                    .createTempFile("tmp", ".xml")
    -                    .toFile();
    +            configFile = Files.createTempFile("tmp", ".xml").toFile();
             } catch (IOException e) {
                 throw new RuntimeException("Failed to create tmp file");
             }
    @@ -67,32 +62,26 @@ public void tearDown() {
         @Test
         public void testDump() throws Exception {
             DumpTikaConfigExample ex = new DumpTikaConfigExample();
    -        for (Charset charset : new Charset[]{UTF_8, UTF_16LE}) {
    +        for (Charset charset : new Charset[] {UTF_8, UTF_16LE}) {
                 for (TikaConfigSerializer.Mode mode : TikaConfigSerializer.Mode.values()) {
                     Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset);
    -                TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, charset);
    +                TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer,
    +                                charset);
                     writer.flush();
                     writer.close();
     
                     TikaConfig c = new TikaConfig(configFile);
    -                assertTrue(c.getParser() instanceof CompositeParser, c
    -                        .getParser()
    -                        .toString());
    -                assertTrue(c.getDetector() instanceof CompositeDetector, c
    -                        .getDetector()
    -                        .toString());
    +                assertTrue(c.getParser() instanceof CompositeParser, c.getParser().toString());
    +                assertTrue(c.getDetector() instanceof CompositeDetector,
    +                                c.getDetector().toString());
     
                     CompositeParser p = (CompositeParser) c.getParser();
    -                assertTrue(p
    -                        .getParsers()
    -                        .size() > 130, "enough parsers?");
    +                assertTrue(p.getParsers().size() > 130, "enough parsers?");
     
                     CompositeDetector d = (CompositeDetector) c.getDetector();
    -                assertTrue(d
    -                        .getDetectors()
    -                        .size() > 3, "enough detectors?");
    +                assertTrue(d.getDetectors().size() > 3, "enough detectors?");
     
    -                //just try to load it into autodetect to make sure no errors are thrown
    +                // just try to load it into autodetect to make sure no errors are thrown
                     Parser auto = new AutoDetectParser(c);
                     assertNotNull(auto);
                 }
    diff --git a/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java b/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java
    index b4aec14843..4872ffc29e 100644
    --- a/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java
    +++ b/tika-example/src/test/java/org/apache/tika/example/ExtractEmbeddedFilesTest.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     
     package org.apache.tika.example;
    @@ -42,7 +40,7 @@ public void setUp() throws IOException {
     
         @AfterEach
         public void tearDown() throws IOException {
    -        //this does not act recursively, this only assumes single level directory
    +        // this does not act recursively, this only assumes single level directory
             try (DirectoryStream dirStream = Files.newDirectoryStream(outputPath)) {
                 for (Path entry : dirStream) {
                     Files.delete(entry);
    @@ -55,7 +53,7 @@ public void tearDown() throws IOException {
         @Test
         public void testExtractEmbeddedFiles() throws Exception {
             List extracted = parsingExample.extractEmbeddedDocumentsExample(outputPath);
    -        //this number should be bigger!!!
    +        // this number should be bigger!!!
             assertEquals(2, extracted.size());
         }
     
    diff --git a/tika-example/src/test/java/org/apache/tika/example/LanguageDetectorExampleTest.java b/tika-example/src/test/java/org/apache/tika/example/LanguageDetectorExampleTest.java
    index af06551998..d08f4987e9 100644
    --- a/tika-example/src/test/java/org/apache/tika/example/LanguageDetectorExampleTest.java
    +++ b/tika-example/src/test/java/org/apache/tika/example/LanguageDetectorExampleTest.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     
     package org.apache.tika.example;
    diff --git a/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java b/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java
    index 84be548d4e..824c4581ae 100755
    --- a/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java
    +++ b/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     
     package org.apache.tika.example;
    @@ -24,19 +22,19 @@
     import java.io.PrintStream;
     
     import org.apache.commons.io.FileUtils;
    +import org.apache.tika.TikaTest;
     import org.junit.jupiter.api.Test;
     import org.junit.jupiter.api.parallel.Isolated;
     
    -import org.apache.tika.TikaTest;
    -
     // because System is caught
     // https://junit.org/junit5/docs/snapshot/user-guide/#writing-tests-parallel-execution-synchronization
     @Isolated
     public class SimpleTextExtractorTest extends TikaTest {
         @Test
         public void testSimpleTextExtractor() throws Exception {
    -        String message =
    -                "This is Tika - Hello, World! This is simple UTF-8 text" + " content written in English to test autodetection of" + " the character encoding of the input stream.";
    +        String message = "This is Tika - Hello, World! This is simple UTF-8 text"
    +                        + " content written in English to test autodetection of"
    +                        + " the character encoding of the input stream.";
             ByteArrayOutputStream buffer = new ByteArrayOutputStream();
     
             PrintStream out = System.out;
    @@ -44,13 +42,11 @@ public void testSimpleTextExtractor() throws Exception {
     
             File file = new File("target", "test.txt");
             FileUtils.writeStringToFile(file, message, UTF_8);
    -        SimpleTextExtractor.main(new String[]{file.getPath()});
    +        SimpleTextExtractor.main(new String[] {file.getPath()});
             file.delete();
     
             System.setOut(out);
     
    -        assertContains(message, buffer
    -                .toString(UTF_8.name())
    -                .trim());
    +        assertContains(message, buffer.toString(UTF_8.name()).trim());
         }
     }
    diff --git a/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java b/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java
    index b7d7c90669..87a988dab1 100755
    --- a/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java
    +++ b/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     
     package org.apache.tika.example;
    @@ -22,9 +20,8 @@
     import java.io.ByteArrayOutputStream;
     import java.io.PrintStream;
     
    -import org.junit.jupiter.api.Test;
    -
     import org.apache.tika.TikaTest;
    +import org.junit.jupiter.api.Test;
     
     @SuppressWarnings("deprecation")
     public class SimpleTypeDetectorTest extends TikaTest {
    @@ -36,13 +33,11 @@ public void testSimpleTypeDetector() throws Exception {
             PrintStream out = System.out;
             System.setOut(new PrintStream(buffer, true, UTF_8.name()));
     
    -        SimpleTypeDetector.main(new String[]{"pom.xml"});
    +        SimpleTypeDetector.main(new String[] {"pom.xml"});
     
             System.setOut(out);
     
    -        assertContains("pom.xml: application/xml", buffer
    -                .toString(UTF_8.name())
    -                .trim());
    +        assertContains("pom.xml: application/xml", buffer.toString(UTF_8.name()).trim());
         }
     
     }
    diff --git a/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java b/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java
    index 20feb8b370..06ddd64fbc 100644
    --- a/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java
    +++ b/tika-example/src/test/java/org/apache/tika/example/TestParsingExample.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     
     package org.apache.tika.example;
    @@ -24,14 +22,13 @@
     import java.io.StringReader;
     import java.util.List;
     
    -import org.junit.jupiter.api.BeforeEach;
    -import org.junit.jupiter.api.Test;
    -import org.xml.sax.SAXException;
    -
     import org.apache.tika.TikaTest;
     import org.apache.tika.exception.TikaException;
     import org.apache.tika.metadata.Metadata;
     import org.apache.tika.serialization.JsonMetadataList;
    +import org.junit.jupiter.api.BeforeEach;
    +import org.junit.jupiter.api.Test;
    +import org.xml.sax.SAXException;
     
     public class TestParsingExample extends TikaTest {
         ParsingExample parsingExample;
    @@ -43,17 +40,13 @@ public void setUp() {
     
         @Test
         public void testParseToStringExample() throws IOException, SAXException, TikaException {
    -        String result = parsingExample
    -                .parseToStringExample()
    -                .trim();
    +        String result = parsingExample.parseToStringExample().trim();
             assertEquals("test", result, "enough detectors?");
         }
     
         @Test
         public void testParseExample() throws IOException, SAXException, TikaException {
    -        String result = parsingExample
    -                .parseExample()
    -                .trim();
    +        String result = parsingExample.parseExample().trim();
             assertEquals("test", result, "Expected 'test', but got '" + result + "'");
         }
     
    @@ -76,21 +69,25 @@ public void testRecursiveParseExample() throws IOException, SAXException, TikaEx
         }
     
         @Test
    -    public void testRecursiveParserWrapperExample() throws IOException, SAXException, TikaException {
    +    public void testRecursiveParserWrapperExample()
    +                    throws IOException, SAXException, TikaException {
             List metadataList = parsingExample.recursiveParserWrapperExample();
    -        assertEquals(12, metadataList.size(), "Number of embedded documents + 1 for the container document");
    +        assertEquals(12, metadataList.size(),
    +                        "Number of embedded documents + 1 for the container document");
             Metadata m = metadataList.get(6);
    -        //this is the location the embed3.txt text file within the outer .docx
    -        assertEquals("/embed1.zip/embed2.zip/embed3.zip/embed3.txt", m.get("X-TIKA:embedded_resource_path"));
    -        //it contains some html encoded content
    +        // this is the location the embed3.txt text file within the outer .docx
    +        assertEquals("/embed1.zip/embed2.zip/embed3.zip/embed3.txt",
    +                        m.get("X-TIKA:embedded_resource_path"));
    +        // it contains some html encoded content
             assertContains("When in the Course", m.get("X-TIKA:content"));
         }
     
         @Test
    -    public void testSerializedRecursiveParserWrapperExample() throws IOException, SAXException, TikaException {
    +    public void testSerializedRecursiveParserWrapperExample()
    +                    throws IOException, SAXException, TikaException {
             String json = parsingExample.serializedRecursiveParserWrapperExample();
             assertTrue(json.contains("When in the Course"));
    -        //now try deserializing the JSON
    +        // now try deserializing the JSON
             List metadataList = JsonMetadataList.fromJson(new StringReader(json));
             assertEquals(12, metadataList.size());
         }
    diff --git a/tika-example/src/test/java/org/apache/tika/example/TextStatsFromTikaEvalTest.java b/tika-example/src/test/java/org/apache/tika/example/TextStatsFromTikaEvalTest.java
    index 30f52ce5e0..0bcb9f4cd8 100644
    --- a/tika-example/src/test/java/org/apache/tika/example/TextStatsFromTikaEvalTest.java
    +++ b/tika-example/src/test/java/org/apache/tika/example/TextStatsFromTikaEvalTest.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     package org.apache.tika.example;
     
    diff --git a/tika-example/src/test/java/org/apache/tika/example/TranslatorExampleTest.java b/tika-example/src/test/java/org/apache/tika/example/TranslatorExampleTest.java
    index 4ca556c320..e3b6c3cacb 100644
    --- a/tika-example/src/test/java/org/apache/tika/example/TranslatorExampleTest.java
    +++ b/tika-example/src/test/java/org/apache/tika/example/TranslatorExampleTest.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     
     package org.apache.tika.example;
    diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/ExpiringFetcherStore.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/ExpiringFetcherStore.java
    index 3dc10f09c6..110efac847 100644
    --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/ExpiringFetcherStore.java
    +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/ExpiringFetcherStore.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     package org.apache.tika.pipes.grpc;
     
    @@ -25,21 +23,23 @@
     import java.util.concurrent.Executors;
     import java.util.concurrent.ScheduledExecutorService;
     import java.util.concurrent.TimeUnit;
    -
    -import org.slf4j.Logger;
    -import org.slf4j.LoggerFactory;
    -
     import org.apache.tika.pipes.core.fetcher.AbstractFetcher;
     import org.apache.tika.pipes.core.fetcher.config.AbstractConfig;
    +import org.slf4j.Logger;
    +import org.slf4j.LoggerFactory;
     
     public class ExpiringFetcherStore implements AutoCloseable {
         private static final Logger LOG = LoggerFactory.getLogger(ExpiringFetcherStore.class);
         public static final long EXPIRE_JOB_INITIAL_DELAY = 1L;
    -    private final Map fetchers = Collections.synchronizedMap(new HashMap<>());
    -    private final Map fetcherConfigs = Collections.synchronizedMap(new HashMap<>());
    -    private final Map fetcherLastAccessed = Collections.synchronizedMap(new HashMap<>());
    +    private final Map fetchers =
    +                    Collections.synchronizedMap(new HashMap<>());
    +    private final Map fetcherConfigs =
    +                    Collections.synchronizedMap(new HashMap<>());
    +    private final Map fetcherLastAccessed =
    +                    Collections.synchronizedMap(new HashMap<>());
     
    -    private final ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor();
    +    private final ScheduledExecutorService executorService =
    +                    Executors.newSingleThreadScheduledExecutor();
     
         public ExpiringFetcherStore(int expireAfterSeconds, int checkForExpiredFetchersDelaySeconds) {
             executorService.scheduleAtFixedRate(() -> {
    @@ -47,14 +47,13 @@ public ExpiringFetcherStore(int expireAfterSeconds, int checkForExpiredFetchersD
                 for (String fetcherName : fetchers.keySet()) {
                     Instant lastAccessed = fetcherLastAccessed.get(fetcherName);
                     if (lastAccessed == null) {
    -                    LOG.error("Detected a fetcher with no last access time. FetcherName={}", fetcherName);
    +                    LOG.error("Detected a fetcher with no last access time. FetcherName={}",
    +                                    fetcherName);
                         expired.add(fetcherName);
    -                } else if (Instant
    -                        .now()
    -                        .isAfter(lastAccessed.plusSeconds(expireAfterSeconds))) {
    -                    LOG.info("Detected stale fetcher {} hasn't been accessed in {} seconds. " + "Deleting.", fetcherName, Instant
    -                            .now()
    -                            .getEpochSecond() - lastAccessed.getEpochSecond());
    +                } else if (Instant.now().isAfter(lastAccessed.plusSeconds(expireAfterSeconds))) {
    +                    LOG.info("Detected stale fetcher {} hasn't been accessed in {} seconds. "
    +                                    + "Deleting.", fetcherName,
    +                                    Instant.now().getEpochSecond() - lastAccessed.getEpochSecond());
                         expired.add(fetcherName);
                     }
                 }
    @@ -80,15 +79,16 @@ public Map getFetcherConfigs() {
         }
     
         /**
    -     * This method will get the fetcher, but will also log the access the fetcher as having
    -     * been accessed. This prevents the scheduled job from removing the stale fetcher.
    +     * This method will get the fetcher, but will also log the access the fetcher as having been
    +     * accessed. This prevents the scheduled job from removing the stale fetcher.
          */
         public  T getFetcherAndLogAccess(String fetcherName) {
             fetcherLastAccessed.put(fetcherName, Instant.now());
             return (T) fetchers.get(fetcherName);
         }
     
    -    public  void createFetcher(T fetcher, C config) {
    +    public  void createFetcher(T fetcher,
    +                    C config) {
             fetchers.put(fetcher.getName(), fetcher);
             fetcherConfigs.put(fetcher.getName(), config);
             getFetcherAndLogAccess(fetcher.getName());
    diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java
    index 70e8bcb917..2b687b2408 100644
    --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java
    +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java
    @@ -1,29 +1,21 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     package org.apache.tika.pipes.grpc;
     
     import static io.grpc.health.v1.HealthCheckResponse.ServingStatus;
     
    -import java.io.File;
    -import java.io.FileWriter;
    -import java.nio.charset.StandardCharsets;
    -import java.nio.file.Files;
    -import java.util.concurrent.TimeUnit;
    -
     import com.beust.jcommander.JCommander;
     import com.beust.jcommander.Parameter;
     import io.grpc.Grpc;
    @@ -33,11 +25,15 @@
     import io.grpc.TlsServerCredentials;
     import io.grpc.protobuf.services.HealthStatusManager;
     import io.grpc.protobuf.services.ProtoReflectionServiceV1;
    -import org.slf4j.Logger;
    -import org.slf4j.LoggerFactory;
    -
    +import java.io.File;
    +import java.io.FileWriter;
    +import java.nio.charset.StandardCharsets;
    +import java.nio.file.Files;
    +import java.util.concurrent.TimeUnit;
     import org.apache.tika.config.TikaConfig;
     import org.apache.tika.config.TikaConfigSerializer;
    +import org.slf4j.Logger;
    +import org.slf4j.LoggerFactory;
     
     /**
      * Server that manages startup/shutdown of the GRPC Tika server.
    @@ -52,19 +48,23 @@ public class TikaGrpcServer {
         @Parameter(names = {"-c", "--config"}, description = "The grpc server port", help = true)
         private File tikaConfigXml;
     
    -    @Parameter(names = {"-s", "--secure"}, description = "Enable credentials required to access this grpc server")
    +    @Parameter(names = {"-s", "--secure"},
    +                    description = "Enable credentials required to access this grpc server")
         private boolean secure;
     
    -    @Parameter(names = {"--cert-chain"}, description = "Certificate chain file. Example: server1.pem See: https://github.com/grpc/grpc-java/tree/b3ffb5078df361d7460786e134db7b5c00939246/examples/example-tls")
    +    @Parameter(names = {"--cert-chain"},
    +                    description = "Certificate chain file. Example: server1.pem See: https://github.com/grpc/grpc-java/tree/b3ffb5078df361d7460786e134db7b5c00939246/examples/example-tls")
         private File certChain;
     
    -    @Parameter(names = {"--private-key"}, description = "Private key store. Example: server1.key See: https://github.com/grpc/grpc-java/tree/b3ffb5078df361d7460786e134db7b5c00939246/examples/example-tls")
    +    @Parameter(names = {"--private-key"},
    +                    description = "Private key store. Example: server1.key See: https://github.com/grpc/grpc-java/tree/b3ffb5078df361d7460786e134db7b5c00939246/examples/example-tls")
         private File privateKey;
     
         @Parameter(names = {"--private-key-password"}, description = "Private key password, if needed")
         private String privateKeyPassword;
     
    -    @Parameter(names = {"--trust-cert-collection"}, description = "The trust certificate collection (root certs). Example: ca.pem See: https://github.com/grpc/grpc-java/tree/b3ffb5078df361d7460786e134db7b5c00939246/examples/example-tls")
    +    @Parameter(names = {"--trust-cert-collection"},
    +                    description = "The trust certificate collection (root certs). Example: ca.pem See: https://github.com/grpc/grpc-java/tree/b3ffb5078df361d7460786e134db7b5c00939246/examples/example-tls")
         private File trustCertCollection;
     
         @Parameter(names = {"--client-auth-required"}, description = "Is Mutual TLS required?")
    @@ -93,39 +93,33 @@ public void start() throws Exception {
                 // Create a default tika config
                 tikaConfigXml = Files.createTempFile("tika-config", ".xml").toFile();
                 try (FileWriter fw = new FileWriter(tikaConfigXml, StandardCharsets.UTF_8)) {
    -                TikaConfigSerializer.serialize(new TikaConfig(), TikaConfigSerializer.Mode.STATIC_FULL, fw, StandardCharsets.UTF_8);
    +                TikaConfigSerializer.serialize(new TikaConfig(),
    +                                TikaConfigSerializer.Mode.STATIC_FULL, fw, StandardCharsets.UTF_8);
                 }
             }
             File tikaConfigFile = new File(tikaConfigXml.getAbsolutePath());
             healthStatusManager.setStatus(TikaGrpcServer.class.getSimpleName(), ServingStatus.SERVING);
    -        server = Grpc
    -                .newServerBuilderForPort(port, creds)
    -                .addService(new TikaGrpcServerImpl(tikaConfigFile.getAbsolutePath()))
    -                .addService(healthStatusManager.getHealthService())
    -                .addService(ProtoReflectionServiceV1.newInstance())
    -                .build()
    -                .start();
    +        server = Grpc.newServerBuilderForPort(port, creds)
    +                        .addService(new TikaGrpcServerImpl(tikaConfigFile.getAbsolutePath()))
    +                        .addService(healthStatusManager.getHealthService())
    +                        .addService(ProtoReflectionServiceV1.newInstance()).build().start();
             LOGGER.info("Server started, listening on " + port);
    -        Runtime
    -                .getRuntime()
    -                .addShutdownHook(new Thread(() -> {
    -                    // Use stderr here since the logger may have been reset by its JVM shutdown hook.
    -                    System.err.println("*** shutting down gRPC server since JVM is shutting down");
    -                    healthStatusManager.clearStatus(TikaGrpcServer.class.getSimpleName());
    -                    try {
    -                        TikaGrpcServer.this.stop();
    -                    } catch (InterruptedException e) {
    -                        e.printStackTrace(System.err);
    -                    }
    -                    System.err.println("*** server shut down");
    -                }));
    +        Runtime.getRuntime().addShutdownHook(new Thread(() -> {
    +            // Use stderr here since the logger may have been reset by its JVM shutdown hook.
    +            System.err.println("*** shutting down gRPC server since JVM is shutting down");
    +            healthStatusManager.clearStatus(TikaGrpcServer.class.getSimpleName());
    +            try {
    +                TikaGrpcServer.this.stop();
    +            } catch (InterruptedException e) {
    +                e.printStackTrace(System.err);
    +            }
    +            System.err.println("*** server shut down");
    +        }));
         }
     
         public void stop() throws InterruptedException {
             if (server != null) {
    -            server
    -                    .shutdown()
    -                    .awaitTermination(30, TimeUnit.SECONDS);
    +            server.shutdown().awaitTermination(30, TimeUnit.SECONDS);
             }
         }
     
    @@ -143,10 +137,7 @@ public void blockUntilShutdown() throws InterruptedException {
          */
         public static void main(String[] args) throws Exception {
             TikaGrpcServer server = new TikaGrpcServer();
    -        JCommander commander = JCommander
    -                .newBuilder()
    -                .addObject(server)
    -                .build();
    +        JCommander commander = JCommander.newBuilder().addObject(server).build();
     
             commander.parse(args);
     
    diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
    index cbae8dba71..4628a38367 100644
    --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
    +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
    @@ -1,21 +1,28 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     package org.apache.tika.pipes.grpc;
     
    +import com.fasterxml.jackson.annotation.JsonInclude;
    +import com.fasterxml.jackson.core.JsonProcessingException;
    +import com.fasterxml.jackson.core.type.TypeReference;
    +import com.fasterxml.jackson.databind.ObjectMapper;
    +import com.fasterxml.jackson.module.jsonSchema.JsonSchema;
    +import com.fasterxml.jackson.module.jsonSchema.JsonSchemaGenerator;
    +import com.google.rpc.Status;
    +import io.grpc.protobuf.StatusProto;
    +import io.grpc.stub.StreamObserver;
     import java.io.File;
     import java.io.FileWriter;
     import java.io.IOException;
    @@ -34,24 +41,8 @@
     import javax.xml.transform.TransformerFactory;
     import javax.xml.transform.dom.DOMSource;
     import javax.xml.transform.stream.StreamResult;
    -
    -import com.fasterxml.jackson.annotation.JsonInclude;
    -import com.fasterxml.jackson.core.JsonProcessingException;
    -import com.fasterxml.jackson.core.type.TypeReference;
    -import com.fasterxml.jackson.databind.ObjectMapper;
    -import com.fasterxml.jackson.module.jsonSchema.JsonSchema;
    -import com.fasterxml.jackson.module.jsonSchema.JsonSchemaGenerator;
    -import com.google.rpc.Status;
    -import io.grpc.protobuf.StatusProto;
    -import io.grpc.stub.StreamObserver;
     import org.apache.commons.io.FileUtils;
     import org.apache.commons.lang3.StringUtils;
    -import org.slf4j.Logger;
    -import org.slf4j.LoggerFactory;
    -import org.w3c.dom.Document;
    -import org.w3c.dom.Element;
    -import org.xml.sax.SAXException;
    -
     import org.apache.tika.DeleteFetcherReply;
     import org.apache.tika.DeleteFetcherRequest;
     import org.apache.tika.FetchAndParseReply;
    @@ -81,15 +72,21 @@
     import org.apache.tika.pipes.core.fetcher.config.AbstractConfig;
     import org.apache.tika.pipes.core.fetcher.config.FetcherConfigContainer;
     import org.apache.tika.utils.XMLReaderUtils;
    +import org.slf4j.Logger;
    +import org.slf4j.LoggerFactory;
    +import org.w3c.dom.Document;
    +import org.w3c.dom.Element;
    +import org.xml.sax.SAXException;
     
     class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
         private static final Logger LOG = LoggerFactory.getLogger(TikaGrpcServerImpl.class);
         private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
         static {
    -        //TODO with Jackson 3.0 we'll have to use MapperBuilder
    +        // TODO with Jackson 3.0 we'll have to use MapperBuilder
             OBJECT_MAPPER.setDefaultPropertyInclusion(JsonInclude.Include.NON_NULL);
         }
    -    public static final JsonSchemaGenerator JSON_SCHEMA_GENERATOR = new JsonSchemaGenerator(OBJECT_MAPPER);
    +    public static final JsonSchemaGenerator JSON_SCHEMA_GENERATOR =
    +                    new JsonSchemaGenerator(OBJECT_MAPPER);
     
         /**
          * FetcherID is key, The pair is the Fetcher object and the Metadata
    @@ -100,16 +97,19 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
     
         String tikaConfigPath;
     
    -    TikaGrpcServerImpl(String tikaConfigPath)
    -            throws TikaConfigException, IOException, ParserConfigurationException,
    -            TransformerException, SAXException {
    +    TikaGrpcServerImpl(String tikaConfigPath) throws TikaConfigException, IOException,
    +                    ParserConfigurationException, TransformerException, SAXException {
             File tikaConfigFile = new File(tikaConfigPath);
             if (!tikaConfigFile.canWrite()) {
    -            File tmpTikaConfigFile = Files.createTempFile("configCopy", tikaConfigFile.getName()).toFile();
    +            File tmpTikaConfigFile =
    +                            Files.createTempFile("configCopy", tikaConfigFile.getName()).toFile();
                 tmpTikaConfigFile.deleteOnExit();
    -            LOG.info("Tika config file {} is read-only. Making a temporary copy to {}", tikaConfigFile, tmpTikaConfigFile);
    -            String tikaConfigFileContents = FileUtils.readFileToString(tikaConfigFile, StandardCharsets.UTF_8);
    -            FileUtils.writeStringToFile(tmpTikaConfigFile, tikaConfigFileContents, StandardCharsets.UTF_8);
    +            LOG.info("Tika config file {} is read-only. Making a temporary copy to {}",
    +                            tikaConfigFile, tmpTikaConfigFile);
    +            String tikaConfigFileContents =
    +                            FileUtils.readFileToString(tikaConfigFile, StandardCharsets.UTF_8);
    +            FileUtils.writeStringToFile(tmpTikaConfigFile, tikaConfigFileContents,
    +                            StandardCharsets.UTF_8);
                 tikaConfigFile = tmpTikaConfigFile;
                 tikaConfigPath = tikaConfigFile.getAbsolutePath();
             }
    @@ -117,7 +117,7 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
             pipesClient = new PipesClient(pipesConfig);
     
             expiringFetcherStore = new ExpiringFetcherStore(pipesConfig.getStaleFetcherTimeoutSeconds(),
    -                pipesConfig.getStaleFetcherDelaySeconds());
    +                        pipesConfig.getStaleFetcherDelaySeconds());
             this.tikaConfigPath = tikaConfigPath;
             try {
                 updateTikaConfig();
    @@ -126,9 +126,10 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
             }
         }
     
    -    private void updateTikaConfig() throws ParserConfigurationException, IOException, SAXException, TransformerException, TikaException {
    -        Document tikaConfigDoc =
    -                DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(tikaConfigPath);
    +    private void updateTikaConfig() throws ParserConfigurationException, IOException, SAXException,
    +                    TransformerException, TikaException {
    +        Document tikaConfigDoc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
    +                        .parse(tikaConfigPath);
     
             Element fetchersElement = (Element) tikaConfigDoc.getElementsByTagName("fetchers").item(0);
             if (fetchersElement == null) {
    @@ -141,9 +142,8 @@ private void updateTikaConfig() throws ParserConfigurationException, IOException
             for (var fetcherEntry : expiringFetcherStore.getFetchers().entrySet()) {
                 AbstractFetcher fetcherObject = fetcherEntry.getValue();
                 Map fetcherConfigParams = OBJECT_MAPPER.convertValue(
    -                    expiringFetcherStore.getFetcherConfigs().get(fetcherEntry.getKey()),
    -                    new TypeReference<>() {
    -                    });
    +                            expiringFetcherStore.getFetcherConfigs().get(fetcherEntry.getKey()),
    +                            new TypeReference<>() {});
                 Element fetcher = tikaConfigDoc.createElement("fetcher");
                 fetcher.setAttribute("class", fetcherEntry.getValue().getClass().getName());
                 Element fetcherName = tikaConfigDoc.createElement("name");
    @@ -162,13 +162,14 @@ private void updateTikaConfig() throws ParserConfigurationException, IOException
         }
     
         private void populateFetcherConfigs(Map fetcherConfigParams,
    -                                        Document tikaConfigDoc, Element fetcher) {
    +                    Document tikaConfigDoc, Element fetcher) {
             for (var configParam : fetcherConfigParams.entrySet()) {
                 Element configElm = tikaConfigDoc.createElement(configParam.getKey());
                 fetcher.appendChild(configElm);
                 if (configParam.getValue() instanceof List) {
                     List configParamVal = (List) configParam.getValue();
    -                String singularName = configParam.getKey().substring(0, configParam.getKey().length() - 1);
    +                String singularName = configParam.getKey().substring(0,
    +                                configParam.getKey().length() - 1);
                     for (Object configParamObj : configParamVal) {
                         Element childElement = tikaConfigDoc.createElement(singularName);
                         childElement.setTextContent(Objects.toString(configParamObj));
    @@ -182,13 +183,13 @@ private void populateFetcherConfigs(Map fetcherConfigParams,
     
         @Override
         public void fetchAndParseServerSideStreaming(FetchAndParseRequest request,
    -                                                 StreamObserver responseObserver) {
    +                    StreamObserver responseObserver) {
             fetchAndParseImpl(request, responseObserver);
         }
     
         @Override
         public StreamObserver fetchAndParseBiDirectionalStreaming(
    -            StreamObserver responseObserver) {
    +                    StreamObserver responseObserver) {
             return new StreamObserver<>() {
                 @Override
                 public void onNext(FetchAndParseRequest fetchAndParseRequest) {
    @@ -209,19 +210,19 @@ public void onCompleted() {
     
         @Override
         public void fetchAndParse(FetchAndParseRequest request,
    -                              StreamObserver responseObserver) {
    +                    StreamObserver responseObserver) {
             fetchAndParseImpl(request, responseObserver);
             responseObserver.onCompleted();
         }
     
     
         private void fetchAndParseImpl(FetchAndParseRequest request,
    -                                   StreamObserver responseObserver) {
    +                    StreamObserver responseObserver) {
             AbstractFetcher fetcher =
    -                expiringFetcherStore.getFetcherAndLogAccess(request.getFetcherId());
    +                        expiringFetcherStore.getFetcherAndLogAccess(request.getFetcherId());
             if (fetcher == null) {
                 throw new RuntimeException(
    -                    "Could not find fetcher with name " + request.getFetcherId());
    +                            "Could not find fetcher with name " + request.getFetcherId());
             }
             Metadata tikaMetadata = new Metadata();
             try {
    @@ -229,24 +230,25 @@ private void fetchAndParseImpl(FetchAndParseRequest request,
                 String additionalFetchConfigJson = request.getAdditionalFetchConfigJson();
                 if (StringUtils.isNotBlank(additionalFetchConfigJson)) {
                     // The fetch and parse has the option to specify additional configuration
    -                AbstractConfig abstractConfig = expiringFetcherStore
    -                        .getFetcherConfigs()
    -                        .get(fetcher.getName());
    -                parseContext.set(FetcherConfigContainer.class, new FetcherConfigContainer()
    -                        .setConfigClassName(abstractConfig
    -                                .getClass().getName())
    -                        .setJson(additionalFetchConfigJson));
    +                AbstractConfig abstractConfig =
    +                                expiringFetcherStore.getFetcherConfigs().get(fetcher.getName());
    +                parseContext.set(FetcherConfigContainer.class,
    +                                new FetcherConfigContainer()
    +                                                .setConfigClassName(
    +                                                                abstractConfig.getClass().getName())
    +                                                .setJson(additionalFetchConfigJson));
                 }
                 PipesResult pipesResult = pipesClient.process(new FetchEmitTuple(request.getFetchKey(),
    -                    new FetchKey(fetcher.getName(), request.getFetchKey()), new EmitKey(), tikaMetadata, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
    +                            new FetchKey(fetcher.getName(), request.getFetchKey()), new EmitKey(),
    +                            tikaMetadata, parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
                 FetchAndParseReply.Builder fetchReplyBuilder =
    -                    FetchAndParseReply.newBuilder()
    -                                      .setFetchKey(request.getFetchKey())
    -                            .setStatus(pipesResult.getStatus().name());
    +                            FetchAndParseReply.newBuilder().setFetchKey(request.getFetchKey())
    +                                            .setStatus(pipesResult.getStatus().name());
                 if (pipesResult.getStatus().equals(PipesResult.STATUS.FETCH_EXCEPTION)) {
                     fetchReplyBuilder.setErrorMessage(pipesResult.getMessage());
                 }
    -            if (pipesResult.getEmitData() != null && pipesResult.getEmitData().getMetadataList() != null) {
    +            if (pipesResult.getEmitData() != null
    +                            && pipesResult.getEmitData().getMetadataList() != null) {
                     for (Metadata metadata : pipesResult.getEmitData().getMetadataList()) {
                         for (String name : metadata.names()) {
                             String value = metadata.get(name);
    @@ -267,13 +269,15 @@ private void fetchAndParseImpl(FetchAndParseRequest request,
         @SuppressWarnings("raw")
         @Override
         public void saveFetcher(SaveFetcherRequest request,
    -                              StreamObserver responseObserver) {
    +                    StreamObserver responseObserver) {
             SaveFetcherReply reply =
    -                SaveFetcherReply.newBuilder().setFetcherId(request.getFetcherId()).build();
    +                        SaveFetcherReply.newBuilder().setFetcherId(request.getFetcherId()).build();
             try {
    -            Map fetcherConfigMap = OBJECT_MAPPER.readValue(request.getFetcherConfigJson(), new TypeReference<>() {});
    +            Map fetcherConfigMap = OBJECT_MAPPER
    +                            .readValue(request.getFetcherConfigJson(), new TypeReference<>() {});
                 Map tikaParamsMap = createTikaParamMap(fetcherConfigMap);
    -            saveFetcher(request.getFetcherId(), request.getFetcherClass(), fetcherConfigMap, tikaParamsMap);
    +            saveFetcher(request.getFetcherId(), request.getFetcherClass(), fetcherConfigMap,
    +                            tikaParamsMap);
                 updateTikaConfig();
             } catch (Exception e) {
                 throw new RuntimeException(e);
    @@ -282,21 +286,21 @@ public void saveFetcher(SaveFetcherRequest request,
             responseObserver.onCompleted();
         }
     
    -    private void saveFetcher(String name, String fetcherClassName, Map paramsMap, Map tikaParamsMap) {
    +    private void saveFetcher(String name, String fetcherClassName, Map paramsMap,
    +                    Map tikaParamsMap) {
             try {
                 if (paramsMap == null) {
                     paramsMap = new LinkedHashMap<>();
                 }
                 Class fetcherClass =
    -                    (Class) Class.forName(fetcherClassName);
    -            String configClassName =
    -                    fetcherClass.getPackageName() + ".config." + fetcherClass.getSimpleName() +
    -                            "Config";
    +                            (Class) Class.forName(fetcherClassName);
    +            String configClassName = fetcherClass.getPackageName() + ".config."
    +                            + fetcherClass.getSimpleName() + "Config";
                 Class configClass =
    -                    (Class) Class.forName(configClassName);
    +                            (Class) Class.forName(configClassName);
                 AbstractConfig configObject = OBJECT_MAPPER.convertValue(paramsMap, configClass);
    -            AbstractFetcher abstractFetcher =
    -                    fetcherClass.getDeclaredConstructor(configClass).newInstance(configObject);
    +            AbstractFetcher abstractFetcher = fetcherClass.getDeclaredConstructor(configClass)
    +                            .newInstance(configObject);
                 abstractFetcher.setName(name);
                 if (Initializable.class.isAssignableFrom(fetcherClass)) {
                     Initializable initializable = (Initializable) abstractFetcher;
    @@ -308,8 +312,9 @@ private void saveFetcher(String name, String fetcherClassName, Map createTikaParamMap(Map fetcher
         }
     
         static Status notFoundStatus(String fetcherId) {
    -        return Status.newBuilder()
    -                .setCode(io.grpc.Status.Code.NOT_FOUND.value())
    -                .setMessage("Could not find fetcher with id:" + fetcherId)
    -                .build();
    +        return Status.newBuilder().setCode(io.grpc.Status.Code.NOT_FOUND.value())
    +                        .setMessage("Could not find fetcher with id:" + fetcherId).build();
         }
     
         @Override
         public void getFetcher(GetFetcherRequest request,
    -                           StreamObserver responseObserver) {
    +                    StreamObserver responseObserver) {
             GetFetcherReply.Builder getFetcherReply = GetFetcherReply.newBuilder();
             AbstractConfig abstractConfig =
    -                expiringFetcherStore.getFetcherConfigs().get(request.getFetcherId());
    -        AbstractFetcher abstractFetcher = expiringFetcherStore.getFetchers().get(request.getFetcherId());
    +                        expiringFetcherStore.getFetcherConfigs().get(request.getFetcherId());
    +        AbstractFetcher abstractFetcher =
    +                        expiringFetcherStore.getFetchers().get(request.getFetcherId());
             if (abstractFetcher == null || abstractConfig == null) {
    -            responseObserver.onError(StatusProto.toStatusException(notFoundStatus(request.getFetcherId())));
    +            responseObserver.onError(
    +                            StatusProto.toStatusException(notFoundStatus(request.getFetcherId())));
                 return;
             }
             getFetcherReply.setFetcherId(request.getFetcherId());
             getFetcherReply.setFetcherClass(abstractFetcher.getClass().getName());
    -        Map paramMap = OBJECT_MAPPER.convertValue(abstractConfig, new TypeReference<>() {});
    -        paramMap.forEach(
    -                (k, v) -> getFetcherReply.putParams(Objects.toString(k), Objects.toString(v)));
    +        Map paramMap =
    +                        OBJECT_MAPPER.convertValue(abstractConfig, new TypeReference<>() {});
    +        paramMap.forEach((k, v) -> getFetcherReply.putParams(Objects.toString(k),
    +                        Objects.toString(v)));
             responseObserver.onNext(getFetcherReply.build());
             responseObserver.onCompleted();
         }
     
         @Override
         public void listFetchers(ListFetchersRequest request,
    -                             StreamObserver responseObserver) {
    +                    StreamObserver responseObserver) {
             ListFetchersReply.Builder listFetchersReplyBuilder = ListFetchersReply.newBuilder();
    -        for (Map.Entry fetcherConfig : expiringFetcherStore.getFetcherConfigs()
    -                .entrySet()) {
    +        for (Map.Entry fetcherConfig : expiringFetcherStore
    +                        .getFetcherConfigs().entrySet()) {
                 GetFetcherReply.Builder replyBuilder = saveFetcherReply(fetcherConfig);
                 listFetchersReplyBuilder.addGetFetcherReplies(replyBuilder.build());
             }
    @@ -365,32 +371,31 @@ public void listFetchers(ListFetchersRequest request,
         }
     
         private GetFetcherReply.Builder saveFetcherReply(
    -            Map.Entry fetcherConfig) {
    +                    Map.Entry fetcherConfig) {
             AbstractFetcher abstractFetcher =
    -                expiringFetcherStore.getFetchers().get(fetcherConfig.getKey());
    +                        expiringFetcherStore.getFetchers().get(fetcherConfig.getKey());
             AbstractConfig abstractConfig =
    -                expiringFetcherStore.getFetcherConfigs().get(fetcherConfig.getKey());
    -        GetFetcherReply.Builder replyBuilder =
    -                GetFetcherReply.newBuilder().setFetcherClass(abstractFetcher.getClass().getName())
    +                        expiringFetcherStore.getFetcherConfigs().get(fetcherConfig.getKey());
    +        GetFetcherReply.Builder replyBuilder = GetFetcherReply.newBuilder()
    +                        .setFetcherClass(abstractFetcher.getClass().getName())
                             .setFetcherId(abstractFetcher.getName());
             loadParamsIntoReply(abstractConfig, replyBuilder);
             return replyBuilder;
         }
     
         private static void loadParamsIntoReply(AbstractConfig abstractConfig,
    -                                            GetFetcherReply.Builder replyBuilder) {
    +                    GetFetcherReply.Builder replyBuilder) {
             Map paramMap =
    -                OBJECT_MAPPER.convertValue(abstractConfig, new TypeReference<>() {
    -                });
    +                        OBJECT_MAPPER.convertValue(abstractConfig, new TypeReference<>() {});
             if (paramMap != null) {
    -            paramMap.forEach(
    -                    (k, v) -> replyBuilder.putParams(Objects.toString(k), Objects.toString(v)));
    +            paramMap.forEach((k, v) -> replyBuilder.putParams(Objects.toString(k),
    +                            Objects.toString(v)));
             }
         }
     
         @Override
         public void deleteFetcher(DeleteFetcherRequest request,
    -                              StreamObserver responseObserver) {
    +                    StreamObserver responseObserver) {
             boolean successfulDelete = deleteFetcher(request.getFetcherId());
             if (successfulDelete) {
                 try {
    @@ -399,18 +404,24 @@ public void deleteFetcher(DeleteFetcherRequest request,
                     throw new RuntimeException(e);
                 }
             }
    -        responseObserver.onNext(DeleteFetcherReply.newBuilder().setSuccess(successfulDelete).build());
    +        responseObserver.onNext(
    +                        DeleteFetcherReply.newBuilder().setSuccess(successfulDelete).build());
             responseObserver.onCompleted();
         }
     
         @Override
    -    public void getFetcherConfigJsonSchema(GetFetcherConfigJsonSchemaRequest request, StreamObserver responseObserver) {
    -        GetFetcherConfigJsonSchemaReply.Builder builder = GetFetcherConfigJsonSchemaReply.newBuilder();
    +    public void getFetcherConfigJsonSchema(GetFetcherConfigJsonSchemaRequest request,
    +                    StreamObserver responseObserver) {
    +        GetFetcherConfigJsonSchemaReply.Builder builder =
    +                        GetFetcherConfigJsonSchemaReply.newBuilder();
             try {
    -            JsonSchema jsonSchema = JSON_SCHEMA_GENERATOR.generateSchema(Class.forName(request.getFetcherClass()));
    -            builder.setFetcherConfigJsonSchema(OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValueAsString(jsonSchema));
    +            JsonSchema jsonSchema = JSON_SCHEMA_GENERATOR
    +                            .generateSchema(Class.forName(request.getFetcherClass()));
    +            builder.setFetcherConfigJsonSchema(OBJECT_MAPPER.writerWithDefaultPrettyPrinter()
    +                            .writeValueAsString(jsonSchema));
             } catch (ClassNotFoundException | JsonProcessingException e) {
    -            throw new RuntimeException("Could not create json schema for " + request.getFetcherClass(), e);
    +            throw new RuntimeException(
    +                            "Could not create json schema for " + request.getFetcherClass(), e);
             }
             responseObserver.onNext(builder.build());
             responseObserver.onCompleted();
    diff --git a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/ExpiringFetcherStoreTest.java b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/ExpiringFetcherStoreTest.java
    index 91a8b8a19a..f39af906bc 100644
    --- a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/ExpiringFetcherStoreTest.java
    +++ b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/ExpiringFetcherStoreTest.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     package org.apache.tika.pipes.grpc;
     
    @@ -20,15 +18,13 @@
     
     import java.io.InputStream;
     import java.time.Duration;
    -
    -import org.awaitility.Awaitility;
    -import org.junit.jupiter.api.Assertions;
    -import org.junit.jupiter.api.Test;
    -
     import org.apache.tika.metadata.Metadata;
     import org.apache.tika.parser.ParseContext;
     import org.apache.tika.pipes.core.fetcher.AbstractFetcher;
     import org.apache.tika.pipes.core.fetcher.config.AbstractConfig;
    +import org.awaitility.Awaitility;
    +import org.junit.jupiter.api.Assertions;
    +import org.junit.jupiter.api.Test;
     
     class ExpiringFetcherStoreTest {
     
    @@ -37,29 +33,21 @@ void createFetcher() {
             try (ExpiringFetcherStore expiringFetcherStore = new ExpiringFetcherStore(1, 5)) {
                 AbstractFetcher fetcher = new AbstractFetcher() {
                     @Override
    -                public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) {
    +                public InputStream fetch(String fetchKey, Metadata metadata,
    +                                ParseContext parseContext) {
                         return null;
                     }
                 };
                 fetcher.setName("nick");
    -            AbstractConfig config = new AbstractConfig() {
    -            };
    +            AbstractConfig config = new AbstractConfig() {};
                 expiringFetcherStore.createFetcher(fetcher, config);
     
    -            Assertions.assertNotNull(expiringFetcherStore
    -                    .getFetchers()
    -                    .get(fetcher.getName()));
    +            Assertions.assertNotNull(expiringFetcherStore.getFetchers().get(fetcher.getName()));
     
    -            Awaitility
    -                    .await()
    -                    .atMost(Duration.ofSeconds(60))
    -                    .until(() -> expiringFetcherStore
    -                            .getFetchers()
    -                            .get(fetcher.getName()) == null);
    +            Awaitility.await().atMost(Duration.ofSeconds(60)).until(() -> expiringFetcherStore
    +                            .getFetchers().get(fetcher.getName()) == null);
     
    -            assertNull(expiringFetcherStore
    -                    .getFetcherConfigs()
    -                    .get(fetcher.getName()));
    +            assertNull(expiringFetcherStore.getFetcherConfigs().get(fetcher.getName()));
             }
         }
     }
    diff --git a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/PipesBiDirectionalStreamingIntegrationTest.java b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/PipesBiDirectionalStreamingIntegrationTest.java
    index c540e61001..155c867454 100644
    --- a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/PipesBiDirectionalStreamingIntegrationTest.java
    +++ b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/PipesBiDirectionalStreamingIntegrationTest.java
    @@ -1,21 +1,26 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     package org.apache.tika.pipes.grpc;
     
    +import com.fasterxml.jackson.databind.ObjectMapper;
    +import com.google.common.collect.ImmutableMap;
    +import io.grpc.Grpc;
    +import io.grpc.ManagedChannel;
    +import io.grpc.TlsChannelCredentials;
    +import io.grpc.netty.shaded.io.netty.handler.ssl.util.InsecureTrustManagerFactory;
    +import io.grpc.stub.StreamObserver;
     import java.io.File;
     import java.io.IOException;
     import java.net.InetAddress;
    @@ -29,15 +34,13 @@
     import java.util.Map;
     import java.util.UUID;
     import java.util.concurrent.atomic.AtomicInteger;
    -
    -import com.fasterxml.jackson.databind.ObjectMapper;
    -import com.google.common.collect.ImmutableMap;
    -import io.grpc.Grpc;
    -import io.grpc.ManagedChannel;
    -import io.grpc.TlsChannelCredentials;
    -import io.grpc.netty.shaded.io.netty.handler.ssl.util.InsecureTrustManagerFactory;
    -import io.grpc.stub.StreamObserver;
     import org.apache.commons.io.FileUtils;
    +import org.apache.tika.FetchAndParseReply;
    +import org.apache.tika.FetchAndParseRequest;
    +import org.apache.tika.SaveFetcherReply;
    +import org.apache.tika.SaveFetcherRequest;
    +import org.apache.tika.TikaGrpc;
    +import org.apache.tika.pipes.fetcher.http.HttpFetcher;
     import org.awaitility.Awaitility;
     import org.eclipse.jetty.server.Server;
     import org.eclipse.jetty.server.handler.ResourceHandler;
    @@ -50,25 +53,17 @@
     import org.slf4j.Logger;
     import org.slf4j.LoggerFactory;
     
    -import org.apache.tika.FetchAndParseReply;
    -import org.apache.tika.FetchAndParseRequest;
    -import org.apache.tika.SaveFetcherReply;
    -import org.apache.tika.SaveFetcherRequest;
    -import org.apache.tika.TikaGrpc;
    -import org.apache.tika.pipes.fetcher.http.HttpFetcher;
    -
     /**
    - * This test will start an HTTP server using jetty.
    - * Then it will start Tika Pipes Grpc service.
    - * Then it will, using a bidirectional stream of data, send urls to the
    - * HTTP fetcher whilst simultaneously receiving parsed output as they parse.
    + * This test will start an HTTP server using jetty. Then it will start Tika Pipes Grpc service. Then
    + * it will, using a bidirectional stream of data, send urls to the HTTP fetcher whilst
    + * simultaneously receiving parsed output as they parse.
      */
     class PipesBiDirectionalStreamingIntegrationTest {
    -    static final Logger LOGGER = LoggerFactory.getLogger(PipesBiDirectionalStreamingIntegrationTest.class);
    +    static final Logger LOGGER =
    +                    LoggerFactory.getLogger(PipesBiDirectionalStreamingIntegrationTest.class);
         private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
    -    static File tikaConfigXmlTemplate = Paths
    -            .get("src", "test", "resources", "tika-pipes-test-config.xml")
    -            .toFile();
    +    static File tikaConfigXmlTemplate =
    +                    Paths.get("src", "test", "resources", "tika-pipes-test-config.xml").toFile();
         static File tikaConfigXml = new File("target", "tika-config-" + UUID.randomUUID() + ".xml");
         static TikaGrpcServer grpcServer;
         static int grpcPort;
    @@ -95,14 +90,14 @@ static void setUpHttpServer() throws Exception {
             ResourceHandler resourceHandler = new ResourceHandler();
             resourceHandler.setDirAllowed(true);
             // TODO when using jetty 12:
    -        // resourceHandler.setBaseResourceAsString("src/test/resources/test-files")        
    -        resourceHandler.setBaseResource(new PathResource(Paths.get("src", "test", "resources", "test-files")));
    +        // resourceHandler.setBaseResourceAsString("src/test/resources/test-files")
    +        resourceHandler.setBaseResource(
    +                        new PathResource(Paths.get("src", "test", "resources", "test-files")));
             httpServer.setHandler(resourceHandler);
             httpServer.start();
     
    -        httpServerUrl = "http://" + InetAddress
    -                .getByName("localhost")
    -                .getHostAddress() + ":" + httpServerPort;
    +        httpServerUrl = "http://" + InetAddress.getByName("localhost").getHostAddress() + ":"
    +                        + httpServerPort;
         }
     
         @BeforeAll
    @@ -114,25 +109,26 @@ static void setUpGrpcServer() throws Exception {
             grpcServer.setTikaConfigXml(tikaConfigXml);
             grpcServer.setPort(grpcPort);
             grpcServer.setSecure(true);
    -        grpcServer.setCertChain(Paths.get("src", "test", "resources", "certs", "server1.pem").toFile());
    -        grpcServer.setPrivateKey(Paths.get("src", "test", "resources", "certs", "server1.key").toFile());
    -        grpcServer.setTrustCertCollection(Paths.get("src", "test", "resources", "certs", "ca.pem").toFile());
    +        grpcServer.setCertChain(
    +                        Paths.get("src", "test", "resources", "certs", "server1.pem").toFile());
    +        grpcServer.setPrivateKey(
    +                        Paths.get("src", "test", "resources", "certs", "server1.key").toFile());
    +        grpcServer.setTrustCertCollection(
    +                        Paths.get("src", "test", "resources", "certs", "ca.pem").toFile());
             grpcServer.setClientAuthRequired(true);
             grpcServer.start();
     
    -        String target = InetAddress
    -                .getByName("localhost")
    -                .getHostAddress() + ":" + grpcPort;
    +        String target = InetAddress.getByName("localhost").getHostAddress() + ":" + grpcPort;
     
             TlsChannelCredentials.Builder channelCredBuilder = TlsChannelCredentials.newBuilder();
    -        File clientCertChain = Paths.get("src", "test", "resources", "certs", "client.pem").toFile();
    -        File clientPrivateKey = Paths.get("src", "test", "resources", "certs", "client.key").toFile();
    +        File clientCertChain =
    +                        Paths.get("src", "test", "resources", "certs", "client.pem").toFile();
    +        File clientPrivateKey =
    +                        Paths.get("src", "test", "resources", "certs", "client.key").toFile();
             channelCredBuilder.keyManager(clientCertChain, clientPrivateKey);
             channelCredBuilder.trustManager(InsecureTrustManagerFactory.INSTANCE.getTrustManagers());
     
    -        ManagedChannel channel = Grpc
    -                .newChannelBuilder(target, channelCredBuilder.build())
    -                .build();
    +        ManagedChannel channel = Grpc.newChannelBuilder(target, channelCredBuilder.build()).build();
     
             tikaBlockingStub = TikaGrpc.newBlockingStub(channel);
             tikaStub = TikaGrpc.newStub(channel);
    @@ -159,21 +155,15 @@ static void cleanConfig() throws Exception {
     
         @BeforeEach
         void createHttpFetcher() throws Exception {
    -        SaveFetcherRequest saveFetcherRequest = SaveFetcherRequest
    -                .newBuilder()
    -                .setFetcherId(httpFetcherId)
    -                .setFetcherClass(HttpFetcher.class.getName())
    -                .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap
    -                        .builder()
    -                        .put("requestTimeout", 30_000)
    -                        .put("socketTimeout", 30_000)
    -                        .put("connectTimeout", 20_000)
    -                        .put("maxConnectionsPerRoute", 200)
    -                        .put("maxRedirects", 0)
    -                        .put("maxSpoolSize", -1)
    -                        .put("overallTimeout", 50_000)
    -                        .build()))
    -                .build();
    +        SaveFetcherRequest saveFetcherRequest = SaveFetcherRequest.newBuilder()
    +                        .setFetcherId(httpFetcherId).setFetcherClass(HttpFetcher.class.getName())
    +                        .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap
    +                                        .builder().put("requestTimeout", 30_000)
    +                                        .put("socketTimeout", 30_000).put("connectTimeout", 20_000)
    +                                        .put("maxConnectionsPerRoute", 200).put("maxRedirects", 0)
    +                                        .put("maxSpoolSize", -1).put("overallTimeout", 50_000)
    +                                        .build()))
    +                        .build();
             SaveFetcherReply saveFetcherReply = tikaBlockingStub.saveFetcher(saveFetcherRequest);
             Assertions.assertEquals(saveFetcherReply.getFetcherId(), httpFetcherId);
         }
    @@ -200,17 +190,16 @@ public void onCompleted() {
                     LOGGER.info("Completed fetching.");
                 }
             };
    -        StreamObserver request = tikaStub.fetchAndParseBiDirectionalStreaming(responseObserver);
    +        StreamObserver request =
    +                        tikaStub.fetchAndParseBiDirectionalStreaming(responseObserver);
             for (String file : files) {
    -            request.onNext(FetchAndParseRequest
    -                    .newBuilder()
    -                    .setFetcherId(httpFetcherId)
    -                    .setFetchKey(httpServerUrl + "/" + file)
    -                    .build());
    +            request.onNext(FetchAndParseRequest.newBuilder().setFetcherId(httpFetcherId)
    +                            .setFetchKey(httpServerUrl + "/" + file).build());
             }
             request.onCompleted();
     
    -        Awaitility.await().atMost(Duration.ofSeconds(600)).until(() -> result.size() == files.size());
    +        Awaitility.await().atMost(Duration.ofSeconds(600))
    +                        .until(() -> result.size() == files.size());
     
             Assertions.assertEquals(files.size(), numParsed.get());
         }
    diff --git a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java
    index 78c5b10ea8..22316fbcbb 100644
    --- a/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java
    +++ b/tika-grpc/src/test/java/org/apache/tika/pipes/grpc/TikaGrpcServerTest.java
    @@ -1,18 +1,16 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     package org.apache.tika.pipes.grpc;
     
    @@ -21,6 +19,17 @@
     import static org.junit.jupiter.api.Assertions.assertTrue;
     import static org.junit.jupiter.api.Assertions.fail;
     
    +import com.asarkar.grpc.test.GrpcCleanupExtension;
    +import com.asarkar.grpc.test.Resources;
    +import com.fasterxml.jackson.databind.ObjectMapper;
    +import com.google.common.collect.ImmutableMap;
    +import io.grpc.ManagedChannel;
    +import io.grpc.Server;
    +import io.grpc.Status;
    +import io.grpc.StatusRuntimeException;
    +import io.grpc.inprocess.InProcessChannelBuilder;
    +import io.grpc.inprocess.InProcessServerBuilder;
    +import io.grpc.stub.StreamObserver;
     import java.io.File;
     import java.nio.charset.StandardCharsets;
     import java.nio.file.Paths;
    @@ -34,28 +43,7 @@
     import java.util.Locale;
     import java.util.UUID;
     import java.util.concurrent.atomic.AtomicBoolean;
    -
    -import com.asarkar.grpc.test.GrpcCleanupExtension;
    -import com.asarkar.grpc.test.Resources;
    -import com.fasterxml.jackson.databind.ObjectMapper;
    -import com.google.common.collect.ImmutableMap;
    -import io.grpc.ManagedChannel;
    -import io.grpc.Server;
    -import io.grpc.Status;
    -import io.grpc.StatusRuntimeException;
    -import io.grpc.inprocess.InProcessChannelBuilder;
    -import io.grpc.inprocess.InProcessServerBuilder;
    -import io.grpc.stub.StreamObserver;
     import org.apache.commons.io.FileUtils;
    -import org.jetbrains.annotations.NotNull;
    -import org.junit.jupiter.api.AfterAll;
    -import org.junit.jupiter.api.Assertions;
    -import org.junit.jupiter.api.BeforeAll;
    -import org.junit.jupiter.api.Test;
    -import org.junit.jupiter.api.extension.ExtendWith;
    -import org.slf4j.Logger;
    -import org.slf4j.LoggerFactory;
    -
     import org.apache.tika.DeleteFetcherReply;
     import org.apache.tika.DeleteFetcherRequest;
     import org.apache.tika.FetchAndParseReply;
    @@ -67,15 +55,22 @@
     import org.apache.tika.TikaGrpc;
     import org.apache.tika.pipes.core.PipesResult;
     import org.apache.tika.pipes.fetcher.fs.FileSystemFetcher;
    +import org.jetbrains.annotations.NotNull;
    +import org.junit.jupiter.api.AfterAll;
    +import org.junit.jupiter.api.Assertions;
    +import org.junit.jupiter.api.BeforeAll;
    +import org.junit.jupiter.api.Test;
    +import org.junit.jupiter.api.extension.ExtendWith;
    +import org.slf4j.Logger;
    +import org.slf4j.LoggerFactory;
     
     @ExtendWith(GrpcCleanupExtension.class)
     public class TikaGrpcServerTest {
         private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
         private static final Logger LOG = LoggerFactory.getLogger(TikaGrpcServerTest.class);
         public static final int NUM_TEST_DOCS = 2;
    -    static File tikaConfigXmlTemplate = Paths
    -            .get("src", "test", "resources", "tika-pipes-test-config.xml")
    -            .toFile();
    +    static File tikaConfigXmlTemplate =
    +                    Paths.get("src", "test", "resources", "tika-pipes-test-config.xml").toFile();
         static File tikaConfigXml = new File("target", "tika-config-" + UUID.randomUUID() + ".xml");
     
     
    @@ -97,18 +92,13 @@ public void testFetcherCrud(Resources resources) throws Exception {
             Assertions.assertTrue(tikaConfigXml.setWritable(false));
             String serverName = InProcessServerBuilder.generateName();
     
    -        Server server = InProcessServerBuilder
    -                .forName(serverName)
    -                .directExecutor()
    -                .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath()))
    -                .build()
    -                .start();
    +        Server server = InProcessServerBuilder.forName(serverName).directExecutor()
    +                        .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath())).build()
    +                        .start();
             resources.register(server, Duration.ofSeconds(10));
     
    -        ManagedChannel channel = InProcessChannelBuilder
    -                .forName(serverName)
    -                .directExecutor()
    -                .build();
    +        ManagedChannel channel =
    +                        InProcessChannelBuilder.forName(serverName).directExecutor().build();
             resources.register(channel, Duration.ofSeconds(10));
             TikaGrpc.TikaBlockingStub blockingStub = TikaGrpc.newBlockingStub(channel);
     
    @@ -116,48 +106,36 @@ public void testFetcherCrud(Resources resources) throws Exception {
             // create fetchers
             for (int i = 0; i < NUM_FETCHERS_TO_CREATE; ++i) {
                 String fetcherId = createFetcherId(i);
    -            SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest
    -                    .newBuilder()
    -                    .setFetcherId(fetcherId)
    -                    .setFetcherClass(FileSystemFetcher.class.getName())
    -                    .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap
    -                            .builder()
    -                            .put("basePath", targetFolder)
    -                            .put("extractFileSystemMetadata", true)
    -                            .build()))
    -                    .build());
    +            SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest.newBuilder()
    +                            .setFetcherId(fetcherId)
    +                            .setFetcherClass(FileSystemFetcher.class.getName())
    +                            .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap
    +                                            .builder().put("basePath", targetFolder)
    +                                            .put("extractFileSystemMetadata", true).build()))
    +                            .build());
                 assertEquals(fetcherId, reply.getFetcherId());
             }
             // update fetchers
             for (int i = 0; i < NUM_FETCHERS_TO_CREATE; ++i) {
                 String fetcherId = createFetcherId(i);
    -            SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest
    -                    .newBuilder()
    -                    .setFetcherId(fetcherId)
    -                    .setFetcherClass(FileSystemFetcher.class.getName())
    -                    .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap
    -                            .builder()
    -                            .put("basePath", targetFolder)
    -                            .put("extractFileSystemMetadata", false)
    -                            .build()))
    -                    .build());
    +            SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest.newBuilder()
    +                            .setFetcherId(fetcherId)
    +                            .setFetcherClass(FileSystemFetcher.class.getName())
    +                            .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap
    +                                            .builder().put("basePath", targetFolder)
    +                                            .put("extractFileSystemMetadata", false).build()))
    +                            .build());
                 assertEquals(fetcherId, reply.getFetcherId());
    -            GetFetcherReply getFetcherReply = blockingStub.getFetcher(GetFetcherRequest
    -                    .newBuilder()
    -                    .setFetcherId(fetcherId)
    -                    .build());
    -            assertEquals("false", getFetcherReply
    -                    .getParamsMap()
    -                    .get("extractFileSystemMetadata"));
    +            GetFetcherReply getFetcherReply = blockingStub.getFetcher(
    +                            GetFetcherRequest.newBuilder().setFetcherId(fetcherId).build());
    +            assertEquals("false", getFetcherReply.getParamsMap().get("extractFileSystemMetadata"));
             }
     
             // get fetchers
             for (int i = 0; i < NUM_FETCHERS_TO_CREATE; ++i) {
                 String fetcherId = createFetcherId(i);
    -            GetFetcherReply getFetcherReply = blockingStub.getFetcher(GetFetcherRequest
    -                    .newBuilder()
    -                    .setFetcherId(fetcherId)
    -                    .build());
    +            GetFetcherReply getFetcherReply = blockingStub.getFetcher(
    +                            GetFetcherRequest.newBuilder().setFetcherId(fetcherId).build());
                 assertEquals(fetcherId, getFetcherReply.getFetcherId());
                 assertEquals(FileSystemFetcher.class.getName(), getFetcherReply.getFetcherClass());
             }
    @@ -165,26 +143,20 @@ public void testFetcherCrud(Resources resources) throws Exception {
             // delete fetchers
             for (int i = 0; i < NUM_FETCHERS_TO_CREATE; ++i) {
                 String fetcherId = createFetcherId(i);
    -            DeleteFetcherReply deleteFetcherReply = blockingStub.deleteFetcher(DeleteFetcherRequest
    -                    .newBuilder()
    -                    .setFetcherId(fetcherId)
    -                    .build());
    +            DeleteFetcherReply deleteFetcherReply = blockingStub.deleteFetcher(
    +                            DeleteFetcherRequest.newBuilder().setFetcherId(fetcherId).build());
                 Assertions.assertTrue(deleteFetcherReply.getSuccess());
    -            StatusRuntimeException statusRuntimeException = Assertions.assertThrows(StatusRuntimeException.class, () -> blockingStub.getFetcher(GetFetcherRequest
    -                    .newBuilder()
    -                    .setFetcherId(fetcherId)
    -                    .build()));
    -            Assertions.assertEquals(Status.NOT_FOUND
    -                    .getCode()
    -                    .value(), statusRuntimeException
    -                    .getStatus()
    -                    .getCode()
    -                    .value());
    +            StatusRuntimeException statusRuntimeException =
    +                            Assertions.assertThrows(StatusRuntimeException.class,
    +                                            () -> blockingStub.getFetcher(GetFetcherRequest
    +                                                            .newBuilder().setFetcherId(fetcherId)
    +                                                            .build()));
    +            Assertions.assertEquals(Status.NOT_FOUND.getCode().value(),
    +                            statusRuntimeException.getStatus().getCode().value());
             }
         }
     
    -    @NotNull
    -    private static String createFetcherId(int i) {
    +    @NotNull private static String createFetcherId(int i) {
             return "nick" + i + ":is:cool:super/" + FileSystemFetcher.class;
         }
     
    @@ -192,34 +164,25 @@ private static String createFetcherId(int i) {
         public void testBiStream(Resources resources) throws Exception {
             String serverName = InProcessServerBuilder.generateName();
     
    -        Server server = InProcessServerBuilder
    -                .forName(serverName)
    -                .directExecutor()
    -                .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath()))
    -                .build()
    -                .start();
    +        Server server = InProcessServerBuilder.forName(serverName).directExecutor()
    +                        .addService(new TikaGrpcServerImpl(tikaConfigXml.getAbsolutePath())).build()
    +                        .start();
             resources.register(server, Duration.ofSeconds(10));
     
    -        ManagedChannel channel = InProcessChannelBuilder
    -                .forName(serverName)
    -                .directExecutor()
    -                .build();
    +        ManagedChannel channel =
    +                        InProcessChannelBuilder.forName(serverName).directExecutor().build();
             resources.register(channel, Duration.ofSeconds(10));
             TikaGrpc.TikaBlockingStub blockingStub = TikaGrpc.newBlockingStub(channel);
             TikaGrpc.TikaStub tikaStub = TikaGrpc.newStub(channel);
     
             String fetcherId = createFetcherId(1);
             String targetFolder = new File("target").getAbsolutePath();
    -        SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest
    -                .newBuilder()
    -                .setFetcherId(fetcherId)
    -                .setFetcherClass(FileSystemFetcher.class.getName())
    -                .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap
    -                        .builder()
    -                        .put("basePath", targetFolder)
    -                        .put("extractFileSystemMetadata", true)
    -                        .build()))
    -                .build());
    +        SaveFetcherReply reply = blockingStub.saveFetcher(SaveFetcherRequest.newBuilder()
    +                        .setFetcherId(fetcherId).setFetcherClass(FileSystemFetcher.class.getName())
    +                        .setFetcherConfigJson(OBJECT_MAPPER.writeValueAsString(ImmutableMap
    +                                        .builder().put("basePath", targetFolder)
    +                                        .put("extractFileSystemMetadata", true).build()))
    +                        .build());
     
             assertEquals(fetcherId, reply.getFetcherId());
     
    @@ -230,8 +193,10 @@ public void testBiStream(Resources resources) throws Exception {
             StreamObserver replyStreamObserver = new StreamObserver<>() {
                 @Override
                 public void onNext(FetchAndParseReply fetchAndParseReply) {
    -                LOG.debug("Fetched {} with metadata {}", fetchAndParseReply.getFetchKey(), fetchAndParseReply.getFieldsMap());
    -                if (PipesResult.STATUS.FETCH_EXCEPTION.name().equals(fetchAndParseReply.getStatus())) {
    +                LOG.debug("Fetched {} with metadata {}", fetchAndParseReply.getFetchKey(),
    +                                fetchAndParseReply.getFieldsMap());
    +                if (PipesResult.STATUS.FETCH_EXCEPTION.name()
    +                                .equals(fetchAndParseReply.getStatus())) {
                         errors.add(fetchAndParseReply);
                     } else {
                         successes.add(fetchAndParseReply);
    @@ -250,32 +215,30 @@ public void onCompleted() {
                 }
             };
     
    -        StreamObserver requestStreamObserver = tikaStub.fetchAndParseBiDirectionalStreaming(replyStreamObserver);
    +        StreamObserver requestStreamObserver =
    +                        tikaStub.fetchAndParseBiDirectionalStreaming(replyStreamObserver);
     
    -        File testDocumentFolder = new File("target/" + DateTimeFormatter
    -                .ofPattern("yyyy_MM_dd_HH_mm_ssSSS", Locale.getDefault())
    -                .format(LocalDateTime.now(ZoneId.systemDefault())) + "-" + UUID.randomUUID());
    +        File testDocumentFolder = new File("target/"
    +                        + DateTimeFormatter.ofPattern("yyyy_MM_dd_HH_mm_ssSSS", Locale.getDefault())
    +                                        .format(LocalDateTime.now(ZoneId.systemDefault()))
    +                        + "-" + UUID.randomUUID());
             assertTrue(testDocumentFolder.mkdir());
             try {
                 for (int i = 0; i < NUM_TEST_DOCS; ++i) {
                     File testFile = new File(testDocumentFolder, "test-" + i + ".html");
    -                FileUtils.writeStringToFile(testFile, "test " + i + "", StandardCharsets.UTF_8);
    +                FileUtils.writeStringToFile(testFile, "test " + i + "",
    +                                StandardCharsets.UTF_8);
                 }
                 File[] testDocuments = testDocumentFolder.listFiles();
                 assertNotNull(testDocuments);
                 for (File testDocument : testDocuments) {
    -                requestStreamObserver.onNext(FetchAndParseRequest
    -                        .newBuilder()
    -                        .setFetcherId(fetcherId)
    -                        .setFetchKey(testDocument.getAbsolutePath())
    -                        .build());
    +                requestStreamObserver.onNext(FetchAndParseRequest.newBuilder()
    +                                .setFetcherId(fetcherId).setFetchKey(testDocument.getAbsolutePath())
    +                                .build());
                 }
                 // Now test error condition
    -            requestStreamObserver.onNext(FetchAndParseRequest
    -                    .newBuilder()
    -                    .setFetcherId(fetcherId)
    -                    .setFetchKey("does not exist")
    -                    .build());
    +            requestStreamObserver.onNext(FetchAndParseRequest.newBuilder().setFetcherId(fetcherId)
    +                            .setFetchKey("does not exist").build());
                 requestStreamObserver.onCompleted();
                 assertEquals(NUM_TEST_DOCS, successes.size());
                 assertEquals(1, errors.size());
    diff --git a/tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java b/tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
    index 7558afdd10..53b2436537 100644
    --- a/tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
    +++ b/tika-handlers/tika-handler-boilerpipe/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java
    @@ -1,29 +1,19 @@
     /*
    - * Licensed to the Apache Software Foundation (ASF) under one or more
    - * contributor license agreements.  See the NOTICE file distributed with
    - * this work for additional information regarding copyright ownership.
    - * The ASF licenses this file to You under the Apache License, Version 2.0
    - * (the "License"); you may not use this file except in compliance with
    - * the License.  You may obtain a copy of the License at
    + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
    + * agreements. See the NOTICE file distributed with this work for additional information regarding
    + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance with the License. You may obtain a
    + * copy of the License at
      *
    - *     http://www.apache.org/licenses/LICENSE-2.0
    + * http://www.apache.org/licenses/LICENSE-2.0
      *
    - * Unless required by applicable law or agreed to in writing, software
    - * distributed under the License is distributed on an "AS IS" BASIS,
    - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    - * See the License for the specific language governing permissions and
    - * limitations under the License.
    + * Unless required by applicable law or agreed to in writing, software distributed under the License
    + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
    + * or implied. See the License for the specific language governing permissions and limitations under
    + * the License.
      */
     package org.apache.tika.sax.boilerpipe;
     
    -import java.io.Writer;
    -import java.util.ArrayList;
    -import java.util.BitSet;
    -import java.util.HashSet;
    -import java.util.List;
    -import java.util.Locale;
    -import java.util.Set;
    -
     import de.l3s.boilerpipe.BoilerpipeExtractor;
     import de.l3s.boilerpipe.BoilerpipeProcessingException;
     import de.l3s.boilerpipe.document.TextBlock;
    @@ -31,28 +21,33 @@
     import de.l3s.boilerpipe.extractors.ArticleExtractor;
     import de.l3s.boilerpipe.extractors.DefaultExtractor;
     import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
    +import java.io.Writer;
    +import java.util.ArrayList;
    +import java.util.BitSet;
    +import java.util.HashSet;
    +import java.util.List;
    +import java.util.Locale;
    +import java.util.Set;
    +import org.apache.tika.metadata.Metadata;
    +import org.apache.tika.sax.WriteOutContentHandler;
    +import org.apache.tika.sax.XHTMLContentHandler;
     import org.xml.sax.Attributes;
     import org.xml.sax.ContentHandler;
     import org.xml.sax.SAXException;
     import org.xml.sax.helpers.AttributesImpl;
     
    -import org.apache.tika.metadata.Metadata;
    -import org.apache.tika.sax.WriteOutContentHandler;
    -import org.apache.tika.sax.XHTMLContentHandler;
    -
     /**
    - * Uses the boilerpipe
    - * library to automatically extract the main content from a web page.
    + * Uses the boilerpipe library to automatically
    + * extract the main content from a web page.
      * 

    * Use this as a {@link ContentHandler} object passed to - * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, - * org.apache.tika.parser.ParseContext)} + * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)} */ public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler { /** * The newline character that gets inserted after block elements. */ - private static final char[] NL = new char[]{'\n'}; + private static final char[] NL = new char[] {'\n'}; private static Set ALLOWABLE_CHARS; static { @@ -72,8 +67,8 @@ public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler { private TextDocument td; /** - * Creates a new boilerpipe-based content extractor, using the - * {@link DefaultExtractor} extraction rules and "delegate" as the content handler. + * Creates a new boilerpipe-based content extractor, using the {@link DefaultExtractor} + * extraction rules and "delegate" as the content handler. * * @param delegate The {@link ContentHandler} object */ @@ -82,8 +77,7 @@ public BoilerpipeContentHandler(ContentHandler delegate) { } /** - * Creates a content handler that writes XHTML body character events to - * the given writer. + * Creates a content handler that writes XHTML body character events to the given writer. * * @param writer writer */ @@ -92,11 +86,10 @@ public BoilerpipeContentHandler(Writer writer) { } /** - * Creates a new boilerpipe-based content extractor, using the given - * extraction rules. The extracted main content will be passed to the - * content handler. + * Creates a new boilerpipe-based content extractor, using the given extraction rules. The + * extracted main content will be passed to the content handler. * - * @param delegate The {@link ContentHandler} object + * @param delegate The {@link ContentHandler} object * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor} */ public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) { @@ -145,7 +138,7 @@ public void startPrefixMapping(String prefix, String uri) throws SAXException { @Override public void startElement(String uri, String localName, String qName, Attributes atts) - throws SAXException { + throws SAXException { super.startElement(uri, localName, qName, atts); if (inHeader) { @@ -231,7 +224,7 @@ public void endDocument() throws SAXException { switch (element.getElementType()) { case START: delegate.startElement(element.getUri(), element.getLocalName(), - element.getQName(), element.getAttrs()); + element.getQName(), element.getAttrs()); // Fall through case CONTINUE: @@ -245,14 +238,14 @@ public void endDocument() throws SAXException { // https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683 // Allow exempted characters to be written - if (isValidCharacterRun || - (chars.length == 1 && ALLOWABLE_CHARS.contains(chars[0]))) { + if (isValidCharacterRun || (chars.length == 1 + && ALLOWABLE_CHARS.contains(chars[0]))) { delegate.characters(chars, 0, chars.length); } // https://issues.apache.org/jira/browse/TIKA-961 - if (isValidCharacterRun && i == element.getCharacters().size() - 1 && - !Character.isWhitespace(chars[chars.length - 1])) { + if (isValidCharacterRun && i == element.getCharacters().size() - 1 + && !Character.isWhitespace(chars[chars.length - 1])) { // Only add whitespace for certain elements if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) { delegate.ignorableWhitespace(NL, 0, NL.length); @@ -263,12 +256,12 @@ public void endDocument() throws SAXException { case END: delegate.endElement(element.getUri(), element.getLocalName(), - element.getQName()); + element.getQName()); break; default: throw new RuntimeException( - "Unhandled element type: " + element.getElementType()); + "Unhandled element type: " + element.getElementType()); } @@ -316,7 +309,7 @@ public RecordedElement() { } protected RecordedElement(String uri, String localName, String qName, Attributes attrs, - RecordedElement.ElementType elementType) { + RecordedElement.ElementType elementType) { this.uri = uri; this.localName = localName; this.qName = qName; diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java index 50282909a0..c192491c80 100644 --- a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java +++ b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/java/org/apache/tika/pipes/kafka/tests/TikaPipesKafkaTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.pipes.kafka.tests; @@ -20,6 +18,9 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assumptions.assumeTrue; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Stopwatch; import java.io.File; import java.io.InputStream; import java.nio.charset.StandardCharsets; @@ -34,10 +35,6 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; - -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.base.Stopwatch; import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; @@ -50,6 +47,9 @@ import org.apache.kafka.clients.producer.ProducerRecord; import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.kafka.common.serialization.StringSerializer; +import org.apache.tika.cli.TikaCLI; +import org.apache.tika.pipes.core.HandlerConfig; +import org.apache.tika.utils.SystemUtils; import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -61,28 +61,25 @@ import org.testcontainers.kafka.ConfluentKafkaContainer; import org.testcontainers.utility.DockerImageName; -import org.apache.tika.cli.TikaCLI; -import org.apache.tika.pipes.core.HandlerConfig; -import org.apache.tika.utils.SystemUtils; - /** * Test will emit some documents into a Kafka "pipe_iterator_topic", then kafka pipe iterator will - * poll those documents and send them to tika pipes. Tika pipes will then use a file fetcher to fetch/parse, then - * the kafka emitter will send the now-parsed output to the "emitter_topic". - * Will then wait for the messages to come from the emitter and assert they are correct. + * poll those documents and send them to tika pipes. Tika pipes will then use a file fetcher to + * fetch/parse, then the kafka emitter will send the now-parsed output to the "emitter_topic". Will + * then wait for the messages to come from the emitter and assert they are correct. */ @Testcontainers(disabledWithoutDocker = true) public class TikaPipesKafkaTest { @BeforeAll public static void setUp() { assumeTrue(!SystemUtils.IS_OS_MAC_OSX && !SystemUtils.OS_VERSION.equals("12.6.1"), - "This stopped working on macos x ... TIKA-3932"); + "This stopped working on macos x ... TIKA-3932"); } + public static final String PIPE_ITERATOR_TOPIC = "pipe_iterator_topic"; public static final String EMITTER_TOPIC = "emitter_topic"; /** - * Wait up to this many minutes before you give up waiting for the emitted documents to poll from the - * emitter_topic and fail the test. + * Wait up to this many minutes before you give up waiting for the emitted documents to poll + * from the emitter_topic and fail the test. */ public static final int WAIT_FOR_EMITTED_DOCS_TIMEOUT_MINUTES = 2; private static final Logger LOG = LoggerFactory.getLogger(TikaPipesKafkaTest.class); @@ -94,7 +91,8 @@ public static void setUp() { private final Set waitingFor = new HashSet<>(); // https://java.testcontainers.org/modules/kafka/#using-orgtestcontainerskafkaconfluentkafkacontainer - ConfluentKafkaContainer kafka = new ConfluentKafkaContainer(DockerImageName.parse("confluentinc/cp-kafka:7.4.0")); + ConfluentKafkaContainer kafka = new ConfluentKafkaContainer( + DockerImageName.parse("confluentinc/cp-kafka:7.4.0")); private void createTestFiles() throws Exception { if (testFileFolder.mkdirs()) { @@ -103,7 +101,7 @@ private void createTestFiles() throws Exception { for (int i = 0; i < numDocs; ++i) { String nextFileName = "test-" + i + ".html"; FileUtils.writeStringToFile(new File(testFileFolder, nextFileName), - "body-" + i + "", StandardCharsets.UTF_8); + "body-" + i + "", StandardCharsets.UTF_8); waitingFor.add(nextFileName); } } @@ -124,7 +122,7 @@ public void testKafkaPipeIteratorAndEmitter() throws Exception { File tikaConfigFile = new File("target", "ta.xml"); File log4jPropFile = new File("target", "tmp-log4j2.xml"); try (InputStream is = this.getClass() - .getResourceAsStream("/pipes-fork-server-custom-log4j2.xml")) { + .getResourceAsStream("/pipes-fork-server-custom-log4j2.xml")) { assert is != null; FileUtils.copyInputStreamToFile(is, log4jPropFile); } @@ -138,17 +136,17 @@ public void testKafkaPipeIteratorAndEmitter() throws Exception { consumerProps.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); consumerProps.put("group.id", UUID.randomUUID().toString()); consumerProps.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, - StringDeserializer.class.getName()); + StringDeserializer.class.getName()); consumerProps.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, - StringDeserializer.class.getName()); + StringDeserializer.class.getName()); consumerProps.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); Properties producerProps = new Properties(); producerProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, kafka.getBootstrapServers()); producerProps.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, - StringSerializer.class.getName()); + StringSerializer.class.getName()); producerProps.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, - StringSerializer.class.getName()); + StringSerializer.class.getName()); KafkaConsumer consumer = new KafkaConsumer<>(consumerProps); LOG.info("Listening to EMITTER_TOPIC={}", EMITTER_TOPIC); @@ -165,8 +163,8 @@ public void testKafkaPipeIteratorAndEmitter() throws Exception { meta.put("path", nextFile.getAbsolutePath()); meta.put("totalSpace", nextFile.getTotalSpace()); try { - producer.send( - new ProducerRecord<>(PIPE_ITERATOR_TOPIC, nextFile.getAbsolutePath(), + producer.send(new ProducerRecord<>(PIPE_ITERATOR_TOPIC, + nextFile.getAbsolutePath(), objectMapper.writeValueAsString(meta))).get(); LOG.info("Sent fetch request : {}", nextFile.getAbsolutePath()); ++numSent; @@ -179,32 +177,30 @@ public void testKafkaPipeIteratorAndEmitter() throws Exception { es.execute(() -> { try { - String tikaConfigXml = - createTikaConfigXml(tikaConfigFile, log4jPropFile, tikaConfigTemplateXml); + String tikaConfigXml = createTikaConfigXml(tikaConfigFile, log4jPropFile, + tikaConfigTemplateXml); FileUtils.writeStringToFile(tikaConfigFile, tikaConfigXml, StandardCharsets.UTF_8); - TikaCLI.main(new String[]{"-a", "-c", tikaConfigFile.getAbsolutePath()}); + TikaCLI.main(new String[] {"-a", "-c", tikaConfigFile.getAbsolutePath()}); } catch (Exception e) { throw new RuntimeException(e); } }); - LOG.info( - "Tika pipes have been started. See if we can pull the response messages from the EMITTER_TOPIC={}", - EMITTER_TOPIC); + LOG.info("Tika pipes have been started. See if we can pull the response messages from the EMITTER_TOPIC={}", + EMITTER_TOPIC); Stopwatch stopwatch = Stopwatch.createStarted(); while (!waitingFor.isEmpty()) { assertFalse(stopwatch.elapsed(TimeUnit.MINUTES) > WAIT_FOR_EMITTED_DOCS_TIMEOUT_MINUTES, - "Timed out after " + WAIT_FOR_EMITTED_DOCS_TIMEOUT_MINUTES + - " minutes waiting for the emitted docs"); + "Timed out after " + WAIT_FOR_EMITTED_DOCS_TIMEOUT_MINUTES + + " minutes waiting for the emitted docs"); try { ConsumerRecords records = consumer.poll(Duration.ofSeconds(1)); for (ConsumerRecord record : records) { String val = record.value(); - Map valMap = - objectMapper.readValue(val, new TypeReference>() { - }); + Map valMap = objectMapper.readValue(val, + new TypeReference>() {}); waitingFor.remove(FilenameUtils.getName(record.key())); assertNotNull(valMap.get("content_s")); assertNotNull(valMap.get("mime_s")); @@ -219,15 +215,14 @@ public void testKafkaPipeIteratorAndEmitter() throws Exception { LOG.info("Done"); } - @NotNull - private String createTikaConfigXml(File tikaConfigFile, File log4jPropFile, - String tikaConfigTemplateXml) { + @NotNull private String createTikaConfigXml(File tikaConfigFile, File log4jPropFile, + String tikaConfigTemplateXml) { return tikaConfigTemplateXml.replace("{TIKA_CONFIG}", tikaConfigFile.getAbsolutePath()) - .replace("{LOG4J_PROPERTIES_FILE}", log4jPropFile.getAbsolutePath()) - .replace("{PATH_TO_DOCS}", testFileFolder.getAbsolutePath()) - .replace("{PARSE_MODE}", HandlerConfig.PARSE_MODE.RMETA.name()) - .replace("{PIPE_ITERATOR_TOPIC}", PIPE_ITERATOR_TOPIC) - .replace("{EMITTER_TOPIC}", EMITTER_TOPIC) - .replace("{BOOTSTRAP_SERVERS}", kafka.getBootstrapServers()); + .replace("{LOG4J_PROPERTIES_FILE}", log4jPropFile.getAbsolutePath()) + .replace("{PATH_TO_DOCS}", testFileFolder.getAbsolutePath()) + .replace("{PARSE_MODE}", HandlerConfig.PARSE_MODE.RMETA.name()) + .replace("{PIPE_ITERATOR_TOPIC}", PIPE_ITERATOR_TOPIC) + .replace("{EMITTER_TOPIC}", EMITTER_TOPIC) + .replace("{BOOTSTRAP_SERVERS}", kafka.getBootstrapServers()); } } diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java index 9923a320a3..e894713eff 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java +++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpenSearchTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.pipes.opensearch.tests; @@ -20,6 +18,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; +import com.fasterxml.jackson.databind.JsonNode; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -32,19 +31,7 @@ import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; - -import com.fasterxml.jackson.databind.JsonNode; import org.apache.commons.io.IOUtils; -import org.jetbrains.annotations.NotNull; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; -import org.opensearch.testcontainers.OpensearchContainer; -import org.testcontainers.junit.jupiter.Testcontainers; -import org.testcontainers.utility.DockerImageName; - import org.apache.tika.cli.TikaCLI; import org.apache.tika.client.HttpClientFactory; import org.apache.tika.exception.TikaConfigException; @@ -55,10 +42,20 @@ import org.apache.tika.pipes.core.emitter.EmitterManager; import org.apache.tika.pipes.emitter.opensearch.JsonResponse; import org.apache.tika.pipes.emitter.opensearch.OpenSearchEmitter; +import org.jetbrains.annotations.NotNull; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.opensearch.testcontainers.OpensearchContainer; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.utility.DockerImageName; @Testcontainers(disabledWithoutDocker = true) public class OpenSearchTest { - private static DockerImageName OPENSEARCH_IMAGE = DockerImageName.parse("opensearchproject/opensearch:2.19.3"); + private static DockerImageName OPENSEARCH_IMAGE = + DockerImageName.parse("opensearchproject/opensearch:2.19.3"); private static OpensearchContainer CONTAINER; protected static final String TEST_INDEX = "tika-pipes-index"; @@ -87,7 +84,8 @@ public void clearIndex() throws TikaConfigException, IOException { } @Test - public void testBasicFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir Path testDocDirectory) throws Exception { + public void testBasicFSToOpenSearch(@TempDir Path pipesDirectory, + @TempDir Path testDocDirectory) throws Exception { OpensearchTestClient client = getNewClient(); int numHtmlDocs = 42; @@ -97,31 +95,32 @@ public void testBasicFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir Path sendMappings(client, endpoint, TEST_INDEX, "opensearch-mappings.json"); runPipes(client, OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS, - OpenSearchEmitter.UpdateStrategy.UPSERT, HandlerConfig.PARSE_MODE.CONCATENATE, endpoint, - pipesDirectory, testDocDirectory); + OpenSearchEmitter.UpdateStrategy.UPSERT, + HandlerConfig.PARSE_MODE.CONCATENATE, endpoint, pipesDirectory, + testDocDirectory); - String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + - "\"query\": \"happiness\" } } } }"; + String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + + "\"query\": \"happiness\" } } } }"; JsonResponse results = client.postJson(endpoint + "/_search", query); assertEquals(200, results.getStatus()); assertEquals(numHtmlDocs + 1, - results.getJson().get("hits").get("total").get("value").asInt()); + results.getJson().get("hits").get("total").get("value").asInt()); - //now try match all - query = "{ \"track_total_hits\": true, \"query\": { \"match_all\": {} }, " + - "\"from\": 0, \"size\": 1000 }"; + // now try match all + query = "{ \"track_total_hits\": true, \"query\": { \"match_all\": {} }, " + + "\"from\": 0, \"size\": 1000 }"; results = client.postJson(endpoint + "/_search", query); assertEquals(200, results.getStatus()); assertEquals(numHtmlDocs + numTestDocs, - results.getJson().get("hits").get("total").get("value").asInt()); + results.getJson().get("hits").get("total").get("value").asInt()); - //now test that the reporter worked + // now test that the reporter worked Map statusCounts = new HashMap<>(); for (JsonNode n : results.getJson().get("hits").get("hits")) { String status = n.get("_source").get("my_test_parse_status").asText(); - //this will throw an NPE if the field isn't there - //in short, this guarantees that the value is there + // this will throw an NPE if the field isn't there + // in short, this guarantees that the value is there long parseTimeMs = n.get("_source").get("my_test_parse_time_ms").asLong(); Integer cnt = statusCounts.get(status); if (cnt == null) { @@ -132,9 +131,9 @@ public void testBasicFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir Path statusCounts.put(status, cnt); } assertEquals(numHtmlDocs, (int) statusCounts.get("PARSE_SUCCESS")); - //the npe is caught and counted as a "parse success with exception" + // the npe is caught and counted as a "parse success with exception" assertEquals(1, (int) statusCounts.get("PARSE_SUCCESS_WITH_EXCEPTION")); - //the embedded docx is emitted directly + // the embedded docx is emitted directly assertEquals(1, (int) statusCounts.get("EMIT_SUCCESS")); assertEquals(2, (int) statusCounts.get("OOM")); @@ -142,7 +141,8 @@ public void testBasicFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir Path @Test - public void testParentChildFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir Path testDocDirectory) throws Exception { + public void testParentChildFSToOpenSearch(@TempDir Path pipesDirectory, + @TempDir Path testDocDirectory) throws Exception { int numHtmlDocs = 42; OpensearchTestClient client = getNewClient(); @@ -151,65 +151,62 @@ public void testParentChildFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir sendMappings(client, endpoint, TEST_INDEX, "opensearch-parent-child-mappings.json"); runPipes(client, OpenSearchEmitter.AttachmentStrategy.PARENT_CHILD, - OpenSearchEmitter.UpdateStrategy.OVERWRITE, - HandlerConfig.PARSE_MODE.RMETA, endpoint, pipesDirectory, testDocDirectory); + OpenSearchEmitter.UpdateStrategy.OVERWRITE, HandlerConfig.PARSE_MODE.RMETA, + endpoint, pipesDirectory, testDocDirectory); - String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + - "\"query\": \"happiness\" } } } }"; + String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + + "\"query\": \"happiness\" } } } }"; JsonResponse results = client.postJson(endpoint + "/_search", query); assertEquals(200, results.getStatus()); - assertEquals(numHtmlDocs + 1, results.getJson().get("hits").get("total").get("value").asInt()); + assertEquals(numHtmlDocs + 1, + results.getJson().get("hits").get("total").get("value").asInt()); - //now try match all + // now try match all query = "{ " + - //"\"from\":0, \"size\":1000," + - "\"track_total_hits\": true, \"query\": { " + - "\"match_all\": {} } }"; + // "\"from\":0, \"size\":1000," + + "\"track_total_hits\": true, \"query\": { " + "\"match_all\": {} } }"; results = client.postJson(endpoint + "/_search", query); assertEquals(200, results.getStatus()); assertEquals(numHtmlDocs + 3 + 12, // 3 mock files and... - // the .docx file has 11 embedded files, plus itself - results.getJson().get("hits").get("total").get("value").asInt()); + // the .docx file has 11 embedded files, plus itself + results.getJson().get("hits").get("total").get("value").asInt()); - //now check out one of the embedded files - query = "{ \"track_total_hits\": true, \"query\": { \"query_string\": { " + - "\"default_field\": \"content\", " + - "\"query\": \"embed4 zip\" , \"minimum_should_match\":2 } } } "; + // now check out one of the embedded files + query = "{ \"track_total_hits\": true, \"query\": { \"query_string\": { " + + "\"default_field\": \"content\", " + + "\"query\": \"embed4 zip\" , \"minimum_should_match\":2 } } } "; results = client.postJson(endpoint + "/_search", query); assertEquals(200, results.getStatus()); - assertEquals(1, - results.getJson().get("hits").get("total").get("value").asInt()); + assertEquals(1, results.getJson().get("hits").get("total").get("value").asInt()); JsonNode source = results.getJson().get("hits").get("hits").get(0).get("_source"); Matcher m = Pattern - .compile("\\Atest_recursive_embedded" + - ".docx-[0-9a-f]{8}-[0-9a-f]{4}-" + - "[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\\Z").matcher( - results.getJson().get("hits").get("hits").get(0).get("_id").asText() - ); + .compile("\\Atest_recursive_embedded" + ".docx-[0-9a-f]{8}-[0-9a-f]{4}-" + + "[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\\Z") + .matcher(results.getJson().get("hits").get("hits").get(0).get("_id") + .asText()); assertTrue(m.find(), "test_recursive_embedded.docx_$guid"); assertEquals("test_recursive_embedded.docx", - results.getJson().get("hits").get("hits").get(0).get("_routing").asText()); + results.getJson().get("hits").get("hits").get(0).get("_routing").asText()); assertEquals("test_recursive_embedded.docx", - source.get("relation_type").get("parent").asText()); - assertEquals("embedded", - source.get("relation_type").get("name").asText()); + source.get("relation_type").get("parent").asText()); + assertEquals("embedded", source.get("relation_type").get("name").asText()); assertEquals("application/zip", source.get("mime").asText()); - //now make sure all the children are returned by a parent search - query = "{ \"track_total_hits\": true, \"query\": { \"parent_id\": { " + - "\"type\": \"embedded\", " + - "\"id\": \"test_recursive_embedded.docx\" } } } "; + // now make sure all the children are returned by a parent search + query = "{ \"track_total_hits\": true, \"query\": { \"parent_id\": { " + + "\"type\": \"embedded\", " + + "\"id\": \"test_recursive_embedded.docx\" } } } "; results = client.postJson(endpoint + "/_search", query); - assertEquals(11, - results.getJson().get("hits").get("total").get("value").asInt()); + assertEquals(11, results.getJson().get("hits").get("total").get("value").asInt()); } @Test - public void testSeparateDocsFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir Path testDocDirectory) throws Exception { + public void testSeparateDocsFSToOpenSearch(@TempDir Path pipesDirectory, + @TempDir Path testDocDirectory) throws Exception { OpensearchTestClient client = getNewClient(); int numHtmlDocs = 42; @@ -218,138 +215,134 @@ public void testSeparateDocsFSToOpenSearch(@TempDir Path pipesDirectory, @TempDi sendMappings(client, endpoint, TEST_INDEX, "opensearch-mappings.json"); runPipes(client, OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS, - OpenSearchEmitter.UpdateStrategy.OVERWRITE, - HandlerConfig.PARSE_MODE.RMETA, endpoint, - pipesDirectory, testDocDirectory); + OpenSearchEmitter.UpdateStrategy.OVERWRITE, HandlerConfig.PARSE_MODE.RMETA, + endpoint, pipesDirectory, testDocDirectory); - String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + - "\"query\": \"happiness\" } } } }"; + String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + + "\"query\": \"happiness\" } } } }"; JsonResponse results = client.postJson(endpoint + "/_search", query); assertEquals(200, results.getStatus()); assertEquals(numHtmlDocs + 1, - results.getJson().get("hits").get("total").get("value").asInt()); + results.getJson().get("hits").get("total").get("value").asInt()); - //now try match all + // now try match all query = "{ " + - //"\"from\":0, \"size\":1000," + - "\"track_total_hits\": true, \"query\": { " + - "\"match_all\": {} } }"; + // "\"from\":0, \"size\":1000," + + "\"track_total_hits\": true, \"query\": { " + "\"match_all\": {} } }"; results = client.postJson(endpoint + "/_search", query); assertEquals(200, results.getStatus()); - assertEquals(numHtmlDocs + 3 + 12, //3 for the mock docs, - // and the .docx file has 11 embedded files, plus itself - results.getJson().get("hits").get("total").get("value").asInt()); - - //now check out one of the embedded files - query = "{ \"track_total_hits\": true, \"query\": { \"query_string\": { " + - "\"default_field\": \"content\", " + - "\"query\": \"embed4 zip\" , \"minimum_should_match\":2 } } } "; + assertEquals(numHtmlDocs + 3 + 12, // 3 for the mock docs, + // and the .docx file has 11 embedded files, plus itself + results.getJson().get("hits").get("total").get("value").asInt()); + + // now check out one of the embedded files + query = "{ \"track_total_hits\": true, \"query\": { \"query_string\": { " + + "\"default_field\": \"content\", " + + "\"query\": \"embed4 zip\" , \"minimum_should_match\":2 } } } "; results = client.postJson(endpoint + "/_search", query); assertEquals(200, results.getStatus()); - assertEquals(1, - results.getJson().get("hits").get("total").get("value").asInt()); + assertEquals(1, results.getJson().get("hits").get("total").get("value").asInt()); JsonNode source = results.getJson().get("hits").get("hits").get(0).get("_source"); - Matcher m = Pattern.compile("\\Atest_recursive_embedded" + - ".docx-[0-9a-f]{8}-[0-9a-f]{4}-" + - "[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\\Z").matcher( - results.getJson().get("hits").get("hits").get(0).get("_id").asText() - ); + Matcher m = Pattern + .compile("\\Atest_recursive_embedded" + ".docx-[0-9a-f]{8}-[0-9a-f]{4}-" + + "[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\\Z") + .matcher(results.getJson().get("hits").get("hits").get(0).get("_id") + .asText()); assertTrue(m.find(), "test_recursive_embedded.docx-$guid"); assertNull(results.getJson().get("hits").get("hits").get(0).get("_routing"), - "test_recursive_embedded.docx"); + "test_recursive_embedded.docx"); assertNull(source.get("relation_type"), "test_recursive_embedded.docx"); assertEquals("application/zip", source.get("mime").asText()); - //now make sure there are no children; this query should - //cause an exception because there are no relationships in the schema - query = "{ \"track_total_hits\": true, \"query\": { \"parent_id\": { " + - "\"type\": \"embedded\", " + - "\"id\": \"test_recursive_embedded.docx\" } } } "; + // now make sure there are no children; this query should + // cause an exception because there are no relationships in the schema + query = "{ \"track_total_hits\": true, \"query\": { \"parent_id\": { " + + "\"type\": \"embedded\", " + + "\"id\": \"test_recursive_embedded.docx\" } } } "; results = client.postJson(endpoint + "/_search", query); assertEquals(400, results.getStatus()); } @Test - public void testUpsertSeparateDocsFSToOpenSearch(@TempDir Path pipesDirectory, @TempDir Path testDocDirectory) throws Exception { + public void testUpsertSeparateDocsFSToOpenSearch(@TempDir Path pipesDirectory, + @TempDir Path testDocDirectory) throws Exception { OpensearchTestClient client = getNewClient(); - //now test that this works with upsert + // now test that this works with upsert int numHtmlDocs = 42; createTestHtmlFiles("Happiness", numHtmlDocs, testDocDirectory); String endpoint = CONTAINER.getHttpHostAddress() + "/" + TEST_INDEX; sendMappings(client, endpoint, TEST_INDEX, "opensearch-mappings.json"); runPipes(client, OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS, - OpenSearchEmitter.UpdateStrategy.UPSERT, - HandlerConfig.PARSE_MODE.RMETA, endpoint, pipesDirectory, testDocDirectory); + OpenSearchEmitter.UpdateStrategy.UPSERT, HandlerConfig.PARSE_MODE.RMETA, + endpoint, pipesDirectory, testDocDirectory); - String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + - "\"query\": \"happiness\" } } } }"; + String query = "{ \"track_total_hits\": true, \"query\": { \"match\": { \"content\": { " + + "\"query\": \"happiness\" } } } }"; JsonResponse results = client.postJson(endpoint + "/_search", query); assertEquals(200, results.getStatus()); assertEquals(numHtmlDocs + 1, - results.getJson().get("hits").get("total").get("value").asInt()); + results.getJson().get("hits").get("total").get("value").asInt()); - //now try match all + // now try match all query = "{ " + - //"\"from\":0, \"size\":1000," + - "\"track_total_hits\": true, \"query\": { " + - "\"match_all\": {} } }"; + // "\"from\":0, \"size\":1000," + + "\"track_total_hits\": true, \"query\": { " + "\"match_all\": {} } }"; results = client.postJson(endpoint + "/_search", query); assertEquals(200, results.getStatus()); - assertEquals(numHtmlDocs + 3 + 12, //3 for the mock docs, - // and the .docx file has 11 embedded files, plus itself - results.getJson().get("hits").get("total").get("value").asInt()); - - //now check out one of the embedded files - query = "{ \"track_total_hits\": true, \"query\": { \"query_string\": { " + - "\"default_field\": \"content\", " + - "\"query\": \"embed4 zip\" , \"minimum_should_match\":2 } } } "; + assertEquals(numHtmlDocs + 3 + 12, // 3 for the mock docs, + // and the .docx file has 11 embedded files, plus itself + results.getJson().get("hits").get("total").get("value").asInt()); + + // now check out one of the embedded files + query = "{ \"track_total_hits\": true, \"query\": { \"query_string\": { " + + "\"default_field\": \"content\", " + + "\"query\": \"embed4 zip\" , \"minimum_should_match\":2 } } } "; results = client.postJson(endpoint + "/_search", query); assertEquals(200, results.getStatus()); - assertEquals(1, - results.getJson().get("hits").get("total").get("value").asInt()); + assertEquals(1, results.getJson().get("hits").get("total").get("value").asInt()); JsonNode source = results.getJson().get("hits").get("hits").get(0).get("_source"); - Matcher m = Pattern.compile("\\Atest_recursive_embedded" + - ".docx-[0-9a-f]{8}-[0-9a-f]{4}-" + - "[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\\Z").matcher( - results.getJson().get("hits").get("hits").get(0).get("_id").asText() - ); + Matcher m = Pattern + .compile("\\Atest_recursive_embedded" + ".docx-[0-9a-f]{8}-[0-9a-f]{4}-" + + "[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\\Z") + .matcher(results.getJson().get("hits").get("hits").get(0).get("_id") + .asText()); assertTrue(m.find(), "test_recursive_embedded.docx-$guid"); assertNull(results.getJson().get("hits").get("hits").get(0).get("_routing"), - "test_recursive_embedded.docx"); + "test_recursive_embedded.docx"); assertNull(source.get("relation_type"), "test_recursive_embedded.docx"); assertEquals("application/zip", source.get("mime").asText()); - //now make sure there are no children; this query should - //cause an exception because there are no relationships in the schema - query = "{ \"track_total_hits\": true, \"query\": { \"parent_id\": { " + - "\"type\": \"embedded\", " + - "\"id\": \"test_recursive_embedded.docx\" } } } "; + // now make sure there are no children; this query should + // cause an exception because there are no relationships in the schema + query = "{ \"track_total_hits\": true, \"query\": { \"parent_id\": { " + + "\"type\": \"embedded\", " + + "\"id\": \"test_recursive_embedded.docx\" } } } "; results = client.postJson(endpoint + "/_search", query); assertEquals(400, results.getStatus()); } @Test - public void testUpsert(@TempDir Path pipesDirectory, @TempDir Path testDocDirectory) throws Exception { + public void testUpsert(@TempDir Path pipesDirectory, @TempDir Path testDocDirectory) + throws Exception { OpensearchTestClient client = getNewClient(); String endpoint = CONTAINER.getHttpHostAddress() + "/" + TEST_INDEX; sendMappings(client, endpoint, TEST_INDEX, "opensearch-mappings.json"); - Path tikaConfigFile = - getTikaConfigFile(OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS, + Path tikaConfigFile = getTikaConfigFile( + OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS, OpenSearchEmitter.UpdateStrategy.UPSERT, HandlerConfig.PARSE_MODE.RMETA, endpoint, pipesDirectory, testDocDirectory); - Emitter emitter = EmitterManager - .load(tikaConfigFile).getEmitter(); + Emitter emitter = EmitterManager.load(tikaConfigFile).getEmitter(); Metadata metadata = new Metadata(); metadata.set("mime", "mimeA"); metadata.set("title", "titleA"); @@ -365,12 +358,10 @@ public void testUpsert(@TempDir Path pipesDirectory, @TempDir Path testDocDirect refresh = client.getJson(endpoint + "/_refresh"); String query = "{ " + - //"\"from\":0, \"size\":1000," + - "\"track_total_hits\": true, \"query\": { " + - "\"match_all\": {} } }"; + // "\"from\":0, \"size\":1000," + + "\"track_total_hits\": true, \"query\": { " + "\"match_all\": {} } }"; JsonResponse response = client.postJson(endpoint + "/_search", query); - JsonNode doc1 = response.getJson().get("hits").get("hits").get(0).get( - "_source"); + JsonNode doc1 = response.getJson().get("hits").get("hits").get(0).get("_source"); assertEquals("mimeA", doc1.get("mime").asText()); assertEquals("titleB", doc1.get("title").asText()); assertEquals("the quick brown fox", doc1.get("content").asText()); @@ -383,19 +374,23 @@ private OpensearchTestClient getNewClient() throws TikaConfigException { httpClientFactory.setUserName(CONTAINER.getUsername()); httpClientFactory.setPassword(CONTAINER.getPassword()); - return new OpensearchTestClient(CONTAINER.getHttpHostAddress(), httpClientFactory.build(), OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS, - OpenSearchEmitter.UpdateStrategy.OVERWRITE, OpenSearchEmitter.DEFAULT_EMBEDDED_FILE_FIELD_NAME); + return new OpensearchTestClient(CONTAINER.getHttpHostAddress(), httpClientFactory.build(), + OpenSearchEmitter.AttachmentStrategy.SEPARATE_DOCUMENTS, + OpenSearchEmitter.UpdateStrategy.OVERWRITE, + OpenSearchEmitter.DEFAULT_EMBEDDED_FILE_FIELD_NAME); } - protected void sendMappings(OpensearchTestClient client, String endpoint, String index, String mappingsFile) throws Exception { - //create the collection with mappings - String mappings = IOUtils.toString(OpenSearchTest.class.getResourceAsStream( - "/opensearch/" + mappingsFile), StandardCharsets.UTF_8); + protected void sendMappings(OpensearchTestClient client, String endpoint, String index, + String mappingsFile) throws Exception { + // create the collection with mappings + String mappings = IOUtils.toString( + OpenSearchTest.class.getResourceAsStream("/opensearch/" + mappingsFile), + StandardCharsets.UTF_8); int status = -1; int tries = 0; JsonResponse response = null; - //need to wait a bit sometimes before OpenSearch is up + // need to wait a bit sometimes before OpenSearch is up while (status != 200 && tries++ < 20) { response = client.putJson(endpoint, mappings); if (status != 200) { @@ -404,8 +399,7 @@ protected void sendMappings(OpensearchTestClient client, String endpoint, String status = response.getStatus(); } if (status != 200) { - throw new IllegalArgumentException("couldn't create index/add mappings: " + - response); + throw new IllegalArgumentException("couldn't create index/add mappings: " + response); } assertTrue(response.getJson().get("acknowledged").asBoolean()); assertEquals(index, response.getJson().get("index").asText()); @@ -413,61 +407,64 @@ protected void sendMappings(OpensearchTestClient client, String endpoint, String } - private void runPipes(OpensearchTestClient client, OpenSearchEmitter.AttachmentStrategy attachmentStrategy, - OpenSearchEmitter.UpdateStrategy updateStrategy, - HandlerConfig.PARSE_MODE parseMode, String endpoint, Path pipesDirectory, Path testDocDirectory) throws Exception { + private void runPipes(OpensearchTestClient client, + OpenSearchEmitter.AttachmentStrategy attachmentStrategy, + OpenSearchEmitter.UpdateStrategy updateStrategy, + HandlerConfig.PARSE_MODE parseMode, String endpoint, Path pipesDirectory, + Path testDocDirectory) throws Exception { Path tikaConfigFile = getTikaConfigFile(attachmentStrategy, updateStrategy, parseMode, - endpoint, pipesDirectory, testDocDirectory); + endpoint, pipesDirectory, testDocDirectory); - TikaCLI.main(new String[]{"-a", "-c", tikaConfigFile.toAbsolutePath().toString()}); + TikaCLI.main(new String[] {"-a", "-c", tikaConfigFile.toAbsolutePath().toString()}); - //refresh to make sure the content is searchable + // refresh to make sure the content is searchable JsonResponse refresh = client.getJson(endpoint + "/_refresh"); } private Path getTikaConfigFile(OpenSearchEmitter.AttachmentStrategy attachmentStrategy, - OpenSearchEmitter.UpdateStrategy updateStrategy, - HandlerConfig.PARSE_MODE parseMode, String endpoint, - Path pipesDirectory, Path testDocDirectory) throws IOException { + OpenSearchEmitter.UpdateStrategy updateStrategy, + HandlerConfig.PARSE_MODE parseMode, String endpoint, Path pipesDirectory, + Path testDocDirectory) throws IOException { Path tikaConfigFile = pipesDirectory.resolve("ta-opensearch.xml"); Path log4jPropFile = pipesDirectory.resolve("tmp-log4j2.xml"); try (InputStream is = OpenSearchTest.class - .getResourceAsStream("/pipes-fork-server-custom-log4j2.xml")) { + .getResourceAsStream("/pipes-fork-server-custom-log4j2.xml")) { Files.copy(is, log4jPropFile); } String tikaConfigTemplateXml; try (InputStream is = OpenSearchTest.class - .getResourceAsStream("/opensearch/tika-config-opensearch.xml")) { + .getResourceAsStream("/opensearch/tika-config-opensearch.xml")) { tikaConfigTemplateXml = IOUtils.toString(is, StandardCharsets.UTF_8); } - String tikaConfigXml = - createTikaConfigXml(tikaConfigFile, log4jPropFile, tikaConfigTemplateXml, - attachmentStrategy, updateStrategy, parseMode, endpoint, testDocDirectory); + String tikaConfigXml = createTikaConfigXml(tikaConfigFile, log4jPropFile, + tikaConfigTemplateXml, attachmentStrategy, updateStrategy, parseMode, + endpoint, testDocDirectory); writeStringToPath(tikaConfigFile, tikaConfigXml); return tikaConfigFile; } - @NotNull - private String createTikaConfigXml(Path tikaConfigFile, Path log4jPropFile, - String tikaConfigTemplateXml, - OpenSearchEmitter.AttachmentStrategy attachmentStrategy, - OpenSearchEmitter.UpdateStrategy updateStrategy, - HandlerConfig.PARSE_MODE parseMode, String endpoint, Path testDocDirectory) { - String res = - tikaConfigTemplateXml.replace("{TIKA_CONFIG}", tikaConfigFile.toAbsolutePath().toString()) - .replace("{ATTACHMENT_STRATEGY}", attachmentStrategy.toString()) - .replace("{LOG4J_PROPERTIES_FILE}", log4jPropFile.toAbsolutePath().toString()) - .replace("{UPDATE_STRATEGY}", updateStrategy.toString()) + @NotNull private String createTikaConfigXml(Path tikaConfigFile, Path log4jPropFile, + String tikaConfigTemplateXml, + OpenSearchEmitter.AttachmentStrategy attachmentStrategy, + OpenSearchEmitter.UpdateStrategy updateStrategy, + HandlerConfig.PARSE_MODE parseMode, String endpoint, Path testDocDirectory) { + String res = tikaConfigTemplateXml + .replace("{TIKA_CONFIG}", tikaConfigFile.toAbsolutePath().toString()) + .replace("{ATTACHMENT_STRATEGY}", attachmentStrategy.toString()) + .replace("{LOG4J_PROPERTIES_FILE}", + log4jPropFile.toAbsolutePath().toString()) + .replace("{UPDATE_STRATEGY}", updateStrategy.toString()) .replaceAll("\\{OPENSEARCH_USERNAME\\}", CONTAINER.getUsername()) - .replaceAll("\\{OPENSEARCH_PASSWORD\\}", CONTAINER.getPassword()) - .replaceAll("\\{PATH_TO_DOCS\\}", - Matcher.quoteReplacement(testDocDirectory.toAbsolutePath().toString())) - .replace("{PARSE_MODE}", parseMode.name()); + .replaceAll("\\{OPENSEARCH_PASSWORD\\}", CONTAINER.getPassword()) + .replaceAll("\\{PATH_TO_DOCS\\}", + Matcher.quoteReplacement(testDocDirectory.toAbsolutePath() + .toString())) + .replace("{PARSE_MODE}", parseMode.name()); if (attachmentStrategy == OpenSearchEmitter.AttachmentStrategy.PARENT_CHILD) { res = res.replace("{INCLUDE_ROUTING}", "true"); @@ -481,16 +478,16 @@ private String createTikaConfigXml(Path tikaConfigFile, Path log4jPropFile, } - private void createTestHtmlFiles(String bodyContent, int numHtmlDocs, Path testDocDirectory) throws Exception { + private void createTestHtmlFiles(String bodyContent, int numHtmlDocs, Path testDocDirectory) + throws Exception { Files.createDirectories(testDocDirectory); for (int i = 0; i < numHtmlDocs; ++i) { - String html = "" + bodyContent + ""; - Path p = testDocDirectory.resolve( "test-" + i + ".html"); + String html = "" + bodyContent + ""; + Path p = testDocDirectory.resolve("test-" + i + ".html"); writeStringToPath(p, html); } - File testDocuments = - Paths - .get(OpenSearchTest.class.getResource("/test-documents").toURI()).toFile(); + File testDocuments = Paths.get(OpenSearchTest.class.getResource("/test-documents").toURI()) + .toFile(); for (File f : testDocuments.listFiles()) { Path targ = testDocDirectory.resolve(f.getName()); Files.copy(f.toPath(), targ); diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpensearchTestClient.java b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpensearchTestClient.java index fb65b8328a..f34fb3f7d0 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpensearchTestClient.java +++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpensearchTestClient.java @@ -1,29 +1,26 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.pipes.opensearch.tests; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.StandardCharsets; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.CloseableHttpResponse; @@ -32,21 +29,19 @@ import org.apache.http.client.methods.HttpPut; import org.apache.http.entity.ByteArrayEntity; import org.apache.http.util.EntityUtils; - import org.apache.tika.pipes.emitter.opensearch.JsonResponse; import org.apache.tika.pipes.emitter.opensearch.OpenSearchClient; import org.apache.tika.pipes.emitter.opensearch.OpenSearchEmitter; /** - * This expands on the OpenSearchClient for testing purposes. - * This has more functionality than is needed for sending docs to OpenSearch + * This expands on the OpenSearchClient for testing purposes. This has more functionality than is + * needed for sending docs to OpenSearch */ public class OpensearchTestClient extends OpenSearchClient { public OpensearchTestClient(String openSearchUrl, HttpClient httpClient, - OpenSearchEmitter.AttachmentStrategy attachmentStrategy, - OpenSearchEmitter.UpdateStrategy updateStrategy, - String embeddedFileFieldName) { + OpenSearchEmitter.AttachmentStrategy attachmentStrategy, + OpenSearchEmitter.UpdateStrategy updateStrategy, String embeddedFileFieldName) { super(openSearchUrl, httpClient, attachmentStrategy, updateStrategy, embeddedFileFieldName); } @@ -56,32 +51,31 @@ public JsonResponse putJson(String url, String json) throws IOException { httpRequest.setEntity(entity); httpRequest.setHeader("Accept", "application/json"); httpRequest.setHeader("Content-type", "application/json; charset=utf-8"); - //At one point, this was required because of connection already + // At one point, this was required because of connection already // bound exceptions on windows :( - //httpPost.setHeader("Connection", "close"); + // httpPost.setHeader("Connection", "close"); - //try (CloseableHttpClient httpClient = HttpClients.createDefault()) { + // try (CloseableHttpClient httpClient = HttpClients.createDefault()) { HttpResponse response = null; try { response = httpClient.execute(httpRequest); int status = response.getStatusLine().getStatusCode(); if (status == 200) { - try (Reader reader = new BufferedReader( - new InputStreamReader(response.getEntity().getContent(), - StandardCharsets.UTF_8))) { + try (Reader reader = new BufferedReader(new InputStreamReader( + response.getEntity().getContent(), StandardCharsets.UTF_8))) { ObjectMapper mapper = new ObjectMapper(); JsonNode node = mapper.readTree(reader); return new JsonResponse(200, node); } } else { return new JsonResponse(status, - new String(EntityUtils.toByteArray(response.getEntity()), - StandardCharsets.UTF_8)); + new String(EntityUtils.toByteArray(response.getEntity()), + StandardCharsets.UTF_8)); } } finally { if (response != null && response instanceof CloseableHttpResponse) { - ((CloseableHttpResponse)response).close(); + ((CloseableHttpResponse) response).close(); } httpRequest.releaseConnection(); } @@ -91,32 +85,31 @@ public JsonResponse getJson(String url) throws IOException { HttpGet httpRequest = new HttpGet(url); httpRequest.setHeader("Accept", "application/json"); httpRequest.setHeader("Content-type", "application/json; charset=utf-8"); - //At one point, this was required because of connection already + // At one point, this was required because of connection already // bound exceptions on windows :( - //httpPost.setHeader("Connection", "close"); + // httpPost.setHeader("Connection", "close"); - //try (CloseableHttpClient httpClient = HttpClients.createDefault()) { + // try (CloseableHttpClient httpClient = HttpClients.createDefault()) { HttpResponse response = null; try { response = httpClient.execute(httpRequest); int status = response.getStatusLine().getStatusCode(); if (status == 200) { - try (Reader reader = new BufferedReader( - new InputStreamReader(response.getEntity().getContent(), - StandardCharsets.UTF_8))) { + try (Reader reader = new BufferedReader(new InputStreamReader( + response.getEntity().getContent(), StandardCharsets.UTF_8))) { ObjectMapper mapper = new ObjectMapper(); JsonNode node = mapper.readTree(reader); return new JsonResponse(200, node); } } else { return new JsonResponse(status, - new String(EntityUtils.toByteArray(response.getEntity()), - StandardCharsets.UTF_8)); + new String(EntityUtils.toByteArray(response.getEntity()), + StandardCharsets.UTF_8)); } } finally { if (response != null && response instanceof CloseableHttpResponse) { - ((CloseableHttpResponse)response).close(); + ((CloseableHttpResponse) response).close(); } httpRequest.releaseConnection(); } @@ -129,21 +122,20 @@ public JsonResponse deleteIndex(String url) throws IOException { response = httpClient.execute(httpRequest); int status = response.getStatusLine().getStatusCode(); if (status == 200) { - try (Reader reader = new BufferedReader( - new InputStreamReader(response.getEntity().getContent(), - StandardCharsets.UTF_8))) { + try (Reader reader = new BufferedReader(new InputStreamReader( + response.getEntity().getContent(), StandardCharsets.UTF_8))) { ObjectMapper mapper = new ObjectMapper(); JsonNode node = mapper.readTree(reader); return new JsonResponse(200, node); } } else { return new JsonResponse(status, - new String(EntityUtils.toByteArray(response.getEntity()), - StandardCharsets.UTF_8)); + new String(EntityUtils.toByteArray(response.getEntity()), + StandardCharsets.UTF_8)); } } finally { if (response != null && response instanceof CloseableHttpResponse) { - ((CloseableHttpResponse)response).close(); + ((CloseableHttpResponse) response).close(); } httpRequest.releaseConnection(); } diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java index 737041919a..592f3a9f02 100644 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java +++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/PipeIntegrationTests.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.pipes.s3.tests; @@ -32,17 +30,6 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; - -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; -import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider; -import software.amazon.awssdk.regions.Region; -import software.amazon.awssdk.services.s3.S3Client; -import software.amazon.awssdk.services.s3.model.GetObjectRequest; -import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; -import software.amazon.awssdk.services.s3.model.S3Object; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.pipes.core.FetchEmitTuple; @@ -53,6 +40,15 @@ import org.apache.tika.pipes.core.pipesiterator.CallablePipesIterator; import org.apache.tika.pipes.core.pipesiterator.PipesIterator; import org.apache.tika.pipes.emitter.s3.S3Emitter; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; +import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.GetObjectRequest; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.S3Object; // To enable these tests, fill OUTDIR and bucket, and adjust profile and region if needed. @Disabled("turn these into actual tests with mock s3") @@ -62,22 +58,26 @@ public class PipeIntegrationTests { /** * This downloads files from a specific bucket. - * @throws Exception + * + * @throws Exception */ @Test public void testBruteForce() throws Exception { String region = "us-east-1"; String profile = "default"; String bucket = ""; - AwsCredentialsProvider provider = ProfileCredentialsProvider.builder().profileName(profile).build(); - S3Client s3Client = S3Client.builder().credentialsProvider(provider).region(Region.of(region)).build(); + AwsCredentialsProvider provider = + ProfileCredentialsProvider.builder().profileName(profile).build(); + S3Client s3Client = S3Client.builder().credentialsProvider(provider) + .region(Region.of(region)).build(); int cnt = 0; long sz = 0; - ListObjectsV2Request listObjectsV2Request = ListObjectsV2Request.builder().bucket(bucket).prefix("").build(); - List s3ObjectList = s3Client.listObjectsV2Paginator(listObjectsV2Request).stream(). - flatMap(resp -> resp.contents().stream()).toList(); + ListObjectsV2Request listObjectsV2Request = + ListObjectsV2Request.builder().bucket(bucket).prefix("").build(); + List s3ObjectList = s3Client.listObjectsV2Paginator(listObjectsV2Request).stream() + .flatMap(resp -> resp.contents().stream()).toList(); for (S3Object s3Object : s3ObjectList) { String key = s3Object.key(); Path targ = OUTDIR.resolve(key); @@ -88,7 +88,8 @@ public void testBruteForce() throws Exception { Files.createDirectories(targ.getParent()); } System.out.println("id: " + cnt + " :: " + key + " : " + s3Object.size()); - GetObjectRequest objectRequest = GetObjectRequest.builder().bucket(bucket).key(key).build(); + GetObjectRequest objectRequest = + GetObjectRequest.builder().bucket(bucket).key(key).build(); s3Client.getObject(objectRequest, targ); cnt++; sz += s3Object.size(); @@ -108,7 +109,7 @@ public void testS3ToFS() throws Exception { ArrayBlockingQueue queue = new ArrayBlockingQueue<>(1000); completionService.submit( - new CallablePipesIterator(pipesIterator, queue, 60000, numConsumers)); + new CallablePipesIterator(pipesIterator, queue, 60000, numConsumers)); for (int i = 0; i < numConsumers; i++) { completionService.submit(new FSFetcherEmitter(queue, fetcher, null)); } @@ -137,8 +138,8 @@ public void testS3ToS3() throws Exception { ExecutorService es = Executors.newFixedThreadPool(numConsumers + 1); ExecutorCompletionService completionService = new ExecutorCompletionService<>(es); ArrayBlockingQueue queue = new ArrayBlockingQueue<>(1000); - completionService.submit(new CallablePipesIterator(pipesIterator, - queue, 60000, numConsumers)); + completionService.submit( + new CallablePipesIterator(pipesIterator, queue, 60000, numConsumers)); for (int i = 0; i < numConsumers; i++) { completionService.submit(new S3FetcherEmitter(queue, fetcher, (S3Emitter) emitter)); } @@ -183,7 +184,7 @@ private static class FSFetcherEmitter implements Callable { private final ArrayBlockingQueue queue; FSFetcherEmitter(ArrayBlockingQueue queue, Fetcher fetcher, - Emitter emitter) { + Emitter emitter) { this.queue = queue; this.fetcher = fetcher; this.emitter = emitter; @@ -209,7 +210,8 @@ private void process(FetchEmitTuple t) throws IOException, TikaException { if (Files.isRegularFile(targ)) { return; } - try (InputStream is = fetcher.fetch(t.getFetchKey().getFetchKey(), t.getMetadata(), t.getParseContext())) { + try (InputStream is = fetcher.fetch(t.getFetchKey().getFetchKey(), t.getMetadata(), + t.getParseContext())) { System.out.println(counter.getAndIncrement() + " : " + t); Files.createDirectories(targ.getParent()); Files.copy(is, targ); @@ -225,7 +227,7 @@ private static class S3FetcherEmitter implements Callable { private final ArrayBlockingQueue queue; S3FetcherEmitter(ArrayBlockingQueue queue, Fetcher fetcher, - S3Emitter emitter) { + S3Emitter emitter) { this.queue = queue; this.fetcher = fetcher; this.emitter = emitter; @@ -250,7 +252,8 @@ private void process(FetchEmitTuple t) throws IOException, TikaException { Metadata userMetadata = t.getMetadata(); userMetadata.set("project", "my-project"); - try (InputStream is = fetcher.fetch(t.getFetchKey().getFetchKey(), t.getMetadata(), t.getParseContext())) { + try (InputStream is = fetcher.fetch(t.getFetchKey().getFetchKey(), t.getMetadata(), + t.getParseContext())) { emitter.emit(t.getEmitKey().getEmitKey(), is, userMetadata, t.getParseContext()); } } diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java index 407dc0b09b..b8bbdb9b1a 100644 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java +++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/java/org/apache/tika/pipes/s3/tests/S3PipeIntegrationTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.pipes.s3.tests; @@ -26,9 +24,10 @@ import java.time.temporal.ChronoUnit; import java.util.HashSet; import java.util.Set; - import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; +import org.apache.tika.cli.TikaCLI; +import org.apache.tika.pipes.core.HandlerConfig; import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assertions; @@ -52,19 +51,17 @@ import software.amazon.awssdk.services.s3.model.GetObjectResponse; import software.amazon.awssdk.services.s3.model.PutObjectRequest; -import org.apache.tika.cli.TikaCLI; -import org.apache.tika.pipes.core.HandlerConfig; - @TestInstance(TestInstance.Lifecycle.PER_CLASS) @Testcontainers(disabledWithoutDocker = true) class S3PipeIntegrationTest { private static final Logger LOG = LoggerFactory.getLogger(S3PipeIntegrationTest.class); public static final int MAX_STARTUP_TIMEOUT = 120; - private static final ComposeContainer minioContainer = new ComposeContainer( - new File("src/test/resources/docker-compose.yml")).withStartupTimeout( - Duration.of(MAX_STARTUP_TIMEOUT, ChronoUnit.SECONDS)) - .withExposedService("minio-service", 9000); + private static final ComposeContainer minioContainer = + new ComposeContainer(new File("src/test/resources/docker-compose.yml")) + .withStartupTimeout(Duration.of(MAX_STARTUP_TIMEOUT, + ChronoUnit.SECONDS)) + .withExposedService("minio-service", 9000); private static final String MINIO_ENDPOINT = "http://localhost:9000"; private static final String ACCESS_KEY = "minio"; private static final String SECRET_KEY = "minio123"; @@ -88,7 +85,8 @@ private void createTestFiles() throws NoSuchAlgorithmException { testFiles.add(nextFileName); String s = "body-of-" + nextFileName + ""; byte[] bytes = s.getBytes(StandardCharsets.US_ASCII); - PutObjectRequest request = PutObjectRequest.builder().bucket(FETCH_BUCKET).key(nextFileName).build(); + PutObjectRequest request = PutObjectRequest.builder().bucket(FETCH_BUCKET) + .key(nextFileName).build(); RequestBody requestBody = RequestBody.fromBytes(bytes); s3Client.putObject(request, requestBody); } @@ -110,10 +108,11 @@ private void initializeS3Client() throws URISyntaxException { // https://github.com/aws/aws-sdk-java-v2/discussions/3536 StaticCredentialsProvider credentialsProvider = StaticCredentialsProvider.create(awsCreds); S3Configuration s3c = S3Configuration.builder().pathStyleAccessEnabled(true).build(); // SO11228792 - s3Client = S3Client.builder(). - requestChecksumCalculation(RequestChecksumCalculation.WHEN_REQUIRED). // https://stackoverflow.com/a/79488850/535646 - serviceConfiguration(s3c).region(REGION). - credentialsProvider(credentialsProvider).endpointOverride(new URI(MINIO_ENDPOINT)).build(); + s3Client = S3Client.builder() + .requestChecksumCalculation(RequestChecksumCalculation.WHEN_REQUIRED). // https://stackoverflow.com/a/79488850/535646 + serviceConfiguration(s3c).region(REGION) + .credentialsProvider(credentialsProvider) + .endpointOverride(new URI(MINIO_ENDPOINT)).build(); } @Test @@ -130,46 +129,48 @@ void s3PipelineIteratorS3FetcherAndS3Emitter() throws Exception { File tikaConfigFile = new File("target", "ta.xml"); File log4jPropFile = new File("target", "tmp-log4j2.xml"); try (InputStream is = this.getClass() - .getResourceAsStream("/pipes-fork-server-custom-log4j2.xml")) { + .getResourceAsStream("/pipes-fork-server-custom-log4j2.xml")) { Assertions.assertNotNull(is); FileUtils.copyInputStreamToFile(is, log4jPropFile); } String tikaConfigTemplateXml; try (InputStream is = this.getClass() - .getResourceAsStream("/tika-config-s3-integration-test.xml")) { + .getResourceAsStream("/tika-config-s3-integration-test.xml")) { assert is != null; tikaConfigTemplateXml = IOUtils.toString(is, StandardCharsets.UTF_8); } try { - String tikaConfigXml = - createTikaConfigXml(tikaConfigFile, log4jPropFile, tikaConfigTemplateXml); + String tikaConfigXml = createTikaConfigXml(tikaConfigFile, log4jPropFile, + tikaConfigTemplateXml); FileUtils.writeStringToFile(tikaConfigFile, tikaConfigXml, StandardCharsets.UTF_8); - TikaCLI.main(new String[]{"-a", "-c", tikaConfigFile.getAbsolutePath()}); + TikaCLI.main(new String[] {"-a", "-c", tikaConfigFile.getAbsolutePath()}); } catch (Exception e) { throw new RuntimeException(e); } for (String testFile : testFiles) { - GetObjectRequest objectRequest = GetObjectRequest.builder().bucket(EMIT_BUCKET).key(testFile + ".json").build(); - ResponseBytes objectAsBytes = s3Client.getObjectAsBytes(objectRequest); + GetObjectRequest objectRequest = GetObjectRequest.builder().bucket(EMIT_BUCKET) + .key(testFile + ".json").build(); + ResponseBytes objectAsBytes = + s3Client.getObjectAsBytes(objectRequest); String data = objectAsBytes.asString(StandardCharsets.UTF_8); Assertions.assertTrue(data.contains("body-of-" + testFile), - "Should be able to read the parsed body of the HTML file as the body of the document"); + "Should be able to read the parsed body of the HTML file as the body of the document"); } } - @NotNull - private String createTikaConfigXml(File tikaConfigFile, File log4jPropFile, - String tikaConfigTemplateXml) { + @NotNull private String createTikaConfigXml(File tikaConfigFile, File log4jPropFile, + String tikaConfigTemplateXml) { return tikaConfigTemplateXml.replace("{TIKA_CONFIG}", tikaConfigFile.getAbsolutePath()) - .replace("{LOG4J_PROPERTIES_FILE}", log4jPropFile.getAbsolutePath()) - .replace("{PATH_TO_DOCS}", testFileFolder.getAbsolutePath()) - .replace("{PARSE_MODE}", HandlerConfig.PARSE_MODE.RMETA.name()) - .replace("{PIPE_ITERATOR_BUCKET}", FETCH_BUCKET) - .replace("{EMIT_BUCKET}", EMIT_BUCKET).replace("{FETCH_BUCKET}", FETCH_BUCKET) - .replace("{ACCESS_KEY}", ACCESS_KEY).replace("{SECRET_KEY}", SECRET_KEY) - .replace("{ENDPOINT_CONFIGURATION_SERVICE}", MINIO_ENDPOINT) - .replace("{REGION}", REGION.id()); + .replace("{LOG4J_PROPERTIES_FILE}", log4jPropFile.getAbsolutePath()) + .replace("{PATH_TO_DOCS}", testFileFolder.getAbsolutePath()) + .replace("{PARSE_MODE}", HandlerConfig.PARSE_MODE.RMETA.name()) + .replace("{PIPE_ITERATOR_BUCKET}", FETCH_BUCKET) + .replace("{EMIT_BUCKET}", EMIT_BUCKET) + .replace("{FETCH_BUCKET}", FETCH_BUCKET).replace("{ACCESS_KEY}", ACCESS_KEY) + .replace("{SECRET_KEY}", SECRET_KEY) + .replace("{ENDPOINT_CONFIGURATION_SERVICE}", MINIO_ENDPOINT) + .replace("{REGION}", REGION.id()); } } diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java index 8b87db6a17..732c8d047e 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.pipes.solr.tests; diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8ZkTest.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8ZkTest.java index 91293fe783..7b1017b26b 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8ZkTest.java +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8ZkTest.java @@ -1,36 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.pipes.solr.tests; import static org.junit.jupiter.api.Assumptions.assumeTrue; +import org.apache.tika.utils.SystemUtils; import org.junit.jupiter.api.BeforeAll; import org.testcontainers.junit.jupiter.Testcontainers; -import org.apache.tika.utils.SystemUtils; - @Testcontainers(disabledWithoutDocker = true) public class TikaPipesSolr8ZkTest extends TikaPipesSolr8Test { @BeforeAll public static void setUp() { - assumeTrue( - SystemUtils.IS_OS_UNIX && !SystemUtils.IS_OS_MAC_OSX, - "zk test only works on linux (and not mac os x)"); + assumeTrue(SystemUtils.IS_OS_UNIX && !SystemUtils.IS_OS_MAC_OSX, + "zk test only works on linux (and not mac os x)"); } diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java index 1a6d165959..232f62c63d 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.pipes.solr.tests; diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9ZkTest.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9ZkTest.java index fa6571f396..dd8fc0b8d3 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9ZkTest.java +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9ZkTest.java @@ -1,38 +1,34 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.pipes.solr.tests; import static org.junit.jupiter.api.Assumptions.assumeTrue; +import org.apache.tika.utils.SystemUtils; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Disabled; import org.testcontainers.junit.jupiter.Testcontainers; -import org.apache.tika.utils.SystemUtils; - @Disabled("until we can fix SessionExpiredException") @Testcontainers(disabledWithoutDocker = true) public class TikaPipesSolr9ZkTest extends TikaPipesSolr9Test { @BeforeAll public static void setUp() { - assumeTrue( - SystemUtils.IS_OS_UNIX && !SystemUtils.IS_OS_MAC_OSX, - "zk test only works on linux (and not mac os x)"); + assumeTrue(SystemUtils.IS_OS_UNIX && !SystemUtils.IS_OS_MAC_OSX, + "zk test only works on linux (and not mac os x)"); } @Override diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java index 10d53ecfbe..869ab7ad92 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolrTestBase.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.pipes.solr.tests; @@ -22,7 +20,6 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; - import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; @@ -34,6 +31,10 @@ import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.impl.Http2SolrClient; import org.apache.solr.common.SolrInputDocument; +import org.apache.tika.cli.TikaCLI; +import org.apache.tika.pipes.core.HandlerConfig; +import org.apache.tika.pipes.emitter.solr.SolrEmitter; +import org.apache.tika.utils.SystemUtils; import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -43,11 +44,6 @@ import org.testcontainers.junit.jupiter.Container; import org.testcontainers.utility.DockerImageName; -import org.apache.tika.cli.TikaCLI; -import org.apache.tika.pipes.core.HandlerConfig; -import org.apache.tika.pipes.emitter.solr.SolrEmitter; -import org.apache.tika.utils.SystemUtils; - public abstract class TikaPipesSolrTestBase { @@ -66,7 +62,7 @@ public TikaPipesSolrTestBase() { try { init(); } catch (InterruptedException e) { - //swallow + // swallow } } @@ -80,15 +76,16 @@ public boolean handlesParentChild() { private void init() throws InterruptedException { if (SystemUtils.IS_OS_MAC_OSX || SystemUtils.IS_OS_VERSION_WSL) { - // Networking on these operating systems needs fixed ports and localhost to be passed for the SolrCloud + // Networking on these operating systems needs fixed ports and localhost to be passed + // for the SolrCloud // with Zookeeper tests to succeed. This means stopping and starting needs solr = new FixedHostPortGenericContainer<>( - DockerImageName.parse(getSolrImageName()).toString()).withFixedExposedPort(8983, - 8983).withFixedExposedPort(9983, 9983).withCommand("-DzkRun -Dhost=localhost"); + DockerImageName.parse(getSolrImageName()).toString()) + .withFixedExposedPort(8983, 8983).withFixedExposedPort(9983, 9983) + .withCommand("-DzkRun -Dhost=localhost"); } else { - solr = new GenericContainer<>( - DockerImageName.parse(getSolrImageName())).withExposedPorts(8983, 9983) - .withCommand("-DzkRun"); + solr = new GenericContainer<>(DockerImageName.parse(getSolrImageName())) + .withExposedPorts(8983, 9983).withCommand("-DzkRun"); } solr.start(); @@ -127,11 +124,12 @@ private void createTestFiles(String bodyContent) throws Exception { testFileFolder.mkdirs(); for (int i = 0; i < numDocs; ++i) { FileUtils.writeStringToFile(new File(testFileFolder, "test-" + i + ".html"), - "" + bodyContent + "", StandardCharsets.UTF_8); + "" + bodyContent + "", + StandardCharsets.UTF_8); } FileUtils.copyInputStreamToFile( - this.getClass().getResourceAsStream("/embedded/embedded.docx"), - new File(testFileFolder, "test-embedded.docx")); + this.getClass().getResourceAsStream("/embedded/embedded.docx"), + new File(testFileFolder, "test-embedded.docx")); } protected void setupSolr() throws Exception { @@ -167,17 +165,16 @@ private void addBasicSchemaFields(String solrUrl) throws IOException { try (CloseableHttpClient client = HttpClients.createMinimal()) { HttpPost postAddRoot = new HttpPost(solrUrl + "/schema"); postAddRoot.setHeader("Content-Type", "application/json"); - postAddRoot.setEntity(new StringEntity( - """ - { - "add-field":{ - "name":"path", - "type":"string", - "indexed":true, - "stored":true, - "docValues":false - } - }""")); + postAddRoot.setEntity(new StringEntity(""" + { + "add-field":{ + "name":"path", + "type":"string", + "indexed":true, + "stored":true, + "docValues":false + } + }""")); CloseableHttpResponse resp = client.execute(postAddRoot); assertEquals(200, resp.getStatusLine().getStatusCode()); } @@ -187,17 +184,16 @@ private void addSchemaFieldsForNestedDocs(String solrUrl) throws IOException { try (CloseableHttpClient client = HttpClients.createMinimal()) { HttpPost postAddRoot = new HttpPost(solrUrl + "/schema"); postAddRoot.setHeader("Content-Type", "application/json"); - postAddRoot.setEntity(new StringEntity( - """ - { - "replace-field":{ - "name":"_root_", - "type":"string", - "indexed":true, - "stored":true, - "docValues":false - } - }""")); + postAddRoot.setEntity(new StringEntity(""" + { + "replace-field":{ + "name":"_root_", + "type":"string", + "indexed":true, + "stored":true, + "docValues":false + } + }""")); CloseableHttpResponse resp = client.execute(postAddRoot); assertEquals(200, resp.getStatusLine().getStatusCode()); } @@ -210,7 +206,7 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter() throws Excep File tikaConfigFile = new File("target", "ta.xml"); File log4jPropFile = new File("target", "tmp-log4j2.xml"); try (InputStream is = this.getClass() - .getResourceAsStream("/pipes-fork-server-custom-log4j2.xml")) { + .getResourceAsStream("/pipes-fork-server-custom-log4j2.xml")) { FileUtils.copyInputStreamToFile(is, log4jPropFile); } String tikaConfigTemplateXml; @@ -218,28 +214,28 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter() throws Excep tikaConfigTemplateXml = IOUtils.toString(is, StandardCharsets.UTF_8); } - String tikaConfigXml = - createTikaConfigXml(useZk(), tikaConfigFile, log4jPropFile, tikaConfigTemplateXml, - SolrEmitter.UpdateStrategy.ADD, SolrEmitter.AttachmentStrategy.PARENT_CHILD, + String tikaConfigXml = createTikaConfigXml(useZk(), tikaConfigFile, log4jPropFile, + tikaConfigTemplateXml, SolrEmitter.UpdateStrategy.ADD, + SolrEmitter.AttachmentStrategy.PARENT_CHILD, HandlerConfig.PARSE_MODE.RMETA); FileUtils.writeStringToFile(tikaConfigFile, tikaConfigXml, StandardCharsets.UTF_8); - TikaCLI.main(new String[]{"-a", "-c", tikaConfigFile.getAbsolutePath()}); + TikaCLI.main(new String[] {"-a", "-c", tikaConfigFile.getAbsolutePath()}); try (SolrClient solrClient = new Http2SolrClient.Builder(solrEndpoint).build()) { solrClient.commit(collection, true, true); - assertEquals(numDocs, solrClient.query(collection, - new SolrQuery("mime_s:\"text/html; charset=ISO-8859-1\"")).getResults() - .getNumFound()); assertEquals(numDocs, - solrClient.query(collection, new SolrQuery("content_s:*initial*")).getResults() - .getNumFound()); + solrClient.query(collection, new SolrQuery( + "mime_s:\"text/html; charset=ISO-8859-1\"")) + .getResults().getNumFound()); + assertEquals(numDocs, solrClient.query(collection, new SolrQuery("content_s:*initial*")) + .getResults().getNumFound()); if (handlesParentChild()) { - assertEquals(3, - solrClient.query(collection, new SolrQuery("_root_:\"test-embedded.docx\"")) + assertEquals(3, solrClient + .query(collection, new SolrQuery("_root_:\"test-embedded.docx\"")) .getResults().getNumFound()); } - //clean up test-embedded.docx so that the iterator won't try to update its children - //in the next test + // clean up test-embedded.docx so that the iterator won't try to update its children + // in the next test solrClient.deleteByQuery(collection, "_root_:\"test-embedded.docx\""); @@ -247,37 +243,35 @@ protected void runTikaAsyncSolrPipeIteratorFileFetcherSolrEmitter() throws Excep } - // update the documents with "update must exist" and run tika async again with "UPDATE_MUST_EXIST". + // update the documents with "update must exist" and run tika async again with + // "UPDATE_MUST_EXIST". // It should not fail, and docs should be updated. createTestFiles("updated"); - tikaConfigXml = - createTikaConfigXml(useZk(), tikaConfigFile, log4jPropFile, tikaConfigTemplateXml, - SolrEmitter.UpdateStrategy.UPDATE_MUST_EXIST, + tikaConfigXml = createTikaConfigXml(useZk(), tikaConfigFile, log4jPropFile, + tikaConfigTemplateXml, SolrEmitter.UpdateStrategy.UPDATE_MUST_EXIST, SolrEmitter.AttachmentStrategy.PARENT_CHILD, HandlerConfig.PARSE_MODE.RMETA); FileUtils.writeStringToFile(tikaConfigFile, tikaConfigXml, StandardCharsets.UTF_8); - TikaCLI.main(new String[]{"-a", "-c", tikaConfigFile.getAbsolutePath()}); + TikaCLI.main(new String[] {"-a", "-c", tikaConfigFile.getAbsolutePath()}); try (SolrClient solrClient = new Http2SolrClient.Builder(solrEndpoint).build()) { solrClient.commit(collection, true, true); - assertEquals(numDocs, solrClient.query(collection, - new SolrQuery("mime_s:\"text/html; charset=ISO-8859-1\"")).getResults() - .getNumFound()); assertEquals(numDocs, - solrClient.query(collection, new SolrQuery("content_s:*updated*")).getResults() - .getNumFound()); + solrClient.query(collection, new SolrQuery( + "mime_s:\"text/html; charset=ISO-8859-1\"")) + .getResults().getNumFound()); + assertEquals(numDocs, solrClient.query(collection, new SolrQuery("content_s:*updated*")) + .getResults().getNumFound()); } } - @NotNull - private String createTikaConfigXml(boolean useZk, File tikaConfigFile, File log4jPropFile, - String tikaConfigTemplateXml, - SolrEmitter.UpdateStrategy updateStrategy, - SolrEmitter.AttachmentStrategy attachmentStrategy, - HandlerConfig.PARSE_MODE parseMode) { - String res = - tikaConfigTemplateXml.replace("{TIKA_CONFIG}", tikaConfigFile.getAbsolutePath()) + @NotNull private String createTikaConfigXml(boolean useZk, File tikaConfigFile, File log4jPropFile, + String tikaConfigTemplateXml, SolrEmitter.UpdateStrategy updateStrategy, + SolrEmitter.AttachmentStrategy attachmentStrategy, + HandlerConfig.PARSE_MODE parseMode) { + String res = tikaConfigTemplateXml + .replace("{TIKA_CONFIG}", tikaConfigFile.getAbsolutePath()) .replace("{UPDATE_STRATEGY}", updateStrategy.toString()) .replace("{ATTACHMENT_STRATEGY}", attachmentStrategy.toString()) .replace("{LOG4J_PROPERTIES_FILE}", log4jPropFile.getAbsolutePath()) @@ -285,12 +279,12 @@ private String createTikaConfigXml(boolean useZk, File tikaConfigFile, File log4 .replace("{PARSE_MODE}", parseMode.name()); if (useZk) { res = res.replace("{SOLR_CONNECTION}", - "\n" + " " + solrHost + ":" + zkPort + - "\n" + " \n"); + "\n" + " " + solrHost + ":" + zkPort + + "\n" + " \n"); } else { res = res.replace("{SOLR_CONNECTION}", - "\n" + " http://" + solrHost + ":" + solrPort + - "/solr\n" + " \n"); + "\n" + " http://" + solrHost + ":" + solrPort + + "/solr\n" + " \n"); } return res; } diff --git a/tika-integration-tests/tika-resource-loading-tests/src/test/java/org/apache/custom/parser/CustomParserTest.java b/tika-integration-tests/tika-resource-loading-tests/src/test/java/org/apache/custom/parser/CustomParserTest.java index 3096375d4a..ee067eaff7 100644 --- a/tika-integration-tests/tika-resource-loading-tests/src/test/java/org/apache/custom/parser/CustomParserTest.java +++ b/tika-integration-tests/tika-resource-loading-tests/src/test/java/org/apache/custom/parser/CustomParserTest.java @@ -1,32 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.custom.parser; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.Map; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.junit.jupiter.api.Test; public class CustomParserTest extends TikaTest { diff --git a/tika-integration-tests/tika-resource-loading-tests/src/test/java/org/apache/custom/parser/MyCustomParser.java b/tika-integration-tests/tika-resource-loading-tests/src/test/java/org/apache/custom/parser/MyCustomParser.java index bddf0afe0a..db1a300b30 100644 --- a/tika-integration-tests/tika-resource-loading-tests/src/test/java/org/apache/custom/parser/MyCustomParser.java +++ b/tika-integration-tests/tika-resource-loading-tests/src/test/java/org/apache/custom/parser/MyCustomParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.custom.parser; @@ -20,16 +18,14 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class MyCustomParser implements Parser { @@ -40,7 +36,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); diff --git a/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java b/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java index c0d56cd7ce..7c1faa872a 100644 --- a/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java +++ b/tika-integration-tests/tika-woodstox-tests/src/test/java/org/apache/tika/woodstox/WoodstoxXMLReaderUtilsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.woodstox; @@ -28,7 +26,10 @@ import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; - +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.ToTextContentHandler; +import org.apache.tika.utils.ExceptionUtils; +import org.apache.tika.utils.XMLReaderUtils; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Test; import org.w3c.dom.Document; @@ -36,11 +37,6 @@ import org.w3c.dom.NodeList; import org.xml.sax.SAXException; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.ToTextContentHandler; -import org.apache.tika.utils.ExceptionUtils; -import org.apache.tika.utils.XMLReaderUtils; - /** * This confirms that XML parsing still works with woodstox on the classpath */ @@ -48,27 +44,34 @@ public class WoodstoxXMLReaderUtilsTest { private static final Locale defaultLocale = Locale.getDefault(); static { - //tests on content of Exception msgs require specifying locale. - //even this, though is not sufficient for the billion laughs tests ?! + // tests on content of Exception msgs require specifying locale. + // even this, though is not sufficient for the billion laughs tests ?! Locale.setDefault(Locale.US); } - private static final String EXTERNAL_DTD_SIMPLE_FILE = ""; - private static final String EXTERNAL_DTD_SIMPLE_URL = ""; - private static final String EXTERNAL_ENTITY = "" + - " ]>&bar;"; - private static final String EXTERNAL_LOCAL_DTD = "" + - "%local_dtd;]>"; + private static final String EXTERNAL_DTD_SIMPLE_FILE = + ""; + private static final String EXTERNAL_DTD_SIMPLE_URL = + ""; + private static final String EXTERNAL_ENTITY = + "" + + " ]>&bar;"; + private static final String EXTERNAL_LOCAL_DTD = "" + + "%local_dtd;]>"; - private static final String BILLION_LAUGHS_CLASSICAL = "\n" + "\n" + " \n" + - " \n" + " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + "]>\n" + "&lol9;"; + private static final String BILLION_LAUGHS_CLASSICAL = "\n" + + "\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "]>\n" + "&lol9;"; private static String BILLION_LAUGHS_VARIANT; @@ -88,23 +91,25 @@ public class WoodstoxXMLReaderUtilsTest { BILLION_LAUGHS_VARIANT = xml.toString(); } - private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL, - EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD }; + private static final String[] EXTERNAL_ENTITY_XMLS = new String[] {EXTERNAL_DTD_SIMPLE_FILE, + EXTERNAL_DTD_SIMPLE_URL, EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD}; - private static final String[] BILLION_LAUGHS = new String[]{ BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT }; + private static final String[] BILLION_LAUGHS = + new String[] {BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT}; @AfterAll public static void tearDown() { Locale.setDefault(defaultLocale); } - //make sure that parseSAX actually defends against external entities + // make sure that parseSAX actually defends against external entities @Test public void testSAX() throws Exception { for (String xml : EXTERNAL_ENTITY_XMLS) { try { - XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), - new ToTextContentHandler(), new ParseContext()); + XMLReaderUtils.parseSAX( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ToTextContentHandler(), new ParseContext()); } catch (ConnectException e) { fail("Parser tried to access resource: " + xml, e); } @@ -115,7 +120,9 @@ public void testSAX() throws Exception { public void testDOM() throws Exception { for (String xml : EXTERNAL_ENTITY_XMLS) { try { - XMLReaderUtils.buildDOM(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext()); + XMLReaderUtils.buildDOM( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ParseContext()); } catch (ConnectException e) { fail("Parser tried to access resource: " + xml, e); } @@ -126,8 +133,10 @@ public void testDOM() throws Exception { public void testStax() throws Exception { for (String xml : EXTERNAL_ENTITY_XMLS) { try { - XMLInputFactory xmlInputFactory = XMLReaderUtils.getXMLInputFactory(new ParseContext()); - XMLEventReader reader = xmlInputFactory.createXMLEventReader(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); + XMLInputFactory xmlInputFactory = + XMLReaderUtils.getXMLInputFactory(new ParseContext()); + XMLEventReader reader = xmlInputFactory.createXMLEventReader( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); assertTrue(reader.getClass().getName().contains("com.ctc.wstx")); StringBuilder sb = new StringBuilder(); while (reader.hasNext()) { @@ -147,7 +156,7 @@ public void testStax() throws Exception { } } } catch (RuntimeException e) { - //woodstox + // woodstox String fullStack = ExceptionUtils.getStackTrace(e); if (fullStack.contains("Undeclared general entity")) { continue; @@ -161,8 +170,9 @@ public void testStax() throws Exception { public void testSAXBillionLaughs() throws Exception { for (String xml : BILLION_LAUGHS) { try { - XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), - new ToTextContentHandler(), new ParseContext()); + XMLReaderUtils.parseSAX( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ToTextContentHandler(), new ParseContext()); } catch (SAXException e) { limitCheck(e); } @@ -171,15 +181,18 @@ public void testSAXBillionLaughs() throws Exception { @Test public void testDOMBillionLaughs() throws Exception { - //confirm that ExpandEntityReferences has been set to false. + // confirm that ExpandEntityReferences has been set to false. - //some implementations ignore the expandEntityReferences=false, and we are still - //protected by the "The parser has encountered more than "20" entity expansions" SAXException. - //We need to check for either: empty content and no exception, or this SAXException + // some implementations ignore the expandEntityReferences=false, and we are still + // protected by the "The parser has encountered more than "20" entity expansions" + // SAXException. + // We need to check for either: empty content and no exception, or this SAXException for (String xml : BILLION_LAUGHS) { Document doc = null; try { - doc = XMLReaderUtils.buildDOM(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), new ParseContext()); + doc = XMLReaderUtils.buildDOM( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)), + new ParseContext()); } catch (SAXException e) { limitCheck(e); continue; @@ -187,10 +200,7 @@ public void testDOMBillionLaughs() throws Exception { NodeList nodeList = doc.getChildNodes(); StringBuilder sb = new StringBuilder(); dumpChildren(nodeList, sb); - assertEquals(0, sb - .toString() - .trim() - .length(), sb.toString()); + assertEquals(0, sb.toString().trim().length(), sb.toString()); } } @@ -207,16 +217,18 @@ private void dumpChildren(NodeList nodeList, StringBuilder sb) { @Test public void testStaxBillionLaughs() throws Exception { /* - Turning off dtd support of the XMLInputFactory in XMLReaderUtils turns off entity expansions and - causes a "NoSuchElementException" with the "'lol9' was referenced but not declared" message with this line: - tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false); - If that line doesn't exist, then we get a - NoSuchElementException with: "The parser has encountered more than "20" entity expansions in this document; this is the limit imposed by the JDK." + * Turning off dtd support of the XMLInputFactory in XMLReaderUtils turns off entity + * expansions and causes a "NoSuchElementException" with the + * "'lol9' was referenced but not declared" message with this line: + * tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, false); If that line doesn't + * exist, then we get a NoSuchElementException with: "The parser has encountered more than " + * 20" entity expansions in this document; this is the limit imposed by the JDK." */ for (String xml : BILLION_LAUGHS) { XMLInputFactory xmlInputFactory = XMLReaderUtils.getXMLInputFactory(new ParseContext()); - XMLEventReader reader = xmlInputFactory.createXMLEventReader(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); + XMLEventReader reader = xmlInputFactory.createXMLEventReader( + new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); assertTrue(reader.getClass().getName().contains("com.ctc.wstx")); try { while (reader.hasNext()) { @@ -224,15 +236,16 @@ public void testStaxBillionLaughs() throws Exception { } } catch (NoSuchElementException e) { String msg = e.getLocalizedMessage(); - //full message on temurin-17: The entity "lol9" was referenced, but not declared. + // full message on temurin-17: The entity "lol9" was referenced, but not declared. if (msg != null) { - if (msg.contains("referenced") && msg.contains("not declared")) { //standard Java + if (msg.contains("referenced") && msg.contains("not declared")) { // standard + // Java continue; } } throw e; } catch (RuntimeException e) { - //woodstox + // woodstox String fullTrace = ExceptionUtils.getStackTrace(e); if (fullTrace.contains("Undeclared general entity")) { continue; @@ -250,14 +263,14 @@ private void limitCheck(SAXException e) throws SAXException { throw e; } - //depending on the flavor/version of the jdk, entity expansions may be triggered + // depending on the flavor/version of the jdk, entity expansions may be triggered // OR entitySizeLimit may be triggered - //See TIKA-4471 - if (msg.contains("JAXP00010001") || //entity expansions - msg.contains("JAXP00010003") || //max entity size limit - msg.contains("JAXP00010004") || //TotalEntitySizeLimit - msg.contains("entity expansions") || - e.getMessage().contains("maxGeneralEntitySizeLimit")) { + // See TIKA-4471 + if (msg.contains("JAXP00010001") || // entity expansions + msg.contains("JAXP00010003") || // max entity size limit + msg.contains("JAXP00010004") || // TotalEntitySizeLimit + msg.contains("entity expansions") + || e.getMessage().contains("maxGeneralEntitySizeLimit")) { return; } throw e; diff --git a/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java b/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java index 8d31459b2e..4821a5fa5c 100644 --- a/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java +++ b/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.filetypedetector; @@ -45,10 +43,9 @@ public String probeContentType(Path path) throws IOException { return fileContentDetect; } - // Specification says to return null if we could not + // Specification says to return null if we could not // conclusively determine the file type return null; } } - diff --git a/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java b/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java index a7a82cd5a0..6ebe9263e0 100644 --- a/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java +++ b/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ /** diff --git a/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java b/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java index 7188945207..416576a13b 100644 --- a/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java +++ b/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.filetypedetector; @@ -54,8 +52,8 @@ public void setUp() throws Exception { @Test public final void testDirectAccess() throws Exception { - String contentType = - new TikaFileTypeDetector().probeContentType(testDirectory.resolve(TEST_HTML)); + String contentType = new TikaFileTypeDetector() + .probeContentType(testDirectory.resolve(TEST_HTML)); assertNotNull(contentType); assertEquals("text/html", contentType); } @@ -70,7 +68,7 @@ public final void testFilesProbeContentTypePathExtension() throws Exception { @Test public final void testFilesProbeContentTypePathUnrecognised() throws Exception { String contentType = - Files.probeContentType(testDirectory.resolve(TEST_UNRECOGNISED_EXTENSION)); + Files.probeContentType(testDirectory.resolve(TEST_UNRECOGNISED_EXTENSION)); assertNotNull(contentType); assertEquals("text/html", contentType); } @@ -89,8 +87,8 @@ public final void testMetaInfServicesLoad() throws Exception { foundTika = true; } } - //o.a.sis.internal.storage.StoreTypeDetector appears with latest upgrade - //check that TikaFileTypeDetector appears at all + // o.a.sis.internal.storage.StoreTypeDetector appears with latest upgrade + // check that TikaFileTypeDetector appears at all assertTrue(foundTika); } } diff --git a/tika-langdetect/tika-langdetect-lingo24/src/main/java/org/apache/tika/langdetect/lingo24/Lingo24LangDetector.java b/tika-langdetect/tika-langdetect-lingo24/src/main/java/org/apache/tika/langdetect/lingo24/Lingo24LangDetector.java index 1fc32e35d0..6d12b8a5d1 100644 --- a/tika-langdetect/tika-langdetect-lingo24/src/main/java/org/apache/tika/langdetect/lingo24/Lingo24LangDetector.java +++ b/tika-langdetect/tika-langdetect-lingo24/src/main/java/org/apache/tika/langdetect/lingo24/Lingo24LangDetector.java @@ -1,21 +1,25 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.lingo24; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import jakarta.ws.rs.core.Form; +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.Response; import java.io.CharArrayWriter; import java.io.IOException; import java.util.ArrayList; @@ -24,27 +28,20 @@ import java.util.Map; import java.util.Properties; import java.util.Set; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import jakarta.ws.rs.core.Form; -import jakarta.ws.rs.core.MediaType; -import jakarta.ws.rs.core.Response; import org.apache.cxf.jaxrs.client.WebClient; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.language.detect.LanguageConfidence; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.language.detect.LanguageResult; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * An implementation of a Language Detector using the * Premium MT API v1. *
    - * You can sign up for an access plan online on the Lingo24 Developer Portal - * and set your Application's User Key in the langdetect.lingo24.properties file. + * You can sign up for an access plan online on the + * Lingo24 Developer Portal and set your + * Application's User Key in the langdetect.lingo24.properties file. */ public class Lingo24LangDetector extends LanguageDetector { @@ -64,19 +61,19 @@ public class Lingo24LangDetector extends LanguageDetector { private CharArrayWriter writer; /** - * Default constructor which first checks for the presence of - * the langdetect.lingo24.properties file to set the API Key. + * Default constructor which first checks for the presence of the + * langdetect.lingo24.properties file to set the API Key. *

    - * If a key is available, it sets the detector as available and also loads - * the languages supported by the detector. + * If a key is available, it sets the detector as available and also loads the languages + * supported by the detector. */ public Lingo24LangDetector() { this.client = WebClient.create(LINGO24_TRANSLATE_URL_BASE + LINGO24_LANGID_ACTION); this.isAvailable = true; Properties config = new Properties(); try { - config.load( - Lingo24LangDetector.class.getResourceAsStream("langdetect.lingo24.properties")); + config.load(Lingo24LangDetector.class + .getResourceAsStream("langdetect.lingo24.properties")); this.userKey = config.getProperty("api.user-key"); @@ -167,7 +164,8 @@ private String detect(String content) { } /** - * Load the supported languages from the Premium MT API. + * Load the supported languages from the + * Premium MT API. * Support is continually expanding. * * @return Set of supported languages. @@ -182,8 +180,8 @@ private Set getAllLanguages() { WebClient _client = null; try { _client = WebClient.create(LINGO24_TRANSLATE_URL_BASE + LINGO24_SOURCELANG_ACTION); - Response response = - _client.accept(MediaType.APPLICATION_JSON).query("user_key", userKey).get(); + Response response = _client.accept(MediaType.APPLICATION_JSON) + .query("user_key", userKey).get(); String json = response.readEntity(String.class); JsonNode jsonArray = new ObjectMapper().readTree(json).get("source_langs"); diff --git a/tika-langdetect/tika-langdetect-lingo24/src/test/java/org/apache/tika/langdetect/lingo24/Lingo24LangDetectorTest.java b/tika-langdetect/tika-langdetect-lingo24/src/test/java/org/apache/tika/langdetect/lingo24/Lingo24LangDetectorTest.java index ec5501bc66..d4a6d953e6 100644 --- a/tika-langdetect/tika-langdetect-lingo24/src/test/java/org/apache/tika/langdetect/lingo24/Lingo24LangDetectorTest.java +++ b/tika-langdetect/tika-langdetect-lingo24/src/test/java/org/apache/tika/langdetect/lingo24/Lingo24LangDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.lingo24; @@ -24,14 +22,12 @@ import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.List; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - import org.apache.tika.langdetect.LanguageDetectorTest; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.language.detect.LanguageWriter; +import org.junit.jupiter.api.Test; /** * Test harness for the {@link org.apache.tika.langdetect.lingo24.Lingo24LangDetector}. @@ -54,8 +50,8 @@ public void testLanguageDetection() throws Exception { // Reusing the test data from OptimaizeLangDetectorTest // Test taht we can at least read the test file List lines = IOUtils.readLines(new InputStreamReader( - LanguageDetectorTest.class.getResourceAsStream("text-test.tsv"), - StandardCharsets.UTF_8)); + LanguageDetectorTest.class.getResourceAsStream("text-test.tsv"), + StandardCharsets.UTF_8)); assertEquals(18, lines.size()); LanguageDetector detector = new Lingo24LangDetector(); diff --git a/tika-langdetect/tika-langdetect-mitll-text/src/main/java/org/apache/tika/langdetect/mitll/TextLangDetector.java b/tika-langdetect/tika-langdetect-mitll-text/src/main/java/org/apache/tika/langdetect/mitll/TextLangDetector.java index 4d84fab2b0..e8824f885d 100644 --- a/tika-langdetect/tika-langdetect-mitll-text/src/main/java/org/apache/tika/langdetect/mitll/TextLangDetector.java +++ b/tika-langdetect/tika-langdetect-mitll-text/src/main/java/org/apache/tika/langdetect/mitll/TextLangDetector.java @@ -1,21 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.mitll; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import jakarta.ws.rs.core.Response; import java.io.CharArrayWriter; import java.io.IOException; import java.util.ArrayList; @@ -23,17 +24,12 @@ import java.util.List; import java.util.Map; import java.util.Set; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import jakarta.ws.rs.core.Response; import org.apache.cxf.jaxrs.client.WebClient; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.language.detect.LanguageConfidence; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.language.detect.LanguageResult; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** diff --git a/tika-langdetect/tika-langdetect-mitll-text/src/test/java/org/apache/tika/langdetect/mitll/TextLangDetectorTest.java b/tika-langdetect/tika-langdetect-mitll-text/src/test/java/org/apache/tika/langdetect/mitll/TextLangDetectorTest.java index 09b4eb07da..058f82e0d0 100644 --- a/tika-langdetect/tika-langdetect-mitll-text/src/test/java/org/apache/tika/langdetect/mitll/TextLangDetectorTest.java +++ b/tika-langdetect/tika-langdetect-mitll-text/src/test/java/org/apache/tika/langdetect/mitll/TextLangDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.mitll; @@ -22,14 +20,12 @@ import java.nio.charset.StandardCharsets; import java.util.List; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - import org.apache.tika.langdetect.LanguageDetectorTest; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.language.detect.LanguageWriter; +import org.junit.jupiter.api.Test; /** * Created by trevorlewis on 3/7/16. @@ -38,8 +34,8 @@ public class TextLangDetectorTest { @Test public void test() throws Exception { - List lines = - IOUtils.readLines(LanguageDetectorTest.class.getResourceAsStream("text-test.tsv"), + List lines = IOUtils.readLines( + LanguageDetectorTest.class.getResourceAsStream("text-test.tsv"), StandardCharsets.UTF_8); assertEquals(18, lines.size()); diff --git a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/OpenNLPDetector.java b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/OpenNLPDetector.java index 405c654654..2a4a0f805e 100644 --- a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/OpenNLPDetector.java +++ b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/OpenNLPDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.opennlp; @@ -23,7 +21,6 @@ import java.util.Map; import java.util.Set; import java.util.regex.Pattern; - import opennlp.tools.langdetect.Language; import opennlp.tools.langdetect.LanguageDetectorModel; import opennlp.tools.util.normalizer.CharSequenceNormalizer; @@ -31,51 +28,42 @@ import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer; import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer; - import org.apache.tika.language.detect.LanguageConfidence; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.language.detect.LanguageResult; /** *

    - * This is based on OpenNLP's language detector. However, - * we've built our own ProbingLanguageDetector and our own language - * models. + * This is based on OpenNLP's language detector. However, we've built our own + * ProbingLanguageDetector and our own language models. *

    * To build our model, we followed OpenNLP's lead by using the - * (Leipzig corpus) - * as gathered and preprocessed - * ( - * big-data corpus - * ). We removed azj, plt, sun - * and zsm because our models couldn't sufficiently well distinguish - * them from related languages. We removed cmn in favor of the - * finer-grained zho-trad and zho-simp. + * (Leipzig corpus) as gathered and + * preprocessed ( big-data + * corpus ). We removed azj, plt, sun and zsm because our models couldn't sufficiently well + * distinguish them from related languages. We removed cmn in favor of the finer-grained zho-trad + * and zho-simp. *

    * We then added the following languages from cc-100: - * ben-rom (Bengali Romanized), ful, gla, gug, hau, hin-rom, ibo, ful, linm - * mya-zaw, nso, orm, quz, roh, srd, ssw, tam-rom, tel-rom, tsn, urd-rom, - * wol, yor. + * ben-rom (Bengali Romanized), ful, gla, gug, hau, hin-rom, ibo, ful, linm mya-zaw, nso, orm, quz, + * roh, srd, ssw, tam-rom, tel-rom, tsn, urd-rom, wol, yor. *

    - * We ran our own train/devtest/test code because OpenNLPs required - * more sentences/data than were available for some languages. + * We ran our own train/devtest/test code because OpenNLPs required more sentences/data than were + * available for some languages. *

    - * Please open an issue on our JIRA if we made mistakes and/or had - * misunderstandings in our design choices or if you need to have other - * languages added. + * Please open an issue on our JIRA if we made mistakes and/or had misunderstandings in our design + * choices or if you need to have other languages added. *

    * Citations for the cc-100 corpus: *

    - * Unsupervised Cross-lingual Representation Learning at Scale, Alexis Conneau, - * Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, - * Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer, Veselin Stoyanov, - * Proceedings of the 58th Annual Meeting of the Association for Computational - * Linguistics (ACL), p. 8440-8451, July 2020, pdf, bib. + * Unsupervised Cross-lingual Representation Learning at Scale, Alexis Conneau, Kartikay Khandelwal, + * Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke + * Zettlemoyer, Veselin Stoyanov, Proceedings of the 58th Annual Meeting of the Association for + * Computational Linguistics (ACL), p. 8440-8451, July 2020, pdf, bib. *

    - * CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data, - * Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, - * Francisco Guzmán, Armand Joulin, Edouard Grave, Proceedings of the 12th - * Language Resources and Evaluation Conference (LREC), p. 4003-4012, + * CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data, Guillaume Wenzek, + * Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, Edouard + * Grave, Proceedings of the 12th Language Resources and Evaluation Conference (LREC), p. 4003-4012, * May 2020, pdf, bib. */ public class OpenNLPDetector extends LanguageDetector { @@ -91,7 +79,7 @@ public class OpenNLPDetector extends LanguageDetector { } private final ProbingLanguageDetector detector = - new ProbingLanguageDetector(LANG_MODEL, getNormalizers()); + new ProbingLanguageDetector(LANG_MODEL, getNormalizers()); private final StringBuilder buffer = new StringBuilder(); public OpenNLPDetector() { @@ -100,22 +88,22 @@ public OpenNLPDetector() { static void loadBuiltInModels() throws IOException { try (InputStream is = OpenNLPDetector.class - .getResourceAsStream("/opennlp-langdetect-20210413.bin")) { + .getResourceAsStream("/opennlp-langdetect-20210413.bin")) { LANG_MODEL = new LanguageDetectorModel(is); } } private static CharSequenceNormalizer[] getNormalizers() { - return new CharSequenceNormalizer[]{TikaUrlCharSequenceNormalizer.getInstance(), - AlphaIdeographSequenceNormalizer.getInstance(), - EmojiCharSequenceNormalizer.getInstance(), - TwitterCharSequenceNormalizer.getInstance(), - NumberCharSequenceNormalizer.getInstance(), - ShrinkCharSequenceNormalizer.getInstance()}; + return new CharSequenceNormalizer[] {TikaUrlCharSequenceNormalizer.getInstance(), + AlphaIdeographSequenceNormalizer.getInstance(), + EmojiCharSequenceNormalizer.getInstance(), + TwitterCharSequenceNormalizer.getInstance(), + NumberCharSequenceNormalizer.getInstance(), + ShrinkCharSequenceNormalizer.getInstance()}; } private static LanguageConfidence getConfidence(double confidence) { - //COMPLETELY heuristic + // COMPLETELY heuristic if (confidence > 0.9) { return LanguageConfidence.HIGH; } else if (confidence > 0.85) { @@ -147,7 +135,7 @@ public LanguageDetector loadModels() throws IOException { @Override public LanguageDetector loadModels(Set languages) throws IOException { throw new UnsupportedOperationException( - "This lang detector doesn't allow subsetting models"); + "This lang detector doesn't allow subsetting models"); } @Override @@ -178,12 +166,11 @@ public void reset() { } /** - * This will buffer up to {@link #setMaxLength(int)} and then - * ignore the rest of the text. + * This will buffer up to {@link #setMaxLength(int)} and then ignore the rest of the text. * * @param cbuf Character buffer - * @param off Offset into cbuf to first character in the run of text - * @param len Number of characters in the run of text. + * @param off Offset into cbuf to first character in the run of text + * @param len Number of characters in the run of text. */ @Override public void addText(char[] cbuf, int off, int len) { @@ -200,9 +187,8 @@ public List detectAll() { Language[] langs = detector.predictLanguages(buffer.toString()); List results = new ArrayList<>(); for (Language lang : langs) { - LanguageResult r = - new LanguageResult(lang.getLang(), getConfidence(lang.getConfidence()), - (float) lang.getConfidence()); + LanguageResult r = new LanguageResult(lang.getLang(), + getConfidence(lang.getConfidence()), (float) lang.getConfidence()); results.add(r); } return results; @@ -217,17 +203,16 @@ public String[] getSupportedLanguages() { } private static class TikaUrlCharSequenceNormalizer implements CharSequenceNormalizer { - //use this custom copy/paste of opennlp to avoid long, long hang with mail_regex - //TIKA-2777 + // use this custom copy/paste of opennlp to avoid long, long hang with mail_regex + // TIKA-2777 private static final Pattern URL_REGEX = - Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]{10,10000}"); - private static final Pattern MAIL_REGEX = - Pattern.compile("[-_.0-9A-Za-z]{1,100}@[-_0-9A-Za-z]{1,100}[-_.0-9A-Za-z]{1,100}"); + Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]{10,10000}"); + private static final Pattern MAIL_REGEX = Pattern + .compile("[-_.0-9A-Za-z]{1,100}@[-_0-9A-Za-z]{1,100}[-_.0-9A-Za-z]{1,100}"); private static final TikaUrlCharSequenceNormalizer INSTANCE = - new TikaUrlCharSequenceNormalizer(); + new TikaUrlCharSequenceNormalizer(); - private TikaUrlCharSequenceNormalizer() { - } + private TikaUrlCharSequenceNormalizer() {} public static TikaUrlCharSequenceNormalizer getInstance() { return INSTANCE; @@ -242,12 +227,11 @@ public CharSequence normalize(CharSequence charSequence) { private static class AlphaIdeographSequenceNormalizer implements CharSequenceNormalizer { private static final Pattern REGEX = - Pattern.compile("[^\\p{IsAlphabetic}\\p{IsIdeographic}]+"); + Pattern.compile("[^\\p{IsAlphabetic}\\p{IsIdeographic}]+"); private static final AlphaIdeographSequenceNormalizer INSTANCE = - new AlphaIdeographSequenceNormalizer(); + new AlphaIdeographSequenceNormalizer(); - private AlphaIdeographSequenceNormalizer() { - } + private AlphaIdeographSequenceNormalizer() {} public static AlphaIdeographSequenceNormalizer getInstance() { return INSTANCE; diff --git a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/ProbingLanguageDetector.java b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/ProbingLanguageDetector.java index 8cc6ecca62..6103f5c232 100644 --- a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/ProbingLanguageDetector.java +++ b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/ProbingLanguageDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.opennlp; @@ -21,7 +19,6 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.Map; - import opennlp.tools.langdetect.LanguageDetector; import opennlp.tools.langdetect.LanguageDetectorModel; import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer; @@ -30,41 +27,36 @@ /** * Implements learnable Language Detector. *

    - * Starts at the beginning of the charsequence and runs language - * detection on chunks of text. If the end of the - * string is reached or there are {@link #minConsecImprovements} - * consecutive predictions for the best language and the confidence - * increases over those last predictions and if the difference - * in confidence between the highest confidence language - * and the second highest confidence language is greater than {@link #minDiff}, - * the language detector will stop and report the results. + * Starts at the beginning of the charsequence and runs language detection on chunks of text. If the + * end of the string is reached or there are {@link #minConsecImprovements} consecutive predictions + * for the best language and the confidence increases over those last predictions and if the + * difference in confidence between the highest confidence language and the second highest + * confidence language is greater than {@link #minDiff}, the language detector will stop and report + * the results. *

    *

    * The authors wish to thank Ken Krugler and - * Yalder} - * for the inspiration for many of the design - * components of this detector. + * Yalder} for the inspiration for many of the + * design components of this detector. *

    */ class ProbingLanguageDetector implements LanguageDetector { /** - * Default chunk size (in codepoints) to take from the - * initial String + * Default chunk size (in codepoints) to take from the initial String */ public static final int DEFAULT_CHUNK_SIZE = 300; /** - * Default minimum consecutive improvements in confidence. - * If the best language is the same over this many consecutive - * probes, and if the confidence did not go down over those probes, - * the detector stops early. + * Default minimum consecutive improvements in confidence. If the best language is the same over + * this many consecutive probes, and if the confidence did not go down over those probes, the + * detector stops early. */ public static final int DEFAULT_MIN_CONSEC_IMPROVEMENTS = 2; /** - * Default minimum difference in confidence between the language with - * the highest confidence and the language with the second highest confidence. + * Default minimum difference in confidence between the language with the highest confidence and + * the language with the second highest confidence. */ public static final double DEFAULT_MIN_DIFF = 0.20; @@ -75,16 +67,16 @@ class ProbingLanguageDetector implements LanguageDetector { private static final String SPACE = " "; - //size at which to break strings for detection (in codepoints) + // size at which to break strings for detection (in codepoints) private int chunkSize = DEFAULT_CHUNK_SIZE; - //require that the "best" language be the same - //and that the confidence in that language increase over - //this number of probes. + // require that the "best" language be the same + // and that the confidence in that language increase over + // this number of probes. private int minConsecImprovements = DEFAULT_MIN_CONSEC_IMPROVEMENTS; - //Minimum difference in confidence between the best candidate - //and the second best candidate + // Minimum difference in confidence between the best candidate + // and the second best candidate private double minDiff = DEFAULT_MIN_DIFF; /** @@ -97,13 +89,13 @@ class ProbingLanguageDetector implements LanguageDetector { private LanguageDetectorModel model; /** - * Initializes the current instance with a language detector model. Default feature - * generation is used. + * Initializes the current instance with a language detector model. Default feature generation + * is used. * * @param model the language detector model */ public ProbingLanguageDetector(LanguageDetectorModel model, - CharSequenceNormalizer... normalizers) { + CharSequenceNormalizer... normalizers) { this.model = model; this.normalizer = new AggregateCharSequenceNormalizer(normalizers); } @@ -115,12 +107,12 @@ public opennlp.tools.langdetect.Language predictLanguage(CharSequence content) { @Override public opennlp.tools.langdetect.Language[] predictLanguages(CharSequence content) { - //list of the languages that received the highest - //confidence over the last n chunk detections + // list of the languages that received the highest + // confidence over the last n chunk detections LinkedList predictions = new LinkedList(); - int start = 0;//where to start the next chunk in codepoints + int start = 0;// where to start the next chunk in codepoints opennlp.tools.langdetect.Language[] currPredictions = null; - //cache ngram counts across chunks + // cache ngram counts across chunks Map ngramCounts = new HashMap<>(); CharIntNGrammer ngrammer = new CharIntNGrammer(1, 3); int nGrams = 0; @@ -173,10 +165,10 @@ private opennlp.tools.langdetect.Language[] predict(Map ngra } double[] eval = model.getMaxentModel().eval(allGrams, counts); opennlp.tools.langdetect.Language[] arr = - new opennlp.tools.langdetect.Language[eval.length]; + new opennlp.tools.langdetect.Language[eval.length]; for (int j = 0; j < eval.length; j++) { arr[j] = new opennlp.tools.langdetect.Language(model.getMaxentModel().getOutcome(j), - eval[j]); + eval[j]); } Arrays.sort(arr, (o1, o2) -> Double.compare(o2.getConfidence(), o1.getConfidence())); @@ -184,8 +176,7 @@ private opennlp.tools.langdetect.Language[] predict(Map ngra } /** - * Size in codepoints at which to chunk the - * text for detection. + * Size in codepoints at which to chunk the text for detection. * * @return the chunk size in codepoints */ @@ -194,8 +185,7 @@ public int getChunkSize() { } /** - * Size in codepoints at which to chunk the - * text for detection. + * Size in codepoints at which to chunk the text for detection. * * @param chunkSize */ @@ -204,9 +194,8 @@ public void setChunkSize(int chunkSize) { } /** - * Number of consecutive improvements in the - * confidence of the most likely language required - * for this language detector to stop. + * Number of consecutive improvements in the confidence of the most likely language required for + * this language detector to stop. * * @return the minimum consecutive improvements */ @@ -215,9 +204,8 @@ public int getMinConsecImprovements() { } /** - * Number of consecutive improvements in the - * confidence of the most likely language required - * for this language detector to stop. + * Number of consecutive improvements in the confidence of the most likely language required for + * this language detector to stop. * * @param minConsecImprovements minimum consecutive improvements */ @@ -226,8 +214,8 @@ public void setMinConsecImprovements(int minConsecImprovements) { } /** - * The minimum difference between the highest confidence and the - * second highest confidence required to stop. + * The minimum difference between the highest confidence and the second highest confidence + * required to stop. * * @return the minimum difference required */ @@ -236,8 +224,8 @@ public double getMinDiff() { } /** - * The minimum difference between the highest confidence and the - * second highest confidence required to stop. + * The minimum difference between the highest confidence and the second highest confidence + * required to stop. *

    * Throws {@link IllegalArgumentException} if < 0.0 * @@ -251,19 +239,16 @@ public void setMinDiff(double minDiff) { } /** - * The absolute maximum length of the string (in codepoints) - * to be processed. + * The absolute maximum length of the string (in codepoints) to be processed. * - * @return the absolute maximum length of the string (in codepoints) - * to be processed. + * @return the absolute maximum length of the string (in codepoints) to be processed. */ public int getMaxLength() { return maxLength; } /** - * The absolute maximum length of the string (in codepoints) - * to be processed. + * The absolute maximum length of the string (in codepoints) to be processed. * * @param maxLength */ @@ -291,8 +276,8 @@ public String[] getSupportedLanguages() { } /** - * Override this for different behavior to determine if there is enough - * confidence in the predictions to stop. + * Override this for different behavior to determine if there is enough confidence in the + * predictions to stop. * * @param predictionsQueue * @param newPredictions @@ -300,8 +285,8 @@ public String[] getSupportedLanguages() { * @return */ boolean seenEnough(LinkedList predictionsQueue, - opennlp.tools.langdetect.Language[] newPredictions, - Map ngramCounts) { + opennlp.tools.langdetect.Language[] newPredictions, + Map ngramCounts) { if (predictionsQueue.size() < minConsecImprovements) { predictionsQueue.add(newPredictions); @@ -310,16 +295,16 @@ boolean seenEnough(LinkedList predictionsQu predictionsQueue.removeFirst(); } predictionsQueue.add(newPredictions); - if (minDiff > 0.0 && - newPredictions[0].getConfidence() - newPredictions[1].getConfidence() < minDiff) { + if (minDiff > 0.0 && newPredictions[0].getConfidence() + - newPredictions[1].getConfidence() < minDiff) { return false; } String lastLang = null; double lastConf = -1.0; - //iterate through the last predictions - //and check that the lang with the highest confidence - //hasn't changed, and that the confidence in it - //hasn't decreased + // iterate through the last predictions + // and check that the lang with the highest confidence + // hasn't changed, and that the confidence in it + // hasn't decreased for (opennlp.tools.langdetect.Language[] predictions : predictionsQueue) { if (lastLang == null) { lastLang = predictions[0].getLang(); @@ -386,8 +371,8 @@ public String next() { currGram = minGram; pos++; if (pos + maxGram < buffer.length) { - //lowercase the last character; we've already - //lowercased all previous chars + // lowercase the last character; we've already + // lowercased all previous chars buffer[pos + maxGram] = Character.toLowerCase(buffer[pos + maxGram]); } } @@ -405,10 +390,9 @@ public String next() { } /** - * @param chunk this is the chunk that will be ngrammed. Note: - * The ngrammer will lowercase the codepoints in place! - * If you don't want the original data transformed, - * copy it before calling this! + * @param chunk this is the chunk that will be ngrammed. Note: The ngrammer will lowercase + * the codepoints in place! If you don't want the original data transformed, copy it + * before calling this! */ void reset(int[] chunk) { next = null; diff --git a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java index e0f88023e6..91d9d68361 100644 --- a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java +++ b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.opennlp.metadatafilter; @@ -43,7 +41,8 @@ public void filter(Metadata metadata) throws TikaException { } LanguageResult r = detector.detect(content); metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE, r.getLanguage()); - metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE, r.getConfidence().name()); + metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE, + r.getConfidence().name()); metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW, r.getRawScore()); } } diff --git a/tika-langdetect/tika-langdetect-opennlp/src/test/java/org/apache/tika/langdetect/opennlp/OpenNLPDetectorTest.java b/tika-langdetect/tika-langdetect-opennlp/src/test/java/org/apache/tika/langdetect/opennlp/OpenNLPDetectorTest.java index b84fba9ef3..369f5d3378 100644 --- a/tika-langdetect/tika-langdetect-opennlp/src/test/java/org/apache/tika/langdetect/opennlp/OpenNLPDetectorTest.java +++ b/tika-langdetect/tika-langdetect-opennlp/src/test/java/org/apache/tika/langdetect/opennlp/OpenNLPDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.opennlp; @@ -25,13 +23,11 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - import org.apache.tika.langdetect.LanguageDetectorTest; import org.apache.tika.language.detect.LanguageResult; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; public class OpenNLPDetectorTest { @@ -71,8 +67,9 @@ public void languageTests() throws Exception { private CharSequence getLangText(String lang) throws IOException { try (Reader reader = new InputStreamReader( - LanguageDetectorTest.class.getResourceAsStream("language-tests/" + lang + ".test"), - StandardCharsets.UTF_8)) { + LanguageDetectorTest.class + .getResourceAsStream("language-tests/" + lang + ".test"), + StandardCharsets.UTF_8)) { return IOUtils.toString(reader); } } diff --git a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java index a4ff3174b1..08ad529aec 100644 --- a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java +++ b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java @@ -1,30 +1,19 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.optimaize; -import java.io.CharArrayWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.optimaize.langdetect.DetectedLanguage; @@ -34,7 +23,14 @@ import com.optimaize.langdetect.profiles.BuiltInLanguages; import com.optimaize.langdetect.profiles.LanguageProfile; import com.optimaize.langdetect.profiles.LanguageProfileReader; - +import java.io.CharArrayWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; import org.apache.tika.language.detect.LanguageConfidence; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.language.detect.LanguageNames; @@ -55,7 +51,7 @@ public class OptimaizeLangDetector extends LanguageDetector { static { try { DEFAULT_LANGUAGE_PROFILES = - ImmutableList.copyOf(new LanguageProfileReader().readAllBuiltIn()); + ImmutableList.copyOf(new LanguageProfileReader().readAllBuiltIn()); ImmutableSet.Builder builder = new ImmutableSet.Builder<>(); for (LanguageProfile profile : DEFAULT_LANGUAGE_PROFILES) { @@ -85,16 +81,17 @@ public OptimaizeLangDetector(int maxCharsForDetection) { private static String makeLanguageName(LdLocale locale) { return LanguageNames.makeName(locale.getLanguage(), locale.getScript().orNull(), - locale.getRegion().orNull()); + locale.getRegion().orNull()); } private static com.optimaize.langdetect.LanguageDetector createDetector( - List languageProfiles, Map languageProbabilities) { - // FUTURE currently the short text algorithm doesn't normalize probabilities until the end, which + List languageProfiles, + Map languageProbabilities) { + // FUTURE currently the short text algorithm doesn't normalize probabilities until the end, + // which // means you can often get 0 probabilities. So we pick a very short length for this limit. - LanguageDetectorBuilder builder = - LanguageDetectorBuilder.create(NgramExtractors.standard()).shortTextAlgorithm(30) - .withProfiles(languageProfiles); + LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .shortTextAlgorithm(30).withProfiles(languageProfiles); if (languageProbabilities != null) { Map languageWeights = new HashMap<>(languageProbabilities.size()); @@ -145,7 +142,7 @@ public LanguageDetector loadModels(Set languages) throws IOException { } detector = createDetector(new LanguageProfileReader().readBuiltIn(locales), - languageProbabilities); + languageProbabilities); return this; } @@ -188,14 +185,14 @@ public void addText(char[] cbuf, int off, int len) { * {@inheritDoc} * * @return the detected list of languages - * @throws IllegalStateException if no models have been loaded with - * {@link #loadModels() } or {@link #loadModels(java.util.Set) } + * @throws IllegalStateException if no models have been loaded with {@link #loadModels() } or + * {@link #loadModels(java.util.Set) } */ @Override public List detectAll() { if (detector == null) { throw new IllegalStateException( - "models haven't been loaded yet (forgot to call loadModels?)"); + "models haven't been loaded yet (forgot to call loadModels?)"); } List result = new ArrayList<>(); @@ -204,10 +201,10 @@ public List detectAll() { for (DetectedLanguage rawResult : rawResults) { // TODO figure out right level for confidence brackets. LanguageConfidence confidence = - rawResult.getProbability() > 0.9 ? LanguageConfidence.HIGH : - LanguageConfidence.MEDIUM; + rawResult.getProbability() > 0.9 ? LanguageConfidence.HIGH + : LanguageConfidence.MEDIUM; result.add(new LanguageResult(makeLanguageName(rawResult.getLocale()), confidence, - (float) rawResult.getProbability())); + (float) rawResult.getProbability())); } if (result.isEmpty()) { diff --git a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java index a0e3dd6c75..6568a9ff0f 100644 --- a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java +++ b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.optimaize.metadatafilter; @@ -43,7 +41,8 @@ public void filter(Metadata metadata) throws TikaException { } LanguageResult r = detector.detect(content); metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE, r.getLanguage()); - metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE, r.getConfidence().name()); + metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE, + r.getConfidence().name()); metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW, r.getRawScore()); } } diff --git a/tika-langdetect/tika-langdetect-optimaize/src/test/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetectorTest.java b/tika-langdetect/tika-langdetect-optimaize/src/test/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetectorTest.java index 903bdef6ad..258e6cc609 100644 --- a/tika-langdetect/tika-langdetect-optimaize/src/test/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetectorTest.java +++ b/tika-langdetect/tika-langdetect-optimaize/src/test/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.optimaize; @@ -27,101 +25,37 @@ import java.util.List; import java.util.Locale; import java.util.Map; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.Timeout; - import org.apache.tika.langdetect.LanguageDetectorTest; import org.apache.tika.language.detect.LanguageConfidence; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.language.detect.LanguageWriter; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; public class OptimaizeLangDetectorTest extends LanguageDetectorTest { /* - * The complete list of supported languages (as of 0.5) is below. - * The ones we have tests for have '*' after the name. + * The complete list of supported languages (as of 0.5) is below. The ones we have tests for + * have '*' after the name. * - af Afrikaans - an Aragonese - ar Arabic - ast Asturian - be Belarusian - br Breton - ca Catalan - bg Bulgarian - bn Bengali - cs Czech - cy Welsh - da Danish * - de German * - el Greek * - en English * - es Spanish * - et Estonian - eu Basque - fa Persian - fi Finnish * - fr French * - ga Irish - gl Galician - gu Gujarati - he Hebrew - hi Hindi - hr Croatian - ht Haitian - hu Hungarian - id Indonesian - is Icelandic - it Italian * - ja Japanese * - km Khmer - kn Kannada - ko Korean - lt Lithuanian * - lv Latvian - mk Macedonian - ml Malayalam - mr Marathi - ms Malay - mt Maltese - ne Nepali - nl Dutch * - no Norwegian - oc Occitan - pa Punjabi - pl Polish - pt Portuguese * - ro Romanian - ru Russian - sk Slovak - sl Slovene - so Somali - sq Albanian - sr Serbian - sv Swedish * - sw Swahili - ta Tamil - te Telugu - th Thai * - tl Tagalog - tr Turkish - uk Ukrainian - ur Urdu - vi Vietnamese - yi Yiddish - zh-CN Simplified Chinese * (just generic Chinese) - zh-TW Traditional Chinese * (just generic Chinese) - */ + * af Afrikaans an Aragonese ar Arabic ast Asturian be Belarusian br Breton ca Catalan bg + * Bulgarian bn Bengali cs Czech cy Welsh da Danish * de German * el Greek * en English * es + * Spanish * et Estonian eu Basque fa Persian fi Finnish * fr French * ga Irish gl Galician gu + * Gujarati he Hebrew hi Hindi hr Croatian ht Haitian hu Hungarian id Indonesian is Icelandic it + * Italian * ja Japanese * km Khmer kn Kannada ko Korean lt Lithuanian * lv Latvian mk + * Macedonian ml Malayalam mr Marathi ms Malay mt Maltese ne Nepali nl Dutch * no Norwegian oc + * Occitan pa Punjabi pl Polish pt Portuguese * ro Romanian ru Russian sk Slovak sl Slovene so + * Somali sq Albanian sr Serbian sv Swedish * sw Swahili ta Tamil te Telugu th Thai * tl Tagalog + * tr Turkish uk Ukrainian ur Urdu vi Vietnamese yi Yiddish zh-CN Simplified Chinese * (just + * generic Chinese) zh-TW Traditional Chinese * (just generic Chinese) + */ /** - * Test correct detection for the many (short) translations of the - * "Universal Declaration of Human Rights (Article 1)", at - * http://www.omniglot.com/udhr + * Test correct detection for the many (short) translations of the "Universal Declaration of + * Human Rights (Article 1)", at http://www.omniglot.com/udhr *

    - * Also make sure we get uncertain results for some set of unsupported - * languages. + * Also make sure we get uncertain results for some set of unsupported languages. * * @throws Exception */ @@ -202,9 +136,8 @@ public void testMixedLanguages() throws IOException { if (results.size() > 0) { LanguageResult result = results.get(0); - assertFalse(result.isReasonablyCertain(), - "mix of " + language + " and " + other + " incorrectly detected as " + - result); + assertFalse(result.isReasonablyCertain(), "mix of " + language + " and " + other + + " incorrectly detected as " + result); } } } @@ -236,15 +169,15 @@ public void testShortText() throws IOException { writeTo(language, writer, 300); LanguageResult result = detector.detect(); - assertNotNull(result, String.format(Locale.US, "Language '%s' wasn't detected", - language)); + assertNotNull(result, + String.format(Locale.US, "Language '%s' wasn't detected", language)); - assertTrue(result.isLanguage(language), String.format(Locale.US, "Language '%s' was " + - "detected as '%s'", language, - result.getLanguage())); + assertTrue(result.isLanguage(language), + String.format(Locale.US, "Language '%s' was " + "detected as '%s'", + language, result.getLanguage())); assertTrue(result.isReasonablyCertain(), - String.format(Locale.US, "Language '%s' isn't reasonably certain: %s", language, - result.getConfidence())); + String.format(Locale.US, "Language '%s' isn't reasonably certain: %s", + language, result.getConfidence())); } writer.close(); @@ -252,8 +185,8 @@ public void testShortText() throws IOException { private Map getTestLanguages(String resourceName) throws IOException { Map result = new HashMap<>(); - List languages = - IOUtils.readLines(OptimaizeLangDetectorTest.class.getResourceAsStream(resourceName), + List languages = IOUtils.readLines( + OptimaizeLangDetectorTest.class.getResourceAsStream(resourceName), StandardCharsets.UTF_8); for (String line : languages) { line = line.trim(); @@ -275,7 +208,7 @@ private Map getTestLanguages(String resourceName) throws IOExcep @Test @Timeout(5000) public void testOptimaizeRegexBug() throws Exception { - //confirm TIKA-2777 doesn't affect langdetect's Optimaize + // confirm TIKA-2777 doesn't affect langdetect's Optimaize LanguageDetector detector = new OptimaizeLangDetector().setShortText(false).loadModels(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < 50000; i++) { diff --git a/tika-langdetect/tika-langdetect-test-commons/src/main/java/org/apache/tika/langdetect/LanguageDetectorTest.java b/tika-langdetect/tika-langdetect-test-commons/src/main/java/org/apache/tika/langdetect/LanguageDetectorTest.java index be91f0968f..9756b5b1a5 100644 --- a/tika-langdetect/tika-langdetect-test-commons/src/main/java/org/apache/tika/langdetect/LanguageDetectorTest.java +++ b/tika-langdetect/tika-langdetect-test-commons/src/main/java/org/apache/tika/langdetect/LanguageDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect; @@ -25,7 +23,6 @@ import java.io.Writer; import java.util.ArrayList; import java.util.List; - import org.apache.commons.io.IOUtils; public abstract class LanguageDetectorTest { @@ -54,7 +51,7 @@ protected String[] getTestLanguages() throws IOException { protected boolean hasTestLanguage(String language) { InputStream stream = LanguageDetectorTest.class - .getResourceAsStream("/language-tests/" + language + ".test"); + .getResourceAsStream("/language-tests/" + language + ".test"); if (stream != null) { IOUtils.closeQuietly(stream); return true; @@ -69,7 +66,7 @@ protected void writeTo(String language, Writer writer) throws IOException { protected void writeTo(String language, Writer writer, int limit) throws IOException { try (InputStream stream = LanguageDetectorTest.class - .getResourceAsStream("/language-tests/" + language + ".test")) { + .getResourceAsStream("/language-tests/" + language + ".test")) { copyAtMost(new InputStreamReader(stream, UTF_8), writer, limit); } } diff --git a/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageIdentifier.java b/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageIdentifier.java index 81c01a56c8..e9bbe14d7b 100644 --- a/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageIdentifier.java +++ b/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageIdentifier.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.tika; @@ -28,14 +26,13 @@ import java.util.Set; /** - * Identifier of the language that best matches a given content profile. - * The content profile is compared to generic language profiles based on - * material from various sources. + * Identifier of the language that best matches a given content profile. The content profile is + * compared to generic language profiles based on material from various sources. * - * @see - * Europarl: A Parallel Corpus for Statistical Machine Translation - * @see - * ISO 639 Language Codes + * @see Europarl: A Parallel + * Corpus for Statistical Machine Translation + * @see ISO 639 Language + * Codes * @since Apache Tika 0.5 */ public class LanguageIdentifier { @@ -43,8 +40,7 @@ public class LanguageIdentifier { /** * The available language profiles. */ - private static final Map PROFILES = - new HashMap<>(); + private static final Map PROFILES = new HashMap<>(); private static final String PROFILE_SUFFIX = ".ngp"; private static final String PROPERTIES_OVERRIDE_FILE = "tika.language.override.properties"; private static final String PROPERTIES_FILE = "tika.language.properties"; @@ -100,14 +96,14 @@ private static void addProfile(String language) throws Exception { LanguageProfile profile = new LanguageProfile(); try (InputStream stream = LanguageIdentifier.class - .getResourceAsStream(language + PROFILE_SUFFIX)) { + .getResourceAsStream(language + PROFILE_SUFFIX)) { BufferedReader reader = new BufferedReader(new InputStreamReader(stream, UTF_8)); String line = reader.readLine(); while (line != null) { if (line.length() > 0 && !line.startsWith("#")) { int space = line.indexOf(' '); profile.add(line.substring(0, space), - Long.parseLong(line.substring(space + 1))); + Long.parseLong(line.substring(space + 1))); } line = reader.readLine(); } @@ -115,9 +111,8 @@ private static void addProfile(String language) throws Exception { addProfile(language, profile); } catch (Throwable t) { - throw new Exception( - "Failed trying to load language profile for language \"" + language + - "\". Error: " + t.getMessage()); + throw new Exception("Failed trying to load language profile for language \"" + language + + "\". Error: " + t.getMessage()); } } @@ -125,18 +120,17 @@ private static void addProfile(String language) throws Exception { * Adds a single language profile * * @param language an ISO 639 code representing language - * @param profile the language profile + * @param profile the language profile */ public static void addProfile(String language, LanguageProfile profile) { PROFILES.put(language, profile); } /** - * Builds the language profiles. - * The list of languages are fetched from a property file named "tika.language.properties" - * If a file called "tika.language.override.properties" is found on classpath, this is - * used instead The property file contains a key "languages" with values being - * comma-separated language codes + * Builds the language profiles. The list of languages are fetched from a property file named + * "tika.language.properties" If a file called "tika.language.override.properties" is found on + * classpath, this is used instead The property file contains a key "languages" with values + * being comma-separated language codes */ public static void initProfiles() { clearProfiles(); @@ -154,8 +148,7 @@ public static void initProfiles() { props.load(stream); } catch (IOException e) { stringBuilder.append("IOException while trying to load property file. Message: ") - .append(e.getMessage()) - .append("\n"); + .append(e.getMessage()).append("\n"); } } @@ -167,17 +160,17 @@ public static void initProfiles() { addProfile(language); } catch (Exception e) { stringBuilder.append("Language ").append(language).append(" (").append(name) - .append(") not initialized. Message: ") - .append(e.getMessage()).append("\n"); + .append(") not initialized. Message: ").append(e.getMessage()) + .append("\n"); } } errors = stringBuilder.toString(); } /** - * Initializes the language profiles from a user supplied initialized Map. - * This overrides the default set of profiles initialized at startup, - * and provides an alternative to configuring profiles through property file + * Initializes the language profiles from a user supplied initialized Map. This overrides the + * default set of profiles initialized at startup, and provides an alternative to configuring + * profiles through property file * * @param profilesMap map of language profiles */ @@ -241,12 +234,11 @@ public float getRawScore() { } /** - * Tries to judge whether the identification is certain enough - * to be trusted. - * WARNING: Will never return true for small amount of input texts. + * Tries to judge whether the identification is certain enough to be trusted. WARNING: Will + * never return true for small amount of input texts. * * @return true if the distance is smaller then - * {@value LanguageIdentifier#CERTAINTY_LIMIT}, false otherwise + * {@value LanguageIdentifier#CERTAINTY_LIMIT}, false otherwise */ public boolean isReasonablyCertain() { return distance < CERTAINTY_LIMIT; diff --git a/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfile.java b/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfile.java index e039247d8b..67e6a5a368 100644 --- a/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfile.java +++ b/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfile.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.tika; @@ -25,7 +23,6 @@ import java.util.List; import java.util.Map; import java.util.Set; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -48,8 +45,7 @@ public class LanguageProfile { */ private final Interleaved interleaved = new Interleaved(); /** - * The sum of all ngram counts in this profile. - * Used to calculate relative ngram frequency. + * The sum of all ngram counts in this profile. Used to calculate relative ngram frequency. */ private long count = 0; @@ -108,9 +104,8 @@ public void add(String ngram) { */ public void add(String ngram, long count) { if (length != ngram.length()) { - throw new IllegalArgumentException( - "Unable to add an ngram of incorrect length: " + ngram.length() + " != " + - length); + throw new IllegalArgumentException("Unable to add an ngram of incorrect length: " + + ngram.length() + " != " + length); } Counter counter = ngrams.get(ngram); @@ -123,8 +118,7 @@ public void add(String ngram, long count) { } /** - * Calculates the geometric distance between this and the given - * other language profile. + * Calculates the geometric distance between this and the given other language profile. * * @param that the other language profile * @return distance between the profiles @@ -135,8 +129,8 @@ public double distance(LanguageProfile that) { private double distanceStandard(LanguageProfile that) { if (length != that.length) { - throw new IllegalArgumentException("Unable to calculage distance of language profiles" + - " with different ngram lengths: " + that.length + " != " + length); + throw new IllegalArgumentException("Unable to calculage distance of language profiles" + + " with different ngram lengths: " + that.length + " != " + length); } double sumOfSquares = 0.0; @@ -163,8 +157,8 @@ public String toString() { private double distanceInterleaved(LanguageProfile that) { if (length != that.length) { - throw new IllegalArgumentException("Unable to calculage distance of language profiles" + - " with different ngram lengths: " + that.length + " != " + length); + throw new IllegalArgumentException("Unable to calculage distance of language profiles" + + " with different ngram lengths: " + that.length + " != " + length); } double sumOfSquares = 0.0; @@ -265,8 +259,7 @@ public Entry firstEntry() { } private List> getSortedNgrams() { - List> entries = - new ArrayList<>(ngrams.size()); + List> entries = new ArrayList<>(ngrams.size()); entries.addAll(ngrams.entrySet()); entries.sort(Map.Entry.comparingByKey()); return entries; diff --git a/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilder.java b/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilder.java index b09fe987d6..cd30ff06c6 100644 --- a/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilder.java +++ b/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilder.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.tika; @@ -36,15 +34,13 @@ import java.util.Iterator; import java.util.List; import java.util.Map; - +import org.apache.tika.exception.TikaException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.exception.TikaException; - /** - * This class runs a ngram analysis over submitted text, results might be used - * for automatic language identification. + * This class runs a ngram analysis over submitted text, results might be used for automatic + * language identification. *

    * The similarity calculation is at experimental level. You have been warned. *

    @@ -86,7 +82,7 @@ public class LanguageProfilerBuilder { /** * The String form of the separator char */ - private final static String SEP_CHARSEQ = new String(new char[]{SEPARATOR}); + private final static String SEP_CHARSEQ = new String(new char[] {SEPARATOR}); /** * A StringBuffer used during analysis */ @@ -119,7 +115,7 @@ public class LanguageProfilerBuilder { /** * Constructs a new ngram profile * - * @param name is the name of the profile + * @param name is the name of the profile * @param minlen is the min length of ngram sequences * @param maxlen is the max length of ngram sequences */ @@ -142,20 +138,18 @@ public LanguageProfilerBuilder(String name) { } /** - * Creates a new Language profile from (preferably quite large - 5-10k of - * lines) text file + * Creates a new Language profile from (preferably quite large - 5-10k of lines) text file * - * @param name to be given for the profile - * @param is a stream to be read + * @param name to be given for the profile + * @param is a stream to be read * @param encoding is the encoding of stream * @throws TikaException if could not create a language profile */ public static LanguageProfilerBuilder create(String name, InputStream is, String encoding) - throws TikaException { + throws TikaException { - LanguageProfilerBuilder newProfile = - new LanguageProfilerBuilder(name, ABSOLUTE_MIN_NGRAM_LENGTH, - ABSOLUTE_MAX_NGRAM_LENGTH); + LanguageProfilerBuilder newProfile = new LanguageProfilerBuilder(name, + ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH); BufferedInputStream bis = new BufferedInputStream(is); byte[] buffer = new byte[4096]; @@ -196,8 +190,8 @@ public static void main(String[] args) { // -create he sample_he.txt utf-8 - String usage = "Usage: NGramProfile " + "[-create profilename filename encoding] " + - "[-similarity file1 file2] " + "[-score profile-name filename encoding]"; + String usage = "Usage: NGramProfile " + "[-create profilename filename encoding] " + + "[-similarity file1 file2] " + "[-score profile-name filename encoding]"; int command = 0; final int CREATE = 1; @@ -246,13 +240,13 @@ public static void main(String[] args) { File f = new File(filename); FileInputStream fis = new FileInputStream(f); LanguageProfilerBuilder newProfile = - LanguageProfilerBuilder.create(profilename, fis, encoding); + LanguageProfilerBuilder.create(profilename, fis, encoding); fis.close(); f = new File(profilename + "." + FILE_EXTENSION); FileOutputStream fos = new FileOutputStream(f); newProfile.save(fos); - System.out.println( - "new profile " + profilename + "." + FILE_EXTENSION + " was created."); + System.out.println("new profile " + profilename + "." + FILE_EXTENSION + + " was created."); break; case SIMILARITY: @@ -265,7 +259,7 @@ public static void main(String[] args) { f = new File(filename2); fis = new FileInputStream(f); LanguageProfilerBuilder newProfile2 = - LanguageProfilerBuilder.create(filename2, fis, encoding); + LanguageProfilerBuilder.create(filename2, fis, encoding); newProfile2.normalize(); System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2)); break; @@ -277,9 +271,8 @@ public static void main(String[] args) { f = new File(profilename + "." + FILE_EXTENSION); fis = new FileInputStream(f); - LanguageProfilerBuilder compare = - new LanguageProfilerBuilder(profilename, DEFAULT_MIN_NGRAM_LENGTH, - DEFAULT_MAX_NGRAM_LENGTH); + LanguageProfilerBuilder compare = new LanguageProfilerBuilder(profilename, + DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH); compare.load(fis); System.out.println("Score is " + compare.getSimilarity(newProfile)); break; @@ -378,7 +371,7 @@ public void analyze(StringBuilder text) { /** * @param word - * @param n sequence length + * @param n sequence length */ private void add(StringBuffer word, int n) { for (int i = 0; i <= word.length() - n; i++) { @@ -435,7 +428,7 @@ public String toString() { for (NGramEntry entry : getSorted()) { s.append("[").append(entry.seq).append("/").append(entry.count).append("/") - .append(entry.frequency).append("]\n"); + .append(entry.frequency).append("]\n"); } return s.toString(); } @@ -465,15 +458,15 @@ public float getSimilarity(LanguageProfilerBuilder another) throws TikaException while (i.hasNext()) { NGramEntry other = i.next(); if (another.ngrams.containsKey(other.seq)) { - sum += Math.abs((other.frequency - another.ngrams.get(other.seq).frequency)) / - 2; + sum += Math.abs((other.frequency - another.ngrams.get(other.seq).frequency)) + / 2; } else { sum += other.frequency; } } } catch (Exception e) { throw new TikaException( - "Could not calculate a score how well NGramProfiles match each other"); + "Could not calculate a score how well NGramProfiles match each other"); } return sum; } @@ -509,15 +502,14 @@ public void load(InputStream is) throws IOException { } /** - * Writes NGramProfile content into OutputStream, content is outputted with - * UTF-8 encoding + * Writes NGramProfile content into OutputStream, content is outputted with UTF-8 encoding * * @param os the Stream to output to * @throws IOException */ public void save(OutputStream os) throws IOException { - os.write(("# NgramProfile generated at " + new Date() + - " for Apache Tika Language Identification\n").getBytes(UTF_8)); + os.write(("# NgramProfile generated at " + new Date() + + " for Apache Tika Language Identification\n").getBytes(UTF_8)); // And then each ngram @@ -582,7 +574,7 @@ public NGramEntry(CharSequence seq) { /** * Constructs a new NGramEntry * - * @param seq is the sequence of characters of the ngram + * @param seq is the sequence of characters of the ngram * @param count is the number of occurrences of this ngram */ public NGramEntry(String seq, int count) { diff --git a/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/ProfilingWriter.java b/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/ProfilingWriter.java index 81daae7d41..e67bebe53e 100644 --- a/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/ProfilingWriter.java +++ b/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/ProfilingWriter.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.tika; @@ -28,7 +26,7 @@ public class ProfilingWriter extends Writer { private final LanguageProfile profile; - private final char[] buffer = new char[]{0, 0, '_'}; + private final char[] buffer = new char[] {0, 0, '_'}; private int n = 1; @@ -41,10 +39,9 @@ public ProfilingWriter() { } /** - * Returns the language profile being built by this writer. Note that - * the returned profile gets updated whenever new characters are written. - * Use the {@link #getLanguage()} method to get the language that best - * matches the current state of the profile. + * Returns the language profile being built by this writer. Note that the returned profile gets + * updated whenever new characters are written. Use the {@link #getLanguage()} method to get the + * language that best matches the current state of the profile. * * @return language profile */ @@ -53,8 +50,7 @@ public LanguageProfile getProfile() { } /** - * Returns the language that best matches the current state of the - * language profile. + * Returns the language that best matches the current state of the language profile. * * @return language that best matches the current profile */ @@ -97,7 +93,6 @@ public void close() throws IOException { * Ignored. */ @Override - public void flush() { - } + public void flush() {} } diff --git a/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/TikaLanguageDetector.java b/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/TikaLanguageDetector.java index 81c6f607ef..36a5628e88 100644 --- a/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/TikaLanguageDetector.java +++ b/tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/TikaLanguageDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.tika; @@ -21,22 +19,18 @@ import java.util.List; import java.util.Map; import java.util.Set; - import org.apache.tika.language.detect.LanguageConfidence; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.language.detect.LanguageResult; /** - * This is Tika's original legacy, homegrown language detector. - * As it is currently implemented, it computes vector distance - * of trigrams between input string and language models. + * This is Tika's original legacy, homegrown language detector. As it is currently implemented, it + * computes vector distance of trigrams between input string and language models. *

    - * Because it works only on trigrams, it is not suitable for short - * texts. + * Because it works only on trigrams, it is not suitable for short texts. *

    - * There are better performing language detectors. This module is still - * here in the hopes that we'll get around to improving it, because - * it is elegant and could be fairly trivially improved. + * There are better performing language detectors. This module is still here in the hopes that we'll + * get around to improving it, because it is elegant and could be fairly trivially improved. */ public class TikaLanguageDetector extends LanguageDetector { @@ -84,8 +78,8 @@ public List detectAll() { LanguageIdentifier langIder = new LanguageIdentifier(sb.toString()); String lang = langIder.getLanguage(); if (langIder.isReasonablyCertain()) { - return Collections.singletonList( - new LanguageResult(lang, LanguageConfidence.MEDIUM, langIder.getRawScore())); + return Collections.singletonList(new LanguageResult(lang, LanguageConfidence.MEDIUM, + langIder.getRawScore())); } return Collections.EMPTY_LIST; } diff --git a/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageIdentifierTest.java b/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageIdentifierTest.java index 15c190cc00..22e24b31d2 100644 --- a/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageIdentifierTest.java +++ b/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageIdentifierTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.tika; @@ -27,7 +25,6 @@ import java.io.Writer; import java.util.HashMap; import java.util.Locale; - import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -40,10 +37,11 @@ */ public class LanguageIdentifierTest { - private static final String[] languages = new String[]{ - // TODO - currently Estonian and Greek fail these tests. - // Enable when language detection works better. - "da", "de", /* "et", "el", */ "en", "es", "fi", "fr", "it", "lt", "nl", "pt", "sv"}; + private static final String[] languages = new String[] { + // TODO - currently Estonian and Greek fail these tests. + // Enable when language detection works better. + "da", "de", /* "et", "el", */ "en", "es", "fi", "fr", "it", "lt", "nl", "pt", + "sv"}; @BeforeEach public void setUp() { @@ -112,8 +110,8 @@ public void testPerformance() throws IOException { int detected = 0; // To avoid code removal by JVM or compiler String lastResult = null; for (int m = 0; m < MRUNS; m++) { - LanguageProfile.useInterleaved = - (m & 1) == 1; // Alternate between standard and interleaved + LanguageProfile.useInterleaved = (m & 1) == 1; // Alternate between standard and + // interleaved StringBuilder currentResult = new StringBuilder(); final long start = System.nanoTime(); for (int i = 0; i < IRUNS; i++) { @@ -128,19 +126,20 @@ public void testPerformance() throws IOException { } } System.out.printf(Locale.ROOT, - "Performed %d detections at %2d ms/test with interleaved=%b%n", - languages.length * IRUNS, - (System.nanoTime() - start) / 1000000 / (languages.length * IRUNS), - LanguageProfile.useInterleaved); - if (lastResult != - null) { // Might as well test that they behave the same while we're at it - assertEquals("This result should be equal to the last", lastResult, currentResult.toString()); + "Performed %d detections at %2d ms/test with interleaved=%b%n", + languages.length * IRUNS, + (System.nanoTime() - start) / 1000000 / (languages.length * IRUNS), + LanguageProfile.useInterleaved); + if (lastResult != null) { // Might as well test that they behave the same while we're at + // it + assertEquals("This result should be equal to the last", lastResult, + currentResult.toString()); } lastResult = currentResult.toString(); } if (detected == -1) { System.out.println( - "Never encountered but keep it to guard against over-eager optimization"); + "Never encountered but keep it to guard against over-eager optimization"); } } @@ -157,9 +156,8 @@ public void testMixedLanguages() throws IOException { writeTo(other, writer); LanguageIdentifier identifier = null; identifier = new LanguageIdentifier(writer.getProfile()); - assertFalse(identifier.isReasonablyCertain(), - "mix of " + language + " and " + other + " incorrectly detected as " + - identifier); + assertFalse(identifier.isReasonablyCertain(), "mix of " + language + " and " + + other + " incorrectly detected as " + identifier); } } } @@ -176,8 +174,8 @@ public void testEstonia() throws Exception { } private void writeTo(String language, Writer writer) throws IOException { - try (InputStream stream = LanguageIdentifierTest.class - .getResourceAsStream(language + ".test")) { + try (InputStream stream = + LanguageIdentifierTest.class.getResourceAsStream(language + ".test")) { IOUtils.copy(new InputStreamReader(stream, UTF_8), writer); } } diff --git a/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfileTest.java b/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfileTest.java index 612df7c52f..2b60289d8c 100644 --- a/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfileTest.java +++ b/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfileTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.tika; @@ -20,7 +18,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; - import org.junit.jupiter.api.Test; public class LanguageProfileTest { diff --git a/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilderTest.java b/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilderTest.java index 9f3e907dc8..915e5dd141 100644 --- a/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilderTest.java +++ b/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilderTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.tika; @@ -29,13 +27,11 @@ import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; - +import org.apache.tika.exception.TikaException; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.apache.tika.exception.TikaException; - public class LanguageProfilerBuilderTest { private final String corpusName = "langbuilder/welsh_corpus.txt"; private final String FILE_EXTENSION = "ngp"; @@ -49,7 +45,7 @@ public void setUp() throws Exception { tmpProfileModel = Files.createTempFile("tika-lang", ".ngp"); try (InputStream is = LanguageProfilerBuilderTest.class.getResourceAsStream(corpusName)) { LanguageProfilerBuilder ngramProfileBuilder = - LanguageProfilerBuilder.create(profileName, is, UTF_8.name()); + LanguageProfilerBuilder.create(profileName, is, UTF_8.name()); try (OutputStream os = Files.newOutputStream(tmpProfileModel)) { ngramProfileBuilder.save(os);; assertEquals(maxlen, ngramProfileBuilder.getSorted().size()); @@ -87,7 +83,7 @@ private LanguageProfile loadProfile() throws IOException, TikaException, URISynt // header/comment int space = line.indexOf(' '); langProfile.add(line.substring(0, space), - Long.parseLong(line.substring(space + 1))); + Long.parseLong(line.substring(space + 1))); } line = reader.readLine(); } diff --git a/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingHandler.java b/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingHandler.java index 6f1b6805d3..0fafecdf1c 100644 --- a/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingHandler.java +++ b/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingHandler.java @@ -1,26 +1,23 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.tika; import org.apache.tika.sax.WriteOutContentHandler; /** - * SAX content handler that builds a language profile based on all the - * received character content. + * SAX content handler that builds a language profile based on all the received character content. * * @since Apache Tika 0.5 */ @@ -42,11 +39,10 @@ public ProfilingHandler() { } /** - * Returns the language profile being built by this content handler. - * Note that the returned profile gets updated whenever new SAX events - * are received by this content handler. Use the {@link #getLanguage()} - * method to get the language that best matches the current state of - * the profile. + * Returns the language profile being built by this content handler. Note that the returned + * profile gets updated whenever new SAX events are received by this content handler. Use the + * {@link #getLanguage()} method to get the language that best matches the current state of the + * profile. * * @return language profile */ @@ -55,8 +51,7 @@ public LanguageProfile getProfile() { } /** - * Returns the language that best matches the current state of the - * language profile. + * Returns the language that best matches the current state of the language profile. * * @return language that best matches the current profile */ diff --git a/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingWriterTest.java b/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingWriterTest.java index beb7c2c667..6bf9dd52ec 100644 --- a/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingWriterTest.java +++ b/tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingWriterTest.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.langdetect.tika; import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.IOException; - import org.junit.jupiter.api.Test; @Deprecated diff --git a/tika-parent/checkstyle.xml b/tika-parent/checkstyle.xml index 55dc19fe74..cf9928edf4 100644 --- a/tika-parent/checkstyle.xml +++ b/tika-parent/checkstyle.xml @@ -40,7 +40,7 @@ + value="^.*$\n^\W*Licensed to the Apache Software Foundation \(ASF\) under one or more"/> @@ -77,16 +77,6 @@ - - - - - - - - - - @@ -119,25 +109,9 @@ - - - - - - - - - - - - - - - diff --git a/tika-parent/intellij-code-style.xml b/tika-parent/intellij-code-style.xml deleted file mode 100644 index 1026b72560..0000000000 --- a/tika-parent/intellij-code-style.xml +++ /dev/null @@ -1,75 +0,0 @@ - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 09b78338c0..73cd6d276b 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -1445,7 +1445,40 @@ - + com.diffplug.spotless + spotless-maven-plugin + 3.0.0 + + + + + 4.26.0 + + ${maven.multiModuleProjectDirectory}/tika-parent/google-style-eclipse.xml + + + + true + 4 + + + + + java,javax,org,com + + + + + + + + apply + + process-sources + + + + org.apache.maven.plugins maven-checkstyle-plugin ${checkstyle.plugin.version} diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java index 483e258d71..8eeeee9d0f 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/envi/EnviHeaderParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. * */ package org.apache.tika.parser.envi; @@ -24,13 +22,7 @@ import java.util.Collections; import java.util.List; import java.util.Set; - import org.apache.commons.io.input.CloseShieldInputStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.detect.AutoDetectReader; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaException; @@ -39,6 +31,10 @@ import org.apache.tika.parser.AbstractEncodingDetectorParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class EnviHeaderParser extends AbstractEncodingDetectorParser { @@ -46,7 +42,7 @@ public class EnviHeaderParser extends AbstractEncodingDetectorParser { private static final long serialVersionUID = -1479368523072408091L; private static final Logger LOG = LoggerFactory.getLogger(EnviHeaderParser.class); private static final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("envi.hdr")); + Collections.singleton(MediaType.application("envi.hdr")); private List multiLineFieldValueList = new ArrayList<>(); @@ -67,7 +63,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { // Only outputting the MIME type as metadata metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE); @@ -75,7 +71,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, // The following code was taken from the TXTParser // Automatically detect the character encoding try (AutoDetectReader reader = new AutoDetectReader(CloseShieldInputStream.wrap(stream), - metadata, getEncodingDetector(context))) { + metadata, getEncodingDetector(context))) { Charset charset = reader.getCharset(); // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, charset.name()); @@ -90,7 +86,7 @@ metadata, getEncodingDetector(context))) { } private void readLines(AutoDetectReader reader, Metadata metadata) - throws IOException, SAXException { + throws IOException, SAXException { // text contents of the xhtml String line; while ((line = reader.readLine()) != null) { @@ -116,21 +112,21 @@ private void writeParagraphAndSetMetadata(String line, Metadata metadata) throws String[] mapInfoValues = parseMapInfoContents(keyValue[1]); if (mapInfoValues[0].equals("UTM")) { metadata.set("envi." + keyValue[0].trim().replace(" ", "."), - keyValue[1].trim()); - String[] latLonStringArray = - convertMapInfoValuesToLatLngAndSetMetadata(mapInfoValues, metadata); - String xhtmlLatLongLine = "lat/lon = { " + latLonStringArray[0] + ", " + - latLonStringArray[1] + " }"; + keyValue[1].trim()); + String[] latLonStringArray = convertMapInfoValuesToLatLngAndSetMetadata( + mapInfoValues, metadata); + String xhtmlLatLongLine = "lat/lon = { " + latLonStringArray[0] + ", " + + latLonStringArray[1] + " }"; xhtml.startElement("p"); xhtml.characters(xhtmlLatLongLine); xhtml.endElement("p"); } else { metadata.set("envi." + keyValue[0].trim().replace(" ", "."), - keyValue[1].trim()); + keyValue[1].trim()); } } else { metadata.set("envi." + keyValue[0].trim().replace(" ", "."), - keyValue[1].trim()); + keyValue[1].trim()); } } } @@ -142,17 +138,18 @@ private void writeParagraphAndSetMetadata(String line, Metadata metadata) throws private String[] parseMapInfoContents(String mapInfoValue) { StringBuilder mapInfoValueStringBuilder = new StringBuilder(); for (int i = 0; i < mapInfoValue.length(); ++i) { - if (mapInfoValue.charAt(i) != '{' && mapInfoValue.charAt(i) != '}' && - mapInfoValue.charAt(i) != ' ') { + if (mapInfoValue.charAt(i) != '{' && mapInfoValue.charAt(i) != '}' + && mapInfoValue.charAt(i) != ' ') { mapInfoValueStringBuilder.append(mapInfoValue.charAt(i)); } } return mapInfoValueStringBuilder.toString().split(","); } - // Conversion logic taken from https://stackoverflow.com/questions/343865/how-to-convert-from-utm-to-latlng-in-python-or-javascript/344083#344083 + // Conversion logic taken from + // https://stackoverflow.com/questions/343865/how-to-convert-from-utm-to-latlng-in-python-or-javascript/344083#344083 private String[] convertMapInfoValuesToLatLngAndSetMetadata(String[] mapInfoValues, - Metadata metadata) { + Metadata metadata) { // Based on the map info data, pixelEasting is at index 3 and pixelNorthing is at index 4 double pixelEasting = Double.parseDouble(mapInfoValues[3].trim()); double pixelNorthing = Double.parseDouble(mapInfoValues[4].trim()); @@ -167,25 +164,24 @@ private String[] convertMapInfoValuesToLatLngAndSetMetadata(String[] mapInfoValu double k0 = 0.9996; double arc = pixelNorthing / k0; - double mu = arc / (a * (1.0 - Math.pow(e, 2.0) / 4.0 - 3.0 * Math.pow(e, 4.0) / 64.0 - - 5.0 * Math.pow(e, 6.0) / 256.0)); + double mu = arc / (a * (1.0 - Math.pow(e, 2.0) / 4.0 - 3.0 * Math.pow(e, 4.0) / 64.0 + - 5.0 * Math.pow(e, 6.0) / 256.0)); - double ei = (1.0 - Math.pow((1.0 - e * e), (1.0 / 2.0))) / - (1.0 + Math.pow((1.0 - e * e), (1.0 / 2.0))); + double ei = (1.0 - Math.pow((1.0 - e * e), (1.0 / 2.0))) + / (1.0 + Math.pow((1.0 - e * e), (1.0 / 2.0))); double ca = 3.0 * ei / 2.0 - 27.0 * Math.pow(ei, 3.0) / 32.0; double cb = 21.0 * Math.pow(ei, 2.0) / 16.0 - 55.0 * Math.pow(ei, 4.0) / 32.0; double cc = 151.0 * Math.pow(ei, 3.0) / 96.0; double cd = 1097.0 * Math.pow(ei, 4.0) / 512.0; - double phi1 = - mu + ca * Math.sin(2.0 * mu) + cb * Math.sin(4.0 * mu) + cc * Math.sin(6.0 * mu) + - cd * Math.sin(8.0 * mu); + double phi1 = mu + ca * Math.sin(2.0 * mu) + cb * Math.sin(4.0 * mu) + + cc * Math.sin(6.0 * mu) + cd * Math.sin(8.0 * mu); double n0 = a / Math.pow((1.0 - Math.pow((e * Math.sin(phi1)), 2.0)), (1.0 / 2.0)); - double r0 = a * (1.0 - e * e) / - Math.pow((1.0 - Math.pow((e * Math.sin(phi1)), 2.0)), (3.0 / 2.0)); + double r0 = a * (1.0 - e * e) + / Math.pow((1.0 - Math.pow((e * Math.sin(phi1)), 2.0)), (3.0 / 2.0)); double fact1 = n0 * Math.tan(phi1) / r0; double _a1 = 500000.0 - pixelEasting; @@ -193,16 +189,14 @@ private String[] convertMapInfoValuesToLatLngAndSetMetadata(String[] mapInfoValu double fact2 = dd0 * dd0 / 2.0; double t0 = Math.pow(Math.tan(phi1), 2.0); double Q0 = e1sq * Math.pow(Math.cos(phi1), 2.0); - double fact3 = - (5.0 + 3.0 * t0 + 10.0 * Q0 - 4.0 * Q0 * Q0 - 9.0 * e1sq) * Math.pow(dd0, 4.0) / - 24.0; - double fact4 = - (61.0 + 90.0 * t0 + 298.0 * Q0 + 45.0 * t0 * t0 - 252.0 * e1sq - 3.0 * Q0 * Q0) * - Math.pow(dd0, 6.0) / 720.0; + double fact3 = (5.0 + 3.0 * t0 + 10.0 * Q0 - 4.0 * Q0 * Q0 - 9.0 * e1sq) + * Math.pow(dd0, 4.0) / 24.0; + double fact4 = (61.0 + 90.0 * t0 + 298.0 * Q0 + 45.0 * t0 * t0 - 252.0 * e1sq + - 3.0 * Q0 * Q0) * Math.pow(dd0, 6.0) / 720.0; double lof1 = _a1 / (n0 * k0); double lof2 = (1.0 + 2.0 * t0 + Q0) * Math.pow(dd0, 3.0) / 6.0; - double lof3 = (5.0 - 2.0 * Q0 + 28.0 * t0 - 3.0 * Math.pow(Q0, 2.0) + 8.0 * e1sq + - 24.0 * Math.pow(t0, 2.0)) * Math.pow(dd0, 5.0) / 120.0; + double lof3 = (5.0 - 2.0 * Q0 + 28.0 * t0 - 3.0 * Math.pow(Q0, 2.0) + 8.0 * e1sq + + 24.0 * Math.pow(t0, 2.0)) * Math.pow(dd0, 5.0) / 120.0; double _a2 = (lof1 - lof2 + lof3) / Math.cos(phi1); double _a3 = _a2 * 180.0 / Math.PI; double zoneCM = (zone > 0) ? 6 * zone - 183.0 : 3.0; @@ -210,22 +204,20 @@ private String[] convertMapInfoValuesToLatLngAndSetMetadata(String[] mapInfoValu double longitude = zoneCM - _a3; metadata.set("envi.lat/lon", latitude + ", " + longitude); - return new String[]{Double.toString(latitude), Double.toString(longitude)}; + return new String[] {Double.toString(latitude), Double.toString(longitude)}; } /* - * Enables correct extraction of fiel values which span more - * than one line. Essentially, multi-line fiel values are - * typically enclosed within curly braces, so a primitive - * check it made to ensure the multi-line contents are contained in - * opening and closing braces. + * Enables correct extraction of fiel values which span more than one line. Essentially, + * multi-line fiel values are typically enclosed within curly braces, so a primitive check it + * made to ensure the multi-line contents are contained in opening and closing braces. */ private String parseMultiLineFieldValue(String line) { multiLineFieldValueList.add(line); if (line.endsWith("}")) { return String.join("", multiLineFieldValueList); } else { - //do nothing + // do nothing } return null; diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java index 1ba1969420..127e5ae21b 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java @@ -1,23 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.gdal; -//JDK imports +// JDK imports import static org.apache.tika.parser.external.ExternalParser.INPUT_FILE_TOKEN; @@ -34,12 +32,6 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.config.TikaTaskTimeout; import org.apache.tika.exception.TikaException; @@ -54,24 +46,26 @@ import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.FileProcessResult; import org.apache.tika.utils.ProcessUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; -//Tika imports -//SAX imports +// Tika imports +// SAX imports /** - * Wraps execution of the Geospatial Data Abstraction - * Library (GDAL) gdalinfo tool used to extract geospatial - * information out of hundreds of geo file formats. + * Wraps execution of the Geospatial Data Abstraction Library (GDAL) + * gdalinfo tool used to extract geospatial information out of hundreds of geo file + * formats. *

    - * The parser requires the installation of GDAL and for gdalinfo to - * be located on the path. + * The parser requires the installation of GDAL and for gdalinfo to be located on the + * path. *

    - * Basic information (Size, Coordinate System, Bounding Box, Driver, and - * resource info) are extracted as metadata, and the remaining metadata patterns - * are extracted and added. + * Basic information (Size, Coordinate System, Bounding Box, Driver, and resource info) are + * extracted as metadata, and the remaining metadata patterns are extracted and added. *

    - * The output of the command is available from the provided - * {@link ContentHandler} in the + * The output of the command is available from the provided {@link ContentHandler} in the * {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} method. */ public class GDALParser implements Parser { @@ -81,66 +75,89 @@ public class GDALParser implements Parser { public static final long DEFAULT_TIMEOUT_MS = 60000; - private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>( - Arrays.asList(MediaType.application("x-netcdf"), MediaType.application("vrt"), - MediaType.image("geotiff"), MediaType.image("nitf"), - MediaType.application("x-rpf-toc"), MediaType.application("x-ecrg-toc"), - MediaType.image("hfa"), MediaType.image("sar-ceos"), MediaType.image("ceos"), - MediaType.application("jaxa-pal-sar"), MediaType.application("gff"), - MediaType.application("elas"), MediaType.application("aig"), - MediaType.application("aaigrid"), MediaType.application("grass-ascii-grid"), - MediaType.application("sdts-raster"), MediaType.application("dted"), - MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("raster"), - MediaType.application("jdem"), MediaType.image("gif"), - MediaType.image("big-gif"), MediaType.image("envisat"), MediaType.image("fits"), - MediaType.application("fits"), MediaType.image("bsb"), - MediaType.application("xpm"), MediaType.image("bmp"), - MediaType.image("x-dimap"), MediaType.image("x-airsar"), - MediaType.application("x-rs2"), MediaType.application("x-pcidsk"), - MediaType.application("pcisdk"), MediaType.image("x-pcraster"), - MediaType.image("ilwis"), MediaType.image("sgi"), - MediaType.application("x-srtmhgt"), MediaType.application("leveller"), - MediaType.application("terragen"), MediaType.application("x-gmt"), - MediaType.application("x-isis3"), MediaType.application("x-isis2"), - MediaType.application("x-pds"), MediaType.application("x-til"), - MediaType.application("x-ers"), MediaType.application("x-l1b"), - MediaType.image("fit"), MediaType.application("x-grib"), MediaType.image("jp2"), - MediaType.application("x-rmf"), MediaType.application("x-wcs"), - MediaType.application("x-wms"), MediaType.application("x-msgn"), - MediaType.application("x-wms"), MediaType.application("x-wms"), - MediaType.application("x-rst"), MediaType.application("x-ingr"), - MediaType.application("x-gsag"), MediaType.application("x-gsbg"), - MediaType.application("x-gs7bg"), MediaType.application("x-cosar"), - MediaType.application("x-tsx"), MediaType.application("x-coasp"), - MediaType.application("x-r"), MediaType.application("x-map"), - MediaType.application("x-pnm"), MediaType.application("x-doq1"), - MediaType.application("x-doq2"), MediaType.application("x-envi"), - MediaType.application("x-envi-hdr"), MediaType.application("x-generic-bin"), - MediaType.application("x-p-aux"), MediaType.image("x-mff"), - MediaType.image("x-mff2"), MediaType.image("x-fujibas"), - MediaType.application("x-gsc"), MediaType.application("x-fast"), - MediaType.application("x-bt"), MediaType.application("x-lan"), - MediaType.application("x-cpg"), MediaType.image("ida"), - MediaType.application("x-ndf"), MediaType.image("eir"), - MediaType.application("x-dipex"), MediaType.application("x-lcp"), - MediaType.application("x-gtx"), MediaType.application("x-los-las"), - MediaType.application("x-ntv2"), MediaType.application("x-ctable2"), - MediaType.application("x-ace2"), MediaType.application("x-snodas"), - MediaType.application("x-kro"), MediaType.image("arg"), - MediaType.application("x-rik"), MediaType.application("x-usgs-dem"), - MediaType.application("x-gxf"), MediaType.application("x-dods"), - MediaType.application("x-http"), MediaType.application("x-bag"), - MediaType.application("x-hdf"), MediaType.image("x-hdf5-image"), - MediaType.application("x-nwt-grd"), MediaType.application("x-nwt-grc"), - MediaType.image("adrg"), MediaType.image("x-srp"), - MediaType.application("x-blx"), MediaType.application("x-rasterlite"), - MediaType.application("x-epsilon"), MediaType.application("x-sdat"), - MediaType.application("x-kml"), MediaType.application("x-xyz"), - MediaType.application("x-geo-pdf"), MediaType.image("x-ozi"), - MediaType.application("x-ctg"), MediaType.application("x-e00-grid"), - MediaType.application("x-zmap"), MediaType.application("x-webp"), - MediaType.application("x-ngs-geoid"), MediaType.application("x-mbtiles"), - MediaType.application("x-ppi"), MediaType.application("x-cappi")))); + private static final Set SUPPORTED_TYPES = Collections + .unmodifiableSet(new HashSet<>(Arrays.asList(MediaType.application("x-netcdf"), + MediaType.application("vrt"), MediaType.image("geotiff"), + MediaType.image("nitf"), MediaType.application("x-rpf-toc"), + MediaType.application("x-ecrg-toc"), MediaType.image("hfa"), + MediaType.image("sar-ceos"), MediaType.image("ceos"), + MediaType.application("jaxa-pal-sar"), + MediaType.application("gff"), MediaType.application("elas"), + MediaType.application("aig"), MediaType.application("aaigrid"), + MediaType.application("grass-ascii-grid"), + MediaType.application("sdts-raster"), + MediaType.application("dted"), MediaType.image("png"), + MediaType.image("jpeg"), MediaType.image("raster"), + MediaType.application("jdem"), MediaType.image("gif"), + MediaType.image("big-gif"), MediaType.image("envisat"), + MediaType.image("fits"), MediaType.application("fits"), + MediaType.image("bsb"), MediaType.application("xpm"), + MediaType.image("bmp"), MediaType.image("x-dimap"), + MediaType.image("x-airsar"), MediaType.application("x-rs2"), + MediaType.application("x-pcidsk"), + MediaType.application("pcisdk"), MediaType.image("x-pcraster"), + MediaType.image("ilwis"), MediaType.image("sgi"), + MediaType.application("x-srtmhgt"), + MediaType.application("leveller"), + MediaType.application("terragen"), + MediaType.application("x-gmt"), + MediaType.application("x-isis3"), + MediaType.application("x-isis2"), + MediaType.application("x-pds"), MediaType.application("x-til"), + MediaType.application("x-ers"), MediaType.application("x-l1b"), + MediaType.image("fit"), MediaType.application("x-grib"), + MediaType.image("jp2"), MediaType.application("x-rmf"), + MediaType.application("x-wcs"), MediaType.application("x-wms"), + MediaType.application("x-msgn"), MediaType.application("x-wms"), + MediaType.application("x-wms"), MediaType.application("x-rst"), + MediaType.application("x-ingr"), + MediaType.application("x-gsag"), + MediaType.application("x-gsbg"), + MediaType.application("x-gs7bg"), + MediaType.application("x-cosar"), + MediaType.application("x-tsx"), + MediaType.application("x-coasp"), MediaType.application("x-r"), + MediaType.application("x-map"), MediaType.application("x-pnm"), + MediaType.application("x-doq1"), + MediaType.application("x-doq2"), + MediaType.application("x-envi"), + MediaType.application("x-envi-hdr"), + MediaType.application("x-generic-bin"), + MediaType.application("x-p-aux"), MediaType.image("x-mff"), + MediaType.image("x-mff2"), MediaType.image("x-fujibas"), + MediaType.application("x-gsc"), MediaType.application("x-fast"), + MediaType.application("x-bt"), MediaType.application("x-lan"), + MediaType.application("x-cpg"), MediaType.image("ida"), + MediaType.application("x-ndf"), MediaType.image("eir"), + MediaType.application("x-dipex"), + MediaType.application("x-lcp"), MediaType.application("x-gtx"), + MediaType.application("x-los-las"), + MediaType.application("x-ntv2"), + MediaType.application("x-ctable2"), + MediaType.application("x-ace2"), + MediaType.application("x-snodas"), + MediaType.application("x-kro"), MediaType.image("arg"), + MediaType.application("x-rik"), + MediaType.application("x-usgs-dem"), + MediaType.application("x-gxf"), MediaType.application("x-dods"), + MediaType.application("x-http"), MediaType.application("x-bag"), + MediaType.application("x-hdf"), MediaType.image("x-hdf5-image"), + MediaType.application("x-nwt-grd"), + MediaType.application("x-nwt-grc"), MediaType.image("adrg"), + MediaType.image("x-srp"), MediaType.application("x-blx"), + MediaType.application("x-rasterlite"), + MediaType.application("x-epsilon"), + MediaType.application("x-sdat"), MediaType.application("x-kml"), + MediaType.application("x-xyz"), + MediaType.application("x-geo-pdf"), MediaType.image("x-ozi"), + MediaType.application("x-ctg"), + MediaType.application("x-e00-grid"), + MediaType.application("x-zmap"), + MediaType.application("x-webp"), + MediaType.application("x-ngs-geoid"), + MediaType.application("x-mbtiles"), + MediaType.application("x-ppi"), + MediaType.application("x-cappi")))); private String command; @@ -184,7 +201,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { if (!ExternalParser.check("gdalinfo")) { return; @@ -199,7 +216,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, long localTimeoutMillis = TikaTaskTimeout.getTimeoutMillis(context, timeoutMs); FileProcessResult result = ProcessUtils.execute(new ProcessBuilder(runCommand), - localTimeoutMillis, maxStdOut, maxStdErr); + localTimeoutMillis, maxStdOut, maxStdErr); metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout()); metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue()); @@ -254,7 +271,8 @@ private void addPatternWithIs(String name, Map patterns) { private void addBoundingBoxPattern(String name, Map patterns) { patterns.put(Pattern.compile( - name + "\\s*\\(\\s*([0-9]+\\.[0-9]+\\s*,\\s*[0-9]+\\.[0-9]+\\s*)\\)\\s*"), name); + name + "\\s*\\(\\s*([0-9]+\\.[0-9]+\\s*,\\s*[0-9]+\\.[0-9]+\\s*)\\)\\s*"), + name); } private void extractMetFromOutput(String output, Metadata met) { @@ -298,14 +316,15 @@ private boolean hasHeadings(String line, String[] headings) { } private void applyPatternsToOutput(String output, Metadata metadata, - Map metadataPatterns) { + Map metadataPatterns) { try (Scanner scanner = new Scanner(output)) { while (scanner.hasNextLine()) { String line = scanner.nextLine(); for (Pattern p : metadataPatterns.keySet()) { Matcher m = p.matcher(line); if (m.find()) { - if (metadataPatterns.get(p) != null && !metadataPatterns.get(p).equals("")) { + if (metadataPatterns.get(p) != null + && !metadataPatterns.get(p).equals("")) { metadata.add(metadataPatterns.get(p), m.group(1)); } else { metadata.add(m.group(1), m.group(2)); @@ -317,7 +336,7 @@ private void applyPatternsToOutput(String output, Metadata metadata, } private void processOutput(ContentHandler handler, Metadata metadata, String output) - throws SAXException, IOException { + throws SAXException, IOException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); try (Reader reader = new StringReader(output)) { xhtml.startDocument(); diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java index beabe65935..aed84a586b 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geoinfo; @@ -27,7 +25,6 @@ import java.util.Locale; import java.util.Map; import java.util.Set; - import org.apache.sis.metadata.iso.DefaultMetadata; import org.apache.sis.metadata.iso.DefaultMetadataScope; import org.apache.sis.metadata.iso.constraint.DefaultLegalConstraints; @@ -39,6 +36,15 @@ import org.apache.sis.storage.DataStores; import org.apache.sis.storage.UnsupportedStorageException; import org.apache.sis.util.collection.CodeListSet; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.DateUtils; import org.opengis.metadata.Identifier; import org.opengis.metadata.citation.Citation; import org.opengis.metadata.citation.CitationDate; @@ -61,23 +67,13 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TemporaryResources; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.utils.DateUtils; - public class GeographicInformationParser implements Parser { public static final String geoInfoType = "text/iso19139+xml"; private static final Logger LOG = LoggerFactory.getLogger(GeographicInformationParser.class); private final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.text("iso19139+xml")); + Collections.singleton(MediaType.text("iso19139+xml")); @Override @@ -87,12 +83,12 @@ public Set getSupportedTypes(ParseContext parseContext) { @Override public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, - ParseContext parseContext) throws IOException, SAXException, TikaException { + ParseContext parseContext) throws IOException, SAXException, TikaException { metadata.set(Metadata.CONTENT_TYPE, geoInfoType); XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(contentHandler, metadata); - TemporaryResources tmp = - TikaInputStream.isTikaInputStream(inputStream) ? null : new TemporaryResources(); + TemporaryResources tmp = TikaInputStream.isTikaInputStream(inputStream) ? null + : new TemporaryResources(); try (TikaInputStream tikaInputStream = TikaInputStream.get(inputStream, tmp, metadata)) { File file = tikaInputStream.getFile(); try (DataStore dataStore = DataStores.open(file)) { @@ -112,7 +108,7 @@ public void parse(InputStream inputStream, ContentHandler contentHandler, Metada } private void extract(XHTMLContentHandler xhtmlContentHandler, Metadata metadata, - DefaultMetadata defaultMetadata) throws SAXException { + DefaultMetadata defaultMetadata) throws SAXException { getMetaDataCharacterSet(metadata, defaultMetadata); getMetaDataContact(metadata, defaultMetadata); getMetaDataIdentificationInfo(metadata, defaultMetadata); @@ -126,28 +122,28 @@ private void extract(XHTMLContentHandler xhtmlContentHandler, Metadata metadata, } private void extractContent(XHTMLContentHandler xhtmlContentHandler, - DefaultMetadata defaultMetadata) throws SAXException { + DefaultMetadata defaultMetadata) throws SAXException { xhtmlContentHandler.startDocument(); xhtmlContentHandler.newline(); xhtmlContentHandler.newline(); ArrayList identifications = - (ArrayList) defaultMetadata.getIdentificationInfo(); + (ArrayList) defaultMetadata.getIdentificationInfo(); for (Identification i : identifications) { xhtmlContentHandler.startElement("h1"); xhtmlContentHandler.characters(i.getCitation().getTitle().toString()); xhtmlContentHandler.endElement("h1"); xhtmlContentHandler.newline(); - ArrayList responsiblePartyArrayList = - (ArrayList) i.getCitation().getCitedResponsibleParties(); + ArrayList responsiblePartyArrayList = (ArrayList) i + .getCitation().getCitedResponsibleParties(); for (ResponsibleParty r : responsiblePartyArrayList) { xhtmlContentHandler.startElement("h3"); xhtmlContentHandler.newline(); xhtmlContentHandler - .characters("CitedResponsiblePartyRole " + r.getRole().toString()); + .characters("CitedResponsiblePartyRole " + r.getRole().toString()); xhtmlContentHandler - .characters("CitedResponsiblePartyName " + r.getIndividualName()); + .characters("CitedResponsiblePartyName " + r.getIndividualName()); xhtmlContentHandler.endElement("h3"); xhtmlContentHandler.newline(); } @@ -155,13 +151,13 @@ private void extractContent(XHTMLContentHandler xhtmlContentHandler, xhtmlContentHandler.startElement("p"); xhtmlContentHandler.newline(); xhtmlContentHandler - .characters("IdentificationInfoAbstract " + i.getAbstract().toString()); + .characters("IdentificationInfoAbstract " + i.getAbstract().toString()); xhtmlContentHandler.endElement("p"); xhtmlContentHandler.newline(); Collection extentList = ((DefaultDataIdentification) i).getExtents(); for (Extent e : extentList) { ArrayList geoElements = - (ArrayList) e.getGeographicElements(); + (ArrayList) e.getGeographicElements(); for (GeographicExtent g : geoElements) { if (g instanceof DefaultGeographicBoundingBox) { @@ -170,8 +166,9 @@ private void extractContent(XHTMLContentHandler xhtmlContentHandler, xhtmlContentHandler.characters("GeographicElementWestBoundLatitude"); xhtmlContentHandler.endElement("td"); xhtmlContentHandler.startElement("td"); - xhtmlContentHandler.characters(String.valueOf( - ((DefaultGeographicBoundingBox) g).getWestBoundLongitude())); + xhtmlContentHandler.characters( + String.valueOf(((DefaultGeographicBoundingBox) g) + .getWestBoundLongitude())); xhtmlContentHandler.endElement("td"); xhtmlContentHandler.endElement("tr"); xhtmlContentHandler.startElement("tr"); @@ -179,8 +176,9 @@ private void extractContent(XHTMLContentHandler xhtmlContentHandler, xhtmlContentHandler.characters("GeographicElementEastBoundLatitude"); xhtmlContentHandler.endElement("td"); xhtmlContentHandler.startElement("td"); - xhtmlContentHandler.characters(String.valueOf( - ((DefaultGeographicBoundingBox) g).getEastBoundLongitude())); + xhtmlContentHandler.characters( + String.valueOf(((DefaultGeographicBoundingBox) g) + .getEastBoundLongitude())); xhtmlContentHandler.endElement("td"); xhtmlContentHandler.endElement("tr"); xhtmlContentHandler.startElement("tr"); @@ -188,8 +186,9 @@ private void extractContent(XHTMLContentHandler xhtmlContentHandler, xhtmlContentHandler.characters("GeographicElementNorthBoundLatitude"); xhtmlContentHandler.endElement("td"); xhtmlContentHandler.startElement("td"); - xhtmlContentHandler.characters(String.valueOf( - ((DefaultGeographicBoundingBox) g).getNorthBoundLatitude())); + xhtmlContentHandler.characters( + String.valueOf(((DefaultGeographicBoundingBox) g) + .getNorthBoundLatitude())); xhtmlContentHandler.endElement("td"); xhtmlContentHandler.endElement("tr"); xhtmlContentHandler.startElement("tr"); @@ -197,8 +196,9 @@ private void extractContent(XHTMLContentHandler xhtmlContentHandler, xhtmlContentHandler.characters("GeographicElementSouthBoundLatitude"); xhtmlContentHandler.endElement("td"); xhtmlContentHandler.startElement("td"); - xhtmlContentHandler.characters(String.valueOf( - ((DefaultGeographicBoundingBox) g).getSouthBoundLatitude())); + xhtmlContentHandler.characters( + String.valueOf(((DefaultGeographicBoundingBox) g) + .getSouthBoundLatitude())); xhtmlContentHandler.endElement("td"); xhtmlContentHandler.endElement("tr"); } @@ -219,7 +219,7 @@ private void getMetaDataCharacterSet(Metadata metadata, DefaultMetadata defaultM private void getMetaDataContact(Metadata metadata, DefaultMetadata defaultMetaData) { Collection contactSet = - (Collection) defaultMetaData.getContacts(); + (Collection) defaultMetaData.getContacts(); for (ResponsibleParty rparty : contactSet) { if (rparty.getRole() != null) { metadata.add("ContactRole", rparty.getRole().name()); @@ -232,24 +232,24 @@ private void getMetaDataContact(Metadata metadata, DefaultMetadata defaultMetaDa private void getMetaDataIdentificationInfo(Metadata metadata, DefaultMetadata defaultMetaData) { ArrayList identifications = - (ArrayList) defaultMetaData.getIdentificationInfo(); + (ArrayList) defaultMetaData.getIdentificationInfo(); for (Identification i : identifications) { DefaultDataIdentification defaultDataIdentification = (DefaultDataIdentification) i; if (i.getCitation() != null && i.getCitation().getTitle() != null) { metadata.add("IdentificationInfoCitationTitle ", - i.getCitation().getTitle().toString()); + i.getCitation().getTitle().toString()); } ArrayList dateArrayList = - (ArrayList) i.getCitation().getDates(); + (ArrayList) i.getCitation().getDates(); for (CitationDate d : dateArrayList) { if (d.getDateType() != null) { String date = DateUtils.formatDate(d.getDate()); metadata.add("CitationDate ", d.getDateType().name() + "-->" + date); } } - ArrayList responsiblePartyArrayList = - (ArrayList) i.getCitation().getCitedResponsibleParties(); + ArrayList responsiblePartyArrayList = (ArrayList) i + .getCitation().getCitedResponsibleParties(); for (ResponsibleParty r : responsiblePartyArrayList) { if (r.getRole() != null) { metadata.add("CitedResponsiblePartyRole ", r.getRole().toString()); @@ -259,11 +259,11 @@ private void getMetaDataIdentificationInfo(Metadata metadata, DefaultMetadata de } if (r.getOrganisationName() != null) { metadata.add("CitedResponsiblePartyOrganizationName ", - r.getOrganisationName().toString()); + r.getOrganisationName().toString()); } if (r.getPositionName() != null) { metadata.add("CitedResponsiblePartyPositionName ", - r.getPositionName().toString()); + r.getPositionName().toString()); } if (r.getContactInfo() != null) { @@ -282,16 +282,17 @@ private void getMetaDataIdentificationInfo(Metadata metadata, DefaultMetadata de for (Format f : formatArrayList) { if (f.getName() != null) { metadata.add("ResourceFormatSpecificationAlternativeTitle ", - f.getName().toString()); + f.getName().toString()); } } Map localeCharsetMap = - defaultDataIdentification.getLocalesAndCharsets(); + defaultDataIdentification.getLocalesAndCharsets(); for (Locale l : localeCharsetMap.keySet()) { metadata.add("IdentificationInfoLanguage-->", l.getDisplayLanguage(Locale.ENGLISH)); } CodeListSet categoryList = - (CodeListSet) defaultDataIdentification.getTopicCategories(); + (CodeListSet) defaultDataIdentification + .getTopicCategories(); for (TopicCategory t : categoryList) { metadata.add("IdentificationInfoTopicCategory-->", t.name()); } @@ -300,7 +301,7 @@ private void getMetaDataIdentificationInfo(Metadata metadata, DefaultMetadata de for (Keywords k : keywordList) { j++; ArrayList stringList = - (ArrayList) k.getKeywords(); + (ArrayList) k.getKeywords(); for (InternationalString s : stringList) { metadata.add("Keywords " + j, s.toString()); } @@ -309,16 +310,16 @@ private void getMetaDataIdentificationInfo(Metadata metadata, DefaultMetadata de } if (k.getThesaurusName() != null && k.getThesaurusName().getTitle() != null) { metadata.add("ThesaurusNameTitle " + j, - k.getThesaurusName().getTitle().toString()); + k.getThesaurusName().getTitle().toString()); } - if (k.getThesaurusName() != null && - k.getThesaurusName().getAlternateTitles() != null) { + if (k.getThesaurusName() != null + && k.getThesaurusName().getAlternateTitles() != null) { metadata.add("ThesaurusNameAlternativeTitle " + j, - k.getThesaurusName().getAlternateTitles().toString()); + k.getThesaurusName().getAlternateTitles().toString()); } ArrayList citationDates = - (ArrayList) k.getThesaurusName().getDates(); + (ArrayList) k.getThesaurusName().getDates(); for (CitationDate cd : citationDates) { if (cd.getDateType() != null) { String date = DateUtils.formatDate(cd.getDate()); @@ -327,7 +328,7 @@ private void getMetaDataIdentificationInfo(Metadata metadata, DefaultMetadata de } } ArrayList constraintList = - (ArrayList) i.getResourceConstraints(); + (ArrayList) i.getResourceConstraints(); for (DefaultLegalConstraints c : constraintList) { for (Restriction r : c.getAccessConstraints()) { @@ -344,38 +345,43 @@ private void getMetaDataIdentificationInfo(Metadata metadata, DefaultMetadata de Collection extentList = ((DefaultDataIdentification) i).getExtents(); for (Extent e : extentList) { ArrayList geoElements = - (ArrayList) e.getGeographicElements(); + (ArrayList) e.getGeographicElements(); for (GeographicExtent g : geoElements) { if (g instanceof DefaultGeographicDescription) { - if (((DefaultGeographicDescription) g).getGeographicIdentifier() != null && - ((DefaultGeographicDescription) g).getGeographicIdentifier() - .getCode() != null) { + if (((DefaultGeographicDescription) g).getGeographicIdentifier() != null + && ((DefaultGeographicDescription) g) + .getGeographicIdentifier() + .getCode() != null) { metadata.add("GeographicIdentifierCode ", - ((DefaultGeographicDescription) g).getGeographicIdentifier() - .getCode()); + ((DefaultGeographicDescription) g) + .getGeographicIdentifier().getCode()); } - if (((DefaultGeographicDescription) g).getGeographicIdentifier() != null && - ((DefaultGeographicDescription) g).getGeographicIdentifier() - .getAuthority() != null && - ((DefaultGeographicDescription) g).getGeographicIdentifier() - .getAuthority().getTitle() != null) { + if (((DefaultGeographicDescription) g).getGeographicIdentifier() != null + && ((DefaultGeographicDescription) g) + .getGeographicIdentifier() + .getAuthority() != null + && ((DefaultGeographicDescription) g) + .getGeographicIdentifier().getAuthority() + .getTitle() != null) { metadata.add("GeographicIdentifierAuthorityTitle ", - ((DefaultGeographicDescription) g).getGeographicIdentifier() - .getAuthority().getTitle().toString()); + ((DefaultGeographicDescription) g) + .getGeographicIdentifier() + .getAuthority().getTitle().toString()); } for (InternationalString s : ((DefaultGeographicDescription) g) - .getGeographicIdentifier().getAuthority().getAlternateTitles()) { + .getGeographicIdentifier().getAuthority() + .getAlternateTitles()) { metadata.add("GeographicIdentifierAuthorityAlternativeTitle ", - s.toString()); + s.toString()); } for (CitationDate cd : ((DefaultGeographicDescription) g) - .getGeographicIdentifier().getAuthority().getDates()) { + .getGeographicIdentifier().getAuthority().getDates()) { if (cd.getDateType() != null && cd.getDate() != null) { String date = DateUtils.formatDate(cd.getDate()); metadata.add("GeographicIdentifierAuthorityDate ", - cd.getDateType().name() + " " + date); + cd.getDateType().name() + " " + date); } } } @@ -387,31 +393,31 @@ private void getMetaDataIdentificationInfo(Metadata metadata, DefaultMetadata de private void getMetaDataDistributionInfo(Metadata metadata, DefaultMetadata defaultMetaData) { Distribution distribution = defaultMetaData.getDistributionInfo(); ArrayList distributionFormat = - (ArrayList) distribution.getDistributionFormats(); + (ArrayList) distribution.getDistributionFormats(); for (Format f : distributionFormat) { if (f.getName() != null) { metadata.add("DistributionFormatSpecificationAlternativeTitle ", - f.getName().toString()); + f.getName().toString()); } } ArrayList distributorList = - (ArrayList) distribution.getDistributors(); + (ArrayList) distribution.getDistributors(); for (Distributor d : distributorList) { - if (d != null && d.getDistributorContact() != null && - d.getDistributorContact().getRole() != null) { + if (d != null && d.getDistributorContact() != null + && d.getDistributorContact().getRole() != null) { metadata.add("Distributor Contact ", d.getDistributorContact().getRole().name()); } - if (d != null && d.getDistributorContact() != null && - d.getDistributorContact().getOrganisationName() != null) { + if (d != null && d.getDistributorContact() != null + && d.getDistributorContact().getOrganisationName() != null) { metadata.add("Distributor Organization Name ", - d.getDistributorContact().getOrganisationName().toString()); + d.getDistributorContact().getOrganisationName().toString()); } } ArrayList transferOptionsList = - (ArrayList) distribution.getTransferOptions(); + (ArrayList) distribution.getTransferOptions(); for (DigitalTransferOptions d : transferOptionsList) { ArrayList onlineResourceList = - (ArrayList) d.getOnLines(); + (ArrayList) d.getOnLines(); for (OnlineResource or : onlineResourceList) { if (or.getLinkage() != null) { metadata.add("TransferOptionsOnlineLinkage ", or.getLinkage().toString()); @@ -427,7 +433,7 @@ private void getMetaDataDistributionInfo(Metadata metadata, DefaultMetadata defa } if (or.getDescription() != null) { metadata.add("TransferOptionsOnlineDescription ", - or.getDescription().toString()); + or.getDescription().toString()); } if (or.getFunction() != null) { metadata.add("TransferOptionsOnlineFunction ", or.getFunction().name()); @@ -439,7 +445,7 @@ private void getMetaDataDistributionInfo(Metadata metadata, DefaultMetadata defa private void getMetaDataDateInfo(Metadata metadata, DefaultMetadata defaultMetaData) { ArrayList citationDateList = - (ArrayList) defaultMetaData.getDateInfo(); + (ArrayList) defaultMetaData.getDateInfo(); for (CitationDate c : citationDateList) { if (c.getDateType() != null) { String date = DateUtils.formatDate(c.getDate()); @@ -450,7 +456,7 @@ private void getMetaDataDateInfo(Metadata metadata, DefaultMetadata defaultMetaD private void getMetaDataResourceScope(Metadata metadata, DefaultMetadata defaultMetaData) { ArrayList scopeList = - (ArrayList) defaultMetaData.getMetadataScopes(); + (ArrayList) defaultMetaData.getMetadataScopes(); for (DefaultMetadataScope d : scopeList) { if (d.getResourceScope() != null) { metadata.add("MetaDataResourceScope ", d.getResourceScope().name()); @@ -459,7 +465,7 @@ private void getMetaDataResourceScope(Metadata metadata, DefaultMetadata default } private void getMetaDataParentMetaDataTitle(Metadata metadata, - DefaultMetadata defaultMetaData) { + DefaultMetadata defaultMetaData) { Citation parentMetaData = defaultMetaData.getParentMetadata(); if (parentMetaData != null && parentMetaData.getTitle() != null) { metadata.add("ParentMetaDataTitle", parentMetaData.getTitle().toString()); @@ -475,7 +481,7 @@ private void getMetaDataIdetifierCode(Metadata metadata, DefaultMetadata default private void getMetaDataStandard(Metadata metadata, DefaultMetadata defaultMetaData) { ArrayList citationList = - (ArrayList) defaultMetaData.getMetadataStandards(); + (ArrayList) defaultMetaData.getMetadataStandards(); for (Citation c : citationList) { if (c.getTitle() != null) { metadata.add("MetaDataStandardTitle ", c.getTitle().toString()); diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/grib/GribParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/grib/GribParser.java index b974a481e6..283f2d0399 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/grib/GribParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/grib/GribParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.grib; @@ -24,16 +22,7 @@ import java.nio.file.StandardCopyOption; import java.util.Collections; import java.util.Set; - import org.apache.commons.io.FileUtils; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import ucar.nc2.Attribute; -import ucar.nc2.Dimension; -import ucar.nc2.NetcdfFile; -import ucar.nc2.Variable; -import ucar.nc2.dataset.NetcdfDataset; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; @@ -42,26 +31,33 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import ucar.nc2.Attribute; +import ucar.nc2.Dimension; +import ucar.nc2.NetcdfFile; +import ucar.nc2.Variable; +import ucar.nc2.dataset.NetcdfDataset; public class GribParser implements Parser { public static final String GRIB_MIME_TYPE = "application/x-grib2"; private static final long serialVersionUID = 7855458954474247655L; private final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("x-grib2")); + Collections.singleton(MediaType.application("x-grib2")); public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { - //Set MIME type as grib2 + // Set MIME type as grib2 metadata.set(Metadata.CONTENT_TYPE, GRIB_MIME_TYPE); - //grib was not cleaning up its temp files no matter what we tried - //this is a work around the creates a temp directory then copies the full input file - //into that tmp directory. We then delete the directory in the finally statement. + // grib was not cleaning up its temp files no matter what we tried + // this is a work around the creates a temp directory then copies the full input file + // into that tmp directory. We then delete the directory in the finally statement. Path tmpDir = Files.createTempDirectory("tika-grib-"); try { @@ -92,8 +88,8 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, xhtml.newline(); for (Dimension dim : ncFile.getDimensions()) { - xhtml.element("li", - dim.getFullName() + "=" + String.valueOf(dim.getLength()) + ";"); + xhtml.element("li", dim.getFullName() + "=" + String.valueOf(dim.getLength()) + + ";"); xhtml.newline(); } @@ -102,8 +98,8 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, xhtml.newline(); for (Variable var : ncFile.getVariables()) { - xhtml.element("p", - String.valueOf(var.getDataType()) + var.getNameAndDimensions() + ";"); + xhtml.element("p", String.valueOf(var.getDataType()) + + var.getNameAndDimensions() + ";"); for (Attribute element : var.getAttributes()) { xhtml.element("li", " :" + element + ";"); xhtml.newline(); diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java index 14098f3b67..e87ce97f75 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java @@ -1,37 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.hdf; -//JDK imports +// JDK imports import java.io.IOException; import java.io.InputStream; import java.util.Collections; import java.util.Set; - import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import ucar.nc2.Attribute; -import ucar.nc2.Group; -import ucar.nc2.NetcdfFile; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -40,13 +31,18 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.netcdf.NetCDFParser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import ucar.nc2.Attribute; +import ucar.nc2.Group; +import ucar.nc2.NetcdfFile; /** - * Since the {@link NetCDFParser} depends on the NetCDF-Java API, - * we are able to use it to parse HDF files as well. See this link for more information. + * Since the {@link NetCDFParser} depends on the + * NetCDF-Java API, we are able to + * use it to parse HDF files as well. See + * this link + * for more information. */ public class HDFParser implements Parser { @@ -56,13 +52,12 @@ public class HDFParser implements Parser { private static final long serialVersionUID = 1091208208003437549L; private static final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("x-hdf")); + Collections.singleton(MediaType.application("x-hdf")); /* * (non-Javadoc) * - * @see - * org.apache.tika.parser.netcdf.NetCDFParser#getSupportedTypes(org.apache + * @see org.apache.tika.parser.netcdf.NetCDFParser#getSupportedTypes(org.apache * .tika.parser.ParseContext) */ public Set getSupportedTypes(ParseContext context) { @@ -72,14 +67,14 @@ public Set getSupportedTypes(ParseContext context) { /* * (non-Javadoc) * - * @see - * org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream, + * @see org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream, * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, * org.apache.tika.parser.ParseContext) */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { - UnsynchronizedByteArrayOutputStream os = UnsynchronizedByteArrayOutputStream.builder().get(); + ParseContext context) throws IOException, SAXException, TikaException { + UnsynchronizedByteArrayOutputStream os = + UnsynchronizedByteArrayOutputStream.builder().get(); IOUtils.copy(stream, os); String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java index 7577b32174..5e9fb93d94 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.isatab; @@ -24,13 +22,10 @@ import java.util.Iterator; import java.util.Locale; import java.util.Map; - import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.apache.commons.io.input.CloseShieldInputStream; -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.AutoDetectReader; import org.apache.tika.detect.DefaultEncodingDetector; @@ -40,6 +35,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; public class ISATabUtils { @@ -48,9 +44,8 @@ public class ISATabUtils { */ // Investigation section. - private static final String[] sections = - {"ONTOLOGY SOURCE REFERENCE", "INVESTIGATION", "INVESTIGATION PUBLICATIONS", - "INVESTIGATION CONTACTS"}; + private static final String[] sections = {"ONTOLOGY SOURCE REFERENCE", "INVESTIGATION", + "INVESTIGATION PUBLICATIONS", "INVESTIGATION CONTACTS"}; // STUDY section (inside the Study section) private static final String studySectionField = "STUDY"; @@ -59,32 +54,31 @@ public class ISATabUtils { private static final String studyFileNameField = "Study File Name"; public static void parseInvestigation(InputStream stream, XHTMLContentHandler handler, - Metadata metadata, ParseContext context, - String studyFileName) - throws IOException, TikaException, SAXException { + Metadata metadata, ParseContext context, String studyFileName) + throws IOException, TikaException, SAXException { // Automatically detect the character encoding - try (AutoDetectReader reader = new AutoDetectReader(CloseShieldInputStream.wrap(stream), - metadata)) { + try (AutoDetectReader reader = + new AutoDetectReader(CloseShieldInputStream.wrap(stream), metadata)) { extractMetadata(reader, metadata, studyFileName); } } public static void parseInvestigation(InputStream stream, XHTMLContentHandler handler, - Metadata metadata, ParseContext context) - throws IOException, TikaException, SAXException { + Metadata metadata, ParseContext context) + throws IOException, TikaException, SAXException { parseInvestigation(stream, handler, metadata, context, null); } public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, - ParseContext context) - throws IOException, TikaException, SAXException { + ParseContext context) throws IOException, TikaException, SAXException { TikaInputStream tis = TikaInputStream.get(stream); // Automatically detect the character encoding EncodingDetector encodingDetector = getEncodingDetector(context); try (AutoDetectReader reader = new AutoDetectReader(CloseShieldInputStream.wrap(tis), - metadata, encodingDetector); - CSVParser csvParser = CSVParser.builder().setReader(reader).setFormat(CSVFormat.TDF).get()) { + metadata, encodingDetector); + CSVParser csvParser = CSVParser.builder().setReader(reader) + .setFormat(CSVFormat.TDF).get()) { Iterator iterator = csvParser.iterator(); xhtml.startElement("table"); @@ -126,15 +120,15 @@ private static EncodingDetector getEncodingDetector(ParseContext context) { } public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, - ParseContext context) - throws IOException, TikaException, SAXException { + ParseContext context) throws IOException, TikaException, SAXException { TikaInputStream tis = TikaInputStream.get(stream); // Automatically detect the character encoding EncodingDetector encodingDetector = getEncodingDetector(context); try (AutoDetectReader reader = new AutoDetectReader(CloseShieldInputStream.wrap(tis), - metadata, encodingDetector); - CSVParser csvParser = CSVParser.builder().setReader(reader).setFormat(CSVFormat.TDF).get()) { + metadata, encodingDetector); + CSVParser csvParser = CSVParser.builder().setReader(reader) + .setFormat(CSVFormat.TDF).get()) { xhtml.startElement("table"); Iterator iterator = csvParser.iterator(); @@ -168,14 +162,15 @@ public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Met } private static void extractMetadata(Reader reader, Metadata metadata, String studyFileName) - throws IOException { + throws IOException { boolean investigationSection = false; boolean studySection = false; boolean studyTarget = false; Map map = new HashMap<>(); - try (CSVParser csvParser = CSVParser.builder().setReader(reader).setFormat(CSVFormat.TDF).get()) { + try (CSVParser csvParser = + CSVParser.builder().setReader(reader).setFormat(CSVFormat.TDF).get()) { for (CSVRecord record : csvParser) { String field = record.get(0); @@ -191,8 +186,8 @@ private static void extractMetadata(Reader reader, Metadata metadata, String stu } String value = record.get(1); map.put(field, value); - studyTarget = - (field.equals(studyFileNameField)) && (value.equals(studyFileName)); + studyTarget = (field.equals(studyFileNameField)) + && (value.equals(studyFileName)); if (studyTarget) { mapStudyToMetadata(map, metadata); studySection = false; diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java index 6c7975f82b..3b8bf59e6e 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.isatab; @@ -21,10 +19,6 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -33,6 +27,8 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class ISArchiveParser implements Parser { @@ -42,7 +38,7 @@ public class ISArchiveParser implements Parser { private static final long serialVersionUID = 3640809327541300229L; private static String studyAssayFileNameField = "Study Assay File Name"; private final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("x-isatab")); + Collections.singleton(MediaType.application("x-isatab")); private String location = null; private String studyFileName = null; @@ -73,10 +69,10 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = - TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources(); + TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources(); TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); try { @@ -86,7 +82,8 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, this.studyFileName = tis.getFile().getName(); File locationFile = new File(location); - String[] investigationList = locationFile.list((dir, name) -> name.matches("i_.+\\.txt")); + String[] investigationList = + locationFile.list((dir, name) -> name.matches("i_.+\\.txt")); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); @@ -104,8 +101,8 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } private void parseInvestigation(String[] investigationList, XHTMLContentHandler xhtml, - Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { if ((investigationList == null) || (investigationList.length == 0)) { // TODO warning return; @@ -116,27 +113,28 @@ private void parseInvestigation(String[] investigationList, XHTMLContentHandler } String investigation = investigationList[0]; // TODO add to metadata? - try (InputStream stream = TikaInputStream.get(new File(this.location + investigation).toPath())) { + try (InputStream stream = + TikaInputStream.get(new File(this.location + investigation).toPath())) { ISATabUtils.parseInvestigation(stream, xhtml, metadata, context, this.studyFileName); } xhtml.element("h1", "INVESTIGATION " + metadata.get("Investigation Identifier")); } private void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { xhtml.element("h2", "STUDY " + metadata.get("Study Identifier")); ISATabUtils.parseStudy(stream, xhtml, metadata, context); } private void parseAssay(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { for (String assayFileName : metadata.getValues(studyAssayFileNameField)) { xhtml.startElement("div"); xhtml.element("h3", "ASSAY " + assayFileName); // location starts with "/C:" on windows, can't use Paths.get() - try (InputStream stream = TikaInputStream.get(new File(this.location + assayFileName).toPath())) - { + try (InputStream stream = + TikaInputStream.get(new File(this.location + assayFileName).toPath())) { ISATabUtils.parseAssay(stream, xhtml, metadata, context); } xhtml.endElement("div"); diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java index 9f4ef020c6..348b325735 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java @@ -1,36 +1,26 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.netcdf; -//JDK imports +// JDK imports import java.io.IOException; import java.io.InputStream; import java.util.Collections; import java.util.List; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import ucar.nc2.Attribute; -import ucar.nc2.Dimension; -import ucar.nc2.NetcdfFile; -import ucar.nc2.Variable; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -41,13 +31,17 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import ucar.nc2.Attribute; +import ucar.nc2.Dimension; +import ucar.nc2.NetcdfFile; +import ucar.nc2.Variable; /** - * A {@link Parser} for NetCDF - * files using the UCAR, MIT-licensed NetCDF for Java - * API. + * A {@link Parser} for NetCDF + * files using the UCAR, MIT-licensed + * NetCDF for Java API. */ public class NetCDFParser implements Parser { @@ -57,14 +51,12 @@ public class NetCDFParser implements Parser { private static final long serialVersionUID = -5940938274907708665L; private final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("x-netcdf")); + Collections.singleton(MediaType.application("x-netcdf")); /* * (non-Javadoc) * - * @see - * org.apache.tika.parser.Parser#getSupportedTypes(org.apache.tika.parser - * .ParseContext) + * @see org.apache.tika.parser.Parser#getSupportedTypes(org.apache.tika.parser .ParseContext) */ public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; @@ -73,15 +65,14 @@ public Set getSupportedTypes(ParseContext context) { /* * (non-Javadoc) * - * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, - * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, - * org.apache.tika.parser.ParseContext) + * @see org.apache.tika.parser.Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, + * org.apache.tika.metadata.Metadata, org.apache.tika.parser.ParseContext) */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = - TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources(); + TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources(); TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); try (NetcdfFile ncFile = NetcdfFile.open(tis.getFile().getAbsolutePath())) { metadata.set("File-Type-Description", ncFile.getFileTypeDescription()); diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java index 06b8285272..6b31cb20e2 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.envi; @@ -23,16 +21,14 @@ import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.nio.charset.StandardCharsets; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.ToXMLContentHandler; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; /** * Test cases to exercise the {@link EnviHeaderParser}. @@ -61,7 +57,7 @@ public void tearDown() { public void testParseGlobalMetadata() throws Exception { try (InputStream stream = EnviHeaderParser.class - .getResourceAsStream("/test-documents/envi_test_header.hdr")) { + .getResourceAsStream("/test-documents/envi_test_header.hdr")) { assertNotNull(stream, "Test ENVI file 'envi_test_header.hdr' not found"); parser.parse(stream, handler, metadata, new ParseContext()); } @@ -71,28 +67,26 @@ public void testParseGlobalMetadata() throws Exception { assertContains("

    ENVI

    ", content); assertContains("

    samples = 2400

    ", content); assertContains("

    lines = 2400

    ", content); - assertContains( - "

    map info = {Sinusoidal, 1.5000, 1.5000, -10007091.3643, " + - "5559289.2856, 4.6331271653e+02, 4.6331271653e+02, , units=Meters}

    ", - content); + assertContains("

    map info = {Sinusoidal, 1.5000, 1.5000, -10007091.3643, " + + "5559289.2856, 4.6331271653e+02, 4.6331271653e+02, , units=Meters}

    ", + content); assertContains("content=\"application/envi.hdr\"", content); - assertContains( - "projection info = {16, 6371007.2, 0.000000, 0.0, 0.0, Sinusoidal, units=Meters}", - content); + assertContains("projection info = {16, 6371007.2, 0.000000, 0.0, 0.0, Sinusoidal, units=Meters}", + content); } @Test public void testParseGlobalMetadataMultiLineMetadataValues() throws Exception { ByteArrayOutputStream bos = new ByteArrayOutputStream(); - IOUtils.copy(EnviHeaderParserTest.class.getResourceAsStream("/ground-truth" + - "/EnviHeaderGroundTruth.txt"), bos); + IOUtils.copy(EnviHeaderParserTest.class + .getResourceAsStream("/ground-truth" + "/EnviHeaderGroundTruth.txt"), bos); String expected = new String(bos.toByteArray(), StandardCharsets.UTF_8).trim(); Parser parser = new EnviHeaderParser(); ToXMLContentHandler handler = new ToXMLContentHandler(); Metadata metadata = new Metadata(); - try (InputStream stream = EnviHeaderParser.class - .getResourceAsStream("/test-documents/ang20150420t182050_corr_v1e_img.hdr")) { + try (InputStream stream = EnviHeaderParser.class.getResourceAsStream( + "/test-documents/ang20150420t182050_corr_v1e_img.hdr")) { assertNotNull(stream, "Test ENVI file 'ang20150420t182050_corr_v1e_img.hdr' not found"); parser.parse(stream, handler, metadata, new ParseContext()); } @@ -100,20 +94,14 @@ public void testParseGlobalMetadataMultiLineMetadataValues() throws Exception { // Check content of test file String content = handler.toString(); assertContains("

    ENVI

    ", content); - assertContains( - "

    description = { Georeferenced Image built from input GLT. " + - "[Wed Jun 10 04:37:54 2015] [Wed Jun 10 04:48:52 2015]}

    ", - content); + assertContains("

    description = { Georeferenced Image built from input GLT. " + + "[Wed Jun 10 04:37:54 2015] [Wed Jun 10 04:48:52 2015]}

    ", content); assertContains("

    samples = 739

    ", content); assertContains("

    lat/lon = { 36.79077627261556, -108.48370867914815 }

    ", content); - assertContains( - "

    map info = { UTM , 1.000 , 1.000 , 724522.127 , 4074620.759 , " + - "1.1000000000e+00 , 1.1000000000e+00 , 12 , North , " + - "WGS-84 , units=Meters , " + - "rotation=75.00000000 }

    ", - content); - assertContains( expected, - content); + assertContains("

    map info = { UTM , 1.000 , 1.000 , 724522.127 , 4074620.759 , " + + "1.1000000000e+00 , 1.1000000000e+00 , 12 , North , " + + "WGS-84 , units=Meters , " + "rotation=75.00000000 }

    ", content); + assertContains(expected, content); } /** diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java index 456e9be10c..ea02f4da74 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.gdal; @@ -24,14 +22,12 @@ import static org.junit.jupiter.api.Assumptions.assumeTrue; import java.io.InputStream; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; /** * Test harness for the GDAL parser. @@ -57,7 +53,7 @@ public void testParseBasicInfo() { GDALParser parser = new GDALParser(); InputStream stream = TestGDALParser.class - .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc"); + .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc"); Metadata met = new Metadata(); BodyContentHandler handler = new BodyContentHandler(); try { @@ -71,10 +67,10 @@ public void testParseBasicInfo() { assertNotNull(met.get("Driver")); assertEquals(expectedDriver, met.get("Driver")); assumeTrue(met.get("Files") != null); - //recent version of gdalinfo doesn't include "Coordinate System": + // recent version of gdalinfo doesn't include "Coordinate System": // GDAL 3.7.1, released 2023/07/06 - //assertNotNull(met.get("Coordinate System")); - //assertEquals(expectedCoordinateSystem, met.get("Coordinate System")); + // assertNotNull(met.get("Coordinate System")); + // assertEquals(expectedCoordinateSystem, met.get("Coordinate System")); assertNotNull(met.get("Size")); assertEquals(expectedSize, met.get("Size")); assertNotNull(met.get("Upper Right")); @@ -92,7 +88,7 @@ public void testParseBasicInfo() { public void testParseMetadata() { assumeTrue(canRun()); final String expectedNcInst = - "NCAR (National Center for Atmospheric Research, Boulder, CO, USA)"; + "NCAR (National Center for Atmospheric Research, Boulder, CO, USA)"; final String expectedModelNameEnglish = "NCAR CCSM"; final String expectedProgramId = "Source file unknown Version unknown Date unknown"; final String expectedProjectId = "IPCC Fourth Assessment"; @@ -103,7 +99,7 @@ public void testParseMetadata() { GDALParser parser = new GDALParser(); InputStream stream = TestGDALParser.class - .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc"); + .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc"); Metadata met = new Metadata(); BodyContentHandler handler = new BodyContentHandler(); try { diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java index 8a46330c20..4e64249827 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java @@ -1,28 +1,25 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geoinfo; import static org.junit.jupiter.api.Assertions.assertEquals; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.Test; public class GeographicInformationParserTest extends TikaTest { diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java index 78758093ff..5a72898aef 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.grib; @@ -21,14 +19,12 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.InputStream; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; /** * Test cases to exercise the {@link org.apache.tika.parser.grib.GribParser}. @@ -42,7 +38,7 @@ public void testParseGlobalMetadata() throws Exception { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); try (InputStream stream = GribParser.class - .getResourceAsStream("/test-documents/gdas1.forecmwf.2014062612.grib2")) { + .getResourceAsStream("/test-documents/gdas1.forecmwf.2014062612.grib2")) { parser.parse(stream, handler, metadata, new ParseContext()); } assertNotNull(metadata); @@ -51,4 +47,3 @@ public void testParseGlobalMetadata() throws Exception { assertTrue(content.contains("variables:")); } } - diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java index 3aaea28c1a..5d4aabb007 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.hdf; @@ -20,14 +18,12 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import java.io.InputStream; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; /** * Test suite for the {@link HDFParser}. diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java index 08f9308c4d..0344a8c84e 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.isatab; @@ -20,14 +18,12 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.InputStream; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; public class ISArchiveParserTest { @@ -35,10 +31,9 @@ public class ISArchiveParserTest { public void testParseArchive() throws Exception { String path = "/test-documents/testISATab_BII-I-1/s_BII-S-1.txt"; - Parser parser = new ISArchiveParser( - ISArchiveParserTest.class.getResource("/test-documents/testISATab_BII-I-1/").toURI() - .getPath()); - //Parser parser = new AutoDetectParser(); + Parser parser = new ISArchiveParser(ISArchiveParserTest.class + .getResource("/test-documents/testISATab_BII-I-1/").toURI().getPath()); + // Parser parser = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); @@ -49,22 +44,20 @@ public void testParseArchive() throws Exception { // INVESTIGATION assertEquals("BII-I-1", metadata.get("Investigation Identifier"), - "Invalid Investigation Identifier"); + "Invalid Investigation Identifier"); assertEquals("Growth control of the eukaryote cell: a systems biology study in yeast", - metadata.get("Investigation Title"), - "Invalid Investigation Title"); + metadata.get("Investigation Title"), "Invalid Investigation Title"); // INVESTIGATION PUBLICATIONS assertEquals("17439666", metadata.get("Investigation PubMed ID"), - "Invalid Investigation PubMed ID"); - assertEquals("doi:10.1186/jbiol54", - metadata.get("Investigation Publication DOI"), - "Invalid Investigation Publication DOI"); + "Invalid Investigation PubMed ID"); + assertEquals("doi:10.1186/jbiol54", metadata.get("Investigation Publication DOI"), + "Invalid Investigation Publication DOI"); // INVESTIGATION CONTACTS - assertEquals( "Oliver", metadata.get("Investigation Person Last Name"), - "Invalid Investigation Person Last Name"); + assertEquals("Oliver", metadata.get("Investigation Person Last Name"), + "Invalid Investigation Person Last Name"); assertEquals("Stephen", metadata.get("Investigation Person First Name"), - "Invalid Investigation Person First Name"); + "Invalid Investigation Person First Name"); } } diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java index 92dc0733e4..dbbb450567 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.netcdf; @@ -20,15 +18,13 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.InputStream; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; /** * Test cases to exercise the {@link NetCDFParser}. @@ -41,8 +37,8 @@ public void testParseGlobalMetadata() throws Exception { ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); - try (InputStream stream = NetCDFParser.class - .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc")) { + try (InputStream stream = NetCDFParser.class.getResourceAsStream( + "/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc")) { parser.parse(stream, handler, metadata, new ParseContext()); } @@ -52,7 +48,7 @@ public void testParseGlobalMetadata() throws Exception { assertEquals(metadata.get(Metadata.CONVENTIONS), "CF-1.0"); assertEquals(metadata.get(Metadata.REALIZATION), "1"); assertEquals(metadata.get(Metadata.EXPERIMENT_ID), - "720 ppm stabilization experiment (SRESA1B)"); + "720 ppm stabilization experiment (SRESA1B)"); assertEquals(metadata.get("File-Type-Description"), "NetCDF-3/CDM"); String content = handler.toString(); diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java index 7123e2edb9..7ebd8b5a10 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. * */ package org.apache.tika.parser.scientific.integration; @@ -26,9 +24,6 @@ import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; - -import org.junit.jupiter.api.Test; - import org.apache.tika.exception.TikaException; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.CompositeParser; @@ -36,16 +31,15 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.external.CompositeExternalParser; import org.apache.tika.parser.ocr.TesseractOCRParser; +import org.junit.jupiter.api.Test; /** - * We fixed parser ordering in 2.4.1. This confirms going forward that the integration - * of tika-parsers-standard with the tika-parser-scientific package maintains - * parser order. + * We fixed parser ordering in 2.4.1. This confirms going forward that the integration of + * tika-parsers-standard with the tika-parser-scientific package maintains parser order. * * This does not currently test parsers added after 2.4.1. * - * We included 2.4.0 for historical reasons to show what the behavior was - * before the fix. + * We included 2.4.0 for historical reasons to show what the behavior was before the fix. */ public class TestParsers { @@ -59,15 +53,13 @@ public void testDiffsFrom241() throws Exception { } int checked = 0; - //The initial lists were developed with exiftool installed. We have since - //modified the 2.4.1-* files to act as if no exiftool is installed. - //However, on systems with ffmpeg or exiftool installed, we need - //to override those file formats + // The initial lists were developed with exiftool installed. We have since + // modified the 2.4.1-* files to act as if no exiftool is installed. + // However, on systems with ffmpeg or exiftool installed, we need + // to override those file formats CompositeParser externalParser = (CompositeParser) new CompositeExternalParser(); - try (BufferedReader reader = - new BufferedReader(new InputStreamReader( - getClass().getResourceAsStream(path241), - StandardCharsets.UTF_8))) { + try (BufferedReader reader = new BufferedReader(new InputStreamReader( + getClass().getResourceAsStream(path241), StandardCharsets.UTF_8))) { String line = reader.readLine(); while (line != null) { String[] data = line.split("\t"); @@ -79,7 +71,7 @@ public void testDiffsFrom241() throws Exception { parserClass = externalParser.getClass().toString(); } assertEquals(parserClass, currentDefault.get(mediaType), - "for mediaType '" + mediaType + "'"); + "for mediaType '" + mediaType + "'"); checked++; line = reader.readLine(); } diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java index d4b56127d2..e3711fc34a 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgDBParser.java @@ -1,24 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geopkg; import java.sql.Connection; import java.util.Set; - import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.jdbc.JDBCTableReader; @@ -27,8 +24,8 @@ /** * This is the implementation of the db parser for SQLite. *

    - * This parser is internal only; it should not be registered in the services - * file or configured in the TikaConfig xml file. + * This parser is internal only; it should not be registered in the services file or configured in + * the TikaConfig xml file. */ class GeoPkgDBParser extends SQLite3DBParser { @@ -40,15 +37,15 @@ class GeoPkgDBParser extends SQLite3DBParser { @Override public JDBCTableReader getTableReader(Connection connection, String tableName, - ParseContext context) { + ParseContext context) { return new GeoPkgTableReader(connection, tableName, new EmbeddedDocumentUtil(context), - ignoreBlobColumns); + ignoreBlobColumns); } @Override protected JDBCTableReader getTableReader(Connection connection, String tableName, - EmbeddedDocumentUtil embeddedDocumentUtil) { + EmbeddedDocumentUtil embeddedDocumentUtil) { return new GeoPkgTableReader(connection, tableName, embeddedDocumentUtil, - ignoreBlobColumns); + ignoreBlobColumns); } } diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java index e157a09c9f..1c78c5da75 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geopkg; @@ -24,10 +22,6 @@ import java.util.List; import java.util.Map; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; @@ -37,20 +31,22 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.sqlite3.SQLite3Parser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Customization of sqlite parser to skip certain common blob columns. *

    - * The motivation is that "geom" and "data" columns are intrinsic to geopkg - * and are not regular embedded files. Tika treats all blob columns as, potentially, - * embedded files -- this can add dramatically to the time to parse geopkg - * files, which might have hundreds of thousands of uninteresting blobs. + * The motivation is that "geom" and "data" columns are intrinsic to geopkg and are not regular + * embedded files. Tika treats all blob columns as, potentially, embedded files -- this can add + * dramatically to the time to parse geopkg files, which might have hundreds of thousands of + * uninteresting blobs. *

    - * Users may modify which columns are ignored or turn off "ignoring" - * of all solumns. + * Users may modify which columns are ignored or turn off "ignoring" of all solumns. *

    * To add a column to the default "ignore blob columns" via tika-config.xml: - *

    {@code}
    + * 
    + * 
    {@code}
      *   
      *     
      *     
    @@ -63,7 +59,7 @@
      *   
      *   }
    *

    - * Or use an empty list to parse all columns. + * Or use an empty list to parse all columns. */ public class GeoPkgParser extends SQLite3Parser { @@ -83,10 +79,11 @@ public class GeoPkgParser extends SQLite3Parser { private static final Set DEFAULT_IGNORE_BLOB_COLUMNS = Set.of("geom", "data"); private Set ignoreBlobColumns = new HashSet<>(DEFAULT_IGNORE_BLOB_COLUMNS); + /** * Checks to see if class is available for org.sqlite.JDBC. *

    - * If not, this class will return an EMPTY_SET for getSupportedTypes() + * If not, this class will return an EMPTY_SET for getSupportedTypes() */ public GeoPkgParser() { @@ -99,7 +96,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { GeoPkgDBParser p = new GeoPkgDBParser(ignoreBlobColumns); p.parse(stream, handler, metadata, context); } @@ -109,6 +106,7 @@ public void setIgnoreBlobColumns(List ignoreBlobColumns) { this.ignoreBlobColumns.clear(); this.ignoreBlobColumns.addAll(ignoreBlobColumns); } + /** * No-op * @@ -122,6 +120,5 @@ public void initialize(Map params) throws TikaConfigException { @Override public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { - } + throws TikaConfigException {} } diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java index 48256c2a58..5ee9aa4a73 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/geopkg/GeoPkgTableReader.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geopkg; @@ -22,30 +20,27 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.util.Set; - +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.sqlite3.SQLite3TableReader; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.extractor.EmbeddedDocumentUtil; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.sqlite3.SQLite3TableReader; - /** - * Concrete class for GeoPkg parsing. This overrides blob handling to skip "geom" and "data" - * columns + * Concrete class for GeoPkg parsing. This overrides blob handling to skip "geom" and "data" columns *

    - * For now, this silently skips cells of type CLOB, because xerial's jdbc connector - * does not currently support them. + * For now, this silently skips cells of type CLOB, because xerial's jdbc connector does not + * currently support them. */ class GeoPkgTableReader extends SQLite3TableReader { private final Set ignoreBlobColumns; public GeoPkgTableReader(Connection connection, String tableName, - EmbeddedDocumentUtil embeddedDocumentUtil, Set ignoreBlobColumns) { + EmbeddedDocumentUtil embeddedDocumentUtil, Set ignoreBlobColumns) { super(connection, tableName, embeddedDocumentUtil); this.ignoreBlobColumns = ignoreBlobColumns; } @@ -54,15 +49,15 @@ public GeoPkgTableReader(Connection connection, String tableName, @Override protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet, - int columnIndex, ContentHandler handler, ParseContext context) - throws SQLException, IOException, SAXException { + int columnIndex, ContentHandler handler, ParseContext context) + throws SQLException, IOException, SAXException { if (ignoreBlobColumns.contains(columnName)) { Attributes attrs = new AttributesImpl(); ((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob"); - ((AttributesImpl) attrs) - .addAttribute("", "column_name", "column_name", "CDATA", columnName); + ((AttributesImpl) attrs).addAttribute("", "column_name", "column_name", "CDATA", + columnName); ((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA", - Integer.toString(rowNum)); + Integer.toString(rowNum)); handler.startElement("", "span", "span", attrs); handler.endElement("", "span", "span"); return; diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java index fd8c2e8a43..c66dd55ca2 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3DBParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.sqlite3; @@ -29,9 +27,6 @@ import java.util.List; import java.util.Map; import java.util.Set; - -import org.sqlite.SQLiteConfig; - import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -40,23 +35,25 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.jdbc.AbstractDBParser; import org.apache.tika.parser.jdbc.JDBCTableReader; +import org.sqlite.SQLiteConfig; /** * This is the implementation of the db parser for SQLite. *

    - * This parser is internal only; it should not be registered in the services - * file or configured in the TikaConfig xml file. + * This parser is internal only; it should not be registered in the services file or configured in + * the TikaConfig xml file. */ public class SQLite3DBParser extends AbstractDBParser { protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC"; - protected static final Map METADATA_KEYS = Map.of( - SQLite3Parser.SQLITE_APPLICATION_ID, "select application_id from pragma_application_id", - SQLite3Parser.SQLITE_USER_VERSION, "select user_version from pragma_user_version" - ); + protected static final Map METADATA_KEYS = + Map.of(SQLite3Parser.SQLITE_APPLICATION_ID, + "select application_id from pragma_application_id", + SQLite3Parser.SQLITE_USER_VERSION, + "select user_version from pragma_user_version"); - //If the InputStream wasn't a TikaInputStream, copy to this tmp file + // If the InputStream wasn't a TikaInputStream, copy to this tmp file Path tmpFile = null; /** @@ -70,7 +67,7 @@ public Set getSupportedTypes(ParseContext context) { @Override protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context) - throws IOException { + throws IOException { String connectionString = getConnectionString(stream, metadata, context); Connection connection = null; @@ -82,7 +79,7 @@ protected Connection getConnection(InputStream stream, Metadata metadata, ParseC try { SQLiteConfig config = new SQLiteConfig(); - //good habit, but effectively meaningless here + // good habit, but effectively meaningless here config.setReadOnly(true); connection = config.createConnection(connectionString); @@ -94,15 +91,15 @@ protected Connection getConnection(InputStream stream, Metadata metadata, ParseC @Override protected String getConnectionString(InputStream is, Metadata metadata, ParseContext context) - throws IOException { + throws IOException { TikaInputStream tis = TikaInputStream.cast(is); - //if this is a TikaInputStream, use that to spool is to disk or - //use original underlying file. + // if this is a TikaInputStream, use that to spool is to disk or + // use original underlying file. if (tis != null) { Path dbFile = tis.getPath(); return "jdbc:sqlite:" + dbFile.toAbsolutePath().toString(); } else { - //if not TikaInputStream, create own tmpResources. + // if not TikaInputStream, create own tmpResources. tmpFile = Files.createTempFile("tika-sqlite-tmp", ""); Files.copy(is, tmpFile, StandardCopyOption.REPLACE_EXISTING); return "jdbc:sqlite:" + tmpFile.toAbsolutePath().toString(); @@ -127,7 +124,7 @@ protected String getJDBCClassName() { @Override protected List getTableNames(Connection connection, Metadata metadata, - ParseContext context) throws SQLException { + ParseContext context) throws SQLException { List tableNames = new LinkedList<>(); try (Statement st = connection.createStatement()) { @@ -143,36 +140,36 @@ protected List getTableNames(Connection connection, Metadata metadata, @Override public JDBCTableReader getTableReader(Connection connection, String tableName, - ParseContext context) { + ParseContext context) { return new SQLite3TableReader(connection, tableName, new EmbeddedDocumentUtil(context)); } @Override protected JDBCTableReader getTableReader(Connection connection, String tableName, - EmbeddedDocumentUtil embeddedDocumentUtil) { + EmbeddedDocumentUtil embeddedDocumentUtil) { return new SQLite3TableReader(connection, tableName, embeddedDocumentUtil); } @Override protected void extractMetadata(Connection connection, Metadata metadata) { - //TODO -- figure out how to get the version of sqlite3 that last modified this file and + // TODO -- figure out how to get the version of sqlite3 that last modified this file and // version-valid-for. // version-valid-for is at offset 92, last modified by app version isat offset 96 -- // not clear how to get this info via sql - //'file' extracts this info; we should to :\ - //See: https://www.sqlite.org/fileformat.html + // 'file' extracts this info; we should to :\ + // See: https://www.sqlite.org/fileformat.html for (Map.Entry e : METADATA_KEYS.entrySet()) { try (Statement st = connection.createStatement()) { try (ResultSet rs = st.executeQuery(e.getValue())) { if (rs.next()) { int val = rs.getInt(1); - if (! rs.wasNull()) { + if (!rs.wasNull()) { metadata.set(e.getKey(), Integer.toString(val, 16)); } } } } catch (SQLException ex) { - //swallow + // swallow } } } diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java index 34aab4d10f..2dac502535 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.sqlite3; @@ -22,10 +20,6 @@ import java.util.Collections; import java.util.Map; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; @@ -36,22 +30,21 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * This is the main class for parsing SQLite3 files. When {@link #parse} is called, - * this creates a new {@link SQLite3DBParser}. + * This is the main class for parsing SQLite3 files. When {@link #parse} is called, this creates a + * new {@link SQLite3DBParser}. *

    - * Given potential conflicts of native libraries in web servers, users will - * need to add org.xerial's sqlite-jdbc jar to the class path for this parser - * to work. For development and testing, this jar is specified in tika-parsers' - * pom.xml, but it is currently set to "provided." + * Given potential conflicts of native libraries in web servers, users will need to add org.xerial's + * sqlite-jdbc jar to the class path for this parser to work. For development and testing, this jar + * is specified in tika-parsers' pom.xml, but it is currently set to "provided." *

    - * Note that this family of jdbc parsers is designed to treat each CLOB and each BLOB - * as an embedded document; i.e. it will recursively process documents that are stored - * in a sqlite db as "bytes". + * Note that this family of jdbc parsers is designed to treat each CLOB and each BLOB as an embedded + * document; i.e. it will recursively process documents that are stored in a sqlite db as "bytes". *

    - * If using a TikaInputStream, make sure to close it to delete the temp file - * that has to be created. + * If using a TikaInputStream, make sure to close it to delete the temp file that has to be created. */ public class SQLite3Parser implements Parser, Initializable { @@ -61,13 +54,13 @@ public class SQLite3Parser implements Parser, Initializable { * Base16 encoded integer representing the "application id" */ public static final Property SQLITE_APPLICATION_ID = - Property.internalText(SQLITE3_PREFIX + "application_id"); + Property.internalText(SQLITE3_PREFIX + "application_id"); /** * Base16 encoded integer representing the "user version" */ public static final Property SQLITE_USER_VERSION = - Property.internalText(SQLITE3_PREFIX + "user_version"); + Property.internalText(SQLITE3_PREFIX + "user_version"); /** * Serial version UID @@ -85,7 +78,7 @@ public class SQLite3Parser implements Parser, Initializable { /** * Checks to see if class is available for org.sqlite.JDBC. *

    - * If not, this class will return an EMPTY_SET for getSupportedTypes() + * If not, this class will return an EMPTY_SET for getSupportedTypes() */ public SQLite3Parser() { @@ -98,7 +91,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { SQLite3DBParser p = new SQLite3DBParser(); p.parse(stream, handler, metadata, context); } @@ -116,6 +109,5 @@ public void initialize(Map params) throws TikaConfigException { @Override public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { - } + throws TikaConfigException {} } diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java index e0b5f0b271..ff07644d5d 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3TableReader.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.sqlite3; @@ -23,28 +21,26 @@ import java.sql.ResultSet; import java.sql.SQLException; import javax.sql.rowset.serial.SerialBlob; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.jdbc.JDBCTableReader; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Concrete class for SQLLite table parsing. This overrides - * column type handling from JDBCRowHandler. + * Concrete class for SQLLite table parsing. This overrides column type handling from + * JDBCRowHandler. *

    - * For now, this silently skips cells of type CLOB, because xerial's jdbc connector - * does not currently support them. + * For now, this silently skips cells of type CLOB, because xerial's jdbc connector does not + * currently support them. */ public class SQLite3TableReader extends JDBCTableReader { public SQLite3TableReader(Connection connection, String tableName, - EmbeddedDocumentUtil embeddedDocumentUtil) { + EmbeddedDocumentUtil embeddedDocumentUtil) { super(connection, tableName, embeddedDocumentUtil); } @@ -65,9 +61,9 @@ public SQLite3TableReader(Connection connection, String tableName, */ @Override protected void handleClob(String tableName, String fieldName, int rowNum, ResultSet resultSet, - int columnIndex, ContentHandler handler, ParseContext context) - throws SQLException, IOException, SAXException { - //no-op for now. + int columnIndex, ContentHandler handler, ParseContext context) + throws SQLException, IOException, SAXException { + // no-op for now. } @Override diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java index 49dfcd0bab..0405ec1805 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.sqlite3; @@ -20,22 +18,20 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.InputStream; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; public class SQLite3ParserTest extends TikaTest { private final static String TEST_FILE_NAME = "testSqlite3b.db"; private final static String TEST_FILE1 = "/test-documents/" + TEST_FILE_NAME; - //make sure that table cells and rows are properly marked to - //yield \t and \n at the appropriate places + // make sure that table cells and rows are properly marked to + // yield \t and \n at the appropriate places @Test public void testSpacesInBodyContentHandler() throws Exception { Metadata metadata = new Metadata(); @@ -55,125 +51,65 @@ public void testSpacesInBodyContentHandler() throws Exception { @Test public void testNulls() throws Exception { String xml = getXML(TEST_FILE_NAME).xml.replaceAll("\\s+", ""); - //everything except for the first key column should be empty + // everything except for the first key column should be empty TikaTest.assertContains("2", - xml); - } - - //code used for creating the test file -/* - private Connection getConnection(String dbFileName) throws Exception { - File testDirectory = new File(this.getClass().getResource("/test-documents").toURI()); - System.out.println("Writing to: " + testDirectory.getAbsolutePath()); - File testDB = new File(testDirectory, dbFileName); - Connection c = null; - try { - Class.forName("org.sqlite.JDBC"); - c = DriverManager.getConnection("jdbc:sqlite:" + testDB.getAbsolutePath()); - } catch ( Exception e ) { - System.err.println( e.getClass().getName() + ": " + e.getMessage() ); - System.exit(0); - } - return c; - } - - @Test - public void testCreateDB() throws Exception { - Connection c = getConnection("testSqlite3d.db"); - Statement st = c.createStatement(); - String sql = "DROP TABLE if exists my_table1"; - st.execute(sql); - sql = "CREATE TABLE my_table1 (" + - "PK INT PRIMARY KEY, "+ - "INT_COL INTEGER, "+ - "FLOAT_COL FLOAT, " + - "DOUBLE_COL DOUBLE, " + - "CHAR_COL CHAR(30), "+ - "VARCHAR_COL VARCHAR(30), "+ - "BOOLEAN_COL BOOLEAN,"+ - "DATE_COL DATE,"+ - "TIME_STAMP_COL TIMESTAMP,"+ - "CLOB_COL CLOB, "+ - "BYTES_COL BYTES" + - ")"; - st.execute(sql); - sql = "insert into my_table1 (PK, INT_COL, FLOAT_COL, DOUBLE_COL, CHAR_COL, " + - "VARCHAR_COL, BOOLEAN_COL, DATE_COL, TIME_STAMP_COL, CLOB_COL, BYTES_COL) " + - "values (?,?,?,?,?,?,?,?,?,?,?)"; - SimpleDateFormat f = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - java.util.Date d = f.parse("2015-01-03 15:17:03"); - System.out.println(d.getTime()); - long d1Long = 1420229823000L;// 2015-01-02 15:17:03 - long d2Long = 1420316223000L;// 2015-01-03 15:17:03 - PreparedStatement ps = c.prepareStatement(sql); - ps.setInt(1, 0); - ps.setInt(2, 10); - ps.setFloat(3, 2.3f); - ps.setDouble(4, 2.4d); - ps.setString(5, "lorem"); - ps.setString(6, "普林斯顿大学"); - ps.setBoolean(7, true); - ps.setString(8, "2015-01-02"); - ps.setString(9, "2015-01-03 15:17:03"); -// ps.setClob(10, new StringReader(sql)); - ps.setBytes(10, getByteArray(this.getClass() - .getResourceAsStream("/test-documents/testWORD_1img.doc")));//contains "quick brown fox" - ps.executeUpdate(); - ps.clearParameters(); - - ps.setInt(1, 1); - ps.setInt(2, 20); - ps.setFloat(3, 4.6f); - ps.setDouble(4, 4.8d); - ps.setString(5, "dolor"); - ps.setString(6, "sit"); - ps.setBoolean(7, false); - ps.setString(8, "2015-01-04"); - ps.setString(9, "2015-01-03 15:17:03"); - //ps.setClob(9, new StringReader("consectetur adipiscing elit")); - ps.setBytes(10, getByteArray(this.getClass() - .getResourceAsStream("/test-documents/testWORD_1img.docx")));//contains "The end!" - - ps.executeUpdate(); - //now add a fully null row - ps.clearParameters(); - ps.setInt(1, 2); - ps.setNull(2, Types.INTEGER); - ps.setNull(3, Types.FLOAT); - ps.setNull(4, Types.DOUBLE); - ps.setNull(5, Types.CHAR); - ps.setNull(6, Types.VARCHAR); - ps.setNull(7, Types.BOOLEAN); - ps.setNull(8, Types.DATE); - ps.setNull(9, Types.TIMESTAMP); - ps.setNull(10, Types.BLOB); - ps.executeUpdate(); - - //build table2 - sql = "DROP TABLE if exists my_table2"; - st.execute(sql); - - sql = "CREATE TABLE my_table2 (" + - "INT_COL2 INT PRIMARY KEY, "+ - "VARCHAR_COL2 VARCHAR(64))"; - st.execute(sql); - sql = "INSERT INTO my_table2 values(0,'sed, do eiusmod tempor')"; - st.execute(sql); - sql = "INSERT INTO my_table2 values(1,'incididunt \nut labore')"; - st.execute(sql); - - c.close(); - } - - private byte[] getByteArray(InputStream is) throws IOException { - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - byte[] buff = new byte[1024]; - for (int bytesRead; (bytesRead = is.read(buff)) != -1;) { - bos.write(buff, 0, bytesRead); - } - return bos.toByteArray(); + xml); } -*/ + // code used for creating the test file + /* + * private Connection getConnection(String dbFileName) throws Exception { File testDirectory = + * new File(this.getClass().getResource("/test-documents").toURI()); + * System.out.println("Writing to: " + testDirectory.getAbsolutePath()); File testDB = new + * File(testDirectory, dbFileName); Connection c = null; try { Class.forName("org.sqlite.JDBC"); + * c = DriverManager.getConnection("jdbc:sqlite:" + testDB.getAbsolutePath()); } catch ( + * Exception e ) { System.err.println( e.getClass().getName() + ": " + e.getMessage() ); + * System.exit(0); } return c; } + * + * @Test public void testCreateDB() throws Exception { Connection c = + * getConnection("testSqlite3d.db"); Statement st = c.createStatement(); String sql = + * "DROP TABLE if exists my_table1"; st.execute(sql); sql = "CREATE TABLE my_table1 (" + + * "PK INT PRIMARY KEY, "+ "INT_COL INTEGER, "+ "FLOAT_COL FLOAT, " + "DOUBLE_COL DOUBLE, " + + * "CHAR_COL CHAR(30), "+ "VARCHAR_COL VARCHAR(30), "+ "BOOLEAN_COL BOOLEAN,"+ "DATE_COL DATE,"+ + * "TIME_STAMP_COL TIMESTAMP,"+ "CLOB_COL CLOB, "+ "BYTES_COL BYTES" + ")"; st.execute(sql); sql + * = "insert into my_table1 (PK, INT_COL, FLOAT_COL, DOUBLE_COL, CHAR_COL, " + + * "VARCHAR_COL, BOOLEAN_COL, DATE_COL, TIME_STAMP_COL, CLOB_COL, BYTES_COL) " + + * "values (?,?,?,?,?,?,?,?,?,?,?)"; SimpleDateFormat f = new + * SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); java.util.Date d = f.parse("2015-01-03 15:17:03"); + * System.out.println(d.getTime()); long d1Long = 1420229823000L;// 2015-01-02 15:17:03 long + * d2Long = 1420316223000L;// 2015-01-03 15:17:03 PreparedStatement ps = + * c.prepareStatement(sql); ps.setInt(1, 0); ps.setInt(2, 10); ps.setFloat(3, 2.3f); + * ps.setDouble(4, 2.4d); ps.setString(5, "lorem"); ps.setString(6, "普林斯顿大学"); ps.setBoolean(7, + * true); ps.setString(8, "2015-01-02"); ps.setString(9, "2015-01-03 15:17:03"); // + * ps.setClob(10, new StringReader(sql)); ps.setBytes(10, getByteArray(this.getClass() + * .getResourceAsStream("/test-documents/testWORD_1img.doc")));//contains "quick brown fox" + * ps.executeUpdate(); ps.clearParameters(); + * + * ps.setInt(1, 1); ps.setInt(2, 20); ps.setFloat(3, 4.6f); ps.setDouble(4, 4.8d); + * ps.setString(5, "dolor"); ps.setString(6, "sit"); ps.setBoolean(7, false); ps.setString(8, + * "2015-01-04"); ps.setString(9, "2015-01-03 15:17:03"); //ps.setClob(9, new + * StringReader("consectetur adipiscing elit")); ps.setBytes(10, getByteArray(this.getClass() + * .getResourceAsStream("/test-documents/testWORD_1img.docx")));//contains "The end!" + * + * ps.executeUpdate(); //now add a fully null row ps.clearParameters(); ps.setInt(1, 2); + * ps.setNull(2, Types.INTEGER); ps.setNull(3, Types.FLOAT); ps.setNull(4, Types.DOUBLE); + * ps.setNull(5, Types.CHAR); ps.setNull(6, Types.VARCHAR); ps.setNull(7, Types.BOOLEAN); + * ps.setNull(8, Types.DATE); ps.setNull(9, Types.TIMESTAMP); ps.setNull(10, Types.BLOB); + * ps.executeUpdate(); + * + * //build table2 sql = "DROP TABLE if exists my_table2"; st.execute(sql); + * + * sql = "CREATE TABLE my_table2 (" + "INT_COL2 INT PRIMARY KEY, "+ "VARCHAR_COL2 VARCHAR(64))"; + * st.execute(sql); sql = "INSERT INTO my_table2 values(0,'sed, do eiusmod tempor')"; + * st.execute(sql); sql = "INSERT INTO my_table2 values(1,'incididunt \nut labore')"; + * st.execute(sql); + * + * c.close(); } + * + * private byte[] getByteArray(InputStream is) throws IOException { ByteArrayOutputStream bos = + * new ByteArrayOutputStream(); byte[] buff = new byte[1024]; for (int bytesRead; (bytesRead = + * is.read(buff)) != -1;) { bos.write(buff, 0, bytesRead); } return bos.toByteArray(); } + * + */ } diff --git a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/mime/TestMimeTypesExtended.java b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/mime/TestMimeTypesExtended.java index 88c3e3f3b2..ca4de28460 100644 --- a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/mime/TestMimeTypesExtended.java +++ b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/mime/TestMimeTypesExtended.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.mime; @@ -21,13 +19,11 @@ import java.io.IOException; import java.io.InputStream; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import org.apache.tika.config.TikaConfig; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class TestMimeTypesExtended { @@ -46,7 +42,7 @@ public void testNetCDF() throws Exception { private void assertTypeByData(String expected, String filename) throws IOException { try (InputStream stream = TikaInputStream.get(TestMimeTypesExtended.class - .getResourceAsStream("/test-documents/" + filename))) { + .getResourceAsStream("/test-documents/" + filename))) { assertNotNull(stream, "Test file not found: " + filename); Metadata metadata = new Metadata(); assertEquals(expected, repo.detect(stream, metadata).toString()); diff --git a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java index 5655bd72df..3810d20e07 100644 --- a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java +++ b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/ocr/TestOCR.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ocr; @@ -26,11 +24,6 @@ import java.util.List; import java.util.Map; import java.util.Set; - -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; -import org.xml.sax.SAXException; - import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; @@ -43,6 +36,9 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.gdal.GDALParser; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.xml.sax.SAXException; public class TestOCR extends TikaTest { @@ -66,16 +62,16 @@ public void testPNG() throws Exception { @Test public void testPNGProgrammatically() throws Exception { - //remove the GDAL parser from the default parser + // remove the GDAL parser from the default parser Parser defaultParser = new DefaultParser(); List parsers = new ArrayList<>(); - for (Parser p : ((CompositeParser)defaultParser).getAllComponentParsers()) { - if (! (p instanceof GDALParser)) { + for (Parser p : ((CompositeParser) defaultParser).getAllComponentParsers()) { + if (!(p instanceof GDALParser)) { parsers.add(p); } } - //decorate the gdal parser to exclude these image formats + // decorate the gdal parser to exclude these image formats Set exclude = new HashSet<>(); exclude.add(MediaType.image("png")); exclude.add(MediaType.image("jpeg")); @@ -95,27 +91,27 @@ public void testPNGProgrammatically() throws Exception { public void testOthers() throws Exception { Parser p = loadParser(); if (p instanceof CompositeParser) { - Map parsers = ((CompositeParser)p).getParsers(); + Map parsers = ((CompositeParser) p).getParsers(); Class clz = getParser(MediaType.application("x-netcdf"), parsers); assertEquals(GDALParser.class, clz); } } private Class getParser(MediaType mediaType, Map parsers) { - //this is fragile, but works well enough for a unit test + // this is fragile, but works well enough for a unit test Parser p = parsers.get(mediaType); if (p instanceof CompositeParser) { - return getParser(mediaType, ((CompositeParser)p).getParsers()); + return getParser(mediaType, ((CompositeParser) p).getParsers()); } else if (p instanceof ParserDecorator) { - Parser decorated = ((ParserDecorator)p).getWrappedParser(); + Parser decorated = ((ParserDecorator) p).getWrappedParser(); return decorated.getClass(); } return p.getClass(); } private Parser loadParser() throws IOException, TikaException, SAXException { - try (InputStream is = TestOCR.class.getResourceAsStream( - "/config/tika-config-restricted-gdal.xml")) { + try (InputStream is = TestOCR.class + .getResourceAsStream("/config/tika-config-restricted-gdal.xml")) { TikaConfig tikaConfig = new TikaConfig(is); return new AutoDetectParser(tikaConfig); } diff --git a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java index 65e752bc1f..08323d4460 100644 --- a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java +++ b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.sqlite3; @@ -25,11 +23,7 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.List; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.TikaTest; import org.apache.tika.extractor.EmbeddedResourceHandler; import org.apache.tika.extractor.ParserContainerExtractor; @@ -45,6 +39,8 @@ import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.sax.ToXMLContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; public class SQLite3ParserTest extends TikaTest { @@ -54,14 +50,14 @@ public class SQLite3ParserTest extends TikaTest { @Test public void testBasic() throws Exception { - //test different types of input streams - //actual inputstream, memory buffered bytearray and literal file + // test different types of input streams + // actual inputstream, memory buffered bytearray and literal file try (InputStream stream = getResourceAsStream(TEST_FILE1)) { _testBasic(stream); } try (InputStream is = getResourceAsStream(TEST_FILE1); - ByteArrayOutputStream bos = new ByteArrayOutputStream()) { + ByteArrayOutputStream bos = new ByteArrayOutputStream()) { IOUtils.copy(is, bos); try (InputStream stream = new ByteArrayInputStream(bos.toByteArray())) { _testBasic(stream); @@ -77,25 +73,25 @@ public void testBasic() throws Exception { private void _testBasic(InputStream stream) throws Exception { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, TEST_FILE_NAME); - //1) getXML closes the stream - //2) getXML runs recursively on the contents, so the embedded docs should show up + // 1) getXML closes the stream + // 2) getXML runs recursively on the contents, so the embedded docs should show up XMLResult result = getXML(stream, AUTO_DETECT_PARSER, metadata); String x = result.xml; - //first table name + // first table name assertContains("\t", x); - //non-ascii + // non-ascii assertContains("", x); - //boolean + // boolean assertContains("\t", x); - //date test + // date test assertContains("2015-01-04", x); - //timestamp test + // timestamp test assertContains("2015-01-03 15:17:03", x); - //first embedded doc's image tag + // first embedded doc's image tag assertContains("alt=\"image1.png\"", x); - //second embedded doc's image tag + // second embedded doc's image tag assertContains("alt=\"A description...\"", x); - //second table name + // second table name assertContains("
    PK普林斯顿大学true2015-01-02
    \t", x); Metadata post = result.metadata; @@ -105,7 +101,7 @@ private void _testBasic(InputStream stream) throws Exception { assertEquals("my_table2", tableNames[1]); } - //test what happens if the user does not want embedded docs handled + // test what happens if the user does not want embedded docs handled @Test public void testNotAddingEmbeddedParserToParseContext() throws Exception { ContentHandler handler = new ToXMLContentHandler(); @@ -117,16 +113,14 @@ public void testNotAddingEmbeddedParserToParseContext() throws Exception { AUTO_DETECT_PARSER.parse(is, handler, metadata, parseContext); } String xml = handler.toString(); - //just includes headers for embedded documents + // just includes headers for embedded documents assertContains("
    INT_COL2
    ", xml); - assertContains( - "
    " + - "

    BYTES_COL_0.doc

    ", - xml); - //but no other content + assertContains("
    " + + "

    BYTES_COL_0.doc

    ", xml); + // but no other content assertNotContained("dog", xml); assertNotContained("alt=\"image1.png\"", xml); - //second embedded doc's image tag + // second embedded doc's image tag assertNotContained("alt=\"A description...\"", xml); } @@ -135,8 +129,9 @@ public void testRecursiveParserWrapper() throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER); Metadata metadata = new Metadata(); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1)); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1)); try (InputStream is = getResourceAsStream(TEST_FILE1)) { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, TEST_FILE_NAME); @@ -144,32 +139,32 @@ public void testRecursiveParserWrapper() throws Exception { } List metadataList = handler.getMetadataList(); assertEquals(5, metadataList.size()); - //make sure the \t are inserted in a body handler + // make sure the \t are inserted in a body handler String table = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT); assertContains("0\t2.3\t2.4\tlorem", table); assertContains("普林斯顿大学", table); - //make sure the \n is inserted + // make sure the \n is inserted String table2 = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT); assertContains("do eiusmod tempor\n", table2); assertContains("The quick brown fox", - metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT)); + metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT)); assertContains("The quick brown fox", - metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT)); + metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT)); - //confirm .doc was added to blob + // confirm .doc was added to blob assertEquals("/BYTES_COL_0.doc/image1.png", - metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); } @Test public void testParserContainerExtractor() throws Exception { - //There should be 6 embedded documents: - //2x tables -- UTF-8 csv representations of the tables - //2x word files, one doc and one docx - //2x png files, the same image embedded in each of the doc and docx + // There should be 6 embedded documents: + // 2x tables -- UTF-8 csv representations of the tables + // 2x word files, one doc and one docx + // 2x png files, the same image embedded in each of the doc and docx ParserContainerExtractor ex = new ParserContainerExtractor(); ByteCopyingHandler byteCopier = new ByteCopyingHandler(); @@ -185,10 +180,9 @@ public void testParserContainerExtractor() throws Exception { String s = new String(byteArr, 0, Math.min(byteArr.length, 1000), UTF_8); strings[i] = s; } - byte[] oleBytes = - new byte[]{(byte) -48, (byte) -49, (byte) 17, (byte) -32, (byte) -95, (byte) -79, - (byte) 26, (byte) -31, (byte) 0, (byte) 0,}; - //test OLE + byte[] oleBytes = new byte[] {(byte) -48, (byte) -49, (byte) 17, (byte) -32, (byte) -95, + (byte) -79, (byte) 26, (byte) -31, (byte) 0, (byte) 0,}; + // test OLE for (int i = 0; i < 10; i++) { assertEquals(oleBytes[i], byteCopier.bytes.get(0)[i]); } @@ -197,13 +191,13 @@ public void testParserContainerExtractor() throws Exception { assertContains("PNG", strings[3]); } - //This confirms that reading the stream twice is not - //quadrupling the number of attachments. + // This confirms that reading the stream twice is not + // quadrupling the number of attachments. @Test public void testInputStreamReset() throws Exception { - //There should be 8 embedded documents: - //4x word files, two docs and two docxs - //4x png files, the same image embedded in each of the doc and docx + // There should be 8 embedded documents: + // 4x word files, two docs and two docxs + // 4x png files, the same image embedded in each of the doc and docx ParserContainerExtractor ex = new ParserContainerExtractor(); InputStreamResettingHandler byteCopier = new InputStreamResettingHandler(); @@ -234,13 +228,13 @@ public void handle(String filename, MediaType mediaType, InputStream stream) { IOUtils.copy(stream, os); bytes.add(os.toByteArray()); stream.reset(); - //now try again + // now try again os.reset(); IOUtils.copy(stream, os); bytes.add(os.toByteArray()); stream.reset(); } catch (IOException e) { - //swallow + // swallow } } } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java index caf32004c0..eedc9f16fe 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ctakes; @@ -22,9 +20,12 @@ * This enumeration includes the properties that an {@link IdentifiedAnnotation} object can provide. */ public enum CTAKESAnnotationProperty { - BEGIN("start"), END("end"), CONDITIONAL("conditional"), CONFIDENCE("confidence"), - DISCOVERY_TECNIQUE("discoveryTechnique"), GENERIC("generic"), HISTORY_OF("historyOf"), ID("id"), - ONTOLOGY_CONCEPT_ARR("ontologyConceptArr"), POLARITY("polarity"); + BEGIN("start"), END("end"), CONDITIONAL("conditional"), CONFIDENCE( + "confidence"), DISCOVERY_TECNIQUE("discoveryTechnique"), GENERIC( + "generic"), HISTORY_OF("historyOf"), ID( + "id"), ONTOLOGY_CONCEPT_ARR( + "ontologyConceptArr"), POLARITY( + "polarity"); private String name; diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java index 761e4d6e38..3877ad24ec 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ctakes; @@ -21,7 +19,6 @@ import java.io.OutputStream; import java.io.Serializable; import java.util.Properties; - import org.apache.commons.io.output.NullOutputStream; /** @@ -37,7 +34,7 @@ public class CTAKESConfig implements Serializable { // Path to XML descriptor for AnalysisEngine private String aeDescriptorPath = - "/ctakes-core/desc/analysis_engine/SentencesAndTokensAggregate.xml"; + "/ctakes-core/desc/analysis_engine/SentencesAndTokensAggregate.xml"; // UMLS username private String UMLSUser = ""; @@ -108,10 +105,10 @@ private void init(InputStream stream) { setUMLSPass(props.getProperty("UMLSPass", getUMLSPass())); setText(Boolean.parseBoolean(props.getProperty("text", Boolean.toString(isText())))); setMetadata(props.getProperty("metadata", getMetadataAsString()).split(",")); - setAnnotationProps( - props.getProperty("annotationProps", getAnnotationPropsAsString()).split(",")); + setAnnotationProps(props.getProperty("annotationProps", getAnnotationPropsAsString()) + .split(",")); setSeparatorChar(props.getProperty("separatorChar", Character.toString(getSeparatorChar())) - .charAt(0)); + .charAt(0)); } /** @@ -277,11 +274,11 @@ public void setMetadata(String[] metadata) { } /** - * Returns a string containing a comma-separated list of metadata whose - * values will be analyzed using cTAKES. + * Returns a string containing a comma-separated list of metadata whose values will be analyzed + * using cTAKES. * - * @return a string containing a comma-separated list of metadata whose - * values will be analyzed using cTAKES. + * @return a string containing a comma-separated list of metadata whose values will be analyzed + * using cTAKES. */ public String getMetadataAsString() { if (metadata == null) { @@ -298,11 +295,11 @@ public String getMetadataAsString() { } /** - * Returns an array of {@link CTAKESAnnotationProperty}'s that will be - * included into cTAKES metadata. + * Returns an array of {@link CTAKESAnnotationProperty}'s that will be included into cTAKES + * metadata. * - * @return an array of {@link CTAKESAnnotationProperty}'s that will be - * included into cTAKES metadata. + * @return an array of {@link CTAKESAnnotationProperty}'s that will be included into cTAKES + * metadata. */ public CTAKESAnnotationProperty[] getAnnotationProps() { return annotationProps; @@ -311,8 +308,8 @@ public CTAKESAnnotationProperty[] getAnnotationProps() { /** * Sets the {@link CTAKESAnnotationProperty}'s that will be included into cTAKES metadata. * - * @param annotationProps the {@link CTAKESAnnotationProperty}'s that will - * be included into cTAKES metadata. + * @param annotationProps the {@link CTAKESAnnotationProperty}'s that will be included into + * cTAKES metadata. */ public void setAnnotationProps(CTAKESAnnotationProperty[] annotationProps) { this.annotationProps = annotationProps; @@ -321,12 +318,12 @@ public void setAnnotationProps(CTAKESAnnotationProperty[] annotationProps) { /** * ets the {@link CTAKESAnnotationProperty}'s that will be included into cTAKES metadata. * - * @param annotationProps the {@link CTAKESAnnotationProperty}'s that will be - * included into cTAKES metadata. + * @param annotationProps the {@link CTAKESAnnotationProperty}'s that will be included into + * cTAKES metadata. */ public void setAnnotationProps(String[] annotationProps) { CTAKESAnnotationProperty[] properties = - new CTAKESAnnotationProperty[annotationProps.length]; + new CTAKESAnnotationProperty[annotationProps.length]; for (int i = 0; i < annotationProps.length; i++) { properties[i] = CTAKESAnnotationProperty.valueOf(annotationProps[i]); } @@ -334,8 +331,8 @@ public void setAnnotationProps(String[] annotationProps) { } /** - * Returns a string containing a comma-separated list of {@link CTAKESAnnotationProperty} - * names that will be included into cTAKES metadata. + * Returns a string containing a comma-separated list of {@link CTAKESAnnotationProperty} names + * that will be included into cTAKES metadata. * * @return */ diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java index 1ee5476060..95205c6d0f 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java @@ -1,24 +1,23 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ctakes; import java.util.Collection; - import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.sax.ContentHandlerDecorator; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; @@ -26,16 +25,13 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.sax.ContentHandlerDecorator; - /** * Class used to extract biomedical information while parsing. * *

    - * This class relies on Apache cTAKES - * that is a natural language processing system for extraction of information - * from electronic medical record clinical free-text. + * This class relies on Apache cTAKES that is a natural + * language processing system for extraction of information from electronic medical record clinical + * free-text. *

    */ public class CTAKESContentHandler extends ContentHandlerDecorator { @@ -58,13 +54,13 @@ public class CTAKESContentHandler extends ContentHandlerDecorator { private JCas jcas = null; /** - * Creates a new {@link CTAKESContentHandler} for the given {@link ContentHandler} - * and Metadata objects. + * Creates a new {@link CTAKESContentHandler} for the given {@link ContentHandler} and Metadata + * objects. * - * @param handler the {@link ContentHandler} object to be decorated. - * @param metadata the {@link Metadata} object that will be populated using - * biomedical information extracted by cTAKES. - * @param config the {@link CTAKESConfig} object used to configure the handler. + * @param handler the {@link ContentHandler} object to be decorated. + * @param metadata the {@link Metadata} object that will be populated using biomedical + * information extracted by cTAKES. + * @param config the {@link CTAKESConfig} object used to configure the handler. */ public CTAKESContentHandler(ContentHandler handler, Metadata metadata, CTAKESConfig config) { super(handler); @@ -74,12 +70,12 @@ public CTAKESContentHandler(ContentHandler handler, Metadata metadata, CTAKESCon } /** - * Creates a new {@link CTAKESContentHandler} for the given {@link - * ContentHandler} and Metadata objects. + * Creates a new {@link CTAKESContentHandler} for the given {@link ContentHandler} and Metadata + * objects. * - * @param handler the {@link ContentHandler} object to be decorated. - * @param metadata the {@link Metadata} object that will be populated using - * biomedical information extracted by cTAKES. + * @param handler the {@link ContentHandler} object to be decorated. + * @param metadata the {@link Metadata} object that will be populated using biomedical + * information extracted by cTAKES. */ public CTAKESContentHandler(ContentHandler handler, Metadata metadata) { this(handler, metadata, new CTAKESConfig()); @@ -105,9 +101,8 @@ public void endDocument() throws SAXException { try { // create an Analysis Engine if (ae == null) { - ae = CTAKESUtils - .getAnalysisEngine(config.getAeDescriptorPath(), config.getUMLSUser(), - config.getUMLSPass()); + ae = CTAKESUtils.getAnalysisEngine(config.getAeDescriptorPath(), + config.getUMLSUser(), config.getUMLSPass()); } // create a JCas, given an AE @@ -135,25 +130,25 @@ public void endDocument() throws SAXException { metadata.add(CTAKES_META_PREFIX + "schema", config.getAnnotationPropsAsString()); CTAKESAnnotationProperty[] annotationPros = config.getAnnotationProps(); Collection collection = - JCasUtil.select(jcas, IdentifiedAnnotation.class); + JCasUtil.select(jcas, IdentifiedAnnotation.class); for (IdentifiedAnnotation annotation : collection) { StringBuilder annotationBuilder = new StringBuilder(); annotationBuilder.append(annotation.getCoveredText()); if (annotationPros != null) { for (CTAKESAnnotationProperty property : annotationPros) { annotationBuilder.append(config.getSeparatorChar()); - annotationBuilder - .append(CTAKESUtils.getAnnotationProperty(annotation, property)); + annotationBuilder.append( + CTAKESUtils.getAnnotationProperty(annotation, property)); } } metadata.add(CTAKES_META_PREFIX + annotation.getType().getShortName(), - annotationBuilder.toString()); + annotationBuilder.toString()); } if (config.isSerialize()) { // serialize data CTAKESUtils.serialize(jcas, config.getSerializerType(), config.isPrettyPrint(), - config.getOutputStream()); + config.getOutputStream()); } } catch (Exception e) { throw new SAXException(e.getMessage()); diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java index 09d7af8aae..33ebbd44f7 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java @@ -1,27 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ctakes; import java.io.IOException; import java.io.InputStream; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -29,16 +23,17 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * CTAKESParser decorates a {@link Parser} and leverages on - * {@link CTAKESContentHandler} to extract biomedical information from - * clinical text using Apache cTAKES. - *

    It is normally called by supplying an instance to - * {@link AutoDetectParser}, such as: + * CTAKESParser decorates a {@link Parser} and leverages on {@link CTAKESContentHandler} to extract + * biomedical information from clinical text using Apache cTAKES. + *

    + * It is normally called by supplying an instance to {@link AutoDetectParser}, such as: * AutoDetectParser parser = new AutoDetectParser(new CTAKESParser()); - *

    It can also be used by giving a Tika Config file similar to: - * + *

    + * It can also be used by giving a Tika Config file similar to: * * * @@ -47,8 +42,9 @@ * * * - *

    Because this is a Parser Decorator, and not a normal Parser in - * it's own right, it isn't normally selected via the Parser Service Loader. + *

    + * Because this is a Parser Decorator, and not a normal Parser in it's own right, it isn't normally + * selected via the Parser Service Loader. */ public class CTAKESParser extends ParserDecorator { /** @@ -79,13 +75,13 @@ public CTAKESParser(Parser parser) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { CTAKESConfig config = context.get(CTAKESConfig.class, new CTAKESConfig()); CTAKESContentHandler ctakesHandler = new CTAKESContentHandler(handler, metadata, config); super.parse(stream, ctakesHandler, metadata, context); } - //@Override + // @Override public String getDecorationName() { return "CTakes"; } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java index 1968628668..5bd4088ec1 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ctakes; @@ -26,8 +24,8 @@ * A CAS serializer writes a CAS in the given format. */ public enum CTAKESSerializer { - XCAS(XCASSerializer.class.getName()), XMI(XmiCasSerializer.class.getName()), - XML(XmlCasSerializer.class.getName()); + XCAS(XCASSerializer.class.getName()), XMI(XmiCasSerializer.class.getName()), XML( + XmlCasSerializer.class.getName()); private final String className; diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java index 593193c5c8..0018527303 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ctakes; import java.io.IOException; import java.io.OutputStream; import java.net.URISyntaxException; - import org.apache.ctakes.typesystem.type.refsem.UmlsConcept; import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; import org.apache.uima.UIMAFramework; @@ -37,13 +34,12 @@ import org.xml.sax.SAXException; /** - * This class provides methods to extract biomedical information from plain text - * using {@link CTAKESContentHandler} that relies on Apache cTAKES. + * This class provides methods to extract biomedical information from plain text using + * {@link CTAKESContentHandler} that relies on Apache cTAKES. * *

    - * Apache cTAKES is built on top of Apache - * UIMA framework and OpenNLP - * toolkit. + * Apache cTAKES is built on top of Apache UIMA framework and + * OpenNLP toolkit. *

    */ public class CTAKESUtils { @@ -54,49 +50,45 @@ public class CTAKESUtils { private final static String CTAKES_UMLS_PASS = "ctakes.umlspw"; /** - * Returns a new UIMA Analysis Engine (AE). This method ensures that only - * one instance of an AE is created. + * Returns a new UIMA Analysis Engine (AE). This method ensures that only one instance of an AE + * is created. * *

    - * An Analysis Engine is a component responsible for analyzing unstructured - * information, discovering and representing semantic content. Unstructured - * information includes, but is not restricted to, text documents. + * An Analysis Engine is a component responsible for analyzing unstructured information, + * discovering and representing semantic content. Unstructured information includes, but is not + * restricted to, text documents. *

    * - * @param aeDescriptor pathname for XML file including an AnalysisEngineDescription - * that contains all of the information needed to instantiate and - * use an AnalysisEngine. - * @param umlsUser UMLS username for NLM database - * @param umlsPass UMLS password for NLM database + * @param aeDescriptor pathname for XML file including an AnalysisEngineDescription that + * contains all of the information needed to instantiate and use an AnalysisEngine. + * @param umlsUser UMLS username for NLM database + * @param umlsPass UMLS password for NLM database * @return an Analysis Engine for analyzing unstructured information. - * @throws IOException if any I/O error occurs. - * @throws InvalidXMLException if the input XML is not valid or does not - * specify a valid ResourceSpecifier. + * @throws IOException if any I/O error occurs. + * @throws InvalidXMLException if the input XML is not valid or does not specify a valid + * ResourceSpecifier. * @throws ResourceInitializationException if a failure occurred during production of the - * resource. - * @throws URISyntaxException if URL of the resource is not formatted - * strictly according to RFC2396 and cannot be - * converted to a URI. + * resource. + * @throws URISyntaxException if URL of the resource is not formatted strictly according to + * RFC2396 and cannot be converted to a URI. */ public static AnalysisEngine getAnalysisEngine(String aeDescriptor, String umlsUser, - String umlsPass) - throws IOException, InvalidXMLException, ResourceInitializationException, - URISyntaxException { + String umlsPass) throws IOException, InvalidXMLException, + ResourceInitializationException, URISyntaxException { // UMLS user ID and password. String aeDescriptorPath = CTAKESUtils.class.getResource(aeDescriptor).toURI().getPath(); // get Resource Specifier from XML XMLInputSource aeIputSource = new XMLInputSource(aeDescriptorPath); ResourceSpecifier aeSpecifier = - UIMAFramework.getXMLParser().parseResourceSpecifier(aeIputSource); + UIMAFramework.getXMLParser().parseResourceSpecifier(aeIputSource); // UMLS user ID and password - if ((umlsUser != null) && (!umlsUser.isEmpty()) && (umlsPass != null) && - (!umlsPass.isEmpty())) { + if ((umlsUser != null) && (!umlsUser.isEmpty()) && (umlsPass != null) + && (!umlsPass.isEmpty())) { /* - * It is highly recommended that you change UMLS credentials in the - * XML configuration file instead of giving user and password using - * CTAKESConfig. + * It is highly recommended that you change UMLS credentials in the XML configuration + * file instead of giving user and password using CTAKESConfig. */ System.setProperty(CTAKES_UMLS_USER, umlsUser); System.setProperty(CTAKES_UMLS_PASS, umlsPass); @@ -108,24 +100,21 @@ public static AnalysisEngine getAnalysisEngine(String aeDescriptor, String umlsU } /** - * Returns a new JCas () appropriate for the given Analysis Engine. This - * method ensures that only one instance of a JCas is created. A Jcas is a - * Java Cover Classes based Object-oriented CAS (Common Analysis System) - * API. + * Returns a new JCas () appropriate for the given Analysis Engine. This method ensures that + * only one instance of a JCas is created. A Jcas is a Java Cover Classes based Object-oriented + * CAS (Common Analysis System) API. * *

    - * Important: It is highly recommended that you reuse CAS objects rather - * than creating new CAS objects prior to each analysis. This is because CAS - * objects may be expensive to create and may consume a significant amount - * of memory. + * Important: It is highly recommended that you reuse CAS objects rather than creating new CAS + * objects prior to each analysis. This is because CAS objects may be expensive to create and + * may consume a significant amount of memory. *

    * * @param ae AnalysisEngine used to create an appropriate JCas object. * @return a JCas object appropriate for the given AnalysisEngine. * @throws ResourceInitializationException if a CAS could not be created because this - * AnalysisEngine's CAS metadata (type system, type - * priorities, or FS indexes) - * are invalid. + * AnalysisEngine's CAS metadata (type system, type priorities, or FS indexes) are + * invalid. */ public static JCas getJCas(AnalysisEngine ae) throws ResourceInitializationException { @@ -135,21 +124,21 @@ public static JCas getJCas(AnalysisEngine ae) throws ResourceInitializationExcep /** * Serializes a CAS in the given format. * - * @param jcas CAS (Common Analysis System) to be serialized. - * @param type type of cTAKES (UIMA) serializer used to write CAS. + * @param jcas CAS (Common Analysis System) to be serialized. + * @param type type of cTAKES (UIMA) serializer used to write CAS. * @param prettyPrint {@code true} to do pretty printing of output. - * @param stream {@link OutputStream} object used to print out information - * extracted by using cTAKES. + * @param stream {@link OutputStream} object used to print out information extracted by using + * cTAKES. * @throws SAXException if there was a SAX exception. - * @throws IOException if any I/O error occurs. + * @throws IOException if any I/O error occurs. */ public static void serialize(JCas jcas, CTAKESSerializer type, boolean prettyPrint, - OutputStream stream) throws SAXException, IOException { + OutputStream stream) throws SAXException, IOException { if (type == CTAKESSerializer.XCAS) { XCASSerializer.serialize(jcas.getCas(), stream, prettyPrint); } else if (type == CTAKESSerializer.XMI) { XmiCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(), stream, prettyPrint, - new XmiSerializationSharedData()); + new XmiSerializationSharedData()); } else { XmlCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(), stream); } @@ -159,12 +148,11 @@ public static void serialize(JCas jcas, CTAKESSerializer type, boolean prettyPri * Returns the annotation value based on the given annotation type. * * @param annotation {@link IdentifiedAnnotation} object. - * @param property {@link CTAKESAnnotationProperty} enum used to identify the - * annotation type. + * @param property {@link CTAKESAnnotationProperty} enum used to identify the annotation type. * @return the annotation value. */ public static String getAnnotationProperty(IdentifiedAnnotation annotation, - CTAKESAnnotationProperty property) { + CTAKESAnnotationProperty property) { String value = null; if (property == CTAKESAnnotationProperty.BEGIN) { value = Integer.toString(annotation.getBegin()); @@ -189,9 +177,9 @@ public static String getAnnotationProperty(IdentifiedAnnotation annotation, for (int i = 0; i < mentions.size(); i++) { if (mentions.get(i) instanceof UmlsConcept) { UmlsConcept concept = (UmlsConcept) mentions.get(i); - sb.append("cui=").append(concept.getCui()).append(","). - append(concept.getCodingScheme()).append("="). - append(concept.getCode()); + sb.append("cui=").append(concept.getCui()).append(",") + .append(concept.getCodingScheme()).append("=") + .append(concept.getCode()); if (i < mentions.size() - 1) { sb.append(","); } @@ -201,18 +189,17 @@ public static String getAnnotationProperty(IdentifiedAnnotation annotation, value = sb.toString(); } else if (property == CTAKESAnnotationProperty.POLARITY) { String polarity_pref = "POLARITY"; - value = new StringBuilder(polarity_pref).append("="). - append(Integer.toString(annotation.getPolarity())).toString(); + value = new StringBuilder(polarity_pref).append("=") + .append(Integer.toString(annotation.getPolarity())).toString(); } return value; } /** - * Resets cTAKES objects, if created. This method ensures that new cTAKES - * objects (a.k.a., Analysis Engine and JCas) will be created if getters of - * this class are called. + * Resets cTAKES objects, if created. This method ensures that new cTAKES objects (a.k.a., + * Analysis Engine and JCas) will be created if getters of this class are called. * - * @param ae UIMA Analysis Engine + * @param ae UIMA Analysis Engine * @param jcas JCas object */ public static void reset(AnalysisEngine ae, JCas jcas) { @@ -236,8 +223,7 @@ public static void resetCAS(JCas jcas) { } /** - * Resets the AE (AnalysisEngine), releasing all resources held by the - * current AE. + * Resets the AE (AnalysisEngine), releasing all resources held by the current AE. * * @param ae UIMA Analysis Engine */ diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java index cfa52e7e25..f1b5c480c0 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright owlocationNameEntitieship. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright owlocationNameEntitieship. The ASF licenses this file to You under the Apache License, + * Version 2.0 (the "License"); you may not use this file except in compliance with the License. You + * may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geo.topic; @@ -27,14 +25,8 @@ import java.util.List; import java.util.Map; import java.util.Set; - import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.TokenNameFinderModel; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -43,6 +35,10 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.geo.topic.gazetteer.GeoGazetteerClient; import org.apache.tika.parser.geo.topic.gazetteer.Location; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class GeoParser implements Parser { private static final long serialVersionUID = -2241391757440215491L; @@ -81,7 +77,7 @@ public void initialize(GeoParserConfig geoParserConfig) { gazetteerClient = new GeoGazetteerClient(geoParserConfig); // Check if the NER model is available, and if the - // lucene-geo-gazetteer is available + // lucene-geo-gazetteer is available this.available = modelUrl != null && gazetteerClient.checkAvail(); if (this.available) { @@ -98,7 +94,7 @@ public void initialize(GeoParserConfig geoParserConfig) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { /*----------------configure this parser by ParseContext Object---------------------*/ diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java index 107f10bdb9..3dc00fe1a3 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geo.topic; @@ -24,7 +22,6 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Properties; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java index f0dd7131e5..5d5afc6e2d 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geo.topic; @@ -20,7 +18,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; - import org.apache.tika.parser.geo.topic.gazetteer.Location; public class GeoTag { @@ -43,8 +40,7 @@ public void addAlternative(GeoTag geotag) { * * @param resolvedGeonames resolved entities * - * @param bestNER best name entity among all the extracted entities for the - * input stream + * @param bestNER best name entity among all the extracted entities for the input stream */ public void toGeoTag(Map> resolvedGeonames, String bestNER) { diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java index 8ba75eb428..5d98e93f03 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geo.topic; @@ -28,7 +26,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - import opennlp.tools.namefind.NameFinderME; import opennlp.tools.util.Span; import org.apache.commons.io.IOUtils; @@ -47,9 +44,8 @@ public NameEntityExtractor(NameFinderME nameFinder) throws IOException { } /* - * Use OpenNLP to extract location names that's appearing in the steam. - * OpenNLP's default Name Finder accuracy is not very good, please refer to - * its documentation. + * Use OpenNLP to extract location names that's appearing in the steam. OpenNLP's default Name + * Finder accuracy is not very good, please refer to its documentation. * * @param stream stream that passed from this.parse() */ @@ -57,12 +53,12 @@ public void getAllNameEntitiesfromInput(InputStream stream) throws IOException { String[] in = IOUtils.toString(stream, UTF_8).split(" "); Span[] nameE; - //name finder is not thread safe - //https://opennlp.apache.org/documentation/1.5.2-incubating/manual/ + // name finder is not thread safe + // https://opennlp.apache.org/documentation/1.5.2-incubating/manual/ // opennlp.html#tools.namefind synchronized (nameFinder) { nameE = nameFinder.find(in); - //the same name finder is reused, so clear adaptive data + // the same name finder is reused, so clear adaptive data nameFinder.clearAdaptiveData(); } @@ -79,12 +75,11 @@ public void getAllNameEntitiesfromInput(InputStream stream) throws IOException { } /* - * Get the best location entity extracted from the input stream. Simply - * return the most frequent entity, If there several highest frequent - * entity, pick one randomly. May not be the optimal solution, but works. + * Get the best location entity extracted from the input stream. Simply return the most frequent + * entity, If there several highest frequent entity, pick one randomly. May not be the optimal + * solution, but works. * - * @param locationNameEntities OpenNLP name finder's results, stored in - * ArrayList + * @param locationNameEntities OpenNLP name finder's results, stored in ArrayList */ public void getBestNameEntity() { if (this.locationNameEntities.size() == 0) { diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/gazetteer/GeoGazetteerClient.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/gazetteer/GeoGazetteerClient.java index a09af950f6..d81cbdf0bd 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/gazetteer/GeoGazetteerClient.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/gazetteer/GeoGazetteerClient.java @@ -1,40 +1,36 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geo.topic.gazetteer; +import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; import java.lang.reflect.Type; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; - -import com.google.gson.Gson; -import com.google.gson.reflect.TypeToken; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.utils.URIBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.tika.parser.geo.topic.GeoParserConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.parser.geo.topic.GeoParserConfig; - public class GeoGazetteerClient { private static final String SEARCH_API = "/api/search"; @@ -73,11 +69,11 @@ public Map> getLocations(List locations) { HttpGet httpGet = new HttpGet(uri.build()); HttpResponse resp = httpClient.execute(httpGet); - String respJson = IOUtils.toString(resp.getEntity().getContent(), StandardCharsets.UTF_8); + String respJson = + IOUtils.toString(resp.getEntity().getContent(), StandardCharsets.UTF_8); - @SuppressWarnings("serial") Type typeDef = - new TypeToken>>() { - }.getType(); + @SuppressWarnings("serial") + Type typeDef = new TypeToken>>() {}.getType(); return new Gson().fromJson(respJson, typeDef); diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/gazetteer/Location.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/gazetteer/Location.java index b5a165e914..511e11a460 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/gazetteer/Location.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/gazetteer/Location.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geo.topic.gazetteer; diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java index 08968989eb..c7906ae10c 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java @@ -1,41 +1,37 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.journal; +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.Response; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Properties; - -import jakarta.ws.rs.core.MediaType; -import jakarta.ws.rs.core.Response; import org.apache.cxf.jaxrs.client.WebClient; import org.apache.cxf.jaxrs.ext.multipart.Attachment; import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition; import org.apache.cxf.jaxrs.ext.multipart.MultipartBody; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; public class GrobidRESTParser { @@ -66,8 +62,8 @@ public GrobidRESTParser() { private static String readRestUrl() throws IOException { Properties grobidProperties = new Properties(); - grobidProperties - .load(GrobidRESTParser.class.getResourceAsStream("GrobidExtractor.properties")); + grobidProperties.load( + GrobidRESTParser.class.getResourceAsStream("GrobidExtractor.properties")); return grobidProperties.getProperty("grobid.server.url"); } @@ -79,27 +75,27 @@ protected static boolean canRun() { String resp = response.readEntity(String.class); return resp != null && !resp.equals("") && resp.startsWith("true"); } catch (Exception e) { - //swallow...can't run + // swallow...can't run return false; } } public void parse(String filePath, ContentHandler handler, Metadata metadata, - ParseContext context) throws FileNotFoundException { + ParseContext context) throws FileNotFoundException { File pdfFile = new File(filePath); ContentDisposition cd = new ContentDisposition( - "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\""); + "form-data; name=\"input\"; filename=\"" + pdfFile.getName() + "\""); Attachment att = new Attachment("input", new FileInputStream(pdfFile), cd); MultipartBody body = new MultipartBody(att); try { checkMode(); - Response response = WebClient.create(restHostUrlStr + - (legacyMode ? GROBID_LEGACY_PROCESSHEADER_PATH : GROBID_PROCESSHEADER_PATH)) - .accept(MediaType.APPLICATION_XML) - .type(MediaType.MULTIPART_FORM_DATA) - .post(body); + Response response = WebClient + .create(restHostUrlStr + (legacyMode ? GROBID_LEGACY_PROCESSHEADER_PATH + : GROBID_PROCESSHEADER_PATH)) + .accept(MediaType.APPLICATION_XML).type(MediaType.MULTIPART_FORM_DATA) + .post(body); String resp = response.readEntity(String.class); diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java index 92f3fffaef..da74ed4f49 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.journal; @@ -23,10 +21,6 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -35,6 +29,8 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.pdf.PDFParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class JournalParser implements Parser { @@ -52,7 +48,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources(), metadata); File tmpFile = tis.getFile(); diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java index 46dca7dd7c..b45b57e76b 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.journal; @@ -22,7 +20,10 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; - +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.utils.XMLReaderUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; @@ -30,17 +31,11 @@ import org.w3c.dom.NodeList; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.utils.XMLReaderUtils; - public class TEIDOMParser { - public TEIDOMParser() { - } + public TEIDOMParser() {} - //returns first child with this name, null otherwise + // returns first child with this name, null otherwise private static Node getFirstChild(NodeList childNodes, String name) { for (int i = 0; i < childNodes.getLength(); i++) { Node n = childNodes.item(i); @@ -76,10 +71,9 @@ private static List getChildNodes(NodeList childNodes, String localName) { } public Metadata parse(String source, ParseContext parseContext) - throws TikaException, SAXException, IOException { + throws TikaException, SAXException, IOException { - Document root = XMLReaderUtils - .buildDOM(new StringReader(source), parseContext); + Document root = XMLReaderUtils.buildDOM(new StringReader(source), parseContext); Metadata metadata = new Metadata(); createGrobidMetadata(source, root.getDocumentElement(), metadata); @@ -111,8 +105,8 @@ private void createGrobidMetadata(String source, Element root, Metadata metadata private void addStaticMet(String source, Element obj, Metadata metadata) { metadata.add("Class", Metadata.class.getName()); - //no longer available after we got rid of json.org's and its .toJSONObject() -// metadata.add("TEIJSONSource", obj.toString()); + // no longer available after we got rid of json.org's and its .toJSONObject() + // metadata.add("TEIJSONSource", obj.toString()); metadata.add("TEIXMLSource", source); } @@ -508,9 +502,9 @@ public void setAffiliations(List affiliations) { */ @Override public String toString() { - return "Author [surName=" + surName + ", middleName=" + (middleName != null ? - middleName : - "") + ", firstName=" + firstName + ", affiliations=" + affiliations + "]"; + return "Author [surName=" + surName + ", middleName=" + + (middleName != null ? middleName : "") + ", firstName=" + firstName + + ", affiliations=" + affiliations + "]"; } } @@ -565,8 +559,8 @@ public boolean equals(Object obj) { return false; } Affiliation otherA = (Affiliation) obj; - return this.getAddress().equals(otherA.getAddress()) && - this.getOrgName().equals(otherA.getOrgName()); + return this.getAddress().equals(otherA.getAddress()) + && this.getOrgName().equals(otherA.getOrgName()); } @@ -708,8 +702,8 @@ public boolean equals(Object obj) { return false; } OrgTypeName otherOrgName = (OrgTypeName) obj; - return this.type.equals(otherOrgName.getType()) && - this.name.equals(otherOrgName.getName()); + return this.type.equals(otherOrgName.getType()) + && this.name.equals(otherOrgName.getName()); } @Override @@ -809,10 +803,10 @@ public boolean equals(Object obj) { return otherA.getRegion() == null; } - return this.settlment.equals(otherA.getSettlment()) && - this.country.equals(otherA.getCountry()) && - this.postCode.equals(otherA.getPostCode()) && - this.region.equals(otherA.getRegion()); + return this.settlment.equals(otherA.getSettlment()) + && this.country.equals(otherA.getCountry()) + && this.postCode.equals(otherA.getPostCode()) + && this.region.equals(otherA.getRegion()); } @Override @@ -910,8 +904,8 @@ public boolean equals(Object obj) { return this.key.equals(otherC.getKey()); } } else { - return this.key.equals(otherC.getKey()) && - this.content.equals(otherC.getContent()); + return this.key.equals(otherC.getKey()) + && this.content.equals(otherC.getContent()); } } } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NERecogniser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NERecogniser.java index 453cc2869b..f8b5444f56 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NERecogniser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NERecogniser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner; @@ -26,7 +24,7 @@ */ public interface NERecogniser { - //the common named entity classes + // the common named entity classes String LOCATION = "LOCATION"; String PERSON = "PERSON"; String ORGANIZATION = "ORGANIZATION"; diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java index dae56b2502..3f37cb16a8 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/NamedEntityParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner; @@ -27,13 +25,7 @@ import java.util.List; import java.util.Map; import java.util.Set; - import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; @@ -44,14 +36,22 @@ import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser; import org.apache.tika.parser.ner.regex.RegexNERecogniser; import org.apache.tika.sax.XHTMLContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * This implementation of {@link org.apache.tika.parser.Parser} extracts - * entity names from text content and adds it to the metadata. - *

    All the metadata keys will have a common suffix {@value #MD_KEY_PREFIX}

    - *

    The Named Entity recogniser implementation can be changed by setting the - * system property {@value #SYS_PROP_NER_IMPL} value to a name of class that - * implements {@link NERecogniser} contract

    + * This implementation of {@link org.apache.tika.parser.Parser} extracts entity names from text + * content and adds it to the metadata. + *

    + * All the metadata keys will have a common suffix {@value #MD_KEY_PREFIX} + *

    + *

    + * The Named Entity recogniser implementation can be changed by setting the system property + * {@value #SYS_PROP_NER_IMPL} value to a name of class that implements {@link NERecogniser} + * contract + *

    * * @see OpenNLPNERecogniser * @see NERecogniser @@ -61,7 +61,7 @@ public class NamedEntityParser implements Parser { public static final Set MEDIA_TYPES = new HashSet<>(); public static final String MD_KEY_PREFIX = "NER_"; public static final String DEFAULT_NER_IMPL = - OpenNLPNERecogniser.class.getName() + "," + RegexNERecogniser.class.getName(); + OpenNLPNERecogniser.class.getName() + "," + RegexNERecogniser.class.getName(); public static final String SYS_PROP_NER_IMPL = "ner.impl.class"; static { @@ -79,8 +79,8 @@ private synchronized void initialize(ParseContext context) { } initialized = true; - //TODO: read class name from context or config - //There can be multiple classes in the form of comma separated class names; + // TODO: read class name from context or config + // There can be multiple classes in the form of comma separated class names; String classNamesString = System.getProperty(SYS_PROP_NER_IMPL, DEFAULT_NER_IMPL); String[] classNames = classNamesString.split(","); this.nerChain = new ArrayList<>(classNames.length); @@ -88,8 +88,8 @@ private synchronized void initialize(ParseContext context) { className = className.trim(); LOG.info("going to load, instantiate and bind the instance of {}", className); try { - NERecogniser recogniser = - (NERecogniser) Class.forName(className).getDeclaredConstructor().newInstance(); + NERecogniser recogniser = (NERecogniser) Class.forName(className) + .getDeclaredConstructor().newInstance(); LOG.info("{} is available ? {}", className, recogniser.isAvailable()); if (recogniser.isAvailable()) { nerChain.add(recogniser); @@ -114,7 +114,7 @@ public Set getSupportedTypes(ParseContext parseContext) { } public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, - ParseContext parseContext) throws IOException, SAXException, TikaException { + ParseContext parseContext) throws IOException, SAXException, TikaException { if (!initialized) { initialize(parseContext); @@ -123,10 +123,9 @@ public void parse(InputStream inputStream, ContentHandler contentHandler, Metada return; } - Reader reader = - MediaType.TEXT_PLAIN.toString().equals(metadata.get(Metadata.CONTENT_TYPE)) ? - new InputStreamReader(inputStream, StandardCharsets.UTF_8) : - secondaryParser.parse(inputStream); + Reader reader = MediaType.TEXT_PLAIN.toString().equals(metadata.get(Metadata.CONTENT_TYPE)) + ? new InputStreamReader(inputStream, StandardCharsets.UTF_8) + : secondaryParser.parse(inputStream); String text = IOUtils.toString(reader); IOUtils.closeQuietly(reader); @@ -149,11 +148,10 @@ public void parse(InputStream inputStream, ContentHandler contentHandler, Metada } /** - * writes the content to the given XHTML - * content handler + * writes the content to the given XHTML content handler * * @param content the content which needs to be written - * @param xhtml XHTML content handler + * @param xhtml XHTML content handler * @throws SAXException if the XHTML SAX events could not be handled */ private void extractOutput(String content, XHTMLContentHandler xhtml) throws SAXException { diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java index 8855fda037..826a51c2dc 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java @@ -1,21 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner.corenlp; +import com.github.openjson.JSONException; +import com.github.openjson.JSONObject; import java.io.FileInputStream; import java.io.IOException; import java.lang.reflect.Field; @@ -27,33 +27,28 @@ import java.util.Map; import java.util.Properties; import java.util.Set; - -import com.github.openjson.JSONException; -import com.github.openjson.JSONObject; import org.apache.commons.io.IOUtils; +import org.apache.tika.parser.ner.NERecogniser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.parser.ner.NERecogniser; - /** - * This class offers an implementation of {@link NERecogniser} based on - * CRF classifiers from Stanford CoreNLP. This NER requires additional setup, - * due to runtime binding to Stanford CoreNLP. - * See - * Tika NER Wiki for configuring this recogniser. + * This class offers an implementation of {@link NERecogniser} based on CRF classifiers from + * Stanford CoreNLP. This NER requires additional setup, due to runtime binding to Stanford CoreNLP. + * See Tika NER Wiki for configuring + * this recogniser. * * @see NERecogniser */ public class CoreNLPNERecogniser implements NERecogniser { - //default model paths + // default model paths public static final String NER_3CLASS_MODEL = - "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"; + "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"; public static final String NER_4CLASS_MODEL = - "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"; + "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"; public static final String NER_7CLASS_MODEL = - "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz"; + "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz"; /** * default Model path */ @@ -93,12 +88,12 @@ public CoreNLPNERecogniser(String modelPath) { try { Properties props = new Properties(); Class classifierClass = Class.forName(CLASSIFIER_CLASS_NAME); - Method loadMethod = - classifierClass.getMethod("getClassifier", String.class, Properties.class); + Method loadMethod = classifierClass.getMethod("getClassifier", String.class, + Properties.class); classifierInstance = loadMethod.invoke(classifierClass, modelPath, props); classifyMethod = classifierClass.getMethod("classifyToCharacterOffsets", String.class); - //these fields are for accessing result + // these fields are for accessing result Class tripleClass = Class.forName("edu.stanford.nlp.util.Triple"); this.firstField = tripleClass.getField("first"); this.secondField = tripleClass.getField("second"); @@ -129,7 +124,7 @@ public static void main(String[] args) throws IOException, JSONException { /** * @return {@code true} if model was available, valid and was able to initialise the classifier. - * returns {@code false} when this recogniser is not available for service. + * returns {@code false} when this recogniser is not available for service. */ public boolean isAvailable() { return available; @@ -163,7 +158,7 @@ public Map> recognise(String text) { Integer start = (Integer) secondField.get(entry); Integer end = (Integer) thirdField.get(entry); String name = text.substring(start, end); - //Clean repeating spaces, replace line breaks and tabs with single space + // Clean repeating spaces, replace line breaks and tabs with single space name = name.trim().replaceAll("(\\s\\s+)|\n|\t", " "); if (!name.isEmpty()) { names.get(entityType).add(name); diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java index 4e314f4322..5074f73386 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java @@ -1,39 +1,35 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner.grobid; +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.Response; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Properties; import java.util.Set; - -import jakarta.ws.rs.core.MediaType; -import jakarta.ws.rs.core.Response; import org.apache.cxf.jaxrs.client.WebClient; +import org.apache.tika.parser.ner.NERecogniser; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.parser.ner.NERecogniser; - public class GrobidNERecogniser implements NERecogniser { /* @@ -81,9 +77,7 @@ public GrobidNERecogniser() { private static boolean isServerAlive(String restHostUrlStr) { boolean available = false; try { - Response response = - WebClient.create(restHostUrlStr + ISALIVE_URL) - .get(); + Response response = WebClient.create(restHostUrlStr + ISALIVE_URL).get(); int responseCode = response.getStatus(); if (responseCode == 200) { available = true; @@ -98,30 +92,28 @@ private static boolean isServerAlive(String restHostUrlStr) { } /** - * Reads the GROBID REST URL from the properties file - * returns the GROBID REST URL + * Reads the GROBID REST URL from the properties file returns the GROBID REST URL */ private static String readRestUrl() throws IOException { Properties grobidProperties = new Properties(); - grobidProperties - .load(GrobidNERecogniser.class.getResourceAsStream("GrobidServer.properties")); + grobidProperties.load( + GrobidNERecogniser.class.getResourceAsStream("GrobidServer.properties")); return grobidProperties.getProperty("grobid.server.url"); } /** - * Reads the GROBID REST Endpoint from the properties file - * returns the GROBID REST Endpoint + * Reads the GROBID REST Endpoint from the properties file returns the GROBID REST Endpoint */ private static String readRestEndpoint() throws IOException { Properties grobidProperties = new Properties(); - grobidProperties - .load(GrobidNERecogniser.class.getResourceAsStream("GrobidServer.properties")); + grobidProperties.load( + GrobidNERecogniser.class.getResourceAsStream("GrobidServer.properties")); return grobidProperties.getProperty("grobid.endpoint.text"); } /** - * @return {@code true} if server endpoint is available. - * returns {@code false} if server endpoint is not avaliable for service. + * @return {@code true} if server endpoint is available. returns {@code false} if server + * endpoint is not avaliable for service. */ public boolean isAvailable() { return available; @@ -184,8 +176,7 @@ public Map> recognise(String text) { try { String url = restHostUrlStr + readRestEndpoint(); - try (Response response = - WebClient.create(url).accept(MediaType.APPLICATION_JSON) + try (Response response = WebClient.create(url).accept(MediaType.APPLICATION_JSON) .post("text=" + text)) { int responseCode = response.getStatus(); @@ -198,13 +189,14 @@ public Map> recognise(String text) { StringBuilder measurementString = new StringBuilder(); StringBuilder normalizedMeasurementString = new StringBuilder(); - JSONObject quantity = (JSONObject) convertToJSONObject(measurement.toString()) - .get("quantity"); + JSONObject quantity = + (JSONObject) convertToJSONObject(measurement.toString()) + .get("quantity"); if (quantity != null) { if (quantity.containsKey("rawValue")) { String measurementNumber = - (String) convertToJSONObject(quantity.toString()) - .get("rawValue"); + (String) convertToJSONObject(quantity.toString()) + .get("rawValue"); measurementString.append(measurementNumber); measurementString.append(" "); measurementNumberSet.add(measurementNumber); @@ -212,32 +204,35 @@ public Map> recognise(String text) { if (quantity.containsKey("normalizedQuantity")) { String normalizedMeasurementNumber = - convertToJSONObject(quantity.toString()) - .get("normalizedQuantity").toString(); + convertToJSONObject(quantity.toString()) + .get("normalizedQuantity") + .toString(); normalizedMeasurementString.append(normalizedMeasurementNumber); normalizedMeasurementString.append(" "); } if (quantity.containsKey("type")) { String measurementType = - (String) convertToJSONObject(quantity.toString()).get("type"); + (String) convertToJSONObject(quantity.toString()) + .get("type"); measurementTypeSet.add(measurementType); } - JSONObject jsonObj = (JSONObject) convertToJSONObject(quantity.toString()); + JSONObject jsonObj = + (JSONObject) convertToJSONObject(quantity.toString()); if (jsonObj.containsKey("rawUnit")) { JSONObject rawUnit = (JSONObject) jsonObj.get("rawUnit"); - String unitName = - (String) convertToJSONObject(rawUnit.toString()).get("name"); + String unitName = (String) convertToJSONObject(rawUnit.toString()) + .get("name"); unitSet.add(unitName); measurementString.append(unitName); } if (jsonObj.containsKey("normalizedUnit")) { - JSONObject normalizedUnit = (JSONObject) jsonObj.get("normalizedUnit"); - String normalizedUnitName = - (String) convertToJSONObject(normalizedUnit.toString()) - .get("name"); + JSONObject normalizedUnit = + (JSONObject) jsonObj.get("normalizedUnit"); + String normalizedUnitName = (String) convertToJSONObject( + normalizedUnit.toString()).get("name"); normalizedMeasurementString.append(normalizedUnitName); } @@ -246,7 +241,8 @@ public Map> recognise(String text) { } if (!normalizedMeasurementString.toString().equals("")) { - normalizedMeasurementSet.add(normalizedMeasurementString.toString()); + normalizedMeasurementSet + .add(normalizedMeasurementString.toString()); } } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/mitie/MITIENERecogniser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/mitie/MITIENERecogniser.java index 8686476420..d604d49b15 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/mitie/MITIENERecogniser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/mitie/MITIENERecogniser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner.mitie; @@ -24,19 +22,16 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; - +import org.apache.tika.parser.ner.NERecogniser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.parser.ner.NERecogniser; - /** - * This class offers an implementation of {@link NERecogniser} based on - * trained models using state-of-the-art information extraction tools. \This NER requires - * additional setup, - * due to runtime binding to MIT Information Extraction. - * See - * Tika MITIE Wiki for configuring this recogniser. + * This class offers an implementation of {@link NERecogniser} based on trained models using + * state-of-the-art information extraction tools. \This NER requires additional setup, due to + * runtime binding to MIT Information Extraction. See + * Tika MITIE Wiki for configuring this + * recogniser. * * @see NERecogniser */ @@ -52,7 +47,7 @@ public class MITIENERecogniser implements NERecogniser { }; private static final Logger LOG = LoggerFactory.getLogger(MITIENERecogniser.class); private static final String NamedEntityExtractor_Class = - "edu.mit.ll.mitie.NamedEntityExtractor"; + "edu.mit.ll.mitie.NamedEntityExtractor"; private boolean available = false; private Object extractorInstance; @@ -71,8 +66,8 @@ public MITIENERecogniser(String modelPath) { LOG.warn("{} does not exist", modelPath); } else { Class namedEntityExtractorClass = Class.forName(NamedEntityExtractor_Class); - extractorInstance = - namedEntityExtractorClass.getDeclaredConstructor(new Class[]{String.class}) + extractorInstance = namedEntityExtractorClass + .getDeclaredConstructor(new Class[] {String.class}) .newInstance(modelPath); this.available = true; } @@ -84,7 +79,7 @@ public MITIENERecogniser(String modelPath) { /** * @return {@code true} if model was available, valid and was able to initialise the classifier. - * returns {@code false} when this recogniser is not available for service. + * returns {@code false} when this recogniser is not available for service. */ public boolean isAvailable() { return available; @@ -112,17 +107,17 @@ public Map> recognise(String text) { Class stringVectorClass = Class.forName("edu.mit.ll.mitie.StringVector"); Class entityMentionVectorClass = - Class.forName("edu.mit.ll.mitie.EntityMentionVector"); + Class.forName("edu.mit.ll.mitie.EntityMentionVector"); Class entityMentionClass = Class.forName("edu.mit.ll.mitie.EntityMention"); Object entityMentionObject = null; Class globalClass = Class.forName("edu.mit.ll.mitie.global"); Object stringVectorObject = extractorInstance.getClass().getMethod("getPossibleNerTags") - .invoke(extractorInstance); + .invoke(extractorInstance); long size = (Long) stringVectorClass.getMethod("size").invoke(stringVectorObject); ArrayList possibleTags = new ArrayList<>(); for (long i = 0; i < size; i++) { String t = (String) stringVectorClass.getMethod("get", Integer.TYPE) - .invoke(stringVectorObject, (int) i); + .invoke(stringVectorObject, (int) i); possibleTags.add(t); } Method tokenize = globalClass.getMethod("tokenize", String.class); @@ -132,18 +127,18 @@ public Map> recognise(String text) { size = (Long) stringVectorClass.getMethod("size").invoke(stringVectorObject); for (long i = 0; i < size; i++) { String t = (String) stringVectorClass.getMethod("get", Integer.TYPE) - .invoke(stringVectorObject, (int) i); + .invoke(stringVectorObject, (int) i); stringVector.add(t); } - Method extractEntities = - extractorInstance.getClass().getMethod("extractEntities", stringVectorClass); + Method extractEntities = extractorInstance.getClass().getMethod("extractEntities", + stringVectorClass); Object entities = extractEntities.invoke(extractorInstance, stringVectorObject); size = (Long) entityMentionVectorClass.getMethod("size").invoke(entities); for (long i = 0; i < size; i++) { entityMentionObject = entityMentionVectorClass.getMethod("get", Integer.TYPE) - .invoke(entities, (int) i); + .invoke(entities, (int) i); int tag_index = (Integer) entityMentionClass.getMethod("getTag") - .invoke(entityMentionObject); + .invoke(entityMentionObject); String tag = possibleTags.get(tag_index); Set x = new HashSet<>(); if (names.containsKey(tag)) { @@ -152,9 +147,9 @@ public Map> recognise(String text) { names.put(tag, x); } int start = (Integer) entityMentionClass.getMethod("getStart") - .invoke(entityMentionObject); + .invoke(entityMentionObject); int end = (Integer) entityMentionClass.getMethod("getEnd") - .invoke(entityMentionObject); + .invoke(entityMentionObject); StringBuilder match = new StringBuilder(); while (start < end) { match.append(stringVector.get(start)).append(" "); diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java index 15cdda86cc..0ee2c2ef54 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java @@ -1,21 +1,21 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner.nltk; +import jakarta.ws.rs.core.MediaType; +import jakarta.ws.rs.core.Response; import java.io.IOException; import java.util.Collection; import java.util.HashMap; @@ -24,21 +24,16 @@ import java.util.Map; import java.util.Properties; import java.util.Set; - -import jakarta.ws.rs.core.MediaType; -import jakarta.ws.rs.core.Response; import org.apache.cxf.jaxrs.client.WebClient; +import org.apache.tika.parser.ner.NERecogniser; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.parser.ner.NERecogniser; - /** - * This class offers an implementation of {@link NERecogniser} based on - * ne_chunk() module of NLTK. This NER requires additional setup, - * due to Http requests to an endpoint server that runs NLTK. + * This class offers an implementation of {@link NERecogniser} based on ne_chunk() module of NLTK. + * This NER requires additional setup, due to Http requests to an endpoint server that runs NLTK. * See */ public class NLTKNERecogniser implements NERecogniser { @@ -95,8 +90,8 @@ private static String readRestUrl() throws IOException { } /** - * @return {@code true} if server endpoint is available. - * returns {@code false} if server endpoint is not avaliable for service. + * @return {@code true} if server endpoint is available. returns {@code false} if server + * endpoint is not avaliable for service. */ public boolean isAvailable() { return available; @@ -132,7 +127,7 @@ public Map> recognise(String text) { if (!key.equals("result")) { ENTITY_TYPES.add(key); entities.put(key.toUpperCase(Locale.ENGLISH), - new HashSet((Collection) j.get(key))); + new HashSet((Collection) j.get(key))); } } } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java index 7cba32642b..dce784989a 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner.opennlp; @@ -24,42 +22,48 @@ import java.util.List; import java.util.Map; import java.util.Set; - import org.apache.tika.parser.ner.NERecogniser; /** - * This implementation of {@link NERecogniser} chains an array of - * {@link OpenNLPNameFinder}s for which NER models are - * available in classpath. + * This implementation of {@link NERecogniser} chains an array of {@link OpenNLPNameFinder}s for + * which NER models are available in classpath. *

    * The following models are scanned during initialization via class loader.: * * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * *
    Entity TypePath
    {@value PERSON} {@value PERSON_FILE}
    {@value LOCATION}{@value LOCATION_FILE}
    {@value ORGANIZATION}{@value ORGANIZATION_FILE}
    {@value TIME}{@value TIME_FILE}
    {@value DATE}{@value DATE_FILE}
    {@value PERCENT}{@value PERCENT_FILE}
    {@value MONEY}{@value MONEY_FILE}
    Entity TypePath
    {@value PERSON}{@value PERSON_FILE}
    {@value LOCATION}{@value LOCATION_FILE}
    {@value ORGANIZATION}{@value ORGANIZATION_FILE}
    {@value TIME}{@value TIME_FILE}
    {@value DATE}{@value DATE_FILE}
    {@value PERCENT}{@value PERCENT_FILE}
    {@value MONEY}{@value MONEY_FILE}
    * * @see org.apache.tika.parser.ner.NamedEntityParser#DEFAULT_NER_IMPL @@ -67,7 +71,7 @@ public class OpenNLPNERecogniser implements NERecogniser { public static final String MODELS_DIR = - OpenNLPNERecogniser.class.getPackage().getName().replace(".", "/"); + OpenNLPNERecogniser.class.getPackage().getName().replace(".", "/"); public static final String PERSON_FILE = "ner-person.bin"; public static final String LOCATION_FILE = "ner-location.bin"; public static final String ORGANIZATION_FILE = "ner-organization.bin"; @@ -77,7 +81,7 @@ public class OpenNLPNERecogniser implements NERecogniser { public static final String MONEY_FILE = "ner-money.bin"; - //Default (English) Models for the common 7 classes of named types + // Default (English) Models for the common 7 classes of named types public static final String NER_PERSON_MODEL = MODELS_DIR + "/" + PERSON_FILE; public static final String NER_LOCATION_MODEL = MODELS_DIR + "/" + LOCATION_FILE; public static final String NER_ORGANIZATION_MODEL = MODELS_DIR + "/" + ORGANIZATION_FILE; @@ -96,7 +100,7 @@ public class OpenNLPNERecogniser implements NERecogniser { put(PERCENT, NER_PERCENT_MODEL); put(MONEY, NER_MONEY_MODEL); } - }; + }; private Set entityTypes; private List nameFinders; @@ -112,8 +116,8 @@ public OpenNLPNERecogniser() { /** * Creates a chain of Named Entity recognisers * - * @param models map of entityType -> model path - * NOTE: the model path should be known to class loader. + * @param models map of entityType -> model path NOTE: the model path should be known to + * class loader. */ public OpenNLPNERecogniser(Map models) { this.nameFinders = new ArrayList<>(); @@ -126,7 +130,7 @@ public OpenNLPNERecogniser(Map models) { } } this.entityTypes = Collections.unmodifiableSet(this.entityTypes); - this.available = nameFinders.size() > 0; //at least one finder is present + this.available = nameFinders.size() > 0; // at least one finder is present } @Override diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java index cc26e3b61a..845cdbd0a8 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner.opennlp; @@ -25,20 +23,18 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; - import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.util.Span; import org.apache.commons.io.IOUtils; +import org.apache.tika.parser.ner.NERecogniser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.parser.ner.NERecogniser; - /** - * An implementation of {@link NERecogniser} that finds names in text using Open NLP Model. - * This implementation works with only one entity type. For chain this name finder instances, - * see {@link OpenNLPNERecogniser} + * An implementation of {@link NERecogniser} that finds names in text using Open NLP Model. This + * implementation works with only one entity type. For chain this name finder instances, see + * {@link OpenNLPNERecogniser} */ public class OpenNLPNameFinder implements NERecogniser { @@ -51,7 +47,7 @@ public class OpenNLPNameFinder implements NERecogniser { /** * Creates OpenNLP name finder * - * @param nameType the entity type recognised by the given NER model + * @param nameType the entity type recognised by the given NER model * @param nerModelPath path to ner model */ public OpenNLPNameFinder(String nameType, String nerModelPath) { @@ -75,8 +71,8 @@ public OpenNLPNameFinder(String nameType, String nerModelPath) { } public static String[] tokenize(String text) { - //NOTE: replace this with a NLP tokenizer tool - //clean + split + // NOTE: replace this with a NLP tokenizer tool + // clean + split return text.trim().replaceAll("(\\s\\s+)", " ").split("\\s"); } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java index 89568ee0f7..08422e4505 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner.regex; @@ -26,30 +24,30 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.commons.io.IOUtils; +import org.apache.tika.parser.ner.NERecogniser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.tika.parser.ner.NERecogniser; - /** - * This class offers an implementation of {@link NERecogniser} based on - * Regular Expressions. + * This class offers an implementation of {@link NERecogniser} based on Regular Expressions. *

    - * The default configuration file {@value NER_REGEX_FILE} is used when no - * argument constructor is used to instantiate this class. The regex file is - * loaded via {@link Class#getResourceAsStream(String)}, so the file should be - * placed in the same package path as of this class. + * The default configuration file {@value NER_REGEX_FILE} is used when no argument constructor is + * used to instantiate this class. The regex file is loaded via + * {@link Class#getResourceAsStream(String)}, so the file should be placed in the same package path + * as of this class. *

    * The format of regex configuration as follows: + * *
      * ENTITY_TYPE1=REGEX1
      * ENTITY_TYPE2=REGEX2
      * 
    * * For example, to extract week day from text: - *
    WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
    + * 
    + * 
    + * WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
      * 
    * * @since Nov. 7, 2015 @@ -74,13 +72,13 @@ public RegexNERecogniser(InputStream stream) { IOUtils.closeQuietly(stream); for (String line : lines) { line = line.trim(); - if (line.isEmpty() || line.startsWith("#")) { //empty or comment - continue; //skip + if (line.isEmpty() || line.startsWith("#")) { // empty or comment + continue; // skip } int delim = line.indexOf('='); - if (delim < 0) { //delim not found - //skip + if (delim < 0) { // delim not found + // skip LOG.error("Skipped : Invalid config : {} ", line); continue; } @@ -115,7 +113,7 @@ public Set getEntityTypes() { /** * finds matching sub groups in text * - * @param text text containing interesting sub strings + * @param text text containing interesting sub strings * @param pattern pattern to find sub strings * @return set of sub strings if any found, or null if none found */ diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/sentiment/SentimentAnalysisParser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/sentiment/SentimentAnalysisParser.java index c6fd5597cf..dd06deddb4 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/sentiment/SentimentAnalysisParser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/sentiment/SentimentAnalysisParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.sentiment; @@ -24,15 +22,9 @@ import java.util.Collections; import java.util.Map; import java.util.Set; - import opennlp.tools.sentiment.SentimentME; import opennlp.tools.sentiment.SentimentModel; import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; import org.apache.tika.config.InitializableProblemHandler; @@ -43,17 +35,21 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * This parser classifies documents based on the sentiment of document. - * The classifier is powered by Apache OpenNLP's Maximum Entropy Classifier + * This parser classifies documents based on the sentiment of document. The classifier is powered by + * Apache OpenNLP's Maximum Entropy Classifier */ public class SentimentAnalysisParser implements Parser, Initializable { public static final String DEF_MODEL = - "https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin"; + "https://raw.githubusercontent.com/USCDataScience/SentimentAnalysisParser/master/sentiment-models/src/main/resources/edu/usc/irds/sentiment/en-netflix-sentiment.bin"; private static final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("sentiment")); + Collections.singleton(MediaType.application("sentiment")); private static final Logger LOG = LoggerFactory.getLogger(SentimentAnalysisParser.class); private SentimentME classifier; @@ -63,15 +59,14 @@ public class SentimentAnalysisParser implements Parser, Initializable { *
    * The path could be one of the following: *
      - *
    • a HTTP or HTTPS URL (Not recommended for production use since no caching is - * implemented)
    • + *
    • a HTTP or HTTPS URL (Not recommended for production use since no caching is implemented) + *
    • *
    • an absolute or relative path on local file system (recommended for production use in * standalone mode)
    • *
    • a relative path known to class loader (Especially useful in distributed environments, * recommended for advanced users
    • *
    - * Note: on conflict: the model from local file system gets the priority - * over classpath + * Note: on conflict: the model from local file system gets the priority over classpath */ @Field private String modelPath = DEF_MODEL; @@ -110,8 +105,8 @@ public void initialize(Map params) throws TikaConfigException { @Override public void checkInitialization(InitializableProblemHandler handler) - throws TikaConfigException { - //TODO -- what do we want to check? + throws TikaConfigException { + // TODO -- what do we want to check? } /** @@ -128,14 +123,14 @@ public Set getSupportedTypes(ParseContext context) { /** * Performs the parse * - * @param stream the input - * @param handler the content handler + * @param stream the input + * @param handler the content handler * @param metadata the metadata passed - * @param context the context for the parser + * @param context the context for the parser */ @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { if (classifier == null) { LOG.warn(getClass().getSimpleName() + " is not configured properly."); return; diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java index 308dfdceb0..aa70396384 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.geo.topic; @@ -26,10 +24,6 @@ import java.io.IOException; import java.io.InputStream; import java.net.URI; - -import org.junit.jupiter.api.Test; -import org.xml.sax.SAXException; - import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; @@ -37,29 +31,30 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.SAXException; public class GeoParserTest extends TikaTest { private Parser geoparser = new GeoParser(); @Test public void testFunctions() throws IOException, SAXException, TikaException { - String text = - "The millennial-scale cooling trend that followed the HTM coincides with " + - "the decrease in China " + - "summer insolation driven by slow changes in Earth's orbit. Despite " + - "the nearly linear forcing, the transition from the HTM to " + - "the Little Ice Age (1500-1900 AD) was neither gradual nor uniform. " + - "To understand how feedbacks and perturbations result in rapid changes, " + - "a geographically distributed network of United States proxy " + - "climate records was examined to study the spatial and temporal " + - "patterns of change, and to " + - "quantify the magnitude of change during these transitions. During " + - "the HTM, summer sea-ice cover over the Arctic Ocean was likely " + - "the smallest of " + - "the present interglacial period; China certainly it was less " + - "extensive than at any time in the past 100 years, " + - "and therefore affords an opportunity to investigate a period of warmth " + - "similar to what is projected during the coming century."; + String text = "The millennial-scale cooling trend that followed the HTM coincides with " + + "the decrease in China " + + "summer insolation driven by slow changes in Earth's orbit. Despite " + + "the nearly linear forcing, the transition from the HTM to " + + "the Little Ice Age (1500-1900 AD) was neither gradual nor uniform. " + + "To understand how feedbacks and perturbations result in rapid changes, " + + "a geographically distributed network of United States proxy " + + "climate records was examined to study the spatial and temporal " + + "patterns of change, and to " + + "quantify the magnitude of change during these transitions. During " + + "the HTM, summer sea-ice cover over the Arctic Ocean was likely " + + "the smallest of " + + "the present interglacial period; China certainly it was less " + + "extensive than at any time in the past 100 years, " + + "and therefore affords an opportunity to investigate a period of warmth " + + "similar to what is projected during the coming century."; Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); @@ -95,7 +90,7 @@ public void testNulls() throws IOException, SAXException, TikaException { GeoParserConfig config = new GeoParserConfig(); context.set(GeoParserConfig.class, config); geoparser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)), new BodyContentHandler(), - metadata, context); + metadata, context); assertNull(metadata.get("Geographic_NAME")); assertNull(metadata.get("Geographic_LONGITUDE")); assertNull(metadata.get("Geographic_LATITUDE")); @@ -104,13 +99,13 @@ public void testNulls() throws IOException, SAXException, TikaException { @Test public void testConfig() throws Exception { - TikaConfig config = new TikaConfig( - getResourceAsStream("/org/apache/tika/config/TIKA-3078-geo.topic.GeoParser.xml")); + TikaConfig config = new TikaConfig(getResourceAsStream( + "/org/apache/tika/config/TIKA-3078-geo.topic.GeoParser.xml")); Parser p = config.getParser(); GeoParser geoParser = (GeoParser) findParser(p, GeoParser.class); assertNotNull(geoParser); assertEquals("http://localhost/gazetteerRestEndpoint", - geoParser.getGazetteerRestEndpoint()); + geoParser.getGazetteerRestEndpoint()); assertEquals(new URI("file:/ner/model/url").toURL(), geoParser.getNerModelUrl()); } } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java index aabe657554..d9eed0d72d 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/JournalParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.journal; @@ -23,13 +21,11 @@ import static org.junit.jupiter.api.Assumptions.assumeTrue; import java.io.InputStream; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; public class JournalParserTest { diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/TEITest.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/TEITest.java index e79a13eb0a..83c4720c31 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/TEITest.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/journal/TEITest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.journal; @@ -23,13 +21,11 @@ import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.nio.charset.StandardCharsets; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +import org.junit.jupiter.api.Test; public class TEITest extends TikaTest { @@ -43,33 +39,32 @@ public void testBasic() throws Exception { } String xml = new String(bos.toByteArray(), StandardCharsets.UTF_8); Metadata metadata = teiParser.parse(xml, new ParseContext()); - assertEquals("Montbonnot Saint-Martin, Montbonnot Saint-Martin, " + - "Montbonnot Saint-Martin, " + - "Montbonnot Saint-Martin, null 38330, 38330, 38330, 38330 " + - "France, France, France, France ", metadata.get("Address").replaceAll("\\s+", " ")); - String[] keywords = new String[]{ - "F22 [Analysis of Algorithms and Problem Complexity]: Nonnumerical Algorithms " + - "and Problems\u2014Sequencing", - "and scheduling; D41 [Operating Systems]: Process management\u2014Scheduling, " + - "Concurrency", - "Keywords", "Parallel Computing, Algorithms, Scheduling, Parallel Tasks,", - "Moldable Tasks, Bi-criteria"}; + assertEquals("Montbonnot Saint-Martin, Montbonnot Saint-Martin, " + + "Montbonnot Saint-Martin, " + + "Montbonnot Saint-Martin, null 38330, 38330, 38330, 38330 " + + "France, France, France, France ", + metadata.get("Address").replaceAll("\\s+", " ")); + String[] keywords = new String[] { + "F22 [Analysis of Algorithms and Problem Complexity]: Nonnumerical Algorithms " + + "and Problems\u2014Sequencing", + "and scheduling; D41 [Operating Systems]: Process management\u2014Scheduling, " + + "Concurrency", + "Keywords", "Parallel Computing, Algorithms, Scheduling, Parallel Tasks,", + "Moldable Tasks, Bi-criteria"}; assertArrayEquals(keywords, metadata.getValues("Keyword")); - assertEquals( - "Pierre-François Dutot 1 Lionel Eyraud 1 Grégory Gr´ 1 Grégory " + - "Mouní 1 Denis Trystram 1 ", - metadata.get("Authors")); + assertEquals("Pierre-François Dutot 1 Lionel Eyraud 1 Grégory Gr´ 1 Grégory " + + "Mouní 1 Denis Trystram 1 ", metadata.get("Authors")); assertEquals("Bi-criteria Algorithm for Scheduling Jobs on Cluster Platforms *", - metadata.get("Title")); + metadata.get("Title")); assertEquals("1 ID-IMAG ID-IMAG ID-IMAG ID-IMAG", metadata.get("Affiliation")); - assertEquals("[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG , " + - "address=Montbonnot Saint-Martin, Montbonnot Saint-Martin, " + - "Montbonnot Saint-Martin, Montbonnot Saint-Martin, " + - "null 38330, 38330, 38330, 38330 France, France, France, France}" + - "[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG , " + - "address=Montbonnot Saint-Martin, Montbonnot Saint-Martin, " + - "Montbonnot Saint-Martin, Montbonnot Saint-Martin, " + - "null 38330, 38330, 38330, 38330 France, France, France, France}]", - metadata.get("FullAffiliations")); + assertEquals("[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG , " + + "address=Montbonnot Saint-Martin, Montbonnot Saint-Martin, " + + "Montbonnot Saint-Martin, Montbonnot Saint-Martin, " + + "null 38330, 38330, 38330, 38330 France, France, France, France}" + + "[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG , " + + "address=Montbonnot Saint-Martin, Montbonnot Saint-Martin, " + + "Montbonnot Saint-Martin, Montbonnot Saint-Martin, " + + "null 38330, 38330, 38330, 38330 France, France, France, France}]", + metadata.get("FullAffiliations")); } } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java index 12ef7b764d..43c78782f1 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner; @@ -24,10 +22,7 @@ import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.HashSet; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; -import org.junit.jupiter.api.Test; - import org.apache.tika.Tika; import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; @@ -36,6 +31,7 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser; import org.apache.tika.parser.ner.regex.RegexNERecogniser; +import org.junit.jupiter.api.Test; /** * Test case for {@link NamedEntityParser} @@ -47,18 +43,18 @@ public class NamedEntityParserTest extends TikaTest { @Test public void testParse() throws Exception { - //test config is added to resources directory + // test config is added to resources directory try (InputStream is = getResourceAsStream(CONFIG_FILE)) { TikaConfig config = new TikaConfig(is); Tika tika = new Tika(config); - String text = "I am student at University of Southern California (USC)," + - " located in Los Angeles . USC's football team is called by name Trojans." + - " Mr. John McKay was a head coach of the team from 1960 - 1975"; + String text = "I am student at University of Southern California (USC)," + + " located in Los Angeles . USC's football team is called by name Trojans." + + " Mr. John McKay was a head coach of the team from 1960 - 1975"; Metadata md = new Metadata(); tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md); HashSet set = new HashSet<>( - Arrays.asList(md.getValues(TikaCoreProperties.TIKA_PARSED_BY))); + Arrays.asList(md.getValues(TikaCoreProperties.TIKA_PARSED_BY))); assumeTrue(set.contains(NamedEntityParser.class.getName())); set.clear(); @@ -81,16 +77,16 @@ public void testParse() throws Exception { @Test public void testNerChain() throws Exception { - String classNames = - OpenNLPNERecogniser.class.getName() + "," + RegexNERecogniser.class.getName(); + String classNames = OpenNLPNERecogniser.class.getName() + "," + + RegexNERecogniser.class.getName(); System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames); try (InputStream is = getResourceAsStream(CONFIG_FILE)) { TikaConfig config = new TikaConfig(is); - String text = "University of Southern California (USC), is located in Los Angeles ." + - " Campus is busy from monday to saturday"; - Metadata md = getXML( - UnsynchronizedByteArrayInputStream.builder().setByteArray(text.getBytes(StandardCharsets.UTF_8)).get(), - new AutoDetectParser(config), new Metadata()).metadata; + String text = "University of Southern California (USC), is located in Los Angeles ." + + " Campus is busy from monday to saturday"; + Metadata md = getXML(UnsynchronizedByteArrayInputStream.builder() + .setByteArray(text.getBytes(StandardCharsets.UTF_8)).get(), + new AutoDetectParser(config), new Metadata()).metadata; HashSet keys = new HashSet<>(Arrays.asList(md.names())); assertTrue(keys.contains("NER_WEEK_DAY")); assumeTrue(keys.contains("NER_LOCATION")); diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java index ae9ee396eb..8d8ea872a8 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright owlocationNameEntitieship. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright owlocationNameEntitieship. The ASF licenses this file to You under the Apache License, + * Version 2.0 (the "License"); you may not use this file except in compliance with the License. You + * may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner.nltk; @@ -23,13 +21,11 @@ import java.util.Arrays; import java.util.HashSet; import java.util.Set; - -import org.junit.jupiter.api.Test; - import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ner.NamedEntityParser; +import org.junit.jupiter.api.Test; public class NLTKNERecogniserTest { @@ -38,8 +34,8 @@ public void testGetEntityTypes() throws Exception { String text = "America is a big country."; System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName()); - Tika tika = new Tika( - new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml"))); + Tika tika = new Tika(new TikaConfig( + NamedEntityParser.class.getResourceAsStream("tika-config.xml"))); Metadata md = new Metadata(); tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md); diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java index e6610de3ea..f0cffd3613 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright owlocationNameEntitieship. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright owlocationNameEntitieship. The ASF licenses this file to You under the Apache License, + * Version 2.0 (the "License"); you may not use this file except in compliance with the License. You + * may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.ner.regex; @@ -23,13 +21,11 @@ import java.util.Arrays; import java.util.HashSet; import java.util.Set; - -import org.junit.jupiter.api.Test; - import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ner.NamedEntityParser; +import org.junit.jupiter.api.Test; public class RegexNERecogniserTest { @@ -39,8 +35,8 @@ public void testGetEntityTypes() throws Exception { String text = "Hey, Lets meet on this Sunday or MONDAY because i am busy on Saturday"; System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, RegexNERecogniser.class.getName()); - Tika tika = new Tika( - new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml"))); + Tika tika = new Tika(new TikaConfig( + NamedEntityParser.class.getResourceAsStream("tika-config.xml"))); Metadata md = new Metadata(); tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md); @@ -48,7 +44,7 @@ public void testGetEntityTypes() throws Exception { assertTrue(days.contains("Sunday")); assertTrue(days.contains("MONDAY")); assertTrue(days.contains("Saturday")); - assertTrue(days.size() == 3); //and nothing else + assertTrue(days.size() == 3); // and nothing else } diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/sentiment/SentimentAnalysisParserTest.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/sentiment/SentimentAnalysisParserTest.java index 0e1ba4bf9a..c10a3c1706 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/sentiment/SentimentAnalysisParserTest.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/test/java/org/apache/tika/parser/sentiment/SentimentAnalysisParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.sentiment; @@ -23,15 +21,13 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; - -import org.junit.jupiter.api.Test; -import org.xml.sax.SAXException; - import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.Test; +import org.xml.sax.SAXException; /** * Test case for {@link SentimentAnalysisParser} @@ -46,10 +42,10 @@ public void endToEndTest() throws Exception { return; } - String text = "What a wonderful thought it is that" + - " some of the best days of our lives haven't happened yet."; + String text = "What a wonderful thought it is that" + + " some of the best days of our lives haven't happened yet."; ByteArrayInputStream stream = - new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)); + new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)); Metadata md = new Metadata(); tika.parse(stream, md); String sentiment = md.get("Sentiment"); @@ -66,7 +62,7 @@ public void testCategorical() throws Exception { } String text = "Whatever, I need some cooling off time!"; ByteArrayInputStream stream = - new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)); + new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)); Metadata md = new Metadata(); tika.parse(stream, md); String sentiment = md.get("Sentiment"); @@ -81,7 +77,7 @@ private Tika getTika(String configXml) throws TikaException, SAXException, IOExc TikaConfig config = new TikaConfig(confStream); return new Tika(config); } catch (TikaConfigException e) { - //if can't connect to pull sentiment model...ignore test + // if can't connect to pull sentiment model...ignore test if (e.getCause() instanceof IOException) { return null; } diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java index 2d5215cd6c..42f8c6992e 100644 --- a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java +++ b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/main/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribe.java @@ -1,22 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.transcribe.aws; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; @@ -26,9 +26,17 @@ import java.util.Set; import java.util.UUID; import java.util.concurrent.CompletableFuture; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; @@ -63,27 +71,14 @@ import software.amazon.awssdk.services.transcribe.model.TranscriptionJob; import software.amazon.awssdk.services.transcribe.model.TranscriptionJobStatus; -import org.apache.tika.config.Field; -import org.apache.tika.config.Initializable; -import org.apache.tika.config.InitializableProblemHandler; -import org.apache.tika.config.Param; -import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.sax.XHTMLContentHandler; - /** - *
    Amazon Transcribe - * implementation. See Javadoc for configuration options. + * Amazon Transcribe implementation. See Javadoc + * for configuration options. *

    * Silently becomes unavailable when client keys are unavailable. * - * N.B. it is not necessary to create the bucket before hand. - * This implementation will automatically create the bucket if one - * does not already exist, per the name defined above. + * N.B. it is not necessary to create the bucket before hand. This implementation will + * automatically create the bucket if one does not already exist, per the name defined above. * * @since Tika 2.0 */ @@ -99,12 +94,13 @@ public class AmazonTranscribe implements Parser, Initializable { private String clientSecret; // Keys used for the API calls. private StaticCredentialsProvider credsProvider; - //https://docs.aws.amazon.com/transcribe/latest/dg/input.html - protected static final Set SUPPORTED_TYPES = Collections.unmodifiableSet( - new HashSet<>(Arrays.asList(MediaType.audio("x-flac"), MediaType.audio("mp3"), - MediaType.audio("mpeg"), MediaType.video("ogg"), MediaType.audio("vnd.wave"), - MediaType.audio("mp4"), MediaType.video("mp4"), MediaType.application("mp4"), - MediaType.video("quicktime")))); + // https://docs.aws.amazon.com/transcribe/latest/dg/input.html + protected static final Set SUPPORTED_TYPES = Collections + .unmodifiableSet(new HashSet<>(Arrays.asList(MediaType.audio("x-flac"), + MediaType.audio("mp3"), MediaType.audio("mpeg"), + MediaType.video("ogg"), MediaType.audio("vnd.wave"), + MediaType.audio("mp4"), MediaType.video("mp4"), + MediaType.application("mp4"), MediaType.video("quicktime")))); @Override @@ -118,19 +114,19 @@ public Set getSupportedTypes(ParseContext context) { /** * Starts AWS Transcribe Job with language specification. * - * @param stream the source input stream. - * @param handler handler to use + * @param stream the source input stream. + * @param handler handler to use * @param metadata - * @param context -- set the {@link LanguageCode} in the ParseContext if known + * @param context -- set the {@link LanguageCode} in the ParseContext if known * @throws TikaException When there is an error transcribing. - * @throws IOException If an I/O exception of some sort has occurred. + * @throws IOException If an I/O exception of some sort has occurred. * @see AWS - * Language Code + * "https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/transcribe/model/LanguageCode.html">AWS + * Language Code */ @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { if (!isAvailable) { return; @@ -143,16 +139,19 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, LanguageCode languageCode = context.get(LanguageCode.class); uploadFileToBucket(stream, jobName); StartTranscriptionJobRequest startTranscriptionJobRequest = - StartTranscriptionJobRequest.builder() - .build(); - Media media = Media.builder().mediaFileUri(amazonS3.utilities().getUrl(GetUrlRequest.builder().bucket(bucketName).key(jobName).build()).toString()).build(); - startTranscriptionJobRequest = startTranscriptionJobRequest.toBuilder().media(media).outputBucketName(bucketName) - .transcriptionJobName(jobName).build(); + StartTranscriptionJobRequest.builder().build(); + Media media = Media.builder().mediaFileUri(amazonS3.utilities() + .getUrl(GetUrlRequest.builder().bucket(bucketName).key(jobName).build()) + .toString()).build(); + startTranscriptionJobRequest = startTranscriptionJobRequest.toBuilder().media(media) + .outputBucketName(bucketName).transcriptionJobName(jobName).build(); if (languageCode != null) { - startTranscriptionJobRequest = startTranscriptionJobRequest.toBuilder().languageCode(languageCode).build(); + startTranscriptionJobRequest = startTranscriptionJobRequest.toBuilder() + .languageCode(languageCode).build(); } else { - startTranscriptionJobRequest = startTranscriptionJobRequest.toBuilder().identifyLanguage(true).build(); + startTranscriptionJobRequest = + startTranscriptionJobRequest.toBuilder().identifyLanguage(true).build(); } amazonTranscribeAsync.startTranscriptionJob(startTranscriptionJobRequest); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); @@ -168,8 +167,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, /** - * @return true if this Transcriber is probably able to transcribe right - * now. + * @return true if this Transcriber is probably able to transcribe right now. * @since Tika 2.1 */ public boolean isAvailable() { @@ -234,19 +232,20 @@ private String getJobKey() { } /** - * Constructs a new {@link PutObjectRequest} object to upload a file to the - * specified bucket and jobName. After constructing the request, users may - * optionally specify object metadata or a canned ACL as well. + * Constructs a new {@link PutObjectRequest} object to upload a file to the specified bucket and + * jobName. After constructing the request, users may optionally specify object metadata or a + * canned ACL as well. * - * @param inputStream, null - * The file to upload to Amazon S3. - * @param jobName The unique job name for each job(UUID). + * @param inputStream, null The file to upload to Amazon S3. + * @param jobName The unique job name for each job(UUID). */ private void uploadFileToBucket(InputStream inputStream, String jobName) throws TikaException { - PutObjectRequest request = PutObjectRequest.builder().bucket(bucketName).key(jobName).build(); + PutObjectRequest request = + PutObjectRequest.builder().bucket(bucketName).key(jobName).build(); try { @SuppressWarnings("unused") - PutObjectResponse response = amazonS3.putObject(request, RequestBody.fromInputStream(inputStream, inputStream.available())); + PutObjectResponse response = amazonS3.putObject(request, + RequestBody.fromInputStream(inputStream, inputStream.available())); } catch (SdkClientException | IOException e) { throw new TikaException("File upload to AWS failed: " + e.getMessage(), e); } @@ -254,12 +253,13 @@ private void uploadFileToBucket(InputStream inputStream, String jobName) throws private void deleteFilesFromBucket(String jobName) throws TikaException { try { - amazonS3.deleteObject(DeleteObjectRequest.builder().bucket(bucketName).key(jobName) - .build()); - amazonS3.deleteObject(DeleteObjectRequest.builder().bucket(bucketName).key(jobName + ".json") - .build()); + amazonS3.deleteObject( + DeleteObjectRequest.builder().bucket(bucketName).key(jobName).build()); + amazonS3.deleteObject(DeleteObjectRequest.builder().bucket(bucketName) + .key(jobName + ".json").build()); } catch (SdkClientException e) { - LOG.error("Failed to delete {} and/or {} from {}", jobName, jobName + ".json", bucketName, e); + LOG.error("Failed to delete {} and/or {} from {}", jobName, jobName + ".json", + bucketName, e); } } @@ -268,33 +268,29 @@ private void deleteFilesFromBucket(String jobName) throws TikaException { * * @param fileNameS3 The path of the file to upload to Amazon S3. * @return The transcribed string result, NULL if the job failed. - * @throws IOException possible reasons include (i) an End Event is not received - * from AWS S3 SelectObjectContentResult operation and (ii) a parse exception - * whilst processing JSON from the AWS S3 SelectObjectContentResult operation. - * @throws SdkClientException a AWS-specific exception related to SelectObjectContentResult - * operation. + * @throws IOException possible reasons include (i) an End Event is not received from AWS S3 + * SelectObjectContentResult operation and (ii) a parse exception whilst processing JSON + * from the AWS S3 SelectObjectContentResult operation. + * @throws SdkClientException a AWS-specific exception related to SelectObjectContentResult + * operation. * @throws AwsServiceException possibly thrown if there is an issue selecting object content - * from AWS S3 objects. + * from AWS S3 objects. */ private String getTranscriptText(String fileNameS3) - throws AwsServiceException, SdkClientException, IOException { + throws AwsServiceException, SdkClientException, IOException { TranscriptionJob transcriptionJob = retrieveObjectWhenJobCompleted(fileNameS3); String text = ""; if (transcriptionJob != null && !TranscriptionJobStatus.FAILED - .equals(transcriptionJob.transcriptionJobStatus())) { - ResponseInputStream s3Object = amazonS3.getObject(GetObjectRequest.builder().bucket(bucketName).key(fileNameS3 + ".json") - .build()); + .equals(transcriptionJob.transcriptionJobStatus())) { + ResponseInputStream s3Object = amazonS3.getObject(GetObjectRequest + .builder().bucket(bucketName).key(fileNameS3 + ".json").build()); try (s3Object) { ObjectMapper mapper = new ObjectMapper(); JsonNode root = mapper.readTree(s3Object); - text = root - .path("results") - .path("transcripts") - .get(0) - .path("transcript") - .asText(); + text = root.path("results").path("transcripts").get(0).path("transcript").asText(); // could also be done with json.simple: - // ((JSONObject)((JSONArray)((JSONObject) obj.get("results")).get("transcripts")).get(0)).get("transcript") + // ((JSONObject)((JSONArray)((JSONObject) + // obj.get("results")).get("transcripts")).get(0)).get("transcript") } } return text; @@ -307,21 +303,21 @@ private String getTranscriptText(String fileNameS3) * @return TranscriptionJob object */ private TranscriptionJob retrieveObjectWhenJobCompleted(String jobName) { - GetTranscriptionJobRequest transcriptionJobRequest = GetTranscriptionJobRequest.builder().transcriptionJobName(jobName).build(); + GetTranscriptionJobRequest transcriptionJobRequest = + GetTranscriptionJobRequest.builder().transcriptionJobName(jobName).build(); while (true) { - CompletableFuture transcriptionJob = amazonTranscribeAsync.getTranscriptionJob(transcriptionJobRequest); + CompletableFuture transcriptionJob = + amazonTranscribeAsync.getTranscriptionJob(transcriptionJobRequest); GetTranscriptionJobResponse transcriptionJobResponse = transcriptionJob.join(); - TranscriptionJobStatus status = transcriptionJobResponse.transcriptionJob().transcriptionJobStatus(); - if (TranscriptionJobStatus.COMPLETED.equals(status) || - TranscriptionJobStatus.FAILED.equals(status)) { + TranscriptionJobStatus status = + transcriptionJobResponse.transcriptionJob().transcriptionJobStatus(); + if (TranscriptionJobStatus.COMPLETED.equals(status) + || TranscriptionJobStatus.FAILED.equals(status)) { return transcriptionJobResponse.transcriptionJob(); } - try - { + try { Thread.sleep(1000); - } - catch (InterruptedException ex) - { + } catch (InterruptedException ex) { LOG.warn("interrupted"); } } @@ -334,36 +330,37 @@ public void initialize(Map params) throws TikaConfigException { } try { - AwsBasicCredentials creds = AwsBasicCredentials.create(this.clientId, this.clientSecret); + AwsBasicCredentials creds = + AwsBasicCredentials.create(this.clientId, this.clientSecret); this.credsProvider = StaticCredentialsProvider.create(creds); if (region != null) { this.amazonS3 = S3Client.builder().credentialsProvider(credsProvider) - .region(Region.of(this.region)).build(); + .region(Region.of(this.region)).build(); } else { - this.amazonS3 = - S3Client.builder().credentialsProvider(credsProvider).build(); - this.region = amazonS3.serviceClientConfiguration().region().id(); // not sure if this works at all + this.amazonS3 = S3Client.builder().credentialsProvider(credsProvider).build(); + this.region = amazonS3.serviceClientConfiguration().region().id(); // not sure if + // this works at + // all } // for debugging - StsClient stsClient = StsClient.builder() - .credentialsProvider(credsProvider).region(Region.of(region)) - .build(); - GetCallerIdentityResponse identity = stsClient.getCallerIdentity(GetCallerIdentityRequest.builder() - .build()); + StsClient stsClient = StsClient.builder().credentialsProvider(credsProvider) + .region(Region.of(region)).build(); + GetCallerIdentityResponse identity = + stsClient.getCallerIdentity(GetCallerIdentityRequest.builder().build()); LOG.debug("Authenticated as: {}", identity.arn()); if (!doesBucketExistV2(amazonS3, bucketName)) { // returns true if no access try { - amazonS3.createBucket(CreateBucketRequest.builder().bucket(this.bucketName) - .build()); + amazonS3.createBucket( + CreateBucketRequest.builder().bucket(this.bucketName).build()); } catch (S3Exception e) { throw new TikaConfigException("couldn't create bucket", e); } } this.amazonTranscribeAsync = - TranscribeAsyncClient.builder().credentialsProvider(credsProvider) - .region(Region.of(this.region)).build(); + TranscribeAsyncClient.builder().credentialsProvider(credsProvider) + .region(Region.of(this.region)).build(); } catch (Exception e) { LOG.warn("Exception reading config file", e); isAvailable = false; @@ -373,11 +370,11 @@ public void initialize(Map params) throws TikaConfigException { @Override public void checkInitialization(InitializableProblemHandler problemHandler) - throws TikaConfigException { - //TODO alert user if they've gotten 1 or 2 out of three? + throws TikaConfigException { + // TODO alert user if they've gotten 1 or 2 out of three? this.isAvailable = checkAvailable(); } - + // Thanks, ChatGPT private boolean doesBucketExistV2(S3Client s3, String bucketName) { try { diff --git a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java index 18224b306d..aa20ab0e88 100644 --- a/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java +++ b/tika-parsers/tika-parsers-ml/tika-transcribe-aws/src/test/java/org/apache/tika/parser/transcribe/aws/AmazonTranscribeTest.java @@ -1,48 +1,43 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.transcribe.aws; import java.io.InputStream; - -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import software.amazon.awssdk.services.transcribe.model.LanguageCode; - import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import software.amazon.awssdk.services.transcribe.model.LanguageCode; -//TODO: Check the ACTUAL output of Amazon Transcribe. +// TODO: Check the ACTUAL output of Amazon Transcribe. /** - * Tests tika-trancribe by creating an AmazonTranscribe() object. - * 1) Tests that transcribe functions properly when it is given just a filepath. - * 2) Both audio (mp3) and video (mp4) files are used in these tests. + * Tests tika-trancribe by creating an AmazonTranscribe() object. 1) Tests that transcribe functions + * properly when it is given just a filepath. 2) Both audio (mp3) and video (mp4) files are used in + * these tests. * - * How to get this to work: - * 1) remove "@Disabled" - * 2) Get an amazon aws account (preferably the free tier) - * 3) include access key (clientid), secret and bucket name - * in "tika-config-aws-transcribe.xml" (do not commit this file!). The bucket name must be unique worldwide. - * 4) Make sure you have the needed permissions policies, AmazonS3FullAccess and AmazonTranscribeFullAccess. - * 5) Be aware that as of 6/2025, you can get only 60 min / month free transscripts. + * How to get this to work: 1) remove "@Disabled" 2) Get an amazon aws account (preferably the free + * tier) 3) include access key (clientid), secret and bucket name in + * "tika-config-aws-transcribe.xml" (do not commit this file!). The bucket name must be unique + * worldwide. 4) Make sure you have the needed permissions policies, AmazonS3FullAccess and + * AmazonTranscribeFullAccess. 5) Be aware that as of 6/2025, you can get only 60 min / month free + * transscripts. */ @Disabled("Ignore until finalize AmazonTranscribe Interface & build Tika") public class AmazonTranscribeTest extends TikaTest { @@ -52,14 +47,14 @@ public class AmazonTranscribeTest extends TikaTest { @BeforeAll public static void setUp() throws Exception { try (InputStream is = AmazonTranscribeTest.class - .getResourceAsStream("/tika-config-aws-transcribe.xml")) { + .getResourceAsStream("/tika-config-aws-transcribe.xml")) { PARSER = new AutoDetectParser(new TikaConfig(is)); } } /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-US (English - United States) + * Tests transcribe with an audio file given the source language The source language of the file + * is en-US (English - United States) */ @Test public void testAmazonTranscribeAudio_enUS() throws Exception { @@ -71,8 +66,8 @@ public void testAmazonTranscribeAudio_enUS() throws Exception { } /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is en-US (English - United States) + * Tests transcribe with an audio file without passing in the source language. The source + * language of the file is en-US (English - United States) */ @Test public void testAmazonTranscribeUnknownAudio_enUS() throws Exception { @@ -82,8 +77,8 @@ public void testAmazonTranscribeUnknownAudio_enUS() throws Exception { } /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-US (English - United States) + * Tests transcribe with an audio file given the source language The source language of the file + * is en-US (English - United States) */ @Test public void testAmazonTranscribeVideo_enUS() throws Exception { @@ -95,8 +90,8 @@ public void testAmazonTranscribeVideo_enUS() throws Exception { } /** - * Tests transcribe with a video file without passing in the source language. - * The source language of the file is en-US (English - United States) + * Tests transcribe with a video file without passing in the source language. The source + * language of the file is en-US (English - United States) */ @Test public void testAmazonTranscribeUnknownVideo_enUS() throws Exception { @@ -106,8 +101,8 @@ public void testAmazonTranscribeUnknownVideo_enUS() throws Exception { } /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-GB (English - Great Britain) + * Tests transcribe with an audio file given the source language The source language of the file + * is en-GB (English - Great Britain) */ @Test public void testAmazonTranscribeAudio_enGB() throws Exception { @@ -120,8 +115,8 @@ public void testAmazonTranscribeAudio_enGB() throws Exception { } /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is en-GB (English - Great Britain) + * Tests transcribe with an audio file without passing in the source language. The source + * language of the file is en-GB (English - Great Britain) */ @Test public void testAmazonTranscribeUnknownAudio_enGB() throws Exception { @@ -132,8 +127,8 @@ public void testAmazonTranscribeUnknownAudio_enGB() throws Exception { } /** - * Tests transcribe with an audio file given the source language - * The source language of the file is en-AU (English - Australia) + * Tests transcribe with an audio file given the source language The source language of the file + * is en-AU (English - Australia) */ @Test public void testAmazonTranscribeAudio_enAU() throws Exception { @@ -146,8 +141,8 @@ public void testAmazonTranscribeAudio_enAU() throws Exception { } /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is en-AU (English - Australian) + * Tests transcribe with an audio file without passing in the source language. The source + * language of the file is en-AU (English - Australian) */ @Test public void testAmazonTranscribeUnknownAudio_enAU() throws Exception { @@ -158,8 +153,8 @@ public void testAmazonTranscribeUnknownAudio_enAU() throws Exception { } /** - * Tests transcribe with an audio file given the source language - * The source language of the file is de-DE (German) + * Tests transcribe with an audio file given the source language The source language of the file + * is de-DE (German) */ @Test public void testAmazonTranscribeAudio_deDE() throws Exception { @@ -172,8 +167,8 @@ public void testAmazonTranscribeAudio_deDE() throws Exception { } /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is de-DE (German) + * Tests transcribe with an audio file without passing in the source language. The source + * language of the file is de-DE (German) */ @Test public void testAmazonTranscribeUnknownAudio_deDE() throws Exception { @@ -184,8 +179,8 @@ public void testAmazonTranscribeUnknownAudio_deDE() throws Exception { } /** - * Tests transcribe with an audio file given the source language - * The source language of the file is it-IT (Italian) + * Tests transcribe with an audio file given the source language The source language of the file + * is it-IT (Italian) */ @Test public void testAmazonTranscribeAudio_itIT() throws Exception { @@ -198,8 +193,8 @@ public void testAmazonTranscribeAudio_itIT() throws Exception { } /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is it-IT (Italian) + * Tests transcribe with an audio file without passing in the source language. The source + * language of the file is it-IT (Italian) */ @Test public void testAmazonTranscribeUnknownAudio_itIT() throws Exception { @@ -210,13 +205,13 @@ public void testAmazonTranscribeUnknownAudio_itIT() throws Exception { } /** - * Tests transcribe with an audio file given the source language - * The source language of the file is ja-JP (Japanese) + * Tests transcribe with an audio file given the source language The source language of the file + * is ja-JP (Japanese) */ @Test public void testAmazonTranscribeAudio_jaJP() throws Exception { String file = "ja-JP_(We_Are_At_School).mp3"; - String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu + String expected = "私達は学校にいます"; // TODO or Watashitachi wa gakkō ni imasu ParseContext context = new ParseContext(); context.set(LanguageCode.class, LanguageCode.JA_JP); String xml = getXML(file, PARSER, context).xml; @@ -225,25 +220,25 @@ public void testAmazonTranscribeAudio_jaJP() throws Exception { } /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is ja-JP (Japanese) + * Tests transcribe with an audio file without passing in the source language. The source + * language of the file is ja-JP (Japanese) */ @Test public void testAmazonTranscribeUnknownAudio_jaJP() throws Exception { String file = "ja-JP_(We_Are_At_School).mp3"; - String expected = "私達は学校にいます"; //TODO or Watashitachi wa gakkō ni imasu + String expected = "私達は学校にいます"; // TODO or Watashitachi wa gakkō ni imasu String xml = getXML(file, PARSER).xml; assertContains(expected, xml); } /** - * Tests transcribe with an audio file given the source language - * The source language of the file is ko-KR (Korean) + * Tests transcribe with an audio file given the source language The source language of the file + * is ko-KR (Korean) */ @Test public void testAmazonTranscribeAudio_koKR() throws Exception { String file = "ko-KR_(We_Are_Having_Class_x2).mp3"; - String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda + String expected = "우리는 수업을하고있다"; // TODO or ulineun sueob-eulhagoissda ParseContext context = new ParseContext(); context.set(LanguageCode.class, LanguageCode.KO_KR); String xml = getXML(file, PARSER, context).xml; @@ -251,25 +246,25 @@ public void testAmazonTranscribeAudio_koKR() throws Exception { } /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is ko-KR (Korean) + * Tests transcribe with an audio file without passing in the source language. The source + * language of the file is ko-KR (Korean) */ @Test public void testAmazonTranscribeUnknownAudio_koKR() throws Exception { String file = "ko-KR_(We_Are_Having_Class_x2).mp3"; - String expected = "우리는 수업을하고있다"; //TODO or ulineun sueob-eulhagoissda + String expected = "우리는 수업을하고있다"; // TODO or ulineun sueob-eulhagoissda String xml = getXML(file, PARSER).xml; assertContains(expected, xml); } /** - * Tests transcribe with a video file given the source language - * The source language of the file is ko-KR (Korean) + * Tests transcribe with a video file given the source language The source language of the file + * is ko-KR (Korean) */ @Test public void testAmazonTranscribeVideo_koKR() throws Exception { String file = "ko-KR_(Annyeonghaseyo).mp4"; - //TODO: Check whether output is Annyeonghaseyo or 안녕하세요 + // TODO: Check whether output is Annyeonghaseyo or 안녕하세요 String expected = "Annyeonghaseyo"; ParseContext context = new ParseContext(); context.set(LanguageCode.class, LanguageCode.KO_KR); @@ -278,21 +273,21 @@ public void testAmazonTranscribeVideo_koKR() throws Exception { } /** - * Tests transcribe with an video file without passing in the source language. - * The source language of the file is ko-KR (Korean) + * Tests transcribe with an video file without passing in the source language. The source + * language of the file is ko-KR (Korean) */ @Test public void testAmazonTranscribeUnknownVideo_koKR() throws Exception { String file = "ko-KR_(Annyeonghaseyo).mp4"; - //TODO: Check whether output is Annyeonghaseyo or 안녕하세요 + // TODO: Check whether output is Annyeonghaseyo or 안녕하세요 String expected = "Annyeonghaseyo"; String xml = getXML(file, PARSER).xml; assertContains(expected, xml); } /** - * Tests transcribe with an audio file given the source language - * The source language of the file is pt-BR (Portuguese - Brazil) + * Tests transcribe with an audio file given the source language The source language of the file + * is pt-BR (Portuguese - Brazil) */ @Test public void testAmazonTranscribeAudio_ptBR() throws Exception { @@ -305,8 +300,8 @@ public void testAmazonTranscribeAudio_ptBR() throws Exception { } /** - * Tests transcribe with an audio file without passing in the source language. - * The source language of the file is pt-BR (Portuguese - Brazil) + * Tests transcribe with an audio file without passing in the source language. The source + * language of the file is pt-BR (Portuguese - Brazil) */ @Test public void testAmazonTranscribeUnknownAudio_ptBR() throws Exception { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/BPListDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/BPListDetector.java index bdbd19ccbf..e95da67cd9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/BPListDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/BPListDetector.java @@ -1,21 +1,23 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect.apple; +import com.dd.plist.NSDictionary; +import com.dd.plist.NSObject; +import com.dd.plist.PropertyListFormatException; +import com.dd.plist.PropertyListParser; import java.io.IOException; import java.io.InputStream; import java.text.ParseException; @@ -23,38 +25,32 @@ import java.util.Map; import java.util.Set; import javax.xml.parsers.ParserConfigurationException; - -import com.dd.plist.NSDictionary; -import com.dd.plist.NSObject; -import com.dd.plist.PropertyListFormatException; -import com.dd.plist.PropertyListParser; import org.apache.commons.io.IOUtils; -import org.xml.sax.SAXException; - import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.xml.sax.SAXException; /** * Detector for BPList with utility functions for PList. *

    - * Without significant refactoring, this can't easily work as a true - * detector on plist subtypes. Rather, for now, we require the file to be - * parsed and then the parser adds the subtype for xml-based plists. + * Without significant refactoring, this can't easily work as a true detector on plist subtypes. + * Rather, for now, we require the file to be parsed and then the parser adds the subtype for + * xml-based plists. * * @since 1.25 */ public class BPListDetector implements Detector { - //xml versions + // xml versions public static MediaType MEMGRAPH = MediaType.application("x-plist-memgraph"); public static MediaType WEBARCHIVE = MediaType.application("x-plist-webarchive"); public static MediaType PLIST = MediaType.application("x-plist"); public static MediaType ITUNES = MediaType.application("x-plist-itunes"); - //binary versions + // binary versions public static MediaType BMEMGRAPH = MediaType.application("x-bplist-memgraph"); public static MediaType BWEBARCHIVE = MediaType.application("x-bplist-webarchive"); public static MediaType BPLIST = MediaType.application("x-bplist"); @@ -70,17 +66,17 @@ public class BPListDetector implements Detector { } public static MediaType detectOnKeys(Set keySet) { - if (keySet.contains("nodes") && keySet.contains("edges") && - keySet.contains("graphEncodingVersion")) { + if (keySet.contains("nodes") && keySet.contains("edges") + && keySet.contains("graphEncodingVersion")) { return BMEMGRAPH; - } else if (keySet.contains( - "WebMainResource")) { //&& keySet.contains ("WebSubresources") should we require + } else if (keySet.contains("WebMainResource")) { // && keySet.contains ("WebSubresources") + // should we require // this? return BWEBARCHIVE; - } else if (keySet.contains("Playlists") && keySet.contains("Tracks") && - keySet.contains("Music Folder")) { + } else if (keySet.contains("Playlists") && keySet.contains("Tracks") + && keySet.contains("Music Folder")) { return BITUNES; - } //if it contains $archiver and $objects, it is a bplist inside a webarchive + } // if it contains $archiver and $objects, it is a bplist inside a webarchive return BPLIST; } @@ -89,7 +85,7 @@ public static MediaType detectXMLOnKeys(Set keySet) { } /** - * @param input input stream must support reset + * @param input input stream must support reset * @param metadata input metadata for the document * @return * @throws IOException @@ -114,11 +110,11 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } int i = 0; - if (bytes[i++] != 'b' || bytes[i++] != 'p' || bytes[i++] != 'l' || bytes[i++] != 'i' || - bytes[i++] != 's' || bytes[i++] != 't') { + if (bytes[i++] != 'b' || bytes[i++] != 'p' || bytes[i++] != 'l' || bytes[i++] != 'i' + || bytes[i++] != 's' || bytes[i++] != 't') { return MediaType.OCTET_STREAM; } - //TODO: extract the version with the next two bytes if they were read + // TODO: extract the version with the next two bytes if they were read NSObject rootObj = null; try { if (input instanceof TikaInputStream && ((TikaInputStream) input).hasFile()) { @@ -129,8 +125,8 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException if (input instanceof TikaInputStream) { ((TikaInputStream) input).setOpenContainer(rootObj); } - } catch (PropertyListFormatException | ParseException | - ParserConfigurationException | SAXException e) { + } catch (PropertyListFormatException | ParseException | ParserConfigurationException + | SAXException e) { throw new IOException("problem parsing root", e); } if (rootObj instanceof NSDictionary) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/IWorkDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/IWorkDetector.java index 29287da8b5..71e76c533c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/IWorkDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/detect/apple/IWorkDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect.apple; @@ -20,10 +18,8 @@ import java.io.InputStream; import java.util.HashSet; import java.util.Set; - import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipFile; - import org.apache.tika.detect.zip.StreamingDetectContext; import org.apache.tika.detect.zip.ZipContainerDetector; import org.apache.tika.io.TikaInputStream; @@ -53,7 +49,7 @@ private static MediaType detectIWork(ZipFile zip) { // the correct type of the keynote container. for (String entryName : IWorkPackageParser.IWORK_CONTENT_ENTRIES) { IWorkPackageParser.IWORKDocumentType type = IWorkPackageParser.IWORKDocumentType - .detectType(zip.getEntry(entryName), zip); + .detectType(zip.getEntry(entryName), zip); if (type != null) { return type.getType(); } @@ -81,7 +77,7 @@ public MediaType detect(ZipFile zipFile, TikaInputStream tis) throws IOException @Override public MediaType streamingDetectUpdate(ZipArchiveEntry zae, InputStream zis, - StreamingDetectContext detectContext) { + StreamingDetectContext detectContext) { String name = zae.getName(); EntryNames entryNames = detectContext.get(EntryNames.class); if (entryNames == null) { @@ -91,7 +87,7 @@ public MediaType streamingDetectUpdate(ZipArchiveEntry zae, InputStream zis, entryNames.names.add(name); if (IWorkPackageParser.IWORK_CONTENT_ENTRIES.contains(name)) { IWorkPackageParser.IWORKDocumentType type = - IWorkPackageParser.IWORKDocumentType.detectType(zis); + IWorkPackageParser.IWORKDocumentType.detectType(zis); if (type != null) { return type.getType(); } @@ -117,7 +113,7 @@ public MediaType streamingDetectFinal(StreamingDetectContext detectContext) { if (entryNames.names.contains(IWork13PackageParser.IWORK13_MAIN_ENTRY)) { return IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType(); } - //general iworks + // general iworks if (entryNames.names.contains(IWorkPackageParser.IWORK_COMMON_ENTRY)) { return MediaType.application("vnd.apple.iwork"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java index d97ed1ba79..bfdb290f69 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.apple; @@ -24,11 +22,7 @@ import java.util.Comparator; import java.util.List; import java.util.Set; - import org.apache.commons.io.IOUtils; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaMemoryLimitException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -42,10 +36,11 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Parser that strips the header off of AppleSingle and AppleDouble - * files. + * Parser that strips the header off of AppleSingle and AppleDouble files. *

    * See spec document. */ @@ -61,7 +56,7 @@ public class AppleSingleFileParser implements Parser { private static final int COMMENT = 4; private static final int ICON_BW = 5; private static final int ICON_COLOR = 6; - //7?! + // 7?! private static final int FILE_DATES_INFO = 8; private static final int FINDER_INFO = 9; private static final int MACINTOSH_FILE_INFO = 10; @@ -72,7 +67,7 @@ public class AppleSingleFileParser implements Parser { private static final int DIRECTORY_ID = 15; private static final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("applefile")); + Collections.singleton(MediaType.application("applefile")); public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; @@ -80,7 +75,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); @@ -129,12 +124,11 @@ private FieldInfo getContentFieldInfo(List fieldInfoList) { } private long processFieldEntries(InputStream stream, List fieldInfoList, - Metadata embeddedMetadata, long bytesRead) - throws IOException, TikaException { + Metadata embeddedMetadata, long bytesRead) throws IOException, TikaException { byte[] buffer = null; for (FieldInfo f : fieldInfoList) { long diff = f.offset - bytesRead; - //just in case + // just in case IOUtils.skipFully(stream, diff); bytesRead += diff; if (f.entryId == REAL_NAME) { @@ -145,7 +139,7 @@ private long processFieldEntries(InputStream stream, List fieldInfoLi IOUtils.readFully(stream, buffer); bytesRead += f.length; String originalFileName = - new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII); + new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII); embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalFileName); } else if (f.entryId != DATA_FORK) { IOUtils.skipFully(stream, f.length); @@ -157,36 +151,36 @@ private long processFieldEntries(InputStream stream, List fieldInfoLi private List getSortedFieldInfoList(InputStream stream, short numEntries) - throws IOException, TikaException { - //this is probably overkill. I'd hope that these were already - //in order. This ensures it. + throws IOException, TikaException { + // this is probably overkill. I'd hope that these were already + // in order. This ensures it. List fieldInfoList = new ArrayList<>(numEntries); for (int i = 0; i < numEntries; i++) { - //convert 32-bit unsigned ints to longs - fieldInfoList.add(new FieldInfo(EndianUtils.readUIntBE(stream), //entry id - EndianUtils.readUIntBE(stream), //offset - EndianUtils.readUIntBE(stream) //length + // convert 32-bit unsigned ints to longs + fieldInfoList.add(new FieldInfo(EndianUtils.readUIntBE(stream), // entry id + EndianUtils.readUIntBE(stream), // offset + EndianUtils.readUIntBE(stream) // length )); } if (fieldInfoList.size() == 0) { throw new TikaException("AppleSingleFile missing field info"); } - //make absolutely sure these are in order! + // make absolutely sure these are in order! fieldInfoList.sort(Comparator.comparingLong(fieldInfo -> fieldInfo.offset)); return fieldInfoList; } - //read through header until you hit the number of entries + // read through header until you hit the number of entries private short readThroughNumEntries(InputStream stream) throws TikaException, IOException { - //mime + // mime EndianUtils.readIntBE(stream); - //version + // version long version = EndianUtils.readIntBE(stream); if (version != 0x00020000) { throw new TikaException("Version should have been 0x00020000, but was:" + version); } - IOUtils.skipFully(stream, 16);//filler - return EndianUtils.readShortBE(stream);//number of entries + IOUtils.skipFully(stream, 16);// filler + return EndianUtils.readShortBE(stream);// number of entries } private static class FieldInfo { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java index e90324ba22..ac6d0516be 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java @@ -1,21 +1,30 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.apple; +import com.dd.plist.NSArray; +import com.dd.plist.NSData; +import com.dd.plist.NSDate; +import com.dd.plist.NSDictionary; +import com.dd.plist.NSNumber; +import com.dd.plist.NSObject; +import com.dd.plist.NSSet; +import com.dd.plist.NSString; +import com.dd.plist.PropertyListFormatException; +import com.dd.plist.PropertyListParser; +import com.dd.plist.UID; import java.io.IOException; import java.io.InputStream; import java.text.DateFormat; @@ -28,21 +37,6 @@ import java.util.Map; import java.util.Set; import javax.xml.parsers.ParserConfigurationException; - -import com.dd.plist.NSArray; -import com.dd.plist.NSData; -import com.dd.plist.NSDate; -import com.dd.plist.NSDictionary; -import com.dd.plist.NSNumber; -import com.dd.plist.NSObject; -import com.dd.plist.NSSet; -import com.dd.plist.NSString; -import com.dd.plist.PropertyListFormatException; -import com.dd.plist.PropertyListParser; -import com.dd.plist.UID; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.detect.apple.BPListDetector; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -53,13 +47,14 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Parser for Apple's plist and bplist. This is a wrapper around - * com.googlecode.plist:dd-plist + * Parser for Apple's plist and bplist. This is a wrapper around com.googlecode.plist:dd-plist *

    - * As of 1.25, Tika does not have detection for the text based plist, - * so those files will not be directed to this parser + * As of 1.25, Tika does not have detection for the text based plist, so those files will not be + * directed to this parser * * @since 1.25 */ @@ -77,9 +72,10 @@ public class PListParser implements Parser { private static final String UID = "uid"; - private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>( - Arrays.asList(BPListDetector.BITUNES, BPListDetector.BMEMGRAPH, BPListDetector.BPLIST, - BPListDetector.BWEBARCHIVE, BPListDetector.PLIST))); + private static final Set SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<>(Arrays.asList(BPListDetector.BITUNES, + BPListDetector.BMEMGRAPH, BPListDetector.BPLIST, + BPListDetector.BWEBARCHIVE, BPListDetector.PLIST))); @Override public Set getSupportedTypes(ParseContext context) { @@ -88,14 +84,14 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { EmbeddedDocumentExtractor embeddedDocumentExtractor = - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US); NSObject rootObj = null; - //if this already went through the PListDetector, - //there should be an NSObject in the open container + // if this already went through the PListDetector, + // there should be an NSObject in the open container if (stream instanceof TikaInputStream) { rootObj = (NSObject) ((TikaInputStream) stream).getOpenContainer(); } @@ -107,8 +103,8 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } else { rootObj = PropertyListParser.parse(stream); } - } catch (PropertyListFormatException | ParseException | - ParserConfigurationException e) { + } catch (PropertyListFormatException | ParseException + | ParserConfigurationException e) { throw new TikaException("problem parsing root", e); } } @@ -116,7 +112,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, if (BPListDetector.PLIST.toString().equals(contentType)) { if (rootObj instanceof NSDictionary) { MediaType subtype = - BPListDetector.detectXMLOnKeys(((NSDictionary) rootObj).keySet()); + BPListDetector.detectXMLOnKeys(((NSDictionary) rootObj).keySet()); metadata.set(Metadata.CONTENT_TYPE, subtype.toString()); } } @@ -162,12 +158,11 @@ private void parseObject(NSObject obj, State state) throws SAXException, IOExcep parseSet((NSSet) obj, state); state.xhtml.endElement(SET); } else if (obj instanceof UID) { - //do we want to do anything with obj.getBytes() + // do we want to do anything with obj.getBytes() state.xhtml.element(UID, ((UID) obj).getName()); } else { - throw new UnsupportedOperationException( - "don't yet support this type of object: " + obj.getClass() + - " Please open an issue on our tracker"); + throw new UnsupportedOperationException("don't yet support this type of object: " + + obj.getClass() + " Please open an issue on our tracker"); } } @@ -198,8 +193,7 @@ private void handleData(NSData value, State state) throws IOException, SAXExcept } try (TikaInputStream tis = TikaInputStream.get(value.bytes())) { - state.embeddedDocumentExtractor - .parseEmbedded(tis, state.xhtml, embeddedMetadata, true); + state.embeddedDocumentExtractor.parseEmbedded(tis, state.xhtml, embeddedMetadata, true); } } @@ -210,7 +204,7 @@ private static class State { final DateFormat dateFormat; public State(XHTMLContentHandler xhtml, Metadata metadata, - EmbeddedDocumentExtractor embeddedDocumentExtractor, DateFormat df) { + EmbeddedDocumentExtractor embeddedDocumentExtractor, DateFormat df) { this.xhtml = xhtml; this.metadata = metadata; this.embeddedDocumentExtractor = embeddedDocumentExtractor; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java index 2cbb2b5742..d32035a9b4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/AutoPageNumberUtils.java @@ -1,32 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.iwork; import java.util.Locale; /** - * Utility class to allow for conversion from an integer to Roman numerals - * or alpha-numeric symbols in line with Pages auto numbering formats. + * Utility class to allow for conversion from an integer to Roman numerals or alpha-numeric symbols + * in line with Pages auto numbering formats. */ class AutoPageNumberUtils { - private static final String[] ALPHABET = - {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", - "R", "S", "T", "U", "V", "W", "X", "Y", "Z"}; + private static final String[] ALPHABET = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", + "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"}; private static final int MAX = 26; @@ -52,6 +49,7 @@ public static String asAlphaNumericLower(int i) { /* * Code copied from jena.apache.org. + * * @see com.hp.hpl.jena.sparql.util.RomanNumeral */ public static String asRomanNumerals(int i) { @@ -81,7 +79,7 @@ public static String asRomanNumeralsLower(int i) { } private static int i2r(StringBuffer sbuff, int i, String tens, int iTens, String nines, - int iNines, String fives, int iFives, String fours, int iFours) { + int iNines, String fives, int iFives, String fours, int iFours) { while (i >= iTens) { sbuff.append(tens); i -= iTens; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java index 87074304f0..dd133d34a6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.iwork; @@ -24,7 +22,6 @@ import java.util.HashSet; import java.util.Set; import javax.xml.namespace.QName; - import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; @@ -32,9 +29,6 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.detect.XmlRootExtractor; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -43,10 +37,12 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.XMLReaderUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files. - * This parser delegates the relevant entries to a {@link ContentHandler} that parsers the content. + * A parser for the IWork container files. This includes *.key, *.pages and *.numbers files. This + * parser delegates the relevant entries to a {@link ContentHandler} that parsers the content. *

    * Currently supported formats: *

      @@ -61,7 +57,7 @@ public class IWorkPackageParser implements Parser { * Which files within an iWork file contain the actual content? */ public final static Set IWORK_CONTENT_ENTRIES = Collections.unmodifiableSet( - new HashSet<>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl"))); + new HashSet<>(Arrays.asList("index.apxl", "index.xml", "presentation.apxl"))); /** * All iWork files contain one of these, so we can detect based on it */ @@ -75,16 +71,17 @@ public class IWorkPackageParser implements Parser { * This parser handles all iWorks formats. */ private final static Set supportedTypes = Collections.unmodifiableSet( - new HashSet<>(Arrays.asList(MediaType.application("vnd.apple.iwork"), - IWORKDocumentType.KEYNOTE.getType(), IWORKDocumentType.NUMBERS.getType(), - IWORKDocumentType.PAGES.getType()))); + new HashSet<>(Arrays.asList(MediaType.application("vnd.apple.iwork"), + IWORKDocumentType.KEYNOTE.getType(), + IWORKDocumentType.NUMBERS.getType(), + IWORKDocumentType.PAGES.getType()))); public Set getSupportedTypes(ParseContext context) { return supportedTypes; } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { ZipArchiveInputStream zip = new ZipArchiveInputStream(stream); ZipArchiveEntry entry = zip.getNextEntry(); @@ -125,7 +122,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, xhtml.startDocument(); if (contentHandler != null) { XMLReaderUtils.parseSAX(CloseShieldInputStream.wrap(entryStream), - contentHandler, context); + contentHandler, context); } xhtml.endDocument(); } @@ -135,18 +132,19 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, // Don't close the zip InputStream (TIKA-1117). } - private IWORKDocumentType detectType(InputStream entryStream, int markLimit) throws IOException { + private IWORKDocumentType detectType(InputStream entryStream, int markLimit) + throws IOException { byte[] bytes = new byte[markLimit]; try { int read = IOUtils.read(entryStream, bytes, 0, markLimit); try (InputStream bis = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes) - .setOffset(0).setLength(read).get()) { + .setOffset(0).setLength(read).get()) { return IWORKDocumentType.detectType(bis); } } catch (UnsupportedZipFeatureException e) { // There was a problem with extracting the root type // Password Protected iWorks files are funny, but we can usually - // spot them because they encrypt part of the zip stream + // spot them because they encrypt part of the zip stream // Compression field was likely encrypted return IWORKDocumentType.ENCRYPTED; @@ -156,12 +154,16 @@ private IWORKDocumentType detectType(InputStream entryStream, int markLimit) thr public enum IWORKDocumentType { KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", - MediaType.application("vnd.apple.keynote")), - NUMBERS("http://developer.apple.com/namespaces/ls", "document", - MediaType.application("vnd.apple.numbers")), - PAGES("http://developer.apple.com/namespaces/sl", "document", - MediaType.application("vnd.apple.pages")), - ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected")); + MediaType.application("vnd.apple.keynote")), NUMBERS( + "http://developer.apple.com/namespaces/ls", "document", + MediaType.application("vnd.apple.numbers")), PAGES( + "http://developer.apple.com/namespaces/sl", + "document", + MediaType.application( + "vnd.apple.pages")), ENCRYPTED( + null, null, + MediaType.application( + "x-tika-iworks-protected")); private final String namespace; private final String part; @@ -188,7 +190,7 @@ public static IWORKDocumentType detectType(ZipArchiveEntry entry, ZipFile zip) { } public static IWORKDocumentType detectType(ZipArchiveEntry entry, - ZipArchiveInputStream zip) { + ZipArchiveInputStream zip) { if (entry == null) { return null; } @@ -204,7 +206,7 @@ public static IWORKDocumentType detectType(InputStream stream) { for (IWORKDocumentType type : values()) { if (ENCRYPTED == type) { - //namespace and part are null for ENCRYPTED. + // namespace and part are null for ENCRYPTED. continue; } if (type.getNamespace().equals(uri) && type.getPart().equals(local)) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java index a3ad3db0a8..76a94473a2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java @@ -1,29 +1,26 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.iwork; -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; class KeynoteContentHandler extends DefaultHandler { @@ -61,7 +58,7 @@ public void endDocument() throws SAXException { @Override public void startElement(String uri, String localName, String qName, Attributes attributes) - throws SAXException { + throws SAXException { if ("key:theme".equals(qName)) { inTheme = true; } else if ("key:slide".equals(qName)) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java index bda113df79..e78e26b64d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/NumbersContentHandler.java @@ -1,33 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.iwork; import java.util.HashMap; import java.util.Map; - -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; class NumbersContentHandler extends DefaultHandler { @@ -63,7 +59,7 @@ public void endDocument() throws SAXException { @Override public void startElement(String uri, String localName, String qName, Attributes attributes) - throws SAXException { + throws SAXException { if ("ls:workspace".equals(qName)) { inSheet = true; numberOfSheets++; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java index 1ca168cc8c..486ffce78b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.iwork; @@ -20,16 +18,14 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; class PagesContentHandler extends DefaultHandler { @@ -51,6 +47,7 @@ class PagesContentHandler extends DefaultHandler { private List activeRow = new ArrayList<>(); private String metaDataLocalName; private String metaDataQName; + PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) { this.xhtml = xhtml; this.metadata = metadata; @@ -67,7 +64,7 @@ public void endDocument() throws SAXException { @Override public void startElement(String uri, String localName, String qName, Attributes attributes) - throws SAXException { + throws SAXException { if (parseProperty) { String value = parsePrimitiveElementValue(qName, attributes); if (value != null) { @@ -121,8 +118,8 @@ public void startElement(String uri, String localName, String qName, Attributes } else if ("sf:footer".equals(qName)) { inPart = footers.identifyPart(attributes.getValue("sf:name")); } else if ("sf:page-number".equals(qName)) { - if (inPart == DocumentPart.FOOTER_ODD || inPart == DocumentPart.FOOTER_FIRST || - inPart == DocumentPart.FOOTER_EVEN) { + if (inPart == DocumentPart.FOOTER_ODD || inPart == DocumentPart.FOOTER_FIRST + || inPart == DocumentPart.FOOTER_EVEN) { // We are in a footer footers.hasAutoPageNumber = true; footers.autoPageNumberFormat = attributes.getValue("sf:format"); @@ -270,9 +267,9 @@ private void outputTable(String idRef) throws SAXException { } /** - * Returns a resolved key that is common in other document types or - * returns the specified metaDataLocalName if no common key could be found. - * The key could be a simple String key, or could be a {@link Property} + * Returns a resolved key that is common in other document types or returns the specified + * metaDataLocalName if no common key could be found. The key could be a simple String key, or + * could be a {@link Property} * * @param metaDataLocalName The localname of the element containing metadata * @return a resolved key that is common in other document types @@ -294,17 +291,13 @@ private Object resolveMetaDataKey(String metaDataLocalName) { } /** - * Returns the value of a primitive element e.g.: - * <sl:number sfa:number="0" sfa:type="f"/> - the number attribute - * <sl:string sfa:string="en"/> = the string attribute + * Returns the value of a primitive element e.g.: <sl:number sfa:number="0" sfa:type="f"/> + * - the number attribute <sl:string sfa:string="en"/> = the string attribute *

      - * Returns null if the value could not be extracted from - * the list of attributes. + * Returns null if the value could not be extracted from the list of attributes. * - * @param qName The fully qualified name of the element containing - * the value to extract - * @param attributes The list of attributes of which one contains the - * value to be extracted + * @param qName The fully qualified name of the element containing the value to extract + * @param attributes The list of attributes of which one contains the value to be extracted * @return the value of a primitive element */ private String parsePrimitiveElementValue(String qName, Attributes attributes) { @@ -335,13 +328,11 @@ private void doFooter() throws SAXException { * The (interesting) part of the document we're in. Should be more structured... */ private enum DocumentPart { - METADATA, PARSABLE_TEXT, HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST, FOOTERS, - FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST, FOOTNOTES, ANNOTATIONS + METADATA, PARSABLE_TEXT, HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST, FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST, FOOTNOTES, ANNOTATIONS } /** - * Represents Footnotes in a document. The way these work - * in the file format isn't very clean... + * Represents Footnotes in a document. The way these work in the file format isn't very clean... */ private static class Footnotes { /** @@ -427,7 +418,7 @@ private void output(String what) throws SAXException { xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount)); } else if (autoPageNumberFormat.equals("lower-roman")) { xhtml.characters( - "\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount)); + "\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount)); } else if (autoPageNumberFormat.equals("upper-alpha")) { xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount)); } else if (autoPageNumberFormat.equals("lower-alpha")) { @@ -440,8 +431,7 @@ private void output(String what) throws SAXException { } /** - * Represents Annotations in a document. We currently - * just grab all the sf:p text in each one + * Represents Annotations in a document. We currently just grab all the sf:p text in each one */ private static class Annotations { /** diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java index ac65ff5871..3c7a4b456d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java @@ -1,22 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.iwork.iwana; +import com.dd.plist.NSArray; +import com.dd.plist.NSDictionary; +import com.dd.plist.NSObject; +import com.dd.plist.PropertyListParser; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; @@ -28,17 +30,9 @@ import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; - -import com.dd.plist.NSArray; -import com.dd.plist.NSDictionary; -import com.dd.plist.NSObject; -import com.dd.plist.PropertyListParser; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.io.IOUtils; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; @@ -51,6 +45,8 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class IWork13PackageParser implements Parser { @@ -62,16 +58,16 @@ public class IWork13PackageParser implements Parser { public static final String IWORKS_PREFIX = "iworks:"; public static final Property IWORKS_DOC_ID = - Property.externalText(IWORKS_PREFIX + "document-id"); + Property.externalText(IWORKS_PREFIX + "document-id"); public static final Property IWORKS_BUILD_VERSION_HISTORY = - Property.externalTextBag(IWORKS_PREFIX + "build-version-history"); + Property.externalTextBag(IWORKS_PREFIX + "build-version-history"); private final static Set supportedTypes = Collections.unmodifiableSet( - new HashSet<>(Arrays.asList(IWork13DocumentType.KEYNOTE13.getType(), - IWork13DocumentType.NUMBERS13.getType(), - IWork13DocumentType.PAGES13.getType(), - IWork13DocumentType.UNKNOWN13.getType()))); + new HashSet<>(Arrays.asList(IWork13DocumentType.KEYNOTE13.getType(), + IWork13DocumentType.NUMBERS13.getType(), + IWork13DocumentType.PAGES13.getType(), + IWork13DocumentType.UNKNOWN13.getType()))); @Override public Set getSupportedTypes(ParseContext context) { @@ -80,7 +76,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { // Open the Zip stream // Use a File if we can, and an already open zip is even better ZipFile zipFile = null; @@ -118,12 +114,12 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } private MediaType processZipStream(ZipInputStream zipStream, Metadata metadata, - XHTMLContentHandler xhtml, ParseContext parseContext) - throws TikaException, IOException, SAXException { + XHTMLContentHandler xhtml, ParseContext parseContext) + throws TikaException, IOException, SAXException { MediaType type = null; ZipEntry entry = zipStream.getNextEntry(); EmbeddedDocumentExtractor embeddedDocumentExtractor = - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext); + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext); while (entry != null) { if (type == null) { type = IWork13DocumentType.detectIfPossible(entry); @@ -131,7 +127,8 @@ private MediaType processZipStream(ZipInputStream zipStream, Metadata metadata, try (TemporaryResources tmp = new TemporaryResources()) { TikaInputStream tis = TikaInputStream.get(zipStream, tmp, new Metadata()); - processZipEntry(entry, tis, metadata, xhtml, parseContext, embeddedDocumentExtractor); + processZipEntry(entry, tis, metadata, xhtml, parseContext, + embeddedDocumentExtractor); } entry = zipStream.getNextEntry(); } @@ -141,11 +138,11 @@ private MediaType processZipStream(ZipInputStream zipStream, Metadata metadata, return type; } - private MediaType processZipFile(ZipFile zipFile, Metadata metadata, - XHTMLContentHandler xhtml, ParseContext parseContext) throws TikaException { + private MediaType processZipFile(ZipFile zipFile, Metadata metadata, XHTMLContentHandler xhtml, + ParseContext parseContext) throws TikaException { MediaType type = null; EmbeddedDocumentExtractor embeddedDocumentExtractor = - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext); + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext); Enumeration entries = zipFile.getEntries(); Exception ex = null; @@ -156,7 +153,8 @@ private MediaType processZipFile(ZipFile zipFile, Metadata metadata, type = IWork13DocumentType.detectIfPossible(entry); } try (TikaInputStream tis = TikaInputStream.get(zipFile.getInputStream(entry))) { - processZipEntry(entry, tis, metadata, xhtml, parseContext, embeddedDocumentExtractor); + processZipEntry(entry, tis, metadata, xhtml, parseContext, + embeddedDocumentExtractor); } catch (SecurityException e) { throw e; } catch (Exception e) { @@ -172,12 +170,10 @@ private MediaType processZipFile(ZipFile zipFile, Metadata metadata, return type; } - private void processZipEntry(ZipEntry entry, - TikaInputStream tis, - Metadata metadata, XHTMLContentHandler xhtml, - ParseContext parseContext, - EmbeddedDocumentExtractor embeddedDocumentExtractor) - throws TikaException, IOException, SAXException { + private void processZipEntry(ZipEntry entry, TikaInputStream tis, Metadata metadata, + XHTMLContentHandler xhtml, ParseContext parseContext, + EmbeddedDocumentExtractor embeddedDocumentExtractor) + throws TikaException, IOException, SAXException { String streamName = entry.getName(); if (streamName == null) { return; @@ -189,16 +185,15 @@ private void processZipEntry(ZipEntry entry, } else if ("Metadata/DocumentIdentifier".equals(streamName)) { extractDocumentIdentifier(tis, metadata); } else if ("preview.jpg".equals(streamName)) { - //process thumbnail + // process thumbnail Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, - TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString()); + TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString()); embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, streamName); handleEmbedded(tis, embeddedMetadata, xhtml, embeddedDocumentExtractor); - } else if (streamName.equals("preview-micro.jpg") || - streamName.equals("preview-web.jpg") - || streamName.endsWith(".iwa")) { - //do nothing + } else if (streamName.equals("preview-micro.jpg") || streamName.equals("preview-web.jpg") + || streamName.endsWith(".iwa")) { + // do nothing } else { Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, streamName); @@ -209,19 +204,19 @@ private void processZipEntry(ZipEntry entry, private void handleEmbedded(TikaInputStream tis, Metadata embeddedMetadata, - XHTMLContentHandler xhtml, - EmbeddedDocumentExtractor embeddedDocumentExtractor) - throws IOException, SAXException { + XHTMLContentHandler xhtml, EmbeddedDocumentExtractor embeddedDocumentExtractor) + throws IOException, SAXException { if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { embeddedDocumentExtractor.parseEmbedded(tis, xhtml, embeddedMetadata, true); } } - private void extractVersionHistory(InputStream inputStream, Metadata metadata) throws TikaException { + private void extractVersionHistory(InputStream inputStream, Metadata metadata) + throws TikaException { try { NSObject rootObj = PropertyListParser.parse(inputStream); if (rootObj instanceof NSArray) { - for (NSObject obj : ((NSArray)rootObj).getArray()) { + for (NSObject obj : ((NSArray) rootObj).getArray()) { metadata.add(IWORKS_BUILD_VERSION_HISTORY, obj.toString()); } } @@ -233,11 +228,12 @@ private void extractVersionHistory(InputStream inputStream, Metadata metadata) t } - private void extractProperties(InputStream inputStream, Metadata metadata) throws TikaException { + private void extractProperties(InputStream inputStream, Metadata metadata) + throws TikaException { try { NSObject rootObj = PropertyListParser.parse(inputStream); if (rootObj instanceof NSDictionary) { - NSDictionary dict = (NSDictionary)rootObj; + NSDictionary dict = (NSDictionary) rootObj; for (String k : dict.keySet()) { String v = dict.get(k).toString(); metadata.set(IWORKS_PREFIX + k, v); @@ -251,7 +247,7 @@ private void extractProperties(InputStream inputStream, Metadata metadata) throw } private void extractDocumentIdentifier(InputStream inputStream, Metadata metadata) - throws IOException { + throws IOException { byte[] bytes = new byte[36]; int read = IOUtils.read(inputStream, bytes); if (read == 36) { @@ -275,10 +271,11 @@ private MediaType guessTypeByExtension(Metadata metadata) { } public enum IWork13DocumentType { - KEYNOTE13(MediaType.application("vnd.apple.keynote.13")), - NUMBERS13(MediaType.application("vnd.apple.numbers.13")), - PAGES13(MediaType.application("vnd.apple.pages.13")), - UNKNOWN13(MediaType.application("vnd.apple.unknown.13")); + KEYNOTE13(MediaType.application("vnd.apple.keynote.13")), NUMBERS13( + MediaType.application("vnd.apple.numbers.13")), PAGES13( + MediaType.application("vnd.apple.pages.13")), UNKNOWN13( + MediaType.application( + "vnd.apple.unknown.13")); private final MediaType mediaType; @@ -321,7 +318,7 @@ public static MediaType detectIfPossible(ZipEntry entry) { // Is it the main document? if (name.equals(IWORK13_MAIN_ENTRY)) { // TODO Decode the snappy stream, and check for the Message Type - // = 2 (TN::SheetArchive), it is a numbers file; + // = 2 (TN::SheetArchive), it is a numbers file; // = 10000 (TP::DocumentArchive), that's a pages file return null; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java index 28a699039e..0a3b74de98 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork18PackageParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.iwork.iwana; @@ -26,28 +24,26 @@ import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; - import org.apache.commons.compress.archivers.zip.ZipFile; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * For now, this parser isn't even registered. It contains - * code that will detect the newer 2018 .keynote, .numbers, .pages files. + * For now, this parser isn't even registered. It contains code that will detect the newer 2018 + * .keynote, .numbers, .pages files. */ public class IWork18PackageParser implements Parser { private final static Set supportedTypes = Collections.unmodifiableSet( - new HashSet<>(Arrays.asList(IWork18DocumentType.KEYNOTE18.getType(), - IWork18DocumentType.NUMBERS18.getType(), - IWork18DocumentType.PAGES18.getType()))); + new HashSet<>(Arrays.asList(IWork18DocumentType.KEYNOTE18.getType(), + IWork18DocumentType.NUMBERS18.getType(), + IWork18DocumentType.PAGES18.getType()))); @Override public Set getSupportedTypes(ParseContext context) { @@ -56,7 +52,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { // Open the Zip stream // Use a File if we can, and an already open zip is even better ZipFile zipFile = null; @@ -100,9 +96,9 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } public enum IWork18DocumentType { - KEYNOTE18(MediaType.application("vnd.apple.keynote.18")), - NUMBERS18(MediaType.application("vnd.apple.numbers.18")), - PAGES18(MediaType.application("vnd.apple.pages.18")); + KEYNOTE18(MediaType.application("vnd.apple.keynote.18")), NUMBERS18( + MediaType.application("vnd.apple.numbers.18")), PAGES18( + MediaType.application("vnd.apple.pages.18")); private final MediaType mediaType; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/detect/apple/IWorkDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/detect/apple/IWorkDetectorTest.java index c1d1742908..e4688a0dc2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/detect/apple/IWorkDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/detect/apple/IWorkDetectorTest.java @@ -1,31 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect.apple; import static org.junit.jupiter.api.Assertions.assertEquals; import org.apache.commons.compress.archivers.zip.ZipFile; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.io.TikaInputStream; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.iwork.iwana.IWork13PackageParser.IWork13DocumentType; import org.apache.tika.parser.iwork.iwana.IWork18PackageParser.IWork18DocumentType; +import org.junit.jupiter.api.Test; public class IWorkDetectorTest extends TikaTest { @@ -34,7 +31,7 @@ public void testDetectKeynote13() throws Exception { String testFile = "/test-documents/testKeynote2013.detect"; IWorkDetector detector = new IWorkDetector(); try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream(testFile)); - ZipFile zipFile = ZipFile.builder().setFile(tis.getFile()).get()) { + ZipFile zipFile = ZipFile.builder().setFile(tis.getFile()).get()) { MediaType result = detector.detect(zipFile, tis); assertEquals(IWork13DocumentType.KEYNOTE13.getType(), result); } @@ -45,7 +42,7 @@ public void testDetectKeynote18() throws Exception { String testFile = "/test-documents/testKeynote2018.key"; IWorkDetector detector = new IWorkDetector(); try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream(testFile)); - ZipFile zipFile = ZipFile.builder().setFile(tis.getFile()).get()) { + ZipFile zipFile = ZipFile.builder().setFile(tis.getFile()).get()) { MediaType result = detector.detect(zipFile, tis); assertEquals(IWork18DocumentType.KEYNOTE18.getType(), result); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/apple/PListParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/apple/PListParserTest.java index 420f90e325..ff4f94f06d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/apple/PListParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/apple/PListParserTest.java @@ -1,37 +1,33 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.apple; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.List; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.Test; public class PListParserTest extends TikaTest { @Test public void testBasicBinaryPList() throws Exception { - //test file is MIT licensed: + // test file is MIT licensed: // https://github.com/joeferner/node-bplist-parser/blob/master/test/iTunes-small.bplist List metadataList = getRecursiveMetadata("testBPList.bplist"); assertEquals(21, metadataList.size()); @@ -40,11 +36,11 @@ public void testBasicBinaryPList() throws Exception { String content = m.get(TikaCoreProperties.TIKA_CONTENT); assertContains("Application Version9.0", content); - //TODO -- bad encoding right after this...smart quote? + // TODO -- bad encoding right after this...smart quote? assertContains("90", content); } - //TODO -- add unit tests for memgraph - //TODO -- convert existing unit tests to xml plist and add unit tests. + // TODO -- add unit tests for memgraph + // TODO -- convert existing unit tests to xml plist and add unit tests. } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java index f01c1bcd03..4a4e6fd6fe 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/AutoPageNumberUtilsTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.iwork; @@ -26,8 +24,7 @@ public class AutoPageNumberUtilsTest { /** - * Check upper-case alpha-numeric numbers are generated based on the - * input page number. + * Check upper-case alpha-numeric numbers are generated based on the input page number. */ @Test public void testAlphaUpper() { @@ -40,8 +37,7 @@ public void testAlphaUpper() { } /** - * Check lower-case alpha-numeric numbers are generated based on the - * input page number. + * Check lower-case alpha-numeric numbers are generated based on the input page number. */ @Test public void testAlphaLower() { @@ -54,8 +50,7 @@ public void testAlphaLower() { } /** - * Check upper-case Roman numerals numbers are generated based on the - * input page number. + * Check upper-case Roman numerals numbers are generated based on the input page number. */ @Test public void testRomanUpper() { @@ -65,8 +60,7 @@ public void testRomanUpper() { } /** - * Check lower-case Roman numerals numbers are generated based on the - * input page number. + * Check lower-case Roman numerals numbers are generated based on the input page number. */ @Test public void testRomanLower() { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java index fffc38f9a8..93f8315ab4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.iwork; @@ -23,11 +21,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.TikaTest; import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.Detector; @@ -40,6 +33,9 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; /** * Tests if the IWork parser parses the content and metadata properly of the supported formats. @@ -64,7 +60,7 @@ public void testStreamNotClosed() throws Exception { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); iWorkParser.parse(input, handler, metadata, new ParseContext()); - input.read(); // Will throw an Exception if the stream was already closed. + input.read(); // Will throw an Exception if the stream was already closed. } @Test @@ -77,15 +73,15 @@ public void testParseKeynote() throws Exception { assertTrue(metadata.size() >= 6, "Insufficient metadata found " + metadata.size()); List metadataKeys = Arrays.asList(metadata.names()); assertTrue(metadataKeys.contains(Metadata.CONTENT_TYPE), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(Office.SLIDE_COUNT.getName()), - "Metadata not found in " + metadataKeys); -// assertTrue("Metadata not found in " + metadataKeys, -// metadataKeys.contains(Office.SLIDE_COUNT.getName())); + "Metadata not found in " + metadataKeys); + // assertTrue("Metadata not found in " + metadataKeys, + // metadataKeys.contains(Office.SLIDE_COUNT.getName())); assertTrue(metadataKeys.contains(TikaCoreProperties.CREATOR.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(TikaCoreProperties.TITLE.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); // Check the metadata values assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE)); @@ -126,7 +122,7 @@ public void testKeynoteTextBoxes() throws Exception { public void testKeynoteBulletPoints() throws Exception { String content = getText("testBulletPoints.key", iWorkParser); assertTrue(content.replaceAll("\\s+", " ") - .contains("bullet point 1 bullet point 2 bullet point 3")); + .contains("bullet point 1 bullet point 2 bullet point 3")); } // TIKA-923 @@ -156,17 +152,17 @@ public void testParsePages() throws Exception { assertTrue(metadata.size() >= 50, "Insufficient metadata found " + metadata.size()); List metadataKeys = Arrays.asList(metadata.names()); assertTrue(metadataKeys.contains(Metadata.CONTENT_TYPE), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(Office.PAGE_COUNT.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(TikaCoreProperties.CREATOR.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(TikaCoreProperties.TITLE.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(TikaCoreProperties.MODIFIED.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(TikaCoreProperties.LANGUAGE.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); // Check the metadata values assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE)); @@ -217,17 +213,17 @@ public void testParseNumbers() throws Exception { assertTrue(metadata.size() >= 8, "Insufficient metadata found " + metadata.size()); List metadataKeys = Arrays.asList(metadata.names()); assertTrue(metadataKeys.contains(Metadata.CONTENT_TYPE), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(Office.PAGE_COUNT.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(TikaCoreProperties.CREATOR.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(TikaCoreProperties.TITLE.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); assertTrue(metadataKeys.contains(TikaCoreProperties.TITLE.getName()), - "Metadata not found in " + metadataKeys); + "Metadata not found in " + metadataKeys); // Check the metadata values assertEquals("2", metadata.get(Office.PAGE_COUNT)); @@ -267,9 +263,8 @@ public void testParseNumbersTableHeaders() throws Exception { } /** - * We don't currently support password protected Pages files, as - * we don't know how the encryption works (it's not regular Zip - * Encryption). See TIKA-903 for details + * We don't currently support password protected Pages files, as we don't know how the + * encryption works (it's not regular Zip Encryption). See TIKA-903 for details */ @Test public void testParsePagesPasswordProtected() throws Exception { @@ -403,14 +398,14 @@ public void testNumbersExtractChartNames() throws Exception { assertContains("Chart 2", content); } - //TIKA-3020 + // TIKA-3020 @Test public void testKeyNoteTableMarkup() throws Exception { String expected = - "\t\t\t" + - "\t\t\t" + - "\t\t\t" + - "
      Cell oneCell twoCell three
      Cell fourCell 5Cell six
      7Cell eight5/5/1985
      "; + "\t\t\t" + + "\t\t\t" + + "\t\t\t" + + "
      Cell oneCell twoCell three
      Cell fourCell 5Cell six
      7Cell eight5/5/1985
      "; String xml = getXML("testKeynote.key", iWorkParser).xml; xml = xml.replaceAll("[\r\n]", ""); assertContains(expected, xml); @@ -418,12 +413,12 @@ public void testKeyNoteTableMarkup() throws Exception { @Test public void testNPEInDetection() throws Exception { - //TIKA-3639 + // TIKA-3639 List zips = new ArrayList<>(); zips.add(new IWorkDetector()); Detector d = new CompositeDetector(new DefaultZipContainerDetector(zips)); try (InputStream is = this.getClass() - .getResourceAsStream("/test-documents/testIWorksNPEDetector.zip")) { + .getResourceAsStream("/test-documents/testIWorksNPEDetector.zip")) { MediaType mt = d.detect(is, new Metadata()); assertEquals(MediaType.application("zip"), mt); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java index c36260a696..39d6a5f461 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/test/java/org/apache/tika/parser/iwork/iwana/IWork13ParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.iwork.iwana; @@ -20,22 +18,19 @@ import java.io.InputStream; import java.util.List; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; /** - * Limited testing for the iWorks 13 format parser, which - * currently doesn't do much more than detection and handling - * some embedded files.... + * Limited testing for the iWorks 13 format parser, which currently doesn't do much more than + * detection and handling some embedded files.... */ public class IWork13ParserTest extends TikaTest { private IWork13PackageParser iWorkParser; @@ -57,7 +52,7 @@ public void testParseKeynote13() throws Exception { assertEquals(9, metadata.size()); assertEquals(IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString(), - metadata.get(Metadata.CONTENT_TYPE)); + metadata.get(Metadata.CONTENT_TYPE)); } @Test @@ -68,11 +63,10 @@ public void testParseNumbers13() throws Exception { iWorkParser.parse(input, handler, metadata, parseContext); // Currently parsing is a no-op, and we can't get the type without - // decoding the Snappy stream + // decoding the Snappy stream // TODO Test properly when a full Parser is added - assertEquals( - IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString(), - metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString(), + metadata.get(Metadata.CONTENT_TYPE)); assertEquals("preview.jpg", handler.toString().trim()); } @@ -84,11 +78,10 @@ public void testParsePages13() throws Exception { iWorkParser.parse(input, handler, metadata, parseContext); // Currently parsing is a no-op, and we can't get the type without - // decoding the Snappy stream + // decoding the Snappy stream // TODO Test properly when a full Parser is added - assertEquals( - IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString(), - metadata.get(Metadata.CONTENT_TYPE)); + assertEquals(IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString(), + metadata.get(Metadata.CONTENT_TYPE)); assertEquals("preview.jpg", handler.toString().trim()); } @@ -99,12 +92,12 @@ public void testNumbers13WFileName() throws Exception { List metadataList = getRecursiveMetadata("testNumbers2013.numbers", metadata); assertEquals(2, metadataList.size()); assertEquals("application/vnd.apple.numbers.13", - metadataList.get(0).get(Metadata.CONTENT_TYPE)); + metadataList.get(0).get(Metadata.CONTENT_TYPE)); assertEquals("true", metadataList.get(0).get("iworks:isMultiPage")); assertEquals("C5ED6463-575C-43B9-8FDA-1957B186C422", - metadataList.get(0).get("iworks:versionUUID")); + metadataList.get(0).get("iworks:versionUUID")); assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE)); assertEquals(TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString(), - metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java index b2b25f6060..f5bf9a4631 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java @@ -1,27 +1,23 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an "AS IS" - * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express - * or implied. See the License for the specific language governing - * permissions and limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; import java.io.IOException; import java.io.InputStream; - import org.apache.commons.io.IOUtils; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -33,19 +29,16 @@ public class MatroskaDetector implements Detector { /** For serialization compatibility. */ private static final long serialVersionUID = 1L; - private static final MediaType MATROSKA = - MediaType.application("x-matroska"); + private static final MediaType MATROSKA = MediaType.application("x-matroska"); - private static final MediaType WEBM = - MediaType.video("webm"); + private static final MediaType WEBM = MediaType.video("webm"); - private static final byte[] EBML_HEADER = - new byte[]{0x1A, 0x45, (byte) 0xDF, (byte) 0xA3}; + private static final byte[] EBML_HEADER = new byte[] {0x1A, 0x45, (byte) 0xDF, (byte) 0xA3}; /** * Detects the media type of the input stream by inspecting EBML headers. * - * @param input the input stream + * @param input the input stream * @param metadata the metadata to populate * @return detected MediaType (WEBM, Matroska, or OCTET_STREAM) * @throws IOException if an I/O error occurs @@ -76,16 +69,12 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException } for (int i = 4; i < bytesRead - 4; i++) { - if (header[i] == 'w' - && header[i + 1] == 'e' - && header[i + 2] == 'b' - && header[i + 3] == 'm') { + if (header[i] == 'w' && header[i + 1] == 'e' && header[i + 2] == 'b' + && header[i + 3] == 'm') { return WEBM; } - if (header[i] == 'm' - && header[i + 1] == 'a' - && header[i + 2] == 't' - && header[i + 3] == 'r') { + if (header[i] == 'm' && header[i + 1] == 'a' && header[i + 2] == 't' + && header[i + 3] == 'r') { return MATROSKA; } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java index 64596b9acf..0c6e5764b1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/AudioParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.audio; @@ -30,12 +28,8 @@ import javax.sound.sampled.AudioFormat; import javax.sound.sampled.AudioSystem; import javax.sound.sampled.UnsupportedAudioFileException; - import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.ProxyInputStream; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -44,6 +38,8 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class AudioParser implements Parser { @@ -52,23 +48,22 @@ public class AudioParser implements Parser { */ private static final long serialVersionUID = -6015684081240882695L; - private static final String UNSUPPORTED_AUDIO_FILE_EXCEPTION = "An " + - "UnsupportedAudioFileException was thrown. This could mean that the underlying " + - "parser hit an EndOfFileException or that the file is unsupported. ¯\\_(ツ)_/¯"; + private static final String UNSUPPORTED_AUDIO_FILE_EXCEPTION = "An " + + "UnsupportedAudioFileException was thrown. This could mean that the underlying " + + "parser hit an EndOfFileException or that the file is unsupported. ¯\\_(ツ)_/¯"; - private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet( - new HashSet<>( + private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>( Arrays.asList(MediaType.audio("basic"), MediaType.audio("vnd.wave"), - // Official, fixed in Tika 1.16 - MediaType.audio("x-wav"), // Older, used until Tika 1.16 - MediaType.audio("x-aiff")))); + // Official, fixed in Tika 1.16 + MediaType.audio("x-wav"), // Older, used until Tika 1.16 + MediaType.audio("x-aiff")))); public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { // AudioSystem expects the stream to support the mark feature if (!stream.markSupported()) { stream = new BufferedInputStream(stream); @@ -128,7 +123,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, // In Java 8, the AIFFReader throws an EOF, but // in Java 11, that EOF is swallowed and an UAFE is thrown. metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, - UNSUPPORTED_AUDIO_FILE_EXCEPTION); + UNSUPPORTED_AUDIO_FILE_EXCEPTION); } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java index d7622e97a2..4b6dd5ac8d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/audio/MidiParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.audio; @@ -32,16 +30,14 @@ import javax.sound.midi.Patch; import javax.sound.midi.Sequence; import javax.sound.midi.Track; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class MidiParser implements Parser { @@ -50,8 +46,7 @@ public class MidiParser implements Parser { */ private static final long serialVersionUID = 6343278584336189432L; - private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet( - new HashSet<>( + private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>( Arrays.asList(MediaType.application("x-midi"), MediaType.audio("midi")))); public Set getSupportedTypes(ParseContext context) { @@ -59,7 +54,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { metadata.set(Metadata.CONTENT_TYPE, "audio/midi"); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java index c9a9b21e6c..5f17f968b7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java @@ -1,32 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; import java.io.IOException; import java.io.InputStream; - +import org.apache.tika.exception.TikaException; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaException; - /** - * An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file. - * Currently, only the header is processed, not the raw audio data. + * An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file. Currently, only the + * header is processed, not the raw audio data. */ public class AudioFrame implements MP3Frame { /** @@ -73,7 +69,7 @@ public class AudioFrame implements MP3Frame { */ @Deprecated public AudioFrame(InputStream stream, ContentHandler handler) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { this(-2, -2, -2, -2, stream); } @@ -132,15 +128,15 @@ public AudioFrame(int h1, int h2, int h3, int h4, InputStream in) throws IOExcep * Creates a new instance of {@code AudioFrame} and initializes all properties. * * @param mpegVersion the code for the MPEG version - * @param layer the code for the layer - * @param bitRate the bit rate (in bps) - * @param sampleRate the sample rate (in samples per second) - * @param channels the number of channels - * @param length the frame length (in bytes) - * @param duration the duration of this frame (in milliseconds) + * @param layer the code for the layer + * @param bitRate the bit rate (in bps) + * @param sampleRate the sample rate (in samples per second) + * @param channels the number of channels + * @param length the frame length (in bytes) + * @param duration the duration of this frame (in milliseconds) */ public AudioFrame(int mpegVersion, int layer, int bitRate, int sampleRate, int channels, - int length, float duration) { + int length, float duration) { versionCode = mpegVersion; this.layer = layer; this.bitRate = bitRate; @@ -170,7 +166,7 @@ public static boolean isAudioHeader(int h1, int h2, int h3, int h4) { * Generates a string for the version of this audio frame. * * @param version the code for the MPEG version - * @param layer the code for the layer + * @param layer the code for the layer * @return a string for the version */ private static String generateVersionStr(int version, int layer) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java index b7d2d75427..aacf3f8266 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; @@ -20,8 +18,8 @@ import java.util.List; /** - * Takes an array of {@link ID3Tags} in preference order, and when asked for - * a given tag, will return it from the first {@link ID3Tags} that has it. + * Takes an array of {@link ID3Tags} in preference order, and when asked for a given tag, will + * return it from the first {@link ID3Tags} that has it. */ public class CompositeTagHandler implements ID3Tags { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java index 9217156f6c..0a4df2d1c9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java @@ -1,32 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; import java.util.List; /** - * Interface that defines the common interface for ID3 tag parsers, - * such as ID3v1 and ID3v2.3. - * Implementations should return NULL if the file lacks a given - * tag, or if the tag isn't defined for the version. + * Interface that defines the common interface for ID3 tag parsers, such as ID3v1 and ID3v2.3. + * Implementations should return NULL if the file lacks a given tag, or if the tag isn't defined for + * the version. *

      - * Note that so far, only the ID3v1 core tags are listed here. In - * future, we may wish to add more to cover the extra tags that - * our ID3v2 handlers can produce. + * Note that so far, only the ID3v1 core tags are listed here. In future, we may wish to add more to + * cover the extra tags that our ID3v2 handlers can produce. */ public interface ID3Tags { /** @@ -34,134 +30,45 @@ public interface ID3Tags { *

      * See http://www.id3.org/id3v2-00 */ - String[] GENRES = new String[]{ - /* 0 */ "Blues", - /* 1 */ "Classic Rock", - /* 2 */ "Country", - /* 3 */ "Dance", - /* 4 */ "Disco", - /* 5 */ "Funk", - /* 6 */ "Grunge", - /* 7 */ "Hip-Hop", - /* 8 */ "Jazz", - /* 9 */ "Metal", - /* 10 */ "New Age", - /* 11 */ "Oldies", - /* 12 */ "Other", - /* 13 */ "Pop", - /* 14 */ "R&B", - /* 15 */ "Rap", - /* 16 */ "Reggae", - /* 17 */ "Rock", - /* 18 */ "Techno", - /* 19 */ "Industrial", - /* 20 */ "Alternative", - /* 21 */ "Ska", - /* 22 */ "Death Metal", - /* 23 */ "Pranks", - /* 24 */ "Soundtrack", - /* 25 */ "Euro-Techno", - /* 26 */ "Ambient", - /* 27 */ "Trip-Hop", - /* 28 */ "Vocal", - /* 29 */ "Jazz+Funk", - /* 30 */ "Fusion", - /* 31 */ "Trance", - /* 32 */ "Classical", - /* 33 */ "Instrumental", - /* 34 */ "Acid", - /* 35 */ "House", - /* 36 */ "Game", - /* 37 */ "Sound Clip", - /* 38 */ "Gospel", - /* 39 */ "Noise", - /* 40 */ "AlternRock", - /* 41 */ "Bass", - /* 42 */ "Soul", - /* 43 */ "Punk", - /* 44 */ "Space", - /* 45 */ "Meditative", - /* 46 */ "Instrumental Pop", - /* 47 */ "Instrumental Rock", - /* 48 */ "Ethnic", - /* 49 */ "Gothic", - /* 50 */ "Darkwave", - /* 51 */ "Techno-Industrial", - /* 52 */ "Electronic", - /* 53 */ "Pop-Folk", - /* 54 */ "Eurodance", - /* 55 */ "Dream", - /* 56 */ "Southern Rock", - /* 57 */ "Comedy", - /* 58 */ "Cult", - /* 59 */ "Gangsta", - /* 60 */ "Top 40", - /* 61 */ "Christian Rap", - /* 62 */ "Pop/Funk", - /* 63 */ "Jungle", - /* 64 */ "Native American", - /* 65 */ "Cabaret", - /* 66 */ "New Wave", - /* 67 */ "Psychadelic", - /* 68 */ "Rave", - /* 69 */ "Showtunes", - /* 70 */ "Trailer", - /* 71 */ "Lo-Fi", - /* 72 */ "Tribal", - /* 73 */ "Acid Punk", - /* 74 */ "Acid Jazz", - /* 75 */ "Polka", - /* 76 */ "Retro", - /* 77 */ "Musical", - /* 78 */ "Rock & Roll", - /* 79 */ "Hard Rock", - /* 80 */ "Folk", - /* 81 */ "Folk-Rock", - /* 82 */ "National Folk", - /* 83 */ "Swing", - /* 84 */ "Fast Fusion", - /* 85 */ "Bebob", - /* 86 */ "Latin", - /* 87 */ "Revival", - /* 88 */ "Celtic", - /* 89 */ "Bluegrass", - /* 90 */ "Avantgarde", - /* 91 */ "Gothic Rock", - /* 92 */ "Progressive Rock", - /* 93 */ "Psychedelic Rock", - /* 94 */ "Symphonic Rock", - /* 95 */ "Slow Rock", - /* 96 */ "Big Band", - /* 97 */ "Chorus", - /* 98 */ "Easy Listening", - /* 99 */ "Acoustic", - /* 100 */ "Humour", - /* 101 */ "Speech", - /* 102 */ "Chanson", - /* 103 */ "Opera", - /* 104 */ "Chamber Music", - /* 105 */ "Sonata", - /* 106 */ "Symphony", - /* 107 */ "Booty Bass", - /* 108 */ "Primus", - /* 109 */ "Porn Groove", - /* 110 */ "Satire", - /* 111 */ "Slow Jam", - /* 112 */ "Club", - /* 113 */ "Tango", - /* 114 */ "Samba", - /* 115 */ "Folklore", - /* 116 */ "Ballad", - /* 117 */ "Power Ballad", - /* 118 */ "Rhythmic Soul", - /* 119 */ "Freestyle", - /* 120 */ "Duet", - /* 121 */ "Punk Rock", - /* 122 */ "Drum Solo", - /* 123 */ "A capella", - /* 124 */ "Euro-House", - /* 125 */ "Dance Hall", - /* sentinel */ ""}; + String[] GENRES = new String[] {/* 0 */ "Blues", /* 1 */ "Classic Rock", /* 2 */ "Country", + /* 3 */ "Dance", /* 4 */ "Disco", /* 5 */ "Funk", /* 6 */ "Grunge", + /* 7 */ "Hip-Hop", /* 8 */ "Jazz", /* 9 */ "Metal", /* 10 */ "New Age", + /* 11 */ "Oldies", /* 12 */ "Other", /* 13 */ "Pop", /* 14 */ "R&B", + /* 15 */ "Rap", /* 16 */ "Reggae", /* 17 */ "Rock", /* 18 */ "Techno", + /* 19 */ "Industrial", /* 20 */ "Alternative", /* 21 */ "Ska", + /* 22 */ "Death Metal", /* 23 */ "Pranks", /* 24 */ "Soundtrack", + /* 25 */ "Euro-Techno", /* 26 */ "Ambient", /* 27 */ "Trip-Hop", + /* 28 */ "Vocal", /* 29 */ "Jazz+Funk", /* 30 */ "Fusion", /* 31 */ "Trance", + /* 32 */ "Classical", /* 33 */ "Instrumental", /* 34 */ "Acid", + /* 35 */ "House", /* 36 */ "Game", /* 37 */ "Sound Clip", /* 38 */ "Gospel", + /* 39 */ "Noise", /* 40 */ "AlternRock", /* 41 */ "Bass", /* 42 */ "Soul", + /* 43 */ "Punk", /* 44 */ "Space", /* 45 */ "Meditative", + /* 46 */ "Instrumental Pop", /* 47 */ "Instrumental Rock", /* 48 */ "Ethnic", + /* 49 */ "Gothic", /* 50 */ "Darkwave", /* 51 */ "Techno-Industrial", + /* 52 */ "Electronic", /* 53 */ "Pop-Folk", /* 54 */ "Eurodance", + /* 55 */ "Dream", /* 56 */ "Southern Rock", /* 57 */ "Comedy", /* 58 */ "Cult", + /* 59 */ "Gangsta", /* 60 */ "Top 40", /* 61 */ "Christian Rap", + /* 62 */ "Pop/Funk", /* 63 */ "Jungle", /* 64 */ "Native American", + /* 65 */ "Cabaret", /* 66 */ "New Wave", /* 67 */ "Psychadelic", + /* 68 */ "Rave", /* 69 */ "Showtunes", /* 70 */ "Trailer", /* 71 */ "Lo-Fi", + /* 72 */ "Tribal", /* 73 */ "Acid Punk", /* 74 */ "Acid Jazz", /* 75 */ "Polka", + /* 76 */ "Retro", /* 77 */ "Musical", /* 78 */ "Rock & Roll", + /* 79 */ "Hard Rock", /* 80 */ "Folk", /* 81 */ "Folk-Rock", + /* 82 */ "National Folk", /* 83 */ "Swing", /* 84 */ "Fast Fusion", + /* 85 */ "Bebob", /* 86 */ "Latin", /* 87 */ "Revival", /* 88 */ "Celtic", + /* 89 */ "Bluegrass", /* 90 */ "Avantgarde", /* 91 */ "Gothic Rock", + /* 92 */ "Progressive Rock", /* 93 */ "Psychedelic Rock", + /* 94 */ "Symphonic Rock", /* 95 */ "Slow Rock", /* 96 */ "Big Band", + /* 97 */ "Chorus", /* 98 */ "Easy Listening", /* 99 */ "Acoustic", + /* 100 */ "Humour", /* 101 */ "Speech", /* 102 */ "Chanson", /* 103 */ "Opera", + /* 104 */ "Chamber Music", /* 105 */ "Sonata", /* 106 */ "Symphony", + /* 107 */ "Booty Bass", /* 108 */ "Primus", /* 109 */ "Porn Groove", + /* 110 */ "Satire", /* 111 */ "Slow Jam", /* 112 */ "Club", /* 113 */ "Tango", + /* 114 */ "Samba", /* 115 */ "Folklore", /* 116 */ "Ballad", + /* 117 */ "Power Ballad", /* 118 */ "Rhythmic Soul", /* 119 */ "Freestyle", + /* 120 */ "Duet", /* 121 */ "Punk Rock", /* 122 */ "Drum Solo", + /* 123 */ "A capella", /* 124 */ "Euro-House", /* 125 */ "Dance Hall", + /* sentinel */ ""}; /** * Does the file contain this kind of tags? @@ -187,9 +94,8 @@ public interface ID3Tags { String getCompilation(); /** - * Retrieves the comments, if any. - * Files may have more than one comment, but normally only - * one with any language/description pair. + * Retrieves the comments, if any. Files may have more than one comment, but normally only one + * with any language/description pair. */ List getComments(); @@ -208,8 +114,7 @@ public interface ID3Tags { String getDisc(); /** - * Represents a comments in ID3 (especially ID3 v2), where are - * made up of several parts + * Represents a comments in ID3 (especially ID3 v2), where are made up of several parts */ class ID3Comment { private String language; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java index 7029d41d77..d074e12a44 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; @@ -22,15 +20,12 @@ import java.io.InputStream; import java.util.Collections; import java.util.List; - +import org.apache.tika.exception.TikaException; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaException; - /** - * This is used to parse ID3 Version 1 Tag information from an MP3 file, - * if available. + * This is used to parse ID3 Version 1 Tag information from an MP3 file, if available. * * @see MP3 ID3 Version 1 specification */ @@ -45,7 +40,7 @@ public class ID3v1Handler implements ID3Tags { private String trackNumber; public ID3v1Handler(InputStream stream, ContentHandler handler) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { this(LyricsHandler.getSuffix(stream, 128)); } @@ -80,15 +75,14 @@ protected ID3v1Handler(byte[] tagData) throws IOException, SAXException, TikaExc } /** - * Returns the identified ISO-8859-1 substring from the given byte buffer. - * The return value is the zero-terminated substring retrieved from - * between the given start and end positions in the given byte buffer. - * Extra whitespace (and control characters) from the beginning and the - * end of the substring is removed. + * Returns the identified ISO-8859-1 substring from the given byte buffer. The return value is + * the zero-terminated substring retrieved from between the given start and end positions in the + * given byte buffer. Extra whitespace (and control characters) from the beginning and the end + * of the substring is removed. * * @param buffer byte buffer - * @param start start index of the substring - * @param end end index of the substring + * @param start start index of the substring + * @param end end index of the substring * @return the identified substring * @throws TikaException if the ISO-8859-1 encoding is not available */ @@ -147,32 +141,28 @@ public String getTrackNumber() { } /** - * ID3v1 doesn't have composers, - * so returns null; + * ID3v1 doesn't have composers, so returns null; */ public String getComposer() { return null; } /** - * ID3v1 doesn't have album-wide artists, - * so returns null; + * ID3v1 doesn't have album-wide artists, so returns null; */ public String getAlbumArtist() { return null; } /** - * ID3v1 doesn't have disc numbers, - * so returns null; + * ID3v1 doesn't have disc numbers, so returns null; */ public String getDisc() { return null; } /** - * ID3v1 doesn't have compilations, - * so returns null; + * ID3v1 doesn't have compilations, so returns null; */ public String getCompilation() { return null; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java index 830f4d4431..7d3c13526c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java @@ -1,36 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; import java.io.IOException; import java.util.ArrayList; import java.util.List; - -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.parser.mp3.ID3v2Frame.RawTag; import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator; +import org.xml.sax.SAXException; /** - * This is used to parse ID3 Version 2.2 Tag information from an MP3 file, - * if available. + * This is used to parse ID3 Version 2.2 Tag information from an MP3 file, if available. * - * @see MP3 ID3 Version 2.2 specification + * @see MP3 ID3 Version 2.2 + * specification */ public class ID3v22Handler implements ID3Tags { private String title; @@ -154,8 +150,7 @@ public String getDisc() { } /** - * ID3v22 doesn't have compilations, - * so returns null; + * ID3v22 doesn't have compilations, so returns null; */ public String getCompilation() { return null; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java index 23284d6773..b1daedf2e7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java @@ -1,36 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; import java.io.IOException; import java.util.ArrayList; import java.util.List; - -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.parser.mp3.ID3v2Frame.RawTag; import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator; +import org.xml.sax.SAXException; /** - * This is used to parse ID3 Version 2.3 Tag information from an MP3 file, - * if available. + * This is used to parse ID3 Version 2.3 Tag information from an MP3 file, if available. * - * @see MP3 ID3 Version 2.3 specification + * @see MP3 ID3 Version 2.3 + * specification */ public class ID3v23Handler implements ID3Tags { private String title; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java index 9e53b77409..dbc3fbbef0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java @@ -1,34 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; import java.io.IOException; import java.util.ArrayList; import java.util.List; - -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.parser.mp3.ID3v2Frame.RawTag; import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator; +import org.xml.sax.SAXException; /** - * This is used to parse ID3 Version 2.4 Tag information from an MP3 file, - * if available. + * This is used to parse ID3 Version 2.4 Tag information from an MP3 file, if available. * * @see MP3 ID3 Version 2.4 specification * @see MP3 ID3 Version 2.4 frames/tags diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java index 1dddd1410b..f9bc5532de 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; @@ -23,18 +21,16 @@ import java.io.PushbackInputStream; import java.io.UnsupportedEncodingException; import java.util.Iterator; - import org.apache.tika.parser.mp3.ID3Tags.ID3Comment; /** - * A frame of ID3v2 data, which is then passed to a handler to - * be turned into useful data. + * A frame of ID3v2 data, which is then passed to a handler to be turned into useful data. */ public class ID3v2Frame implements MP3Frame { - protected static final TextEncoding[] encodings = - new TextEncoding[]{new TextEncoding("ISO-8859-1", false), - new TextEncoding("UTF-16", true), // With BOM + protected static final TextEncoding[] encodings = new TextEncoding[] { + new TextEncoding("ISO-8859-1", false), new TextEncoding("UTF-16", true), // With + // BOM new TextEncoding("UTF-16BE", true), // Without BOM new TextEncoding("UTF-8", false)}; private static int MAX_RECORD_SIZE = 50_000_000; @@ -63,7 +59,7 @@ private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp) throws I } // Get the frame's data, or at least as much - // of it as we could do + // of it as we could do data = readFully(inp, length, false); } @@ -72,13 +68,10 @@ public static void setMaxRecordSize(int maxRecordSize) { } /** - * Returns the next ID3v2 Frame in - * the file, or null if the next batch of data - * doesn't correspond to either an ID3v2 header. - * If no ID3v2 frame could be detected and the passed in input stream is a - * {@code PushbackInputStream}, the bytes read so far are pushed back so - * that they can be read again. - * ID3v2 Frames should come before all Audio ones. + * Returns the next ID3v2 Frame in the file, or null if the next batch of data doesn't + * correspond to either an ID3v2 header. If no ID3v2 frame could be detected and the passed in + * input stream is a {@code PushbackInputStream}, the bytes read so far are pushed back so that + * they can be read again. ID3v2 Frames should come before all Audio ones. */ public static MP3Frame createFrameIfPresent(InputStream inp) throws IOException { int h1 = inp.read(); @@ -102,10 +95,10 @@ public static MP3Frame createFrameIfPresent(InputStream inp) throws IOException } /** - * Pushes bytes back into the stream if possible. This method is called if - * no ID3v2 header could be found at the current stream position. + * Pushes bytes back into the stream if possible. This method is called if no ID3v2 header could + * be found at the current stream position. * - * @param inp the input stream + * @param inp the input stream * @param bytes the bytes to be pushed back * @throws IOException if an error occurs */ @@ -145,9 +138,8 @@ protected static int getInt2(byte[] data, int offset) { } /** - * AKA a Synchsafe integer. - * 4 bytes hold a 28 bit number. The highest - * bit in each byte is always 0 and always ignored. + * AKA a Synchsafe integer. 4 bytes hold a 28 bit number. The highest bit in each byte is always + * 0 and always ignored. */ protected static int get7BitsInt(byte[] data, int offset) { int b0 = data[offset + 0] & 0x7F; @@ -162,11 +154,10 @@ protected static byte[] readFully(InputStream inp, int length) throws IOExceptio } protected static byte[] readFully(InputStream inp, int length, boolean shortDataIsFatal) - throws IOException { + throws IOException { if (MAX_RECORD_SIZE > 0 && length > MAX_RECORD_SIZE) { - throw new IOException( - "Record size (" + length + " bytes) is larger than the allowed record size: " + - MAX_RECORD_SIZE); + throw new IOException("Record size (" + length + + " bytes) is larger than the allowed record size: " + MAX_RECORD_SIZE); } byte[] b = new byte[length]; @@ -176,8 +167,8 @@ protected static byte[] readFully(InputStream inp, int length, boolean shortData read = inp.read(b, pos, length - pos); if (read == -1) { if (shortDataIsFatal) { - throw new IOException("Tried to read " + length + " bytes, but only " + pos + - " bytes present"); + throw new IOException("Tried to read " + length + " bytes, but only " + pos + + " bytes present"); } else { // Give them what we found // TODO Log the short read @@ -191,8 +182,8 @@ protected static byte[] readFully(InputStream inp, int length, boolean shortData } /** - * Returns the (possibly null padded) String at the given offset and - * length. String encoding is held in the first byte; + * Returns the (possibly null padded) String at the given offset and length. String encoding is + * held in the first byte; */ protected static String getTagString(byte[] data, int offset, int length) { int actualLength = length; @@ -214,8 +205,8 @@ protected static String getTagString(byte[] data, int offset, int length) { } // Trim off null termination / padding (as present) - while (encoding.doubleByte && actualLength >= 2 && data[offset + actualLength - 1] == 0 && - data[offset + actualLength - 2] == 0) { + while (encoding.doubleByte && actualLength >= 2 && data[offset + actualLength - 1] == 0 + && data[offset + actualLength - 2] == 0) { actualLength -= 2; } while (!encoding.doubleByte && actualLength >= 1 && data[offset + actualLength - 1] == 0) { @@ -229,9 +220,10 @@ protected static String getTagString(byte[] data, int offset, int length) { // have is a naked BOM then short-circuit here // (return empty string), because new String(..) // gives different results on different JVMs - if (encoding.encoding.equals("UTF-16") && actualLength == 2 && - ((data[offset] == (byte) 0xff && data[offset + 1] == (byte) 0xfe) || - (data[offset] == (byte) 0xfe && data[offset + 1] == (byte) 0xff))) { + if (encoding.encoding.equals("UTF-16") && actualLength == 2 + && ((data[offset] == (byte) 0xff && data[offset + 1] == (byte) 0xfe) + || (data[offset] == (byte) 0xfe + && data[offset + 1] == (byte) 0xff))) { return ""; } @@ -240,13 +232,13 @@ protected static String getTagString(byte[] data, int offset, int length) { return new String(data, offset, actualLength, encoding.encoding); } catch (UnsupportedEncodingException e) { throw new RuntimeException("Core encoding " + encoding.encoding + " is not available", - e); + e); } } /** - * Builds up the ID3 comment, by parsing and extracting - * the comment string parts from the given data. + * Builds up the ID3 comment, by parsing and extracting the comment string parts from the given + * data. */ protected static ID3Comment getComment(byte[] data, int offset, int length) { // Comments must have an encoding @@ -300,13 +292,12 @@ protected static ID3Comment getComment(byte[] data, int offset, int length) { return new ID3Comment(lang, description, text); } catch (UnsupportedEncodingException e) { throw new RuntimeException("Core encoding " + encoding.encoding + " is not available", - e); + e); } } /** - * Returns the String at the given - * offset and length. Strings are ISO-8859-1 + * Returns the String at the given offset and length. Strings are ISO-8859-1 */ protected static String getString(byte[] data, int offset, int length) { return new String(data, offset, length, ISO_8859_1); @@ -357,7 +348,7 @@ protected static class RawTag { private int headerSize; private RawTag(int nameLength, int sizeLength, int sizeMultiplier, int flagLength, - byte[] frameData, int offset) { + byte[] frameData, int offset) { headerSize = nameLength + sizeLength + flagLength; // Name, normally 3 or 4 bytes @@ -383,8 +374,9 @@ private RawTag(int nameLength, int sizeLength, int sizeMultiplier, int flagLengt // Now data int copyFrom = offset + nameLength + sizeLength + flagLength; - size = Math.max(0, Math.min(size, frameData.length - - copyFrom)); // TIKA-1218, prevent negative size for malformed files. + size = Math.max(0, Math.min(size, frameData.length - copyFrom)); // TIKA-1218, prevent + // negative size for + // malformed files. data = new byte[size]; System.arraycopy(frameData, copyFrom, data, 0, size); } @@ -396,9 +388,8 @@ protected int getSize() { } /** - * Iterates over id3v2 raw tags. - * Create an instance of this that configures the - * various length and multipliers. + * Iterates over id3v2 raw tags. Create an instance of this that configures the various length + * and multipliers. */ protected class RawTagIterator implements Iterator { private int nameLength; @@ -409,7 +400,7 @@ protected class RawTagIterator implements Iterator { private int offset = 0; protected RawTagIterator(int nameLength, int sizeLength, int sizeMultiplier, - int flagLength) { + int flagLength) { this.nameLength = nameLength; this.sizeLength = sizeLength; this.sizeMultiplier = sizeMultiplier; @@ -422,14 +413,13 @@ public boolean hasNext() { } public RawTag next() { - RawTag tag = - new RawTag(nameLength, sizeLength, sizeMultiplier, flagLength, data, offset); + RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier, flagLength, data, + offset); offset += tag.getSize(); return tag; } - public void remove() { - } + public void remove() {} } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java index 2c6e3b73a8..eee37ae2be 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; @@ -21,18 +19,14 @@ import java.io.IOException; import java.io.InputStream; - +import org.apache.tika.exception.TikaException; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaException; - /** - * This is used to parse Lyrics3 tag information - * from an MP3 file, if available. - * Handles lyrics tags of up to 10kb in size. - * Will process any ID3v1 tag data if present. - * Ignores extended ID3v1 data in the lyrics block + * This is used to parse Lyrics3 tag information from an MP3 file, if available. Handles lyrics tags + * of up to 10kb in size. Will process any ID3v1 tag data if present. Ignores extended ID3v1 data in + * the lyrics block * * @see Lyrics3 v2.0 specification */ @@ -42,17 +36,14 @@ public class LyricsHandler { ID3v1Handler id3v1 = null; public LyricsHandler(InputStream stream, ContentHandler handler) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { this(getSuffix(stream, 10240 + 128)); } /** - * Looks for the Lyrics data, which will be - * just before the ID3v1 data (if present), - * and process it. - * Also sets things up for the ID3v1 - * processing if required. - * Creates from the last 128 bytes of a stream. + * Looks for the Lyrics data, which will be just before the ID3v1 data (if present), and process + * it. Also sets things up for the ID3v1 processing if required. Creates from the last 128 bytes + * of a stream. */ protected LyricsHandler(byte[] tagData) throws IOException, SAXException, TikaException { if (tagData.length < 128) { @@ -69,21 +60,20 @@ protected LyricsHandler(byte[] tagData) throws IOException, SAXException, TikaEx } // Are there lyrics? Look for the closing Lyrics tag - // at the end to decide if there is any + // at the end to decide if there is any int lookat = tagData.length - 9; if (id3v1.found) { lookat -= 128; } - if (tagData[lookat + 0] == 'L' && tagData[lookat + 1] == 'Y' && - tagData[lookat + 2] == 'R' && tagData[lookat + 3] == 'I' && - tagData[lookat + 4] == 'C' && tagData[lookat + 5] == 'S' && - tagData[lookat + 6] == '2' && tagData[lookat + 7] == '0' && - tagData[lookat + 8] == '0') { + if (tagData[lookat + 0] == 'L' && tagData[lookat + 1] == 'Y' && tagData[lookat + 2] == 'R' + && tagData[lookat + 3] == 'I' && tagData[lookat + 4] == 'C' + && tagData[lookat + 5] == 'S' && tagData[lookat + 6] == '2' + && tagData[lookat + 7] == '0' && tagData[lookat + 8] == '0') { foundLyrics = true; // The length (6 bytes) comes just before LYRICS200, and is the - // size including the LYRICSBEGIN but excluding the - // length+LYRICS200 at the end. + // size including the LYRICSBEGIN but excluding the + // length+LYRICS200 at the end. int length = Integer.parseInt(new String(tagData, lookat - 6, 6, UTF_8)); String lyrics = new String(tagData, lookat - length + 5, length - 11, US_ASCII); @@ -94,7 +84,7 @@ protected LyricsHandler(byte[] tagData) throws IOException, SAXException, TikaEx String tagName = lyrics.substring(pos, pos + 3); int tagLen = Integer.parseInt(lyrics.substring(pos + 3, pos + 8)); if (tagLen < 1 || tagLen > lyrics.length()) { - //something went wrong + // something went wrong break; } int startPos = pos + 8; @@ -110,8 +100,7 @@ protected LyricsHandler(byte[] tagData) throws IOException, SAXException, TikaEx } /** - * Reads and returns the last length bytes from the - * given stream. + * Reads and returns the last length bytes from the given stream. * * @param stream input stream * @param length number of bytes from the end to read and return diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java index 8ea7c40c1d..f8f61b6145 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java @@ -1,25 +1,22 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; /** - * A frame in an MP3 file, such as ID3v2 Tags or some - * audio. + * A frame in an MP3 file, such as ID3v2 Tags or some audio. */ public interface MP3Frame { } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java index a8ff69efa3..b853cff4f7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; @@ -22,10 +20,6 @@ import java.util.Collections; import java.util.List; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.config.Field; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TailStream; @@ -37,13 +31,16 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.mp3.ID3Tags.ID3Comment; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * The Mp3Parser is used to parse ID3 Version 1 Tag information - * from an MP3 file, if available. + * The Mp3Parser is used to parse ID3 Version 1 Tag information from an MP3 file, if + * available. * * @see MP3 ID3 Version 1 specification - * @see MP3 ID3 Version 2.4 Structure Specification + * @see MP3 ID3 Version 2.4 Structure + * Specification * @see MP3 ID3 Version 2.4 Frames Specification */ public class Mp3Parser implements Parser { @@ -54,14 +51,14 @@ public class Mp3Parser implements Parser { private static final long serialVersionUID = 8537074922934844370L; private static final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.audio("mpeg")); + Collections.singleton(MediaType.audio("mpeg")); /** - * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers - * for each supported set of tags. + * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers for each supported set of + * tags. */ protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { ID3v24Handler v24 = null; ID3v23Handler v23 = null; ID3v22Handler v22 = null; @@ -140,7 +137,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg"); metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3"); @@ -150,7 +147,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler); // Before we start on the XHTML output, process and store - // as much metadata as possible + // as much metadata as possible if (audioAndTags.duration > 0) { metadata.set(XMPDM.DURATION, audioAndTags.durationSeconds()); } @@ -161,7 +158,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, metadata.set("version", audioAndTags.audio.getVersion()); metadata.set(XMPDM.AUDIO_SAMPLE_RATE, - Integer.toString(audioAndTags.audio.getSampleRate())); + Integer.toString(audioAndTags.audio.getSampleRate())); if (audioAndTags.audio.getChannels() == 1) { metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono"); } else if (audioAndTags.audio.getChannels() == 2) { @@ -257,6 +254,7 @@ public void setMaxRecordSize(int maxRecordSize) { public int getMaxRecordSize() { return ID3v2Frame.getMaxRecordSize(); } + protected static class ID3TagsAndAudio { private ID3Tags[] tags; private AudioFrame audio; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java index 28b7a71fe6..39e8ff8534 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java @@ -1,74 +1,64 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; import java.io.IOException; import java.io.InputStream; import java.io.PushbackInputStream; - import org.apache.commons.io.IOUtils; /** *

      - * A specialized stream class which can be used to extract single frames of MPEG - * audio files. + * A specialized stream class which can be used to extract single frames of MPEG audio files. *

      *

      - * Instances of this class are constructed with an underlying stream which - * should point to an audio file. Read operations are possible in the usual way. - * However, there are special methods for searching and extracting headers of - * MPEG frames. Some meta information of frames can be queried. + * Instances of this class are constructed with an underlying stream which should point to an audio + * file. Read operations are possible in the usual way. However, there are special methods for + * searching and extracting headers of MPEG frames. Some meta information of frames can be queried. *

      */ class MpegStream extends PushbackInputStream { /** * Bit rate table for MPEG V1, layer 1. */ - private static final int[] BIT_RATE_MPEG1_L1 = - {0, 32000, 64000, 96000, 128000, 160000, 192000, 224000, 256000, 288000, 320000, 352000, - 384000, 416000, 448000}; + private static final int[] BIT_RATE_MPEG1_L1 = {0, 32000, 64000, 96000, 128000, 160000, 192000, + 224000, 256000, 288000, 320000, 352000, 384000, 416000, 448000}; /** * Bit rate table for MPEG V1, layer 2. */ - private static final int[] BIT_RATE_MPEG1_L2 = - {0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000, 160000, 192000, 224000, - 256000, 320000, 384000}; + private static final int[] BIT_RATE_MPEG1_L2 = {0, 32000, 48000, 56000, 64000, 80000, 96000, + 112000, 128000, 160000, 192000, 224000, 256000, 320000, 384000}; /** * Bit rate table for MPEG V1, layer 3. */ - private static final int[] BIT_RATE_MPEG1_L3 = - {0, 32000, 40000, 48000, 56000, 64000, 80000, 96000, 112000, 128000, 160000, 192000, - 224000, 256000, 320000}; + private static final int[] BIT_RATE_MPEG1_L3 = {0, 32000, 40000, 48000, 56000, 64000, 80000, + 96000, 112000, 128000, 160000, 192000, 224000, 256000, 320000}; /** * Bit rate table for MPEG V2/V2.5, layer 1. */ - private static final int[] BIT_RATE_MPEG2_L1 = - {0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, - 192000, 224000, 256000}; + private static final int[] BIT_RATE_MPEG2_L1 = {0, 32000, 48000, 56000, 64000, 80000, 96000, + 112000, 128000, 144000, 160000, 176000, 192000, 224000, 256000}; /** * Bit rate table for MPEG V2/V2.5, layer 2 and 3. */ - private static final int[] BIT_RATE_MPEG2_L2 = - {0, 8000, 16000, 24000, 32000, 40000, 48000, 56000, 64000, 80000, 96000, 112000, 128000, - 144000, 160000}; + private static final int[] BIT_RATE_MPEG2_L2 = {0, 8000, 16000, 24000, 32000, 40000, 48000, + 56000, 64000, 80000, 96000, 112000, 128000, 144000, 160000}; /** * Sample rate table for MPEG V1. @@ -116,8 +106,7 @@ class MpegStream extends PushbackInputStream { private boolean endOfStream; /** - * Creates a new instance of {@code MpegStream} and initializes it with the - * underlying stream. + * Creates a new instance of {@code MpegStream} and initializes it with the underlying stream. * * @param in the underlying audio stream */ @@ -129,8 +118,8 @@ public MpegStream(InputStream in) { * Calculates the bit rate based on the given parameters. * * @param mpegVer the MPEG version - * @param layer the layer - * @param code the code for the bit rate + * @param layer the layer + * @param code the code for the bit rate * @return the bit rate in bits per second */ private static int calculateBitRate(int mpegVer, int layer, int code) { @@ -162,7 +151,7 @@ private static int calculateBitRate(int mpegVer, int layer, int code) { * Calculates the sample rate based on the given parameters. * * @param mpegVer the MPEG version - * @param code the code for the sample rate + * @param code the code for the sample rate * @return the sample rate in samples per second */ private static int calculateSampleRate(int mpegVer, int code) { @@ -172,10 +161,10 @@ private static int calculateSampleRate(int mpegVer, int code) { /** * Calculates the length of an MPEG frame based on the given parameters. * - * @param layer the layer - * @param bitRate the bit rate + * @param layer the layer + * @param bitRate the bit rate * @param sampleRate the sample rate - * @param padding the padding flag + * @param padding the padding flag * @return the length of the frame in bytes */ private static int calculateFrameLength(int layer, int bitRate, int sampleRate, int padding) { @@ -189,7 +178,7 @@ private static int calculateFrameLength(int layer, int bitRate, int sampleRate, /** * Calculates the duration of a MPEG frame based on the given parameters. * - * @param layer the layer + * @param layer the layer * @param sampleRate the sample rate * @return the duration of this frame in milliseconds */ @@ -222,12 +211,10 @@ private static int[][] createSampleRateTable() { } /** - * Searches for the next MPEG frame header from the current stream position - * on. This method advances the underlying input stream until it finds a - * valid frame header or the end of the stream is reached. In the former - * case a corresponding {@code AudioFrame} object is created. In the latter - * case there are no more headers, so the end of the stream is probably - * reached. + * Searches for the next MPEG frame header from the current stream position on. This method + * advances the underlying input stream until it finds a valid frame header or the end of the + * stream is reached. In the former case a corresponding {@code AudioFrame} object is created. + * In the latter case there are no more headers, so the end of the stream is probably reached. * * @return the next {@code AudioFrame} or null * @throws IOException if an IO error occurs @@ -252,10 +239,9 @@ public AudioFrame nextFrame() throws IOException { } /** - * Skips the current MPEG frame. This method can be called after a valid - * MPEG header has been retrieved using {@code nextFrame()}. In this case - * the underlying stream is advanced to the end of the associated MPEG - * frame or until the EOF is reached. The return value indicates + * Skips the current MPEG frame. This method can be called after a valid MPEG header has been + * retrieved using {@code nextFrame()}. In this case the underlying stream is advanced to the + * end of the associated MPEG frame or until the EOF is reached. The return value indicates * whether the full frame could be skipped. * * @return true if a frame could be skipped, false otherwise, perhaps EOF? @@ -275,8 +261,7 @@ public boolean skipFrame() throws IOException { } /** - * Advances the underlying stream until the first byte of frame sync is - * found. + * Advances the underlying stream until the first byte of frame sync is found. * * @throws IOException if an error occurs */ @@ -304,8 +289,8 @@ private HeaderBitField createHeaderField() throws IOException { } /** - * Creates an {@code AudioFrame} object based on the given header field. If - * the header field contains invalid values, result is null. + * Creates an {@code AudioFrame} object based on the given header field. If the header field + * contains invalid values, result is null. * * @param bits the header bit field * @return the {@code AudioFrame} @@ -321,8 +306,8 @@ private AudioFrame createHeader(HeaderBitField bits) { int sampleRateCode = bits.get(10, 11); int padding = bits.get(9); - if (mpegVer == 1 || layer == 0 || bitRateCode == 0 || bitRateCode == 15 || - sampleRateCode == 3) { + if (mpegVer == 1 || layer == 0 || bitRateCode == 0 || bitRateCode == 15 + || sampleRateCode == 3) { // invalid header values return null; } @@ -353,9 +338,9 @@ private int nextByte() throws IOException { } /** - * Pushes the given header field back in the stream so that the bytes are - * read again. This method is called if an invalid header was detected. Then - * search has to continue at the next byte after the frame sync byte. + * Pushes the given header field back in the stream so that the bytes are read again. This + * method is called if an invalid header was detected. Then search has to continue at the next + * byte after the frame sync byte. * * @param field the header bit field with the invalid frame header * @throws IOException if an error occurs @@ -365,8 +350,8 @@ private void pushBack(HeaderBitField field) throws IOException { } /** - * A class representing the bit field of an MPEG header. It allows - * convenient access to specific bit groups. + * A class representing the bit field of an MPEG header. It allows convenient access to specific + * bit groups. */ private static class HeaderBitField { /** @@ -385,12 +370,11 @@ public void add(int b) { } /** - * Returns the value of the bit group from the given start and end - * index. E.g. ''from'' = 0, ''to'' = 3 will return the value of the - * first 4 bits. + * Returns the value of the bit group from the given start and end index. E.g. ''from'' = 0, + * ''to'' = 3 will return the value of the first 4 bits. * * @param from index - * @param to the to index + * @param to the to index * @return the value of this group of bits */ public int get(int from, int to) { @@ -400,8 +384,8 @@ public int get(int from, int to) { } /** - * Returns the value of the bit with the given index. The bit index is - * 0-based. Result is either 0 or 1, depending on the value of this bit. + * Returns the value of the bit with the given index. The bit index is 0-based. Result is + * either 0 or 1, depending on the value of this bit. * * @param bit the bit index * @return the value of this bit @@ -411,8 +395,7 @@ public int get(int bit) { } /** - * Returns the internal value of this field as an array. The array - * contains 3 bytes. + * Returns the internal value of this field as an array. The array contains 3 bytes. * * @return the internal value of this field as int array */ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java index 0b7e30b2aa..bddc6d4308 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java @@ -1,21 +1,26 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp4; +import com.drew.imaging.mp4.Mp4Reader; +import com.drew.metadata.Directory; +import com.drew.metadata.MetadataException; +import com.drew.metadata.mp4.Mp4BoxHandler; +import com.drew.metadata.mp4.Mp4Directory; +import com.drew.metadata.mp4.media.Mp4SoundDirectory; +import com.drew.metadata.mp4.media.Mp4VideoDirectory; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; @@ -32,17 +37,6 @@ import java.util.Map; import java.util.Optional; import java.util.Set; - -import com.drew.imaging.mp4.Mp4Reader; -import com.drew.metadata.Directory; -import com.drew.metadata.MetadataException; -import com.drew.metadata.mp4.Mp4BoxHandler; -import com.drew.metadata.mp4.Mp4Directory; -import com.drew.metadata.mp4.media.Mp4SoundDirectory; -import com.drew.metadata.mp4.media.Mp4VideoDirectory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.RuntimeSAXException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; @@ -56,10 +50,12 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.StringUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Parser for the MP4 media container format, as well as the older - * QuickTime format that MP4 is based on. + * Parser for the MP4 media container format, as well as the older QuickTime format that MP4 is + * based on. *

      * This uses Drew Noakes' metadata-extractor: https://github.com/drewnoakes/metadata-extractor */ @@ -70,7 +66,7 @@ public class MP4Parser implements Parser { private static final long serialVersionUID = 84011216792285L; private static final Map> typesMap = new HashMap<>(); private static final Set SUPPORTED_TYPES = - Collections.unmodifiableSet(typesMap.keySet()); + Collections.unmodifiableSet(typesMap.keySet()); private static final MediaType APPLICATION_MP4 = MediaType.application("mp4"); private static final MediaType AUDIO_MP4 = MediaType.audio("mp4"); @@ -79,9 +75,8 @@ public class MP4Parser implements Parser { static { // All types should be 4 bytes long, space padded as needed typesMap.put(MediaType.audio("mp4"), Arrays.asList("M4A ", "M4B ", "F4A ", "F4B ")); - typesMap.put(MediaType.video("3gpp"), - Arrays.asList("3ge6", "3ge7", "3gg6", "3gp1", "3gp2", "3gp3", "3gp4", "3gp5", - "3gp6", "3gs7")); + typesMap.put(MediaType.video("3gpp"), Arrays.asList("3ge6", "3ge7", "3gg6", "3gp1", "3gp2", + "3gp3", "3gp4", "3gp5", "3gp6", "3gs7")); typesMap.put(MediaType.video("3gpp2"), Arrays.asList("3g2a", "3g2b", "3g2c")); typesMap.put(MediaType.video("mp4"), Arrays.asList("mp41", "mp42")); typesMap.put(MediaType.video("x-m4v"), Arrays.asList("M4V ", "M4VH", "M4VP")); @@ -95,7 +90,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); TikaInputStream tstream = TikaInputStream.get(stream, tmp, metadata); @@ -111,10 +106,10 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } catch (RuntimeSAXException e) { throw (SAXException) e.getCause(); } - //TODO -- figure out how to get IOExceptions out of boxhandler. Mp4Reader - //currently swallows IOExceptions. + // TODO -- figure out how to get IOExceptions out of boxhandler. Mp4Reader + // currently swallows IOExceptions. final Collection mp4Directories = - mp4Metadata.getDirectoriesOfType(Mp4Directory.class); + mp4Metadata.getDirectoriesOfType(Mp4Directory.class); final Set errorMessages = processMp4Directories(mp4Directories, metadata); // Despite the brand, if we ONLY have audio streams with no video @@ -133,7 +128,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } private Set processMp4Directories(Collection mp4Directories, - Metadata metadata) { + Metadata metadata) { Set errorMsgs = new HashSet<>(); for (Mp4Directory mp4Directory : mp4Directories) { for (String m : mp4Directory.getErrors()) { @@ -143,10 +138,10 @@ private Set processMp4Directories(Collection mp4Directorie break; } } -/* for (Tag t : mp4Directory.getTags()) { - System.out.println(mp4Directory.getClass() + " : " + t.getTagName() - + " : " + mp4Directory.getString(t.getTagType())); - }*/ + /* + * for (Tag t : mp4Directory.getTags()) { System.out.println(mp4Directory.getClass() + + * " : " + t.getTagName() + " : " + mp4Directory.getString(t.getTagType())); } + */ if (mp4Directory instanceof Mp4SoundDirectory) { processMp4SoundDirectory((Mp4SoundDirectory) mp4Directory, metadata); } else if (mp4Directory instanceof Mp4VideoDirectory) { @@ -170,7 +165,8 @@ private void processMp4VideoDirectory(Mp4VideoDirectory mp4Directory, Metadata m /** * Check we have only audio with no video metadata. *

      - * Other non-video metadata can exist - as long as there's at least one {@link Mp4SoundDirectory}. + * Other non-video metadata can exist - as long as there's at least one + * {@link Mp4SoundDirectory}. * * @param directories from MP4 file * @return whether we can classify the file audio/mp4 @@ -191,10 +187,9 @@ static boolean isAudioOnly(final Collection directories) { return containsSound; } - private void processMp4SoundDirectory(Mp4SoundDirectory mp4SoundDirectory, - Metadata metadata) { + private void processMp4SoundDirectory(Mp4SoundDirectory mp4SoundDirectory, Metadata metadata) { addInt(mp4SoundDirectory, metadata, Mp4SoundDirectory.TAG_AUDIO_SAMPLE_RATE, - XMPDM.AUDIO_SAMPLE_RATE); + XMPDM.AUDIO_SAMPLE_RATE); try { int numChannels = mp4SoundDirectory.getInt(Mp4SoundDirectory.TAG_NUMBER_OF_CHANNELS); @@ -204,28 +199,27 @@ private void processMp4SoundDirectory(Mp4SoundDirectory mp4SoundDirectory, } else if (numChannels == 2) { metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo"); } else { - //??? log + // ??? log } } catch (MetadataException e) { - //log + // log } } - private void addInt(Mp4Directory mp4Directory, Metadata metadata, int tag, - Property property) { + private void addInt(Mp4Directory mp4Directory, Metadata metadata, int tag, Property property) { try { int val = mp4Directory.getInt(tag); metadata.set(property, val); } catch (MetadataException e) { - //log + // log } } private void processActualMp4Directory(Mp4Directory mp4Directory, Metadata metadata) { addDate(mp4Directory, metadata, Mp4Directory.TAG_CREATION_TIME, TikaCoreProperties.CREATED); addDate(mp4Directory, metadata, Mp4Directory.TAG_MODIFICATION_TIME, - TikaCoreProperties.MODIFIED); + TikaCoreProperties.MODIFIED); handleBrands(mp4Directory, metadata); handleDurationInSeconds(mp4Directory, metadata); @@ -239,15 +233,14 @@ private void handleDurationInSeconds(Mp4Directory mp4Directory, Metadata metadat if (durationInSeconds == null) { return; } - if (! durationInSeconds.contains("/")) { + if (!durationInSeconds.contains("/")) { try { double d = Double.parseDouble(durationInSeconds); - DecimalFormat df = - (DecimalFormat) NumberFormat.getNumberInstance(Locale.ROOT); + DecimalFormat df = (DecimalFormat) NumberFormat.getNumberInstance(Locale.ROOT); df.applyPattern("0.0#"); metadata.set(XMPDM.DURATION, df.format(d)); } catch (NumberFormatException e) { - //swallow + // swallow } return; } @@ -262,16 +255,15 @@ private void handleDurationInSeconds(Mp4Directory mp4Directory, Metadata metadat if (denominator != 0) { durationSeconds = (double) numerator / (double) denominator; // Get the duration - //TODO Replace this with a 2dp Duration Property Converter - //avoid thread safety issues by creating a new decimal format for every call - //threadlocal doesn't play well in long running processes. - DecimalFormat df = - (DecimalFormat) NumberFormat.getNumberInstance(Locale.ROOT); + // TODO Replace this with a 2dp Duration Property Converter + // avoid thread safety issues by creating a new decimal format for every call + // threadlocal doesn't play well in long running processes. + DecimalFormat df = (DecimalFormat) NumberFormat.getNumberInstance(Locale.ROOT); df.applyPattern("0.0#"); metadata.set(XMPDM.DURATION, df.format(durationSeconds)); } } catch (NumberFormatException e) { - //log + // log return; } } @@ -281,34 +273,32 @@ private void handleBrands(Mp4Directory mp4Directory, Metadata metadata) { String majorBrand = mp4Directory.getString(Mp4Directory.TAG_MAJOR_BRAND); // Identify the type based on the major brand - Optional typeHolder = typesMap.entrySet().stream() - .filter(e -> e.getValue().contains(majorBrand)).findFirst() - .map(Map.Entry::getKey); + Optional typeHolder = + typesMap.entrySet().stream().filter(e -> e.getValue().contains(majorBrand)) + .findFirst().map(Map.Entry::getKey); if (!typeHolder.isPresent()) { - String compatibleBrands = - mp4Directory.getString(Mp4Directory.TAG_COMPATIBLE_BRANDS); + String compatibleBrands = mp4Directory.getString(Mp4Directory.TAG_COMPATIBLE_BRANDS); if (compatibleBrands != null) { // If no match for major brand, see if any of the compatible brands match - typeHolder = typesMap.entrySet().stream().filter(e -> - e.getValue().stream().anyMatch(compatibleBrands::contains)) - .findFirst().map(Map.Entry::getKey); + typeHolder = typesMap.entrySet().stream().filter( + e -> e.getValue().stream().anyMatch(compatibleBrands::contains)) + .findFirst().map(Map.Entry::getKey); } } MediaType type = typeHolder.orElse(MediaType.application("mp4")); if (metadata.getValues(Metadata.CONTENT_TYPE) == null) { metadata.set(Metadata.CONTENT_TYPE, type.toString()); - } else if (! type.equals(APPLICATION_MP4)) { //todo check for specialization? + } else if (!type.equals(APPLICATION_MP4)) { // todo check for specialization? metadata.set(Metadata.CONTENT_TYPE, type.toString()); } - if (type.getType().equals("audio") && ! StringUtils.isBlank(majorBrand)) { + if (type.getType().equals("audio") && !StringUtils.isBlank(majorBrand)) { metadata.set(XMPDM.AUDIO_COMPRESSOR, majorBrand.trim()); } } - private void addDate(Mp4Directory mp4Directory, Metadata metadata, int tag, - Property property) { + private void addDate(Mp4Directory mp4Directory, Metadata metadata, int tag, Property property) { Date d = mp4Directory.getDate(tag); if (d == null) { return; @@ -317,13 +307,12 @@ private void addDate(Mp4Directory mp4Directory, Metadata metadata, int tag, } - private void addDouble(Directory mp4Directory, Metadata metadata, int tag, - Property property) { + private void addDouble(Directory mp4Directory, Metadata metadata, int tag, Property property) { try { double val = mp4Directory.getDouble(tag); metadata.set(property, val); } catch (MetadataException e) { - //log + // log return; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/TikaMp4BoxHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/TikaMp4BoxHandler.java index ca399ab8b9..815afd54e7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/TikaMp4BoxHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/TikaMp4BoxHandler.java @@ -1,40 +1,37 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp4; -import java.io.IOException; - import com.drew.imaging.mp4.Mp4Handler; import com.drew.lang.annotations.NotNull; import com.drew.lang.annotations.Nullable; import com.drew.metadata.Metadata; import com.drew.metadata.mp4.Mp4BoxHandler; import com.drew.metadata.mp4.Mp4Context; -import org.xml.sax.SAXException; - +import java.io.IOException; import org.apache.tika.parser.mp4.boxes.TikaUserDataBox; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; public class TikaMp4BoxHandler extends Mp4BoxHandler { org.apache.tika.metadata.Metadata tikaMetadata; final XHTMLContentHandler xhtml; + public TikaMp4BoxHandler(Metadata metadata, org.apache.tika.metadata.Metadata tikaMetadata, - XHTMLContentHandler xhtml) { + XHTMLContentHandler xhtml) { super(metadata); this.tikaMetadata = tikaMetadata; this.xhtml = xhtml; @@ -54,9 +51,8 @@ public boolean shouldAcceptContainer(@NotNull String box) { } @Override - public Mp4Handler processBox(@NotNull String box, @Nullable byte[] payload, - long size, Mp4Context context) - throws IOException { + public Mp4Handler processBox(@NotNull String box, @Nullable byte[] payload, long size, + Mp4Context context) throws IOException { if (box.equals("udta")) { return processUserData(box, payload, context); } @@ -65,7 +61,8 @@ public Mp4Handler processBox(@NotNull String box, @Nullable byte[] payload, } - private Mp4Handler processUserData(String box, byte[] payload, Mp4Context context) throws IOException { + private Mp4Handler processUserData(String box, byte[] payload, Mp4Context context) + throws IOException { if (payload == null) { return this; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java index 9324431b28..49241b62b3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java @@ -1,39 +1,35 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp4.boxes; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - import com.drew.lang.SequentialByteArrayReader; import com.drew.lang.SequentialReader; import com.drew.lang.annotations.NotNull; import com.drew.lang.annotations.Nullable; import com.drew.metadata.mp4.Mp4Directory; -import org.xml.sax.SAXException; - +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.tika.exception.RuntimeSAXException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.XMP; import org.apache.tika.metadata.XMPDM; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; public class TikaUserDataBox { @@ -42,18 +38,18 @@ public class TikaUserDataBox { private static final String ILST = "ilst"; private static final String MDTA = "mdta"; private static final String HDLR = "hdlr"; - private static final String MDIR = "mdir";//apple metadata itunes reader + private static final String MDIR = "mdir";// apple metadata itunes reader private static final Pattern COORDINATE_PATTERN = - Pattern.compile("([+-]\\d+\\.\\d+)([+-]\\d+\\.\\d+)"); + Pattern.compile("([+-]\\d+\\.\\d+)([+-]\\d+\\.\\d+)"); - @Nullable - private String coordinateString; + @Nullable private String coordinateString; private boolean isQuickTime = false; private final Metadata metadata; private final XHTMLContentHandler xhtml; + public TikaUserDataBox(@NotNull String box, byte[] payload, Metadata metadata, - XHTMLContentHandler xhtml) throws IOException, SAXException { + XHTMLContentHandler xhtml) throws IOException, SAXException { this.metadata = metadata; this.xhtml = xhtml; int length = payload.length; @@ -69,8 +65,8 @@ public TikaUserDataBox(@NotNull String box, byte[] payload, Metadata metadata, reader.skip(2L); this.coordinateString = reader.getString(xyzLength, "UTF-8"); } else if (META.equals(kindName)) { - reader.getUInt32();//not sure what this is - long lengthToStartOfList = reader.getUInt32() - 4;//this is the length to + reader.getUInt32();// not sure what this is + long lengthToStartOfList = reader.getUInt32() - 4;// this is the length to // 'ilst', but the length of the ilist is defined in the 4 bytes before ilist if (lengthToStartOfList < 0 || lengthToStartOfList > Integer.MAX_VALUE) { return; @@ -86,8 +82,8 @@ public TikaUserDataBox(@NotNull String box, byte[] payload, Metadata metadata, if (HDLR.equals(hdlr) && MDTA.equals(subtype)) { isQuickTime = true; } - int read = 16;//bytes read so far - parseUserDataBox(reader, subtype, read, (int)lengthToStartOfList); + int read = 16;// bytes read so far + parseUserDataBox(reader, subtype, read, (int) lengthToStartOfList); } else { if (size < 8L) { return; @@ -99,9 +95,8 @@ public TikaUserDataBox(@NotNull String box, byte[] payload, Metadata metadata, } - private void parseUserDataBox(SequentialReader reader, String handlerType, - int read, int lengthToStartOfList) - throws IOException { + private void parseUserDataBox(SequentialReader reader, String handlerType, int read, + int lengthToStartOfList) throws IOException { if (!MDIR.equals(handlerType)) { return; } @@ -112,13 +107,13 @@ private void parseUserDataBox(SequentialReader reader, String handlerType, reader.skip(toSkip); long len = reader.getUInt32(); if (len >= Integer.MAX_VALUE || len <= 0) { - //log + // log return; } String subType = reader.getString(4, StandardCharsets.ISO_8859_1); - //this handles "free" types...not sure if there are others? - //will throw IOException if no ilist is found - while (! subType.equals(ILST)) { + // this handles "free" types...not sure if there are others? + // will throw IOException if no ilist is found + while (!subType.equals(ILST)) { reader.skip(len - 8); len = reader.getUInt32(); subType = reader.getString(4, StandardCharsets.ISO_8859_1); @@ -131,38 +126,37 @@ private void parseUserDataBox(SequentialReader reader, String handlerType, - private void processIList(SequentialReader reader, long totalLen) - throws IOException { + private void processIList(SequentialReader reader, long totalLen) throws IOException { long totalRead = 0; while (totalRead < totalLen) { long recordLen = reader.getUInt32(); String fieldName = reader.getString(4, StandardCharsets.ISO_8859_1); long fieldLen = reader.getUInt32(); - String typeName = reader.getString(4, StandardCharsets.ISO_8859_1);//data + String typeName = reader.getString(4, StandardCharsets.ISO_8859_1);// data totalRead += 16; if ("data".equals(typeName)) { - reader.skip(8);//not sure what these are + reader.skip(8);// not sure what these are totalRead += 8; int toRead = (int) fieldLen - 16; if (toRead <= 0) { - //log? + // log? return; } if ("covr".equals(fieldName)) { - //covr can be an image file, e.g. png or jpeg - //skip this for now + // covr can be an image file, e.g. png or jpeg + // skip this for now reader.skip(toRead); } else if ("cpil".equals(fieldName)) { - int compilationId = (int)reader.getByte(); + int compilationId = (int) reader.getByte(); metadata.set(XMPDM.COMPILATION, compilationId); } else if ("trkn".equals(fieldName)) { if (toRead == 8) { long numA = reader.getUInt32(); long numB = reader.getUInt32(); - metadata.set(XMPDM.TRACK_NUMBER, (int)numA); + metadata.set(XMPDM.TRACK_NUMBER, (int) numA); } else { - //log + // log reader.skip(toRead); } } else if ("disk".equals(fieldName)) { @@ -174,7 +168,7 @@ private void processIList(SequentialReader reader, long totalLen) try { addMetadata(fieldName, val); } catch (SAXException e) { - //need to punch through IOException catching in MP4Reader + // need to punch through IOException catching in MP4Reader throw new RuntimeSAXException(e); } } @@ -183,7 +177,7 @@ private void processIList(SequentialReader reader, long totalLen) } else { int toSkip = (int) recordLen - 16; if (toSkip <= 0) { - //log? + // log? return; } reader.skip(toSkip); @@ -202,12 +196,12 @@ private void addMetadata(String key, String value) throws SAXException { case "\u00A9too": metadata.set(XMP.CREATOR_TOOL, value); break; - case "\u00A9ART" : + case "\u00A9ART": metadata.set(XMPDM.ARTIST, value); metadata.set(TikaCoreProperties.CREATOR, value); xhtml.element("p", value); break; - case "aART" : + case "aART": metadata.set(XMPDM.ALBUM_ARTIST, value); xhtml.element("p", value); break; @@ -219,46 +213,46 @@ private void addMetadata(String key, String value) throws SAXException { metadata.set(XMPDM.ALBUM, value); xhtml.element("p", value); break; - case "\u00A9gen" : + case "\u00A9gen": metadata.set(XMPDM.GENRE, value); xhtml.element("p", value); break; - case "\u00A9day" : - //this can be a year "2008" or a date "2017-04-26T07:00:00Z" + case "\u00A9day": + // this can be a year "2008" or a date "2017-04-26T07:00:00Z" metadata.set(XMPDM.RELEASE_DATE, value); xhtml.element("p", value); break; - case "\u00A9cmt" : + case "\u00A9cmt": metadata.set(XMPDM.LOG_COMMENT, value); xhtml.element("p", value); break; - case "cprt" : + case "cprt": metadata.set(XMPDM.COPYRIGHT, value); xhtml.element("p", value); break; - case "keyw" : + case "keyw": metadata.set(TikaCoreProperties.SUBJECT, value); xhtml.element("p", value); break; - case "\u00A9lyr" : + case "\u00A9lyr": xhtml.element("p", value); break; - case "ldes" : //intentional fall through - case "desc" : + case "ldes": // intentional fall through + case "desc": metadata.set(TikaCoreProperties.DESCRIPTION, value); xhtml.element("p", value); break; - case "xid " : - //not sure this is the right use of this key + case "xid ": + // not sure this is the right use of this key metadata.set(XMP.IDENTIFIER, value); break; - //purd date? - //xid ? e.g. SonyBMG:isrc:KRA031208874 - //cprt copyright - //ownr ? and apID - //flvr ? - //son = nam, soal = (c)alb soar = aART? - //(C)ART + // purd date? + // xid ? e.g. SonyBMG:isrc:KRA031208874 + // cprt copyright + // ownr ? and apID + // flvr ? + // son = nam, soal = (c)alb soar = aART? + // (C)ART } } @@ -274,5 +268,3 @@ public void addMetadata(Mp4Directory directory) { } } } - - diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java index 0040989fd8..771c2ca689 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.video; @@ -28,26 +26,23 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** *

      - * Parser for metadata contained in Flash Videos (.flv). Resources: - * http://osflash.org/flv and for AMF: - * http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf + * Parser for metadata contained in Flash Videos (.flv). Resources: http://osflash.org/flv and for + * AMF: http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf *

      - * This parser is capable of extracting the general metadata from header as well - * as embedded metadata. + * This parser is capable of extracting the general metadata from header as well as embedded + * metadata. *

      * Known keys for metadata (from file header): *

        @@ -55,13 +50,12 @@ *
      1. hasSound: true|false *
      *

      - * In addition to the above values also metadata that is inserted in to the - * actual stream will be picked. Usually there are keys like: - * hasKeyframes, lastkeyframetimestamp, audiocodecid, keyframes, filepositions, - * hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid, - * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate, - * hasCuePoints width, cuePoints, lasttimestamp, canSeekToEnd, datasize, - * duration, videosize, filesize, audiodatarate, hasAudio, stereo audiodelay + * In addition to the above values also metadata that is inserted in to the actual stream will be + * picked. Usually there are keys like: hasKeyframes, lastkeyframetimestamp, audiocodecid, + * keyframes, filepositions, hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid, + * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate, hasCuePoints width, + * cuePoints, lasttimestamp, canSeekToEnd, datasize, duration, videosize, filesize, audiodatarate, + * hasAudio, stereo audiodelay */ public class FLVParser implements Parser { @@ -70,7 +64,7 @@ public class FLVParser implements Parser { */ private static final long serialVersionUID = -8718013155719197679L; private static final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.video("x-flv")); + Collections.singleton(MediaType.video("x-flv")); private static int TYPE_METADATA = 0x12; private static byte MASK_AUDIO = 1; private static byte MASK_VIDEO = 4; @@ -84,7 +78,7 @@ private long readUInt32(DataInputStream input) throws IOException { } private int readUInt24(DataInputStream input) throws IOException { - //readUnsignedByte ensures EOFException + // readUnsignedByte ensures EOFException int uint = input.readUnsignedByte() << 16; uint += input.readUnsignedByte() << 8; uint += input.readUnsignedByte(); @@ -165,7 +159,7 @@ private boolean checkSignature(DataInputStream fis) throws IOException { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { DataInputStream datainput = new DataInputStream(stream); if (!checkSignature(datainput)) { throw new TikaException("FLV signature not detected"); @@ -207,14 +201,14 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, break; } - final int datalen = readUInt24(datainput); //body length + final int datalen = readUInt24(datainput); // body length readUInt32(datainput); // timestamp readUInt24(datainput); // streamid if (type == TYPE_METADATA) { // found metadata Tag, read content to buffer byte[] metaBytes = new byte[datalen]; - for (int readCount = 0; readCount < datalen; ) { + for (int readCount = 0; readCount < datalen;) { int r = stream.read(metaBytes, readCount, datalen - readCount); if (r != -1) { readCount += r; @@ -224,11 +218,9 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } } - try ( - UnsynchronizedByteArrayInputStream is = - UnsynchronizedByteArrayInputStream.builder().setByteArray(metaBytes).get(); - DataInputStream dis = new DataInputStream(is); - ) { + try (UnsynchronizedByteArrayInputStream is = UnsynchronizedByteArrayInputStream + .builder().setByteArray(metaBytes).get(); + DataInputStream dis = new DataInputStream(is);) { Object data = null; for (int i = 0; i < 2; i++) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java index a893055a05..6f9b71c0d7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an "AS IS" - * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express - * or implied. See the License for the specific language governing - * permissions and limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.detect; @@ -21,12 +19,10 @@ import java.io.IOException; import java.io.InputStream; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; -import org.junit.jupiter.api.Test; - import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.Test; public class MatroskaDetectorTest { @@ -38,36 +34,34 @@ private InputStream getResourceAsStream(String resourcePath) { @Test public void testDetectMKV() throws IOException { - assertEquals(MediaType.application("x-matroska"), - detector.detect(getResourceAsStream("/test-documents/sample-mkv.noext"), - new Metadata())); + assertEquals(MediaType.application("x-matroska"), detector.detect( + getResourceAsStream("/test-documents/sample-mkv.noext"), new Metadata())); - assertEquals(MediaType.application("x-matroska"), - detector.detect(getResourceAsStream("/test-documents/testMKV.mkv"), - new Metadata())); + assertEquals(MediaType.application("x-matroska"), detector.detect( + getResourceAsStream("/test-documents/testMKV.mkv"), new Metadata())); } @Test public void testDetectWEBM() throws IOException { - assertEquals(MediaType.video("webm"), - detector.detect(getResourceAsStream("/test-documents/sample-webm.noext"), - new Metadata())); + assertEquals(MediaType.video("webm"), detector.detect( + getResourceAsStream("/test-documents/sample-webm.noext"), new Metadata())); } @Test public void testNullAndShort() throws Exception { - assertEquals(MediaType.OCTET_STREAM, - detector.detect(null, new Metadata())); + assertEquals(MediaType.OCTET_STREAM, detector.detect(null, new Metadata())); byte[] bytes = new byte[10]; - assertEquals(MediaType.OCTET_STREAM, - detector.detect(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get(), new Metadata())); + assertEquals(MediaType.OCTET_STREAM, detector.detect( + UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get(), + new Metadata())); bytes = new byte[0]; - assertEquals(MediaType.OCTET_STREAM, - detector.detect(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get(), new Metadata())); + assertEquals(MediaType.OCTET_STREAM, detector.detect( + UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get(), + new Metadata())); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java index 12f753356d..cdc09f4cb7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java @@ -1,27 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.audio; import static org.junit.jupiter.api.Assertions.assertEquals; -import org.junit.jupiter.api.Test; - import org.apache.tika.Tika; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.Test; public class AudioParserTest { @@ -29,8 +26,8 @@ public class AudioParserTest { public void testWAV() throws Exception { String path = "/test-documents/testWAV.wav"; Metadata metadata = new Metadata(); - String content = - new Tika().parseToString(AudioParserTest.class.getResourceAsStream(path), metadata); + String content = new Tika().parseToString(AudioParserTest.class.getResourceAsStream(path), + metadata); assertEquals("audio/vnd.wave", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("44100.0", metadata.get("samplerate")); @@ -45,8 +42,8 @@ public void testWAV() throws Exception { public void testAIFF() throws Exception { String path = "/test-documents/testAIFF.aif"; Metadata metadata = new Metadata(); - String content = - new Tika().parseToString(AudioParserTest.class.getResourceAsStream(path), metadata); + String content = new Tika().parseToString(AudioParserTest.class.getResourceAsStream(path), + metadata); assertEquals("audio/x-aiff", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("44100.0", metadata.get("samplerate")); @@ -61,8 +58,8 @@ public void testAIFF() throws Exception { public void testAU() throws Exception { String path = "/test-documents/testAU.au"; Metadata metadata = new Metadata(); - String content = - new Tika().parseToString(AudioParserTest.class.getResourceAsStream(path), metadata); + String content = new Tika().parseToString(AudioParserTest.class.getResourceAsStream(path), + metadata); assertEquals("audio/basic", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("44100.0", metadata.get("samplerate")); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java index c1c00d8e4f..943e219ba2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.audio; @@ -27,8 +25,8 @@ public class MidiParserTest { public void testMID() throws Exception { String path = "/test-documents/testMID.mid"; Metadata metadata = new Metadata(); - String content = - new Tika().parseToString(MidiParserTest.class.getResourceAsStream(path), metadata); + String content = new Tika().parseToString(MidiParserTest.class.getResourceAsStream(path), + metadata); assertEquals("audio/midi", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("2", metadata.get("tracks")); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java index 7f54bd2b9c..b87b4acbff 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; @@ -20,13 +18,11 @@ import static org.junit.jupiter.api.Assumptions.assumeTrue; import java.io.ByteArrayInputStream; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.XMPDM; +import org.junit.jupiter.api.Test; /** * Test case for parsing mp3 files. @@ -40,9 +36,8 @@ public class Mp3ParserTest extends TikaTest { * @param expected the expected duration, rounded as seconds */ private static void checkDuration(Metadata metadata, int expected) { - assertEquals(expected, - Math.round(Float.parseFloat(metadata.get(XMPDM.DURATION))), - "wrong duration"); + assertEquals(expected, Math.round(Float.parseFloat(metadata.get(XMPDM.DURATION))), + "wrong duration"); } /** @@ -72,8 +67,7 @@ public void testMp3ParsingID3v1() throws Exception { } /** - * Test that with only ID3v2 tags, we get the full - * set of information out. + * Test that with only ID3v2 tags, we get the full set of information out. */ @Test public void testMp3ParsingID3v2() throws Exception { @@ -108,7 +102,7 @@ public void testMp3ParsingID3v2() throws Exception { assertEquals("2008", metadata.get(XMPDM.RELEASE_DATE)); assertEquals("Rock", metadata.get(XMPDM.GENRE)); assertEquals("XXX - ID3v1 Comment\nTest Comment", - metadata.get(XMPDM.LOG_COMMENT.getName())); + metadata.get(XMPDM.LOG_COMMENT.getName())); assertEquals("1", metadata.get(XMPDM.TRACK_NUMBER)); assertEquals("1/1", metadata.get(XMPDM.DISC_NUMBER)); assertEquals("1", metadata.get(XMPDM.COMPILATION)); @@ -120,8 +114,8 @@ public void testMp3ParsingID3v2() throws Exception { } /** - * Test that metadata is added before xhtml content - * is written...so that more metadata shows up in the xhtml + * Test that metadata is added before xhtml content is written...so that more metadata shows up + * in the xhtml */ @Test public void testAddingToMetadataBeforeWriting() throws Exception { @@ -132,8 +126,7 @@ public void testAddingToMetadataBeforeWriting() throws Exception { } /** - * Test that with both id3v2 and id3v1, we prefer the - * details from id3v2 + * Test that with both id3v2 and id3v1, we prefer the details from id3v2 */ @Test public void testMp3ParsingID3v1v2() throws Exception { @@ -158,8 +151,7 @@ public void testMp3ParsingID3v1v2() throws Exception { } /** - * Test that with only ID3v2 tags, of version 2.4, we get the full - * set of information out. + * Test that with only ID3v2 tags, of version 2.4, we get the full set of information out. */ @Test public void testMp3ParsingID3v24() throws Exception { @@ -197,8 +189,7 @@ public void testMp3ParsingID3v24() throws Exception { } /** - * Tests that a file with characters not in the ISO 8859-1 - * range is correctly handled + * Tests that a file with characters not in the ISO 8859-1 range is correctly handled */ @Test public void testMp3ParsingID3i18n() throws Exception { @@ -212,7 +203,7 @@ public void testMp3ParsingID3i18n() throws Exception { assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM)); assertEquals("Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment", - metadata.get(XMPDM.LOG_COMMENT)); + metadata.get(XMPDM.LOG_COMMENT)); assertEquals("MPEG 3 Layer III Version 1", metadata.get("version")); assertEquals("44100", metadata.get("samplerate")); @@ -221,9 +212,8 @@ public void testMp3ParsingID3i18n() throws Exception { } /** - * Tests that a file with the last frame slightly - * truncated does not cause an EOF and does - * not lead to an infinite loop. + * Tests that a file with the last frame slightly truncated does not cause an EOF and does not + * lead to an infinite loop. */ @Test public void testMp3ParsingID3i18nTruncated() throws Exception { @@ -237,7 +227,7 @@ public void testMp3ParsingID3i18nTruncated() throws Exception { assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM)); assertEquals("Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment", - metadata.get(XMPDM.LOG_COMMENT)); + metadata.get(XMPDM.LOG_COMMENT)); assertEquals("MPEG 3 Layer III Version 1", metadata.get("version")); assertEquals("44100", metadata.get("samplerate")); @@ -246,14 +236,13 @@ public void testMp3ParsingID3i18nTruncated() throws Exception { } /** - * Tests that a file with both lyrics and - * ID3v2 tags gets both extracted correctly + * Tests that a file with both lyrics and ID3v2 tags gets both extracted correctly */ @Test public void testMp3ParsingLyrics() throws Exception { // Note - our test file has a lyrics tag, but lacks any - // lyrics in the tags, so we can't test that bit + // lyrics in the tags, so we can't test that bit // TODO Find a better sample file Metadata metadata = new Metadata(); String content = getText("testMP3lyrics.mp3", metadata); @@ -277,13 +266,13 @@ public void testMp3ParsingLyrics() throws Exception { @Test public void testID3v2Frame() throws Exception { - byte[] empty = new byte[]{0x49, 0x44, 0x33, 3, 1, 0, 0, 0, 0, 0}; + byte[] empty = new byte[] {0x49, 0x44, 0x33, 3, 1, 0, 0, 0, 0, 0}; - assertEquals(11, ID3v2Frame.getInt(new byte[]{0, 0, 0, 0x0b})); - assertEquals(257, ID3v2Frame.getInt(new byte[]{0, 0, 1, 1})); + assertEquals(11, ID3v2Frame.getInt(new byte[] {0, 0, 0, 0x0b})); + assertEquals(257, ID3v2Frame.getInt(new byte[] {0, 0, 1, 1})); - ID3v2Frame f = - (ID3v2Frame) ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty)); + ID3v2Frame f = (ID3v2Frame) ID3v2Frame + .createFrameIfPresent(new ByteArrayInputStream(empty)); assertEquals(3, f.getMajorVersion()); assertEquals(1, f.getMinorVersion()); assertEquals(0, f.getFlags()); @@ -291,8 +280,8 @@ public void testID3v2Frame() throws Exception { assertEquals(0, f.getData().length); assertEquals("", ID3v2Frame.getTagString(f.getData(), 0, 0)); - assertEquals("", ID3v2Frame.getTagString(new byte[]{0, 0, 0, 0}, 0, 3)); - assertEquals("A", ID3v2Frame.getTagString(new byte[]{(byte) 'A', 0, 0, 0}, 0, 3)); + assertEquals("", ID3v2Frame.getTagString(new byte[] {0, 0, 0, 0}, 0, 3)); + assertEquals("A", ID3v2Frame.getTagString(new byte[] {(byte) 'A', 0, 0, 0}, 0, 3)); } @Test @@ -301,16 +290,13 @@ public void testTIKA1589_noId3ReturnsDurationCorrectly() throws Exception { } /** - * This test will do nothing, unless you've downloaded the - * mp3 file from TIKA-424 - the file cannot be - * distributed with Tika. - * This test will check for the complicated set of ID3v2.4 + * This test will do nothing, unless you've downloaded the mp3 file from TIKA-424 - the file + * cannot be distributed with Tika. This test will check for the complicated set of ID3v2.4 * tags. */ @Test public void testTIKA424() throws Exception { - assumeTrue( - Mp3ParserTest.class.getResourceAsStream("/test-documents/test2.mp3") != null); + assumeTrue(Mp3ParserTest.class.getResourceAsStream("/test-documents/test2.mp3") != null); Metadata metadata = new Metadata(); String content = getText("test2.mp3", metadata); @@ -327,11 +313,10 @@ public void testTIKA424() throws Exception { } /** - * This tests that we can handle without errors (but perhaps not - * all content) a file with a very very large ID3 frame that - * has been truncated before the end of the ID3 tags. - * In this case, it is a file with JPEG data in the ID3, which - * is truncated before the end of the JPEG bit of the ID3 frame. + * This tests that we can handle without errors (but perhaps not all content) a file with a very + * very large ID3 frame that has been truncated before the end of the ID3 tags. In this case, it + * is a file with JPEG data in the ID3, which is truncated before the end of the JPEG bit of the + * ID3 frame. */ @Test public void testTIKA474() throws Exception { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java index bb408fc8fd..25f737cf7a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp3/MpegStreamTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp3; @@ -26,7 +24,6 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; - import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; @@ -42,7 +39,7 @@ public class MpegStreamTest { /** * Writes the given byte the given number of times into an output stream. * - * @param out the output stream + * @param out the output stream * @param value the value to write * @param count the number of bytes to write * @throws IOException if an error occurs @@ -57,9 +54,9 @@ private static void writeBytes(OutputStream out, int value, int count) throws IO * Writes a frame header in the given output stream. * * @param out the output stream - * @param b2 byte 2 of the header - * @param b3 byte 3 of the header - * @param b4 byte 4 of the header + * @param b2 byte 2 of the header + * @param b3 byte 3 of the header + * @param b4 byte 4 of the header * @throws IOException if an error occurs */ private static void writeFrame(OutputStream out, int b2, int b3, int b4) throws IOException { @@ -124,8 +121,7 @@ public void testSearchNextFrameInvalid() throws IOException { } /** - * Tests a search for another frame which is interrupted because the stream - * ends. + * Tests a search for another frame which is interrupted because the stream ends. */ @Test public void testSeachNextFrameEOS() throws IOException { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java index ffda6bd815..b777cdd30a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mp4; @@ -20,21 +18,16 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; +import com.drew.metadata.mp4.Mp4Directory; +import com.drew.metadata.mp4.media.Mp4MetaDirectory; +import com.drew.metadata.mp4.media.Mp4SoundDirectory; +import com.drew.metadata.mp4.media.Mp4VideoDirectory; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; - -import com.drew.metadata.mp4.Mp4Directory; -import com.drew.metadata.mp4.media.Mp4MetaDirectory; -import com.drew.metadata.mp4.media.Mp4SoundDirectory; -import com.drew.metadata.mp4.media.Mp4VideoDirectory; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.Timeout; -import org.xml.sax.ContentHandler; - import org.apache.tika.TikaTest; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -43,6 +36,9 @@ import org.apache.tika.metadata.XMPDM; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import org.xml.sax.ContentHandler; /** @@ -54,24 +50,17 @@ public class MP4ParserTest extends TikaTest { Set skipKeysB = new HashSet<>(); /* - @Before - public void setUp() { - - skipKeysB.add("X-TIKA:Parsed-By"); - skipKeysA.add("X-TIKA:parse_time_millis"); - skipKeysB.add("X-TIKA:content_handler"); - skipKeysA.add("X-TIKA:content_handler"); - skipKeysB.add("X-TIKA:parse_time_millis"); - skipKeysB.add("xmpDM:videoCompressor"); - //skipKeysB.add("xmpDM:audioChannelType"); - //skipKeysB.add("xmpDM:audioChannelType"); - skipKeysA.add("X-TIKA:content"); - skipKeysB.add("X-TIKA:content"); - skipKeysB.add("xmpDM:copyright"); - }*/ + * @Before public void setUp() { + * + * skipKeysB.add("X-TIKA:Parsed-By"); skipKeysA.add("X-TIKA:parse_time_millis"); + * skipKeysB.add("X-TIKA:content_handler"); skipKeysA.add("X-TIKA:content_handler"); + * skipKeysB.add("X-TIKA:parse_time_millis"); skipKeysB.add("xmpDM:videoCompressor"); + * //skipKeysB.add("xmpDM:audioChannelType"); //skipKeysB.add("xmpDM:audioChannelType"); + * skipKeysA.add("X-TIKA:content"); skipKeysB.add("X-TIKA:content"); + * skipKeysB.add("xmpDM:copyright"); } + */ /** - * Test that we can extract information from - * a M4A MP4 Audio file + * Test that we can extract information from a M4A MP4 Audio file */ @Test public void testMP4ParsingAudio() throws Exception { @@ -114,11 +103,11 @@ public void testMP4ParsingAudio() throws Exception { assertEquals("iTunes 10.5.3.3", metadata.get(XMP.CREATOR_TOOL)); assertContains("org.apache.tika.parser.mp4.MP4Parser", - Arrays.asList(metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY))); + Arrays.asList(metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY))); // Check again by file, rather than stream TikaInputStream tstream = - TikaInputStream.get(getResourceAsStream("/test-documents/testMP4.m4a")); + TikaInputStream.get(getResourceAsStream("/test-documents/testMP4.m4a")); tstream.getFile(); ContentHandler handler = new BodyContentHandler(); try { @@ -126,7 +115,7 @@ public void testMP4ParsingAudio() throws Exception { } finally { tstream.close(); } - //TODO: why don't we check the output here? + // TODO: why don't we check the output here? } // TODO Test a MP4 Video file @@ -159,8 +148,8 @@ public void testMetadataWithSoundConsideredAudio() { @Test public void testVideoDirectoriesNotConsideredAudio() { - final Collection directories = - List.of(new Mp4VideoDirectory(), new Mp4VideoDirectory(), new Mp4SoundDirectory()); + final Collection directories = List.of(new Mp4VideoDirectory(), + new Mp4VideoDirectory(), new Mp4SoundDirectory()); assertFalse(MP4Parser.isAudioOnly(directories)); } @@ -170,115 +159,49 @@ public void testNoDirectoriesNotConsideredAudio() { assertFalse(MP4Parser.isAudioOnly(Collections.emptyList())); } -/* - - @Test - public void compareMetadata() throws Exception { - Path dir = Paths.get("/data/mp4s"); - processDir(dir); - - } - - private void processDir(Path dir) { - for (File f : dir.toFile().listFiles()) { - if (f.isDirectory()) { - processDir(f.toPath()); - } else { - - if (! f.getName().contains("MB3EOKALN337SEYQE6WXIGMY5VQ2ZU7M")) { - // continue; - } - System.out.println(f); - processFile(f.toPath()); - System.out.println(""); - } - } - } - - private void processFile(Path p) { - - Metadata a; - Metadata b; - try { - List metadataList = getRecursiveMetadata(p, new LegacyMP4Parser(), true); - if (metadataList.size() > 0) { - a = metadataList.get(0); - } else { - System.out.println("a is empty"); - return; - } - } catch (AssertionError | Exception e) { - e.printStackTrace(); - return; - } - - try { - List metadataList = getRecursiveMetadata(p); - if (metadataList.size() > 0) { - b = metadataList.get(0); - } else { - System.out.println("b is empty"); - return; - } - } catch (Exception e) { - e.printStackTrace(); - return; - } - compare(p, a, b); - } - - private void compare(Path p, Metadata a, Metadata b) { - /* System.out.println("A"); - debug(a); - System.out.println("B"); - debug(b); - Set aKeys = getKeys(a, skipKeysA); - Set bKeys = getKeys(b, skipKeysB); - for (String k : aKeys) { - if (! bKeys.contains(k)) { - System.out.println("not in b: " + k + " : " + a.get(k) + " : " + - p.getFileName().toString()); - } - } - for (String k : bKeys) { - if (!aKeys.contains(k)) { - System.out.println("not in a: " + k + " : " + b.get(k) + " : " + - p.getFileName().toString()); - } - } - for (String k : aKeys) { - if (! bKeys.contains(k)) { - continue; - } - Set aVals = getVals(a, k); - Set bVals = getVals(b, k); - for (String v : aVals) { - if (!bVals.contains(v)) { - System.out.println("b missing value: " + v + " for key " + k + " in " + p.getFileName().toString()); - for (String bVal : bVals) { - System.out.println("\tb has " + bVal); - } - } - } - } - } - - private Set getKeys(Metadata m, Set skipFields) { - Set keys = new HashSet<>(); - for (String n : m.names()) { - if (! skipFields.contains(n)) { - keys.add(n); - } - } - return keys; - - } - - private Set getVals(Metadata m, String k) { - Set vals = new HashSet<>(); - for (String v : m.getValues(k)) { - vals.add(v); - } - return vals; - } */ + /* + * + * @Test public void compareMetadata() throws Exception { Path dir = Paths.get("/data/mp4s"); + * processDir(dir); + * + * } + * + * private void processDir(Path dir) { for (File f : dir.toFile().listFiles()) { if + * (f.isDirectory()) { processDir(f.toPath()); } else { + * + * if (! f.getName().contains("MB3EOKALN337SEYQE6WXIGMY5VQ2ZU7M")) { // continue; } + * System.out.println(f); processFile(f.toPath()); System.out.println(""); } } } + * + * private void processFile(Path p) { + * + * Metadata a; Metadata b; try { List metadataList = getRecursiveMetadata(p, new + * LegacyMP4Parser(), true); if (metadataList.size() > 0) { a = metadataList.get(0); } else { + * System.out.println("a is empty"); return; } } catch (AssertionError | Exception e) { + * e.printStackTrace(); return; } + * + * try { List metadataList = getRecursiveMetadata(p); if (metadataList.size() > 0) { b + * = metadataList.get(0); } else { System.out.println("b is empty"); return; } } catch + * (Exception e) { e.printStackTrace(); return; } compare(p, a, b); } + * + * private void compare(Path p, Metadata a, Metadata b) { /* System.out.println("A"); debug(a); + * System.out.println("B"); debug(b); Set aKeys = getKeys(a, skipKeysA); Set + * bKeys = getKeys(b, skipKeysB); for (String k : aKeys) { if (! bKeys.contains(k)) { + * System.out.println("not in b: " + k + " : " + a.get(k) + " : " + p.getFileName().toString()); + * } } for (String k : bKeys) { if (!aKeys.contains(k)) { System.out.println("not in a: " + k + + * " : " + b.get(k) + " : " + p.getFileName().toString()); } } for (String k : aKeys) { if (! + * bKeys.contains(k)) { continue; } Set aVals = getVals(a, k); Set bVals = + * getVals(b, k); for (String v : aVals) { if (!bVals.contains(v)) { + * System.out.println("b missing value: " + v + " for key " + k + " in " + + * p.getFileName().toString()); for (String bVal : bVals) { System.out.println("\tb has " + + * bVal); } } } } } + * + * private Set getKeys(Metadata m, Set skipFields) { Set keys = new + * HashSet<>(); for (String n : m.names()) { if (! skipFields.contains(n)) { keys.add(n); } } + * return keys; + * + * } + * + * private Set getVals(Metadata m, String k) { Set vals = new HashSet<>(); for + * (String v : m.getValues(k)) { vals.add(v); } return vals; } + */ } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java index 2259f8a3d0..370eb377d5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/video/FLVParserTest.java @@ -1,27 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.video; import static org.junit.jupiter.api.Assertions.assertEquals; -import org.junit.jupiter.api.Test; - import org.apache.tika.Tika; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.Test; public class FLVParserTest { @@ -30,8 +27,8 @@ public void testFLV() throws Exception { String path = "/test-documents/testFLV.flv"; Metadata metadata = new Metadata(); - String content = - new Tika().parseToString(FLVParserTest.class.getResourceAsStream(path), metadata); + String content = new Tika().parseToString(FLVParserTest.class.getResourceAsStream(path), + metadata); assertEquals("", content); assertEquals("video/x-flv", metadata.get(Metadata.CONTENT_TYPE)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dgn/DGN8Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dgn/DGN8Parser.java index a0eba36c19..5bce8e860b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dgn/DGN8Parser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dgn/DGN8Parser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.dgn; @@ -20,14 +18,10 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; - import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -36,6 +30,8 @@ import org.apache.tika.parser.Parser; import org.apache.tika.parser.microsoft.SummaryExtractor; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * This is a VERY LIMITED parser. It parses metadata out of dgn8 files. @@ -51,7 +47,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); SummaryExtractor summaryExtractor = new SummaryExtractor(metadata); @@ -77,7 +73,8 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } // tstream will close the fs, no need to close this below tstream.setOpenContainer(fs); - root = fs.getRoot(); } + root = fs.getRoot(); + } } summaryExtractor.parseSummaries(root); } finally { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java index e1c4ae3ed0..18ffc52031 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.dwg; @@ -32,7 +30,8 @@ public abstract class AbstractDWGParser implements Parser { private final DWGParserConfig defaultDwgParserConfig = new DWGParserConfig(); public void configure(ParseContext parseContext) { - DWGParserConfig dwgParserConfig = parseContext.get(DWGParserConfig.class, defaultDwgParserConfig); + DWGParserConfig dwgParserConfig = + parseContext.get(DWGParserConfig.class, defaultDwgParserConfig); parseContext.set(DWGParserConfig.class, dwgParserConfig); } @@ -40,16 +39,16 @@ public void configure(ParseContext parseContext) { public String getDwgReadExecutable() { return defaultDwgParserConfig.getDwgReadExecutable(); } - + @Field public void setDwgReadExecutable(String dwgReadExecutable) { defaultDwgParserConfig.setDwgReadExecutable(dwgReadExecutable); } - + public boolean isCleanDwgReadOutput() { return defaultDwgParserConfig.isCleanDwgReadOutput(); } - + @Field public void setCleanDwgReadOutput(boolean cleanDwgReadOutput) { defaultDwgParserConfig.setCleanDwgReadOutput(cleanDwgReadOutput); @@ -58,27 +57,30 @@ public void setCleanDwgReadOutput(boolean cleanDwgReadOutput) { public int getCleanDwgReadOutputBatchSize() { return defaultDwgParserConfig.getCleanDwgReadOutputBatchSize(); } - + @Field public void setCleanDwgReadOutputBatchSize(int cleanDwgReadOutputBatchSize) { defaultDwgParserConfig.setCleanDwgReadOutputBatchSize(cleanDwgReadOutputBatchSize); } + public String getCleanDwgReadRegexToReplace() { return defaultDwgParserConfig.getCleanDwgReadRegexToReplace(); } - + @Field public void setCleanDwgReadRegexToReplace(String cleanDwgReadRegexToReplace) { defaultDwgParserConfig.setCleanDwgReadRegexToReplace(cleanDwgReadRegexToReplace); } + public String getCleanDwgReadReplaceWith() { return defaultDwgParserConfig.getCleanDwgReadReplaceWith(); } - + @Field public void setCleanDwgReadReplaceWith(String cleanDwgReadReplaceWith) { defaultDwgParserConfig.setCleanDwgReadReplaceWith(cleanDwgReadReplaceWith); } + public long getDwgReadTimeout() { return defaultDwgParserConfig.getDwgReadTimeout(); } @@ -87,5 +89,5 @@ public long getDwgReadTimeout() { public void setDwgReadTimeout(long dwgReadTimeout) { defaultDwgParserConfig.setDwgReadtimeout(dwgReadTimeout); } - + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java index a433cb2581..3b419f1230 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.dwg; @@ -20,12 +18,8 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; - import org.apache.commons.io.IOUtils; import org.apache.poi.util.StringUtil; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.EndianUtils; import org.apache.tika.metadata.Metadata; @@ -34,12 +28,13 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * DWG (CAD Drawing) parser. This is a very basic parser, which just - * looks for bits of the headers. - * Note that we use Apache POI for various parts of the processing, as - * lots of the low level string/int/short concepts are the same. + * DWG (CAD Drawing) parser. This is a very basic parser, which just looks for bits of the headers. + * Note that we use Apache POI for various parts of the processing, as lots of the low level + * string/int/short concepts are the same. */ public class DWGParser extends AbstractDWGParser { public static String DWG_CUSTOM_META_PREFIX = "dwg-custom:"; @@ -50,45 +45,49 @@ public class DWGParser extends AbstractDWGParser { /** * The order of the fields in the header */ - private static final Property[] HEADER_PROPERTIES_ENTRIES = { TikaCoreProperties.TITLE, - TikaCoreProperties.DESCRIPTION, TikaCoreProperties.CREATOR, TikaCoreProperties.SUBJECT, - TikaCoreProperties.COMMENTS, TikaCoreProperties.MODIFIER, null, // Unknown? - TikaCoreProperties.RELATION, // Hyperlink - }; + private static final Property[] HEADER_PROPERTIES_ENTRIES = + {TikaCoreProperties.TITLE, TikaCoreProperties.DESCRIPTION, + TikaCoreProperties.CREATOR, TikaCoreProperties.SUBJECT, + TikaCoreProperties.COMMENTS, TikaCoreProperties.MODIFIER, null, // Unknown? + TikaCoreProperties.RELATION, // Hyperlink + }; /** * For the 2000 file, they're indexed */ - private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = { null, TikaCoreProperties.RELATION, // 0x01 - TikaCoreProperties.TITLE, // 0x02 - TikaCoreProperties.DESCRIPTION, // 0x03 - TikaCoreProperties.CREATOR, // 0x04 - null, TikaCoreProperties.COMMENTS, // 0x06 - TikaCoreProperties.SUBJECT, // 0x07 - TikaCoreProperties.MODIFIER, // 0x08 - }; + private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = + {null, TikaCoreProperties.RELATION, // 0x01 + TikaCoreProperties.TITLE, // 0x02 + TikaCoreProperties.DESCRIPTION, // 0x03 + TikaCoreProperties.CREATOR, // 0x04 + null, TikaCoreProperties.COMMENTS, // 0x06 + TikaCoreProperties.SUBJECT, // 0x07 + TikaCoreProperties.MODIFIER, // 0x08 + }; private static final String HEADER_2000_PROPERTIES_MARKER_STR = "DWGPROPS COOKIE"; - private static final byte[] HEADER_2000_PROPERTIES_MARKER = new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()]; + private static final byte[] HEADER_2000_PROPERTIES_MARKER = + new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()]; /** - * How far to skip after the last standard property, before we find any custom - * properties that might be there. + * How far to skip after the last standard property, before we find any custom properties that + * might be there. */ private static final int CUSTOM_PROPERTIES_SKIP = 20; /** * The value of padding bytes other than 0 in some DWG files. */ - private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] { 0x2, 0, 0, 0 }; + private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] {0x2, 0, 0, 0}; private static MediaType TYPE = MediaType.image("vnd.dwg"); static { - StringUtil.putCompressedUnicode(HEADER_2000_PROPERTIES_MARKER_STR, HEADER_2000_PROPERTIES_MARKER, 0); + StringUtil.putCompressedUnicode(HEADER_2000_PROPERTIES_MARKER_STR, + HEADER_2000_PROPERTIES_MARKER, 0); } public Set getSupportedTypes(ParseContext context) { return Collections.singleton(TYPE); } - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) - throws IOException, TikaException, SAXException { + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, TikaException, SAXException { configure(context); DWGParserConfig dwgc = context.get(DWGParserConfig.class); @@ -139,7 +138,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, * Stored as US-ASCII */ private void get2004Props(InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) - throws IOException, TikaException, SAXException { + throws IOException, TikaException, SAXException { // Standard properties for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) { String headerValue = read2004String(stream); @@ -173,8 +172,8 @@ private String read2004String(InputStream stream) throws IOException, TikaExcept /** * Stored as UCS2, so 16 bit "unicode" */ - private void get2007and2010Props(InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) - throws IOException, TikaException, SAXException { + private void get2007and2010Props(InputStream stream, Metadata metadata, + XHTMLContentHandler xhtml) throws IOException, TikaException, SAXException { // Standard properties for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) { String headerValue = read2007and2010String(stream); @@ -208,7 +207,7 @@ private String read2007and2010String(InputStream stream) throws IOException, Tik } private void get2000Props(InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) - throws IOException, TikaException, SAXException { + throws IOException, TikaException, SAXException { int propCount = 0; while (propCount < 30) { int propIdx = EndianUtils.readUShortLE(stream); @@ -249,8 +248,8 @@ private void get2000Props(InputStream stream, Metadata metadata, XHTMLContentHan } } - private void handleHeader(int headerNumber, String value, Metadata metadata, XHTMLContentHandler xhtml) - throws SAXException { + private void handleHeader(int headerNumber, String value, Metadata metadata, + XHTMLContentHandler xhtml) throws SAXException { if (value == null || value.isEmpty()) { return; } @@ -266,7 +265,8 @@ private void handleHeader(int headerNumber, String value, Metadata metadata, XHT /** * Grab the offset, then skip there */ - private boolean skipToPropertyInfoSection(InputStream stream, byte[] header) throws IOException, TikaException { + private boolean skipToPropertyInfoSection(InputStream stream, byte[] header) + throws IOException, TikaException { // The offset is stored in the header from 0x20 onwards long offsetToSection = EndianUtils.getLongLE(header, 0x20); @@ -291,7 +291,8 @@ private boolean skipToPropertyInfoSection(InputStream stream, byte[] header) thr /** * We think it can be anywhere... */ - private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header) throws IOException { + private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header) + throws IOException { int val = 0; while (val != -1) { val = stream.read(); @@ -317,10 +318,10 @@ private int skipToCustomProperties(InputStream stream) throws IOException, TikaE byte[] padding = new byte[4]; IOUtils.readFully(stream, padding); if ((padding[0] == 0 && padding[1] == 0 && padding[2] == 0 && padding[3] == 0) - || (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] - && padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] - && padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] - && padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) { + || (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] + && padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] + && padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] + && padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) { // Looks hopeful, skip on padding = new byte[CUSTOM_PROPERTIES_SKIP]; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java index 35300080bb..0f5bdf3ca4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.dwg; @@ -24,14 +22,12 @@ import java.nio.file.Paths; import java.util.Arrays; import java.util.Map; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.utils.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class DWGParserConfig implements Serializable { @@ -44,7 +40,7 @@ public class DWGParserConfig implements Serializable { // we need to remove non UTF chars and Nan's (dwgread outputs these as nan) private String cleanDwgReadRegexToReplace = "[^\\x20-\\x7e]"; private String cleanDwgReadReplaceWith = ""; - @SuppressWarnings("unused") + @SuppressWarnings("unused") private boolean hasDwgRead; private static final Logger LOG = LoggerFactory.getLogger(DWGParserConfig.class); @@ -62,7 +58,7 @@ public boolean hasDwgRead() throws TikaConfigException { } // Try running DWGRead from there, and see if it exists + works - String[] checkCmd = { dwgRead }; + String[] checkCmd = {dwgRead}; boolean hasDwgRead = ExternalParser.check(checkCmd); LOG.debug("hasDwgRead (path: " + Arrays.toString(checkCmd) + "): " + hasDwgRead); return hasDwgRead; @@ -96,9 +92,9 @@ public String getCleanDwgReadReplaceWith() { public void setDwgReadExecutable(String dwgReadExecutable) { if (!Paths.get(dwgReadExecutable).isAbsolute()) try { - dwgReadExecutable = new File(dwgReadExecutable).getCanonicalFile().toString(); + dwgReadExecutable = new File(dwgReadExecutable).getCanonicalFile().toString(); } catch (IOException e) { - //do nothing as the error will be picked up by the DWG Parser + // do nothing as the error will be picked up by the DWG Parser } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadFormatRemover.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadFormatRemover.java index 9ba7a594a7..fea1265ad2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadFormatRemover.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadFormatRemover.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.dwg; @@ -21,9 +19,8 @@ import java.util.regex.Pattern; /** - * DWGReadFormatRemover removes the formatting from the text from libredwg files so only - * the raw text remains. - * What needs to be cleaned has been found on the following websites: + * DWGReadFormatRemover removes the formatting from the text from libredwg files so only the raw + * text remains. What needs to be cleaned has been found on the following websites: *

      * * https://www.cadforum.cz/en/text-formatting-codes-in-mtext-objects-tip8640 @@ -35,36 +32,38 @@ public class DWGReadFormatRemover { private static final String underlineStrikeThrough = "((?:\\\\\\\\)+|\\\\[LlOoKk])"; - private static final String endMarks = "((?:\\\\\\\\)+|\\\\(?:A|H|pi|pxt|pxi|pt|X|Q|f|F|W|C|T)[^;]{0,100};)"; + private static final String endMarks = + "((?:\\\\\\\\)+|\\\\(?:A|H|pi|pxt|pxi|pt|X|Q|f|F|W|C|T)[^;]{0,100};)"; private static final String newLine = "((?:\\\\\\\\)+|\\\\P)"; - private static final String stackFrac = "(\\\\\\\\)+|\\\\S([^/^#]{1,20})[/^#]([^;]{1,20});"; + private static final String stackFrac = "(\\\\\\\\)+|\\\\S([^/^#]{1,20})[/^#]([^;]{1,20});"; private static final String curlyBraces = "(\\\\)+[{}]|([{}])"; private static final String escapeChars = "(? * https://github.com/LibreDWG/libredwg *

      - * DWGRead outputs json which we then loop through extracting the text elements - * The required configuration is dwgReadExecutable. The other settings which can be - * overwritten are: + * DWGRead outputs json which we then loop through extracting the text elements The required + * configuration is dwgReadExecutable. The other settings which can be overwritten are: *

      - * boolean : cleanDwgReadOutput - whether to clean the json output + * boolean : cleanDwgReadOutput - whether to clean the json output *

      - * int : cleanDwgReadOutputBatchSize - clean output batch size to process + * int : cleanDwgReadOutputBatchSize - clean output batch size to process *

      * long : dwgReadTimeout -timeout in milliseconds before killing the dwgread process *

      - * String : cleanDwgReadRegexToReplace - characters to replace in the json + * String : cleanDwgReadRegexToReplace - characters to replace in the json *

      * String : cleanDwgReadReplaceWith - * replacement characters dwgReadExecutable */ @@ -96,8 +89,8 @@ public Set getSupportedTypes(ParseContext context) { } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { configure(context); DWGParserConfig dwgc = context.get(DWGParserConfig.class); @@ -109,45 +102,43 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, File tmpFileOutCleaned = Files.createTempFile(uuid + "dwgreadoutclean", ".json").toFile(); File tmpFileIn = Files.createTempFile(uuid + "dwgreadin", ".dwg").toFile(); try { - + FileUtils.copyInputStreamToFile(stream, tmpFileIn); List command = Arrays.asList(dwgc.getDwgReadExecutable(), "-O", "JSON", "-o", - tmpFileOut.getCanonicalPath(), tmpFileIn.getCanonicalPath()); + tmpFileOut.getCanonicalPath(), tmpFileIn.getCanonicalPath()); ProcessBuilder pb = new ProcessBuilder().command(command); LOG.info("About to call DWGRead: " + command.toString()); - FileProcessResult fpr = ProcessUtils.execute(pb, dwgc.getDwgReadTimeout(), 10000, 10000); + FileProcessResult fpr = + ProcessUtils.execute(pb, dwgc.getDwgReadTimeout(), 10000, 10000); LOG.info("DWGRead Exit code is: " + fpr.getExitValue()); if (fpr.getExitValue() == 0) { if (dwgc.isCleanDwgReadOutput()) { // dwgread sometimes creates strings with invalid utf-8 sequences or invalid // json (nan instead of NaN). replace them // with empty string. - LOG.debug("Cleaning Json Output - Replace: " + dwgc.getCleanDwgReadRegexToReplace() - + " with: " + dwgc.getCleanDwgReadReplaceWith()); - try ( BufferedReader br = new BufferedReader( - new InputStreamReader( - Files.newInputStream(tmpFileOut.toPath()), - StandardCharsets.UTF_8)); - - BufferedWriter out = new BufferedWriter( - new OutputStreamWriter( - new FileOutputStream(tmpFileOutCleaned, true), - StandardCharsets.UTF_8),32768)) - { + LOG.debug("Cleaning Json Output - Replace: " + + dwgc.getCleanDwgReadRegexToReplace() + " with: " + + dwgc.getCleanDwgReadReplaceWith()); + try (BufferedReader br = new BufferedReader( + new InputStreamReader(Files.newInputStream(tmpFileOut.toPath()), + StandardCharsets.UTF_8)); + + BufferedWriter out = new BufferedWriter(new OutputStreamWriter( + new FileOutputStream(tmpFileOutCleaned, true), + StandardCharsets.UTF_8), 32768)) { String sCurrentLine; - while ((sCurrentLine = br.readLine()) != null) - { + while ((sCurrentLine = br.readLine()) != null) { sCurrentLine = sCurrentLine - .replaceAll( dwgc.getCleanDwgReadRegexToReplace(), - dwgc.getCleanDwgReadReplaceWith()) + .replaceAll(dwgc.getCleanDwgReadRegexToReplace(), + dwgc.getCleanDwgReadReplaceWith()) .replaceAll("\\bnan\\b", " 0,") .replaceAll("\\.,", " \\. ,") + "\n"; out.write(sCurrentLine); - } - + } + } finally { FileUtils.deleteQuietly(tmpFileIn); FileUtils.deleteQuietly(tmpFileOut); @@ -155,37 +146,38 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } } else { - LOG.debug( - "Json wasn't cleaned, " - + "if json parsing fails consider reviewing dwgread json output to check it's valid"); + LOG.debug("Json wasn't cleaned, " + + "if json parsing fails consider reviewing dwgread json output to check it's valid"); } } else if (fpr.isTimeout()) { throw new TikaException( - "DWGRead Failed - Timeout setting exceeded current setting of " + dwgc.getDwgReadTimeout() ); - } - else { - throw new TikaException( - "DWGRead Failed - Exit Code is:" + fpr.getExitValue() + " Exe error is: " + fpr.getStderr() ); + "DWGRead Failed - Timeout setting exceeded current setting of " + + dwgc.getDwgReadTimeout()); + } else { + throw new TikaException("DWGRead Failed - Exit Code is:" + fpr.getExitValue() + + " Exe error is: " + fpr.getStderr()); } // we can't guarantee the json output is correct so we try to ignore as many // errors as we can JsonFactory jfactory = JsonFactory.builder() - .enable(JsonReadFeature.ALLOW_MISSING_VALUES, - JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS, - JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, - JsonReadFeature.ALLOW_UNQUOTED_FIELD_NAMES, - JsonReadFeature.ALLOW_TRAILING_COMMA, - JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS, - JsonReadFeature.ALLOW_LEADING_ZEROS_FOR_NUMBERS) - .build(); + .enable(JsonReadFeature.ALLOW_MISSING_VALUES, + JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS, + JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, + JsonReadFeature.ALLOW_UNQUOTED_FIELD_NAMES, + JsonReadFeature.ALLOW_TRAILING_COMMA, + JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS, + JsonReadFeature.ALLOW_LEADING_ZEROS_FOR_NUMBERS) + .build(); JsonParser jParser; try { jParser = jfactory.createParser(tmpFileOut); } catch (JsonParseException e1) { - throw new TikaException("Failed to parse Json: " + ExceptionUtils.getStackTrace(e1)); + throw new TikaException( + "Failed to parse Json: " + ExceptionUtils.getStackTrace(e1)); } catch (IOException e1) { - throw new TikaException("Failed to read json file: " + ExceptionUtils.getStackTrace(e1)); + throw new TikaException( + "Failed to read json file: " + ExceptionUtils.getStackTrace(e1)); } // read json token in a stream using jackson, iterate over each token. We only // support OBJECTS, FILEHEADER and SummaryInfo @@ -205,10 +197,12 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, parseDwgObject(jParser, (nextTextValue) -> { try { - xhtml.characters(dwgReadFormatRemover.cleanupDwgString(nextTextValue)); + xhtml.characters(dwgReadFormatRemover + .cleanupDwgString(nextTextValue)); xhtml.newline(); } catch (SAXException e) { - LOG.error("Could not write next text value {} to xhtml stream", nextTextValue); + LOG.error("Could not write next text value {} to xhtml stream", + nextTextValue); } }); } @@ -233,7 +227,8 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, xhtml.endDocument(); } - private void parseDwgObject(JsonParser jsonParser, Consumer textConsumer) throws IOException { + private void parseDwgObject(JsonParser jsonParser, Consumer textConsumer) + throws IOException { JsonToken nextToken; while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) { if (nextToken == JsonToken.FIELD_NAME) { @@ -284,7 +279,8 @@ private void parseSummaryInfo(JsonParser jsonParser, Metadata metadata) throws I nextToken = jsonParser.nextToken(); if (nextToken.isStructStart()) { if ("TDCREATE".equals(nextFieldName) || "TDUPDATE".equals(nextFieldName)) { - // timestamps are represented by an integer array of format with 2 values in the + // timestamps are represented by an integer array of format with 2 values in + // the // array: // [julianDate, millisecondOfDay] jsonParser.nextToken(); // start array diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/JulianDateUtil.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/JulianDateUtil.java index 522df0883a..b8e9c59935 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/JulianDateUtil.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/JulianDateUtil.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.dwg; @@ -24,7 +22,8 @@ class JulianDateUtil { private static final double NANOS_PER_DAY = 24.0 * 60.0 * 60.0 * 1000000000.0; - public static final Instant REDUCED_JD = ZonedDateTime.of(1858, 11, 16, 12, 0, 0, 0, ZoneOffset.UTC).toInstant(); + public static final Instant REDUCED_JD = + ZonedDateTime.of(1858, 11, 16, 12, 0, 0, 0, ZoneOffset.UTC).toInstant(); public static final Instant JULIAN_DATE = REDUCED_JD.minus(2400000, ChronoUnit.DAYS); private final Instant epoch; @@ -41,7 +40,7 @@ private Instant toInstant(double day) { public static Instant toInstant(long julianDay, long millisecondsIntoDay) { return new JulianDateUtil(JulianDateUtil.JULIAN_DATE) - .toInstant(Double.parseDouble(julianDay + "." + millisecondsIntoDay)); - + .toInstant(Double.parseDouble(julianDay + "." + millisecondsIntoDay)); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java index 4e978329e1..e86f7fda64 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.prt; @@ -23,11 +21,7 @@ import java.io.UnsupportedEncodingException; import java.util.Collections; import java.util.Set; - import org.apache.commons.io.IOUtils; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.EndianUtils; import org.apache.tika.metadata.Metadata; @@ -36,10 +30,12 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * A basic text extracting parser for the CADKey PRT (CAD Drawing) - * format. It outputs text from note entries. + * A basic text extracting parser for the CADKey PRT (CAD Drawing) format. It outputs text from note + * entries. */ public class PRTParser implements Parser { @@ -50,10 +46,10 @@ public class PRTParser implements Parser { */ private static final long serialVersionUID = 4659638314375035178L; private static final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("x-prt")); + Collections.singleton(MediaType.application("x-prt")); /** - * How long do we allow a text run to claim to be, before we - * decide we're confused and it's not really text after all? + * How long do we allow a text run to claim to be, before we decide we're confused and it's not + * really text after all? */ private static final int MAX_TEXT_LENGTH = 0x0800; @@ -62,17 +58,15 @@ public Set getSupportedTypes(ParseContext context) { } /* - * Text types: - * 00 00 00 00 f0 [3b]f sz sz TEXT *view name* - * 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name* - * (anything) e0 3f sz sz TEXT *view name* - * 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries* + * Text types: 00 00 00 00 f0 [3b]f sz sz TEXT *view name* 00 00 00 00 f0 3f 00 00 00 00 00 00 + * 00 00 sz sz TEXT *view name* (anything) e0 3f sz sz TEXT *view name* 3x 33 33 33 33 33 e3 3f + * 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries* * - * Note - all text is null terminated + * Note - all text is null terminated */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); Last5 l5 = new Last5(); @@ -86,9 +80,9 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, String dateStr = new String(date, US_ASCII); if (dateStr.startsWith("19") || dateStr.startsWith("20")) { - String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + "-" + - dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" + - dateStr.substring(10, 12) + ":00"; + String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + "-" + + dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" + + dateStr.substring(10, 12) + ":00"; metadata.set(TikaCoreProperties.CREATED, formattedDate); // TODO Metadata.DATE is used as modified, should it be here? metadata.set(TikaCoreProperties.CREATED, formattedDate); @@ -126,7 +120,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { // Ensure we have the right padding text int read; for (int i = 0; i < 10; i++) { @@ -152,7 +146,7 @@ private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml) } private void handleViewName(int typeA, int typeB, InputStream stream, XHTMLContentHandler xhtml, - Last5 l5) throws IOException, SAXException, TikaException { + Last5 l5) throws IOException, SAXException, TikaException { // Is it 8 byte zero padded? int maybeLength = EndianUtils.readUShortLE(stream); if (maybeLength == 0) { @@ -185,7 +179,7 @@ private void handleViewName(int typeA, int typeB, InputStream stream, XHTMLConte } private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml) - throws IOException, SAXException, TikaException { + throws IOException, SAXException, TikaException { byte[] str = new byte[length]; IOUtils.readFully(stream, str); if (str[length - 1] != 0) { @@ -205,7 +199,7 @@ private void handleText(int length, InputStream stream, XHTMLContentHandler xhtm */ private String extractText(byte[] data, boolean trim) throws TikaException { // The text is always stored null terminated, but sometimes - // may have extra null padding too + // may have extra null padding too int length = data.length - 1; if (trim) { for (int i = 0; i < data.length; i++) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dgn/DGN8ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dgn/DGN8ParserTest.java index 89a610dfb9..3939d99eaa 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dgn/DGN8ParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dgn/DGN8ParserTest.java @@ -1,31 +1,27 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.dgn; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.Arrays; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.OfficeOpenXMLExtended; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.Test; public class DGN8ParserTest extends TikaTest { /** @@ -37,7 +33,7 @@ public void testBasics() throws Exception { assertEquals("John.Frampton", metadata.get(TikaCoreProperties.MODIFIER)); assertEquals("MicroStation v8.11.0.0", metadata.get(OfficeOpenXMLExtended.APPLICATION)); assertContains("org.apache.tika.parser.dgn.DGN8Parser", - Arrays.asList(metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY))); + Arrays.asList(metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY))); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java index 077e7700f0..83dc331ae0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.dwg; @@ -29,11 +27,6 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.Arrays; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; @@ -44,9 +37,12 @@ import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.utils.StringUtils; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; public class DWGParserTest extends TikaTest { - public boolean canRun(DWGParser parser) { + public boolean canRun(DWGParser parser) { String dwgRead = parser.getDwgReadExecutable(); if (!StringUtils.isBlank(dwgRead) && !Files.isRegularFile(Paths.get(dwgRead))) { @@ -54,42 +50,43 @@ public boolean canRun(DWGParser parser) { } // Try running DWGRead from there, and see if it exists + works - String[] checkCmd = { dwgRead }; + String[] checkCmd = {dwgRead}; return ExternalParser.check(checkCmd); } + @Test public void testDWG2000Parser() throws Exception { InputStream input = - DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2000.dwg"); + DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2000.dwg"); testParserAlt(input); } @Test public void testDWG2004Parser() throws Exception { InputStream input = - DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2004.dwg"); + DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2004.dwg"); testParser(input); } @Test public void testDWG2004ParserNoHeaderAddress() throws Exception { InputStream input = DWGParserTest.class - .getResourceAsStream("/test-documents/testDWG2004_no_header.dwg"); + .getResourceAsStream("/test-documents/testDWG2004_no_header.dwg"); testParserNoHeader(input); } @Test public void testDWG2007Parser() throws Exception { InputStream input = - DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2007.dwg"); + DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2007.dwg"); testParser(input); } @Test public void testDWG2010Parser() throws Exception { InputStream input = - DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2010.dwg"); + DWGParserTest.class.getResourceAsStream("/test-documents/testDWG2010.dwg"); testParser(input); } @@ -97,31 +94,30 @@ public void testDWG2010Parser() throws Exception { public void testDWG2010CustomPropertiesParser() throws Exception { // Check that standard parsing works InputStream testInput = DWGParserTest.class - .getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg"); + .getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg"); testParser(testInput); // Check that custom properties with alternate padding work try (InputStream input = DWGParserTest.class - .getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg")) { + .getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg")) { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); - new DWGParser().parse(input, handler, metadata,new ParseContext()); + new DWGParser().parse(input, handler, metadata, new ParseContext()); assertEquals("valueforcustomprop1", - metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "customprop1")); + metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "customprop1")); assertEquals("valueforcustomprop2", - metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "customprop2")); + metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "customprop2")); } } @Test public void testDWGMechParser() throws Exception { - String[] types = - new String[]{"6", "2004", "2004DX", "2005", "2006", "2007", "2008", "2009", "2010", - "2011"}; + String[] types = new String[] {"6", "2004", "2004DX", "2005", "2006", "2007", "2008", + "2009", "2010", "2011"}; for (String type : types) { InputStream input = DWGParserTest.class - .getResourceAsStream("/test-documents/testDWGmech" + type + ".dwg"); + .getResourceAsStream("/test-documents/testDWGmech" + type + ".dwg"); testParserAlt(input); } } @@ -131,17 +127,17 @@ private void testParser(InputStream input) throws Exception { try { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); - new DWGParser().parse(input, handler, metadata,new ParseContext()); + new DWGParser().parse(input, handler, metadata, new ParseContext()); assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("The quick brown fox jumps over the lazy dog", - metadata.get(TikaCoreProperties.TITLE)); + metadata.get(TikaCoreProperties.TITLE)); assertEquals("Gym class featuring a brown fox and lazy dog", - metadata.get(TikaCoreProperties.DESCRIPTION)); + metadata.get(TikaCoreProperties.DESCRIPTION)); assertEquals("Nevin Nollop", metadata.get(TikaCoreProperties.CREATOR)); assertContains("Pangram, fox, dog", - Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT))); + Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT))); assertEquals("Lorem ipsum", metadata.get(TikaCoreProperties.COMMENTS).substring(0, 11)); assertEquals("http://www.alfresco.com", metadata.get(TikaCoreProperties.RELATION)); @@ -159,7 +155,7 @@ private void testParserNoHeader(InputStream input) throws Exception { try { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); - new DWGParser().parse(input, handler, metadata,new ParseContext()); + new DWGParser().parse(input, handler, metadata, new ParseContext()); assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE)); @@ -193,7 +189,7 @@ private void testParserAlt(InputStream input) throws Exception { assertEquals("bejanpol", metadata.get(TikaCoreProperties.MODIFIER)); assertEquals("http://mycompany/drawings", metadata.get(TikaCoreProperties.RELATION)); assertEquals("MyCustomPropertyValue", - metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "MyCustomProperty")); + metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + "MyCustomProperty")); String content = handler.toString(); assertContains("This is a comment", content); @@ -215,16 +211,16 @@ public void testAC1032() throws Exception { assertEquals("jlakshvi", metadata.get(TikaCoreProperties.MODIFIER)); assertEquals("CUSTOMER'S ADDRESS", metadata.get("dwg-custom:CUSTOMER'S ADDRESS")); } + @Test public void testDWGReadexe() throws Exception { InputStream stream = getResourceAsStream("/test-configs/tika-config-dwgRead.xml"); - DWGParser parser = - (DWGParser) ((CompositeParser) new TikaConfig(stream).getParser()) + DWGParser parser = (DWGParser) ((CompositeParser) new TikaConfig(stream).getParser()) .getAllComponentParsers().get(0); assumeTrue(canRun(parser), "Can't run DWGRead.exe"); String output = getText("architectural_-_annotation_scaling_and_multileaders.dwg", parser); - assertContains("ELEV. 11'-9\" TOP OF SECOND FLR.",output); + assertContains("ELEV. 11'-9\" TOP OF SECOND FLR.", output); } @Test @@ -232,13 +228,12 @@ public void testDWGReadtimeout() throws TikaException, IOException, SAXException InputStream stream = getResourceAsStream("/test-configs/tika-config-dwgRead-Timeout.xml"); DWGParser parser = (DWGParser) ((CompositeParser) new TikaConfig(stream).getParser()) - .getAllComponentParsers().get(0); + .getAllComponentParsers().get(0); assumeTrue(canRun(parser), "Can't run DWGRead.exe"); - TikaException thrown = assertThrows( - TikaException.class, - () -> getText("architectural_-_annotation_scaling_and_multileaders.dwg", parser), - "Expected getText() to throw TikaException but it failed" - ); + TikaException thrown = assertThrows(TikaException.class, + () -> getText("architectural_-_annotation_scaling_and_multileaders.dwg", + parser), + "Expected getText() to throw TikaException but it failed"); assertTrue(thrown.getMessage().contains("Timeout setting exceeded current setting of")); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGReadFormatRemoverTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGReadFormatRemoverTest.java index d570a6f6d9..d41fb0b92b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGReadFormatRemoverTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGReadFormatRemoverTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.dwg; @@ -23,22 +21,24 @@ public class DWGReadFormatRemoverTest { @Test - public void testBasic() { - String formatted = "\\A1;\\fAIGDT|b0|i0;\\H2.5000;\\ln\\fArial|b0|i0;\\H2.5000;68{\\H1.3;\\S+0,8^+0,1;}"; + public void testBasic() { + String formatted = + "\\A1;\\fAIGDT|b0|i0;\\H2.5000;\\ln\\fArial|b0|i0;\\H2.5000;68{\\H1.3;\\S+0,8^+0,1;}"; DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover(); String expected = "n68+0,8/+0,1"; assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted)); } @Test - public void testParameterizables() { + public void testParameterizables() { String formatted = "the quick \\A1;\\fAIGDT|b0|i0;\\H2.5000; brown fox"; DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover(); String expected = "the quick brown fox"; assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted)); } + @Test - public void testEscapedSlashes() { + public void testEscapedSlashes() { String formatted = "the quick \\\\ \\A3;\\fAIGDT|b0|i0;\\H2.5000;brown fox"; DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover(); String expected = "the quick \\ brown fox"; @@ -46,16 +46,16 @@ public void testEscapedSlashes() { } @Test - public void testUnderlineEtc() { - String formatted = "l \\L open cu\\lrly bra\\Kck\\ket \\{ and a close " + - "\\} right?"; + public void testUnderlineEtc() { + String formatted = "l \\L open cu\\lrly bra\\Kck\\ket \\{ and a close " + "\\} right?"; DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover(); String expected = "l open curly bracket { and a close } right?"; assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted)); } + @Test - public void testEscaped() { + public void testEscaped() { String formatted = "then an actual \\P open curly bracket \\{ and a close \\} right?"; DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover(); String expected = "then an actual \n open curly bracket { and a close } right?"; @@ -63,7 +63,7 @@ public void testEscaped() { } @Test - public void testStackedFractions() { + public void testStackedFractions() { String formatted = "abc \\S+0,8^+0,1; efg"; DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover(); String expected = "abc +0,8/+0,1 efg"; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java index d1243fd722..769f089b6e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java @@ -1,33 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.prt; import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.InputStream; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; public class PRTParserTest extends TikaTest { /** @@ -55,7 +51,7 @@ public void testPRTParserBasics() throws Exception { assertContains("Bottom View", contents); assertContains("Right View", contents); assertContains("Left View", contents); - //assertContains("Isometric View", contents); // Can't detect yet + // assertContains("Isometric View", contents); // Can't detect yet assertContains("Axonometric View", contents); assertContains("You've managed to extract all the text!", contents); @@ -79,7 +75,7 @@ public void testPRTParserComplex() throws Exception { // File has both a date and a description assertEquals("1997-04-01T08:59:00", metadata.get(TikaCoreProperties.CREATED)); assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n", - metadata.get(TikaCoreProperties.DESCRIPTION)); + metadata.get(TikaCoreProperties.DESCRIPTION)); String contents = handler.toString(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java index 1b42334a23..060834a39c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.asm; @@ -20,15 +18,13 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; - -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Parser for Java .class files. @@ -41,14 +37,14 @@ public class ClassParser implements Parser { private static final long serialVersionUID = -3531388963354454357L; private static final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("java-vm")); + Collections.singleton(MediaType.application("java-vm")); public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { new XHTMLClassVisitor(handler, metadata).parse(stream); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java index b78d753863..1d98f8cc90 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java @@ -1,24 +1,27 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.asm; import java.io.IOException; import java.io.InputStream; - +import org.apache.tika.exception.RuntimeSAXException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.WriteLimitReachedException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.XHTMLContentHandler; import org.objectweb.asm.AnnotationVisitor; import org.objectweb.asm.Attribute; import org.objectweb.asm.ClassReader; @@ -30,16 +33,8 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.exception.RuntimeSAXException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.exception.WriteLimitReachedException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.sax.XHTMLContentHandler; - /** - * Class visitor that generates XHTML SAX events to describe the - * contents of the visited class. + * Class visitor that generates XHTML SAX events to describe the contents of the visited class. */ class XHTMLClassVisitor extends ClassVisitor { @@ -72,7 +67,7 @@ public void parse(InputStream stream) throws TikaException, SAXException, IOExce } public void visit(int version, int access, String name, String signature, String superName, - String[] interfaces) { + String[] interfaces) { type = Type.getObjectType(name); String className = type.getClassName(); @@ -154,14 +149,12 @@ public void visitEnd() { /** * Ignored. */ - public void visitOuterClass(String owner, String name, String desc) { - } + public void visitOuterClass(String owner, String name, String desc) {} /** * Ignored. */ - public void visitSource(String source, String debug) { - } + public void visitSource(String source, String debug) {} /** * Ignored. @@ -173,20 +166,18 @@ public AnnotationVisitor visitAnnotation(String desc, boolean visible) { /** * Ignored. */ - public void visitAttribute(Attribute attr) { - } + public void visitAttribute(Attribute attr) {} /** * Ignored. */ - public void visitInnerClass(String name, String outerName, String innerName, int access) { - } + public void visitInnerClass(String name, String outerName, String innerName, int access) {} /** * Visits a field. */ public FieldVisitor visitField(int access, String name, String desc, String signature, - Object value) { + Object value) { if (!isSet(access, Opcodes.ACC_SYNTHETIC)) { try { xhtml.characters(" "); @@ -214,7 +205,7 @@ public FieldVisitor visitField(int access, String name, String desc, String sign * Visits a method. */ public MethodVisitor visitMethod(int access, String name, String desc, String signature, - String[] exceptions) { + String[] exceptions) { if (!isSet(access, Opcodes.ACC_SYNTHETIC)) { try { xhtml.characters(" "); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java index c11f20d368..9f235f30c7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.code; @@ -30,8 +28,16 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.XMLConstants; - import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.detect.AutoDetectReader; +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractEncodingDetectorParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; import org.codelibs.jhighlight.renderer.Renderer; import org.codelibs.jhighlight.renderer.XhtmlRendererFactory; import org.jsoup.Jsoup; @@ -46,19 +52,9 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.detect.AutoDetectReader; -import org.apache.tika.detect.EncodingDetector; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractEncodingDetectorParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.XHTMLContentHandler; - /** - * Generic Source code parser for Java, Groovy, C++. - * Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license + * Generic Source code parser for Java, Groovy, C++. Aware: This parser uses JHightlight library + * (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license * * @author Hong-Thai.Nguyen * @since 1.6 @@ -69,15 +65,16 @@ public class SourceCodeParser extends AbstractEncodingDetectorParser { private static final Pattern AUTHORPATTERN = Pattern.compile("(?im)@author (.*) *$"); - private static final Map TYPES_TO_RENDERER = new HashMap() { - private static final long serialVersionUID = -741976157563751152L; + private static final Map TYPES_TO_RENDERER = + new HashMap() { + private static final long serialVersionUID = -741976157563751152L; - { - put(MediaType.text("x-c++src"), CPP); - put(MediaType.text("x-java-source"), JAVA); - put(MediaType.text("x-groovy"), GROOVY); - } - }; + { + put(MediaType.text("x-c++src"), CPP); + put(MediaType.text("x-java-source"), JAVA); + put(MediaType.text("x-groovy"), GROOVY); + } + }; public SourceCodeParser() { super(); @@ -93,10 +90,10 @@ public Set getSupportedTypes(ParseContext context) { } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { try (AutoDetectReader reader = new AutoDetectReader(CloseShieldInputStream.wrap(stream), - metadata, getEncodingDetector(context))) { + metadata, getEncodingDetector(context))) { Charset charset = reader.getCharset(); String mediaType = metadata.get(Metadata.CONTENT_TYPE); String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); @@ -112,9 +109,7 @@ metadata, getEncodingDetector(context))) { String line; int nbLines = 0; while ((line = reader.readLine()) != null) { - out - .append(line) - .append(System.getProperty("line.separator")); + out.append(line).append(System.getProperty("line.separator")); String author = parserAuthor(line); if (author != null) { metadata.add(TikaCoreProperties.CREATOR, author); @@ -152,9 +147,7 @@ private Renderer getRenderer(String mimeType) throws TikaException { private String parserAuthor(String line) { Matcher m = AUTHORPATTERN.matcher(line); if (m.find()) { - return m - .group(1) - .trim(); + return m.group(1).trim(); } return null; @@ -170,7 +163,7 @@ private TikaNodeFilter(ContentHandler handler) { @Override public NodeFilter.FilterResult head(Node node, int i) { - //skip document fragment + // skip document fragment if ("html".equals(node.nodeName())) { ignore = false; } @@ -191,8 +184,8 @@ public NodeFilter.FilterResult head(Node node, int i) { } return NodeFilter.FilterResult.CONTINUE; } else if (node instanceof DataNode) { - //maybe handle script data directly here instead of - //passing it through to the HTMLHandler? + // maybe handle script data directly here instead of + // passing it through to the HTMLHandler? String txt = ((DataNode) node).getWholeData(); if (txt != null) { char[] chars = txt.toCharArray(); @@ -207,12 +200,11 @@ public NodeFilter.FilterResult head(Node node, int i) { return NodeFilter.FilterResult.CONTINUE; } AttributesImpl attributes = new AttributesImpl(); - Iterator jsoupAttrs = node - .attributes() - .iterator(); + Iterator jsoupAttrs = node.attributes().iterator(); while (jsoupAttrs.hasNext()) { Attribute jsoupAttr = jsoupAttrs.next(); - attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "", jsoupAttr.getValue()); + attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "", + jsoupAttr.getValue()); } try { handler.startElement("", node.nodeName(), node.nodeName(), attributes); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java index e8ed04aebc..458aadcd5e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.executable; @@ -23,11 +21,7 @@ import java.util.Collections; import java.util.HashSet; import java.util.Set; - import org.apache.commons.io.IOUtils; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.EndianUtils; import org.apache.tika.metadata.MachineMetadata; @@ -37,6 +31,8 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Parser for executable files. Currently supports ELF and PE @@ -64,23 +60,22 @@ public class ExecutableParser implements Parser, MachineMetadata { private static final MediaType MACH_O_BUNDLE = MediaType.application("x-mach-o-bundle"); private static final MediaType MACH_O_DYLIB_STUB = MediaType.application("x-mach-o-dylib-stub"); private static final MediaType MACH_O_DSYM = MediaType.application("x-mach-o-dsym"); - private static final MediaType MACH_O_KEXT_BUNDLE = MediaType.application( - "x-mach-o-kext-bundle"); + private static final MediaType MACH_O_KEXT_BUNDLE = + MediaType.application("x-mach-o-kext-bundle"); - private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet( - new HashSet<>( - Arrays.asList(PE_EXE, ELF_GENERAL, ELF_OBJECT, ELF_EXECUTABLE, ELF_SHAREDLIB, - ELF_COREDUMP, MACH_O, MACH_O_OBJECT, MACH_O_EXECUTABLE, - MACH_O_FVMLIB, MACH_O_CORE, MACH_O_PRELOAD, MACH_O_DYLIB, - MACH_O_DYLINKER, MACH_O_BUNDLE, MACH_O_DYLIB_STUB, MACH_O_DSYM, - MACH_O_KEXT_BUNDLE))); + private static final Set SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet<>(Arrays.asList(PE_EXE, ELF_GENERAL, + ELF_OBJECT, ELF_EXECUTABLE, ELF_SHAREDLIB, ELF_COREDUMP, MACH_O, + MACH_O_OBJECT, MACH_O_EXECUTABLE, MACH_O_FVMLIB, MACH_O_CORE, + MACH_O_PRELOAD, MACH_O_DYLIB, MACH_O_DYLINKER, MACH_O_BUNDLE, + MACH_O_DYLIB_STUB, MACH_O_DSYM, MACH_O_KEXT_BUNDLE))); public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { // We only do metadata, for now XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); @@ -90,15 +85,15 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, if (first4[0] == (byte) 'M' && first4[1] == (byte) 'Z') { parsePE(xhtml, metadata, stream, first4); - } else if (first4[0] == (byte) 0x7f && first4[1] == (byte) 'E' && first4[2] == (byte) 'L' && - first4[3] == (byte) 'F') { + } else if (first4[0] == (byte) 0x7f && first4[1] == (byte) 'E' && first4[2] == (byte) 'L' + && first4[3] == (byte) 'F') { parseELF(xhtml, metadata, stream, first4); - } else if ((first4[0] == (byte) 0xCF || first4[0] == (byte) 0xCE) && - first4[1] == (byte) 0xFA && first4[2] == (byte) 0xED && first4[3] == (byte) 0xFE) { + } else if ((first4[0] == (byte) 0xCF || first4[0] == (byte) 0xCE) + && first4[1] == (byte) 0xFA && first4[2] == (byte) 0xED + && first4[3] == (byte) 0xFE) { parseMachO(xhtml, metadata, stream, first4); - } else if (first4[0] == (byte) 0xFE && first4[1] == (byte) 0xED && - first4[2] == (byte) 0xFA && - (first4[3] == (byte) 0xCF || first4[3] == (byte) 0xCE)) { + } else if (first4[0] == (byte) 0xFE && first4[1] == (byte) 0xED && first4[2] == (byte) 0xFA + && (first4[3] == (byte) 0xCF || first4[3] == (byte) 0xCE)) { parseMachO(xhtml, metadata, stream, first4); } @@ -111,7 +106,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, * Parses a DOS or Windows PE file */ public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream, - byte[] first4) throws TikaException, IOException { + byte[] first4) throws TikaException, IOException { metadata.set(Metadata.CONTENT_TYPE, PE_EXE.toString()); metadata.set(PLATFORM, PLATFORM_WINDOWS); @@ -128,7 +123,7 @@ public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream st } // Skip the rest of the MS-DOS stub (if PE), until we reach what should - // be the PE header (if this is a PE executable) + // be the PE header (if this is a PE executable) stream.skip(peOffset - 0x40); // Read the PE header @@ -257,7 +252,7 @@ public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream st * Parses a Unix ELF file */ public void parseELF(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream, - byte[] first4) throws TikaException, IOException { + byte[] first4) throws TikaException, IOException { // Byte 5 is the architecture int architecture = stream.read(); if (architecture == 1) { @@ -433,7 +428,7 @@ public void parseELF(XHTMLContentHandler xhtml, Metadata metadata, InputStream s * Parses a Mach-O file */ public void parseMachO(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream, - byte[] first4) throws TikaException, IOException { + byte[] first4) throws TikaException, IOException { var isLE = first4[3] == (byte) 0xFE; if (isLE) { metadata.set(ENDIAN, Endian.LITTLE.getName()); @@ -442,9 +437,7 @@ public void parseMachO(XHTMLContentHandler xhtml, Metadata metadata, InputStream } // Bytes 5-8 are the CPU type and architecture bits - var cpuType = isLE - ? EndianUtils.readIntLE(stream) - : EndianUtils.readIntBE(stream); + var cpuType = isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream); if ((cpuType >> 24) == 1) { metadata.set(ARCHITECTURE_BITS, "64"); } @@ -480,14 +473,10 @@ public void parseMachO(XHTMLContentHandler xhtml, Metadata metadata, InputStream } // Bytes 9-12 are the CPU subtype - var cpuSubtype = isLE - ? EndianUtils.readIntLE(stream) - : EndianUtils.readIntBE(stream); + var cpuSubtype = isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream); // Bytes 13-16 are the file type - var fileType = isLE - ? EndianUtils.readIntLE(stream) - : EndianUtils.readIntBE(stream); + var fileType = isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream); switch (fileType) { case 0x1: metadata.set(Metadata.CONTENT_TYPE, MACH_O_OBJECT.toString()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java index 20e12a564c..d0996ea678 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.executable; @@ -22,12 +20,8 @@ import java.util.Collections; import java.util.Comparator; import java.util.Set; - import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.tuple.Pair; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.exception.UnsupportedFormatException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -39,6 +33,8 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Parser for universal executable files. @@ -47,10 +43,10 @@ public class UniversalExecutableParser implements Parser { private static final long serialVersionUID = 1L; private static final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("x-mach-o-universal")); + Collections.singleton(MediaType.application("x-mach-o-universal")); private static final int MAX_ARCHS_COUNT = 1000; - private static final int MAX_ARCH_SIZE = 500_000_000;//arbitrary + private static final int MAX_ARCH_SIZE = 500_000_000;// arbitrary @Override public Set getSupportedTypes(ParseContext arg0) { @@ -59,23 +55,22 @@ public Set getSupportedTypes(ParseContext arg0) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); EmbeddedDocumentExtractor extractor = - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); byte[] first4 = new byte[4]; IOUtils.readFully(stream, first4); - if ((first4[0] == (byte) 0xBF || first4[0] == (byte) 0xBE) && - first4[1] == (byte) 0xBA && first4[2] == (byte) 0xFE && first4[3] == (byte) 0xCA) { + if ((first4[0] == (byte) 0xBF || first4[0] == (byte) 0xBE) && first4[1] == (byte) 0xBA + && first4[2] == (byte) 0xFE && first4[3] == (byte) 0xCA) { parseMachO(xhtml, extractor, metadata, stream, first4); - } else if (first4[0] == (byte) 0xCA && first4[1] == (byte) 0xFE && - first4[2] == (byte) 0xBA && - (first4[3] == (byte) 0xBF || first4[3] == (byte) 0xBE)) { + } else if (first4[0] == (byte) 0xCA && first4[1] == (byte) 0xFE && first4[2] == (byte) 0xBA + && (first4[3] == (byte) 0xBF || first4[3] == (byte) 0xBE)) { parseMachO(xhtml, extractor, metadata, stream, first4); } else { throw new UnsupportedFormatException("Not a universal executable file"); @@ -88,22 +83,22 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, * Parses a Mach-O Universal file */ public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extractor, - Metadata metadata, InputStream stream, - byte[] first4) - throws IOException, SAXException, TikaException { + Metadata metadata, InputStream stream, byte[] first4) + throws IOException, SAXException, TikaException { var currentOffset = (long) first4.length; var isLE = first4[3] == (byte) 0xCA; var is64 = first4[isLE ? 0 : 3] == (byte) 0xBF; - int archStructSize = 4 /* cputype */ + 4 /* cpusubtype */ + (is64 - ? 8 /* offset */ + 8 /* size */ + 4 /* align */ + 4 /* reserved */ - : 4 /* offset */ + 4 /* size */ + 4 /* align */); + int archStructSize = 4 /* cputype */ + 4 /* cpusubtype */ + + (is64 ? 8 /* offset */ + 8 /* size */ + 4 /* align */ + 4 /* reserved */ + : 4 /* offset */ + 4 /* size */ + 4 /* align */); int archsCount = isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream); if (archsCount < 1) { throw new TikaException("Invalid number of architectures: " + archsCount); } if (archsCount > MAX_ARCHS_COUNT) { - throw new TikaException("Number of architectures=" + archsCount + " greater than max allowed=" + MAX_ARCHS_COUNT); + throw new TikaException("Number of architectures=" + archsCount + + " greater than max allowed=" + MAX_ARCHS_COUNT); } currentOffset += 4; @@ -116,20 +111,25 @@ public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extr IOUtils.skipFully(stream, 8); long offset = is64 - ? (isLE ? EndianUtils.readLongLE(stream) : EndianUtils.readLongBE(stream)) - : (isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream)); + ? (isLE ? EndianUtils.readLongLE(stream) + : EndianUtils.readLongBE(stream)) + : (isLE ? EndianUtils.readIntLE(stream) + : EndianUtils.readIntBE(stream)); if (offset < 4 + 4 + archsSize) { throw new TikaException("Invalid offset: " + offset); } - if (!unsortedOffsets && archIndex > 0 && offset < (long) offsetAndSizePerArch[archIndex - 1].getLeft()) { + if (!unsortedOffsets && archIndex > 0 + && offset < (long) offsetAndSizePerArch[archIndex - 1].getLeft()) { unsortedOffsets = true; } - long size = is64 - ? (isLE ? EndianUtils.readLongLE(stream) : EndianUtils.readLongBE(stream)) - : (isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream)); + long size = is64 ? (isLE ? EndianUtils.readLongLE(stream) + : EndianUtils.readLongBE(stream)) + : (isLE ? EndianUtils.readIntLE(stream) + : EndianUtils.readIntBE(stream)); if (size < 0 || size > MAX_ARCH_SIZE) { - throw new TikaException("Arch size=" + size + " must be > 0 and < " + MAX_ARCH_SIZE); + throw new TikaException( + "Arch size=" + size + " must be > 0 and < " + MAX_ARCH_SIZE); } offsetAndSizePerArch[archIndex] = Pair.of(offset, size); @@ -142,16 +142,17 @@ public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extr currentOffset += archStructSize; } if (unsortedOffsets) { - Arrays.sort(offsetAndSizePerArch, Comparator.comparingLong(entry -> (long) entry.getLeft())); + Arrays.sort(offsetAndSizePerArch, + Comparator.comparingLong(entry -> (long) entry.getLeft())); } for (int archIndex = 0; archIndex < archsCount; archIndex++) { - long skipUntilStart = (long)offsetAndSizePerArch[archIndex].getLeft() - currentOffset; + long skipUntilStart = (long) offsetAndSizePerArch[archIndex].getLeft() - currentOffset; IOUtils.skipFully(stream, skipUntilStart); currentOffset += skipUntilStart; - long sz = (long)offsetAndSizePerArch[archIndex].getRight(); - //we bounds checked this above. - byte[] perArchMachO = new byte[(int)sz]; + long sz = (long) offsetAndSizePerArch[archIndex].getRight(); + // we bounds checked this above. + byte[] perArchMachO = new byte[(int) sz]; IOUtils.readFully(stream, perArchMachO); currentOffset += perArchMachO.length; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java index 042d1db5ec..6b399d8b36 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java @@ -1,38 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mat; -//JDK imports +// JDK imports import static java.nio.charset.StandardCharsets.UTF_8; +import com.jmatio.io.MatFileHeader; +import com.jmatio.io.MatFileReader; +import com.jmatio.types.MLArray; +import com.jmatio.types.MLStructure; import java.io.IOException; import java.io.InputStream; import java.util.Collections; import java.util.Map; import java.util.Set; - -import com.jmatio.io.MatFileHeader; -import com.jmatio.io.MatFileReader; -import com.jmatio.types.MLArray; -import com.jmatio.types.MLStructure; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -41,8 +35,10 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; -//JMatIO imports +// JMatIO imports public class MatParser implements Parser { @@ -50,37 +46,37 @@ public class MatParser implements Parser { public static final String MATLAB_MIME_TYPE = "application/x-matlab-data"; static { - //make sure that this is set to false + // make sure that this is set to false MatFileReader.setAllowObjectDeserialization(false); } private final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("x-matlab-data")); + Collections.singleton(MediaType.application("x-matlab-data")); public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { - //Set MIME type as Matlab + // Set MIME type as Matlab metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE); TemporaryResources tmp = - TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources(); + TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources(); try { // Use TIS so we can spool a temp file for parsing. TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); - //Extract information from header file - MatFileReader mfr = new MatFileReader(tis.getFile()); //input .mat file + // Extract information from header file + MatFileReader mfr = new MatFileReader(tis.getFile()); // input .mat file - MatFileHeader hdr = mfr.getMatFileHeader(); //.mat header information + MatFileHeader hdr = mfr.getMatFileHeader(); // .mat header information - // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar 2 + // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar 2 // 23:41:57 2014" - String[] parts = - hdr.getDescription().split(","); // Break header information into its parts + String[] parts = hdr.getDescription().split(","); // Break header information into its + // parts if (parts[2].contains("Created")) { int lastIndex1 = parts[2].lastIndexOf("Created on:"); @@ -99,17 +95,18 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } // Get endian indicator from header file - String endianBytes = new String(hdr.getEndianIndicator(), - UTF_8); // Retrieve endian bytes and convert to string - String endianCode = String.valueOf( - endianBytes.toCharArray()); // Convert bytes to characters to string + String endianBytes = new String(hdr.getEndianIndicator(), UTF_8); // Retrieve endian + // bytes and convert + // to string + String endianCode = String.valueOf(endianBytes.toCharArray()); // Convert bytes to + // characters to string metadata.set("endian", endianCode); - //Text output + // Text output XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.newline(); - //Loop through each variable + // Loop through each variable for (Map.Entry entry : mfr.getContent().entrySet()) { String varName = entry.getKey(); MLArray varData = entry.getValue(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java index 4dab3499dd..c22cb05564 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/sas/SAS7BDATParser.java @@ -1,21 +1,24 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.sas; +import com.epam.parso.Column; +import com.epam.parso.DataWriterUtil; +import com.epam.parso.SasFileProperties; +import com.epam.parso.SasFileReader; +import com.epam.parso.impl.SasFileReaderImpl; import java.io.IOException; import java.io.InputStream; import java.text.Format; @@ -23,15 +26,6 @@ import java.util.HashMap; import java.util.Map; import java.util.Set; - -import com.epam.parso.Column; -import com.epam.parso.DataWriterUtil; -import com.epam.parso.SasFileProperties; -import com.epam.parso.SasFileReader; -import com.epam.parso.impl.SasFileReaderImpl; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Database; import org.apache.tika.metadata.HttpHeaders; @@ -44,10 +38,11 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** - * Processes the SAS7BDAT data columnar database file used by SAS and - * other similar languages. + * Processes the SAS7BDAT data columnar database file used by SAS and other similar languages. */ public class SAS7BDATParser implements Parser { private static final long serialVersionUID = -2775485539937983150L; @@ -62,7 +57,7 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { metadata.set(Metadata.CONTENT_TYPE, TYPE_SAS7BDAT.toString()); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); @@ -81,21 +76,21 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, metadata.set(Database.ROW_COUNT, (int) props.getRowCount()); // TODO Can we find more general properties for these / move - // these to more general places? + // these to more general places? metadata.set(HttpHeaders.CONTENT_ENCODING, props.getEncoding()); metadata.set(OfficeOpenXMLExtended.APPLICATION, props.getServerType()); metadata.set(OfficeOpenXMLExtended.APP_VERSION, props.getSasRelease()); metadata.set(MachineMetadata.ARCHITECTURE_BITS, props.isU64() ? "64" : "32"); metadata.set(MachineMetadata.ENDIAN, - props.getEndianness() == 1 ? MachineMetadata.Endian.LITTLE.getName() : - MachineMetadata.Endian.BIG.getName()); + props.getEndianness() == 1 ? MachineMetadata.Endian.LITTLE.getName() + : MachineMetadata.Endian.BIG.getName()); // The following SAS Metadata fields are currently ignored: // compressionMethod // sessionEncoding // fileType - // osName - - // osType - + // osName - + // osType - // mixPageRowCount // headerLength // pageLength @@ -132,7 +127,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, xhtml.endElement("tr"); xhtml.newline(); - //TODO: initialize this on the first row and then apply + // TODO: initialize this on the first row and then apply Map formatMap = new HashMap<>(); // Process each row in turn @@ -140,8 +135,8 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, while ((row = sas.readNext()) != null) { xhtml.startElement("tr"); for (String val : DataWriterUtil.getRowValues(sas.getColumns(), row, formatMap)) { - // Use explicit start/end, rather than element, to - // ensure that empty cells still get output + // Use explicit start/end, rather than element, to + // ensure that empty cells still get output xhtml.startElement("td"); xhtml.characters(val); xhtml.endElement("td"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java index 6a2b024470..0913698834 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/asm/ClassParserTest.java @@ -1,29 +1,26 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.asm; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.Test; /** * Test case for parsing Java class files. @@ -41,19 +38,19 @@ public void testClassParsing() throws Exception { assertTrue(content.contains("class AutoDetectParser extends CompositeParser")); assertTrue(content.contains("private org.apache.tika.mime.MimeTypes types")); assertTrue(content.contains( - "public void parse(" + "java.io.InputStream, org.xml.sax.ContentHandler," + - " org.apache.tika.metadata.Metadata) throws" + - " java.io.IOException, org.xml.sax.SAXException," + - " org.apache.tika.exception.TikaException;")); - assertTrue(content.contains("private byte[] getPrefix(java.io.InputStream, int)" + - " throws java.io.IOException;")); + "public void parse(" + "java.io.InputStream, org.xml.sax.ContentHandler," + + " org.apache.tika.metadata.Metadata) throws" + + " java.io.IOException, org.xml.sax.SAXException," + + " org.apache.tika.exception.TikaException;")); + assertTrue(content.contains("private byte[] getPrefix(java.io.InputStream, int)" + + " throws java.io.IOException;")); } @Test public void testJava11() throws Exception { - //Make sure that this java 11 target .class - //file doesn't throw an exception - //TIKA-2992 + // Make sure that this java 11 target .class + // file doesn't throw an exception + // TIKA-2992 XMLResult xmlResult = getXML("AppleSingleFileParser.class"); assertContains("AppleSingleFileParser", xmlResult.xml); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java index e932c066df..cfcdcea2b3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.code; @@ -23,10 +21,6 @@ import java.io.ByteArrayInputStream; import java.util.Set; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.TikaTest; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -34,6 +28,8 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.ToTextContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; public class SourceCodeParserTest extends TikaTest { @@ -47,34 +43,31 @@ public void testSupportTypes() throws Exception { assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src"))); assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()) - .contains(new MediaType("text", "html"))); + .contains(new MediaType("text", "html"))); } @Test public void testHTMLRenderWithReturnLine() throws Exception { - String htmlContent = - getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, - createMetadata("text/x-java-source")).xml; + String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), + sourceCodeParser, createMetadata("text/x-java-source")).xml; assertTrue(htmlContent.indexOf("public") > - 0); + "public") > 0); assertTrue(htmlContent.indexOf("static") > 0); } @Test public void testTextRender() throws Exception { - String textContent = - getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, - createMetadata("text/x-java-source")); + String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), + sourceCodeParser, createMetadata("text/x-java-source")); assertTrue(textContent.length() > 0); assertFalse(textContent.contains("html")); - textContent = - getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes(UTF_8)), + textContent = getText( + new ByteArrayInputStream("public class HelloWorld {}".getBytes(UTF_8)), sourceCodeParser, createMetadata("text/x-java-source")); assertTrue(textContent.length() > 0); assertFalse(textContent.contains("html")); @@ -84,7 +77,7 @@ public void testTextRender() throws Exception { public void testLoC() throws Exception { Metadata metadata = createMetadata("text/x-groovy"); getText(getResourceAsStream("/test-documents/testGROOVY.groovy"), sourceCodeParser, - metadata); + metadata); assertEquals(metadata.get("LoC"), "9"); } @@ -99,9 +92,8 @@ public void testAuthor() throws Exception { @Test public void testReturnContentAsIsForTextHandler() throws Exception { - String strContent = - getXML(getResourceAsStream("/test-documents/testJAVA.java"), AUTO_DETECT_PARSER, - createMetadata("text/plain")).xml; + String strContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), + AUTO_DETECT_PARSER, createMetadata("text/plain")).xml; assertTrue(strContent.indexOf("public class HelloWorld {") > 0); } @@ -110,10 +102,10 @@ public void testReturnContentAsIsForTextHandler() throws Exception { public void testNoMarkupInToTextHandler() throws Exception { ContentHandler contentHandler = new ToTextContentHandler(); ParseContext parseContext = new ParseContext(); - try (TikaInputStream tis = TikaInputStream - .get(getResourceAsStream("/test-documents/testJAVA.java"))) { - AUTO_DETECT_PARSER - .parse(tis, contentHandler, createMetadata("text/x-java-source"), parseContext); + try (TikaInputStream tis = + TikaInputStream.get(getResourceAsStream("/test-documents/testJAVA.java"))) { + AUTO_DETECT_PARSER.parse(tis, contentHandler, createMetadata("text/x-java-source"), + parseContext); } String strContent = contentHandler.toString(); assertContains("public class HelloWorld {", strContent); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java index 357af992a1..3049245188 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/ExecutableParserTest.java @@ -1,29 +1,26 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.executable; import static org.junit.jupiter.api.Assertions.assertEquals; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.MachineMetadata.Endian; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.jupiter.api.Test; public class ExecutableParserTest extends TikaTest { @@ -39,7 +36,7 @@ public void testWin32Parser() throws Exception { assertEquals("Little", metadata.get(ExecutableParser.ENDIAN)); assertEquals("32", metadata.get(ExecutableParser.ARCHITECTURE_BITS)); assertEquals("Windows", metadata.get(ExecutableParser.PLATFORM)); - assertContains("", r.xml); //no text yet + assertContains("", r.xml); // no text yet } @@ -53,8 +50,8 @@ public void testElfParser_x86_32() throws Exception { assertEquals("Little", metadata.get(ExecutableParser.ENDIAN)); assertEquals("32", metadata.get(ExecutableParser.ARCHITECTURE_BITS)); -// assertEquals("Linux", -// metadata.get(ExecutableParser.PLATFORM)); + // assertEquals("Linux", + // metadata.get(ExecutableParser.PLATFORM)); assertContains("", r.xml); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/UniversalExecutableParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/UniversalExecutableParserTest.java index a123f6c453..49c089c2fe 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/UniversalExecutableParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/UniversalExecutableParserTest.java @@ -1,29 +1,25 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.executable; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.List; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; +import org.junit.jupiter.api.Test; /** * Test case for parsing universal executable files. @@ -34,9 +30,11 @@ public class UniversalExecutableParserTest extends TikaTest { public void testMachO() throws Exception { List metadataList = getRecursiveMetadata("testMacOS-x86_64-arm64"); assertEquals(3, metadataList.size()); - assertEquals("application/x-mach-o-universal", metadataList.get(0).get(Metadata.CONTENT_TYPE)); + assertEquals("application/x-mach-o-universal", + metadataList.get(0).get(Metadata.CONTENT_TYPE)); for (int i = 1; i < 3; i++) { - assertEquals("application/x-mach-o-executable", metadataList.get(i).get(Metadata.CONTENT_TYPE)); + assertEquals("application/x-mach-o-executable", + metadataList.get(i).get(Metadata.CONTENT_TYPE)); } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java index 6cba02340d..fdd4994597 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java @@ -1,32 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.mat; import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.InputStream; - -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.ToXMLContentHandler; +import org.junit.jupiter.api.Test; /** * Test cases to exercise the {@link MatParser}. diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java index 697decbc0f..d2f87f684e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/sas/SAS7BDATParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.sas; @@ -22,10 +20,6 @@ import java.text.DateFormatSymbols; import java.util.Arrays; import java.util.Locale; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Database; import org.apache.tika.metadata.HttpHeaders; @@ -37,10 +31,12 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; public class SAS7BDATParserTest extends TikaTest { private static final String[] SHORT_MONTHS = - new DateFormatSymbols(Locale.getDefault()).getShortMonths(); + new DateFormatSymbols(Locale.getDefault()).getShortMonths(); private Parser parser = new SAS7BDATParser(); @Test @@ -68,7 +64,7 @@ public void testSimpleFile() throws Exception { assertEquals("32", metadata.get(MachineMetadata.ARCHITECTURE_BITS)); assertEquals("Little", metadata.get(MachineMetadata.ENDIAN)); assertEquals(Arrays.asList("recnum", "label"), - Arrays.asList(metadata.getValues(Database.COLUMN_NAME))); + Arrays.asList(metadata.getValues(Database.COLUMN_NAME))); String content = handler.toString(); assertContains("TESTING", content); @@ -102,8 +98,9 @@ public void testMultiColumns() throws Exception { assertEquals("32", metadata.get(MachineMetadata.ARCHITECTURE_BITS)); assertEquals("Little", metadata.get(MachineMetadata.ENDIAN)); assertEquals(Arrays.asList("Record Number", "Square of the Record Number", - "Description of the Row", "Percent Done", "Percent Increment", "date", "datetime", - "time"), Arrays.asList(metadata.getValues(Database.COLUMN_NAME))); + "Description of the Row", "Percent Done", "Percent Increment", "date", + "datetime", "time"), + Arrays.asList(metadata.getValues(Database.COLUMN_NAME))); String content = handler.toString(); assertContains("TESTING", content); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java index 0c5ade3681..32a16d0ba0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java @@ -1,26 +1,29 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.crypto; import java.io.IOException; import java.io.InputStream; import java.util.Set; - import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.EmptyParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; import org.bouncycastle.cms.CMSException; import org.bouncycastle.cms.CMSSignedDataParser; import org.bouncycastle.cms.CMSTypedStream; @@ -30,13 +33,6 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.EmptyParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; - /** * Basic parser for PKCS7 data. */ @@ -52,24 +48,24 @@ public class Pkcs7Parser implements Parser { private static final MediaType PKCS7_SIGNATURE = MediaType.application("pkcs7-signature"); private static final Set SUPPORTED_TYPES = - MediaType.set(PKCS7_MIME, PKCS7_SIGNATURE); + MediaType.set(PKCS7_MIME, PKCS7_SIGNATURE); public Set getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { try { DigestCalculatorProvider digestCalculatorProvider = - new JcaDigestCalculatorProviderBuilder().setProvider("BC").build(); + new JcaDigestCalculatorProviderBuilder().setProvider("BC").build(); CMSSignedDataParser parser = new CMSSignedDataParser(digestCalculatorProvider, - CloseShieldInputStream.wrap(stream)); + CloseShieldInputStream.wrap(stream)); try { CMSTypedStream content = parser.getSignedContent(); if (content == null) { throw new TikaException( - "cannot parse detached pkcs7 signature (no signed data to parse)"); + "cannot parse detached pkcs7 signature (no signed data to parse)"); } try (InputStream input = content.getContentStream()) { Parser delegate = context.get(Parser.class, EmptyParser.INSTANCE); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java index 2a0e4a0f95..3dbd04d656 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.crypto; @@ -32,7 +30,17 @@ import java.util.Map; import java.util.Set; import java.util.TimeZone; - +import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.WriteLimitReachedException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.RereadableInputStream; import org.bouncycastle.asn1.cryptopro.CryptoProObjectIdentifiers; import org.bouncycastle.asn1.nist.NISTObjectIdentifiers; import org.bouncycastle.asn1.oiw.OIWObjectIdentifiers; @@ -50,18 +58,6 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.exception.WriteLimitReachedException; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.EmbeddedDocumentUtil; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.utils.RereadableInputStream; - /** * Tika parser for Time Stamped Data Envelope (application/timestamped-data) */ @@ -81,7 +77,7 @@ public class TSDParser implements Parser { private static final String TSD_TSA = "TSA"; private static final String TSD_ALGORITHM = "Algorithm"; private static final Set SUPPORTED_TYPES = - Collections.singleton(MediaType.application("timestamped-data")); + Collections.singleton(MediaType.application("timestamped-data")); @Override public Set getSupportedTypes(ParseContext context) { @@ -90,21 +86,22 @@ public Set getSupportedTypes(ParseContext context) { @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { - //Try to parse TSD file + // Try to parse TSD file try (RereadableInputStream ris = new RereadableInputStream(stream, 2048, true)) { Metadata TSDAndEmbeddedMetadata = new Metadata(); List tsdMetasList = this.extractMetas(ris); this.buildMetas(tsdMetasList, - metadata != null && metadata.size() > 0 ? TSDAndEmbeddedMetadata : metadata); + metadata != null && metadata.size() > 0 ? TSDAndEmbeddedMetadata + : metadata); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); ris.rewind(); - //Try to parse embedded file in TSD file + // Try to parse embedded file in TSD file this.parseTSDContent(ris, xhtml, TSDAndEmbeddedMetadata, context); xhtml.endDocument(); } @@ -120,10 +117,10 @@ private List extractMetas(InputStream stream) throws SAXException { for (TimeStampToken token : tokens) { TSDMetas tsdMetas = new TSDMetas(true, token.getTimeStampInfo().getGenTime(), - token.getTimeStampInfo().getPolicy().getId(), - token.getTimeStampInfo().getSerialNumber(), - token.getTimeStampInfo().getTsa(), - token.getTimeStampInfo().getHashAlgorithm().getAlgorithm().getId()); + token.getTimeStampInfo().getPolicy().getId(), + token.getTimeStampInfo().getSerialNumber(), + token.getTimeStampInfo().getTsa(), + token.getTimeStampInfo().getHashAlgorithm().getAlgorithm().getId()); tsdMetasList.add(tsdMetas); } @@ -145,16 +142,16 @@ private void buildMetas(List tsdMetasList, Metadata metadata) { for (TSDMetas tsdm : tsdMetasList) { metadata.set(TSD_LOOP_LABEL + count + " - " + Metadata.CONTENT_TYPE, TSD_MIME_TYPE); metadata.set(TSD_LOOP_LABEL + count + " - " + TSD_DESCRIPTION_LABEL, - TSD_DESCRIPTION_VALUE); + TSD_DESCRIPTION_VALUE); metadata.set(TSD_LOOP_LABEL + count + " - " + TSD_PARSED_LABEL, - tsdm.getParseBuiltStr()); + tsdm.getParseBuiltStr()); metadata.set(TSD_LOOP_LABEL + count + " - " + TSD_PARSED_DATE, - tsdm.getParsedDateStr() + " " + TSD_DATE_FORMAT); + tsdm.getParsedDateStr() + " " + TSD_DATE_FORMAT); metadata.set(TSD_LOOP_LABEL + count + " - " + TSD_DATE, - tsdm.getEmitDateStr() + " " + TSD_DATE_FORMAT); + tsdm.getEmitDateStr() + " " + TSD_DATE_FORMAT); metadata.set(TSD_LOOP_LABEL + count + " - " + TSD_POLICY_ID, tsdm.getPolicyId()); metadata.set(TSD_LOOP_LABEL + count + " - " + TSD_SERIAL_NUMBER, - tsdm.getSerialNumberFormatted()); + tsdm.getSerialNumberFormatted()); metadata.set(TSD_LOOP_LABEL + count + " - " + TSD_TSA, tsdm.getTsaStr()); metadata.set(TSD_LOOP_LABEL + count + " - " + TSD_ALGORITHM, tsdm.getAlgorithmName()); count++; @@ -162,7 +159,7 @@ private void buildMetas(List tsdMetasList, Metadata metadata) { } private void parseTSDContent(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws SAXException { + ParseContext context) throws SAXException { CMSTimeStampedDataParser cmsTimeStampedDataParser = null; EmbeddedDocumentExtractor edx = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); @@ -171,7 +168,8 @@ private void parseTSDContent(InputStream stream, ContentHandler handler, Metadat try { cmsTimeStampedDataParser = new CMSTimeStampedDataParser(stream); - try (TikaInputStream tis = TikaInputStream.get(cmsTimeStampedDataParser.getContent())) { + try (TikaInputStream tis = + TikaInputStream.get(cmsTimeStampedDataParser.getContent())) { edx.parseEmbedded(tis, handler, metadata, true); } @@ -206,8 +204,8 @@ private static class OIDNameMapper { encryptionAlgs.put(OIWObjectIdentifiers.dsaWithSHA1.getId(), "DSA"); encryptionAlgs.put(PKCSObjectIdentifiers.rsaEncryption.getId(), "RSA"); encryptionAlgs.put(PKCSObjectIdentifiers.sha1WithRSAEncryption.getId(), "RSA"); - encryptionAlgs - .put(TeleTrusTObjectIdentifiers.teleTrusTRSAsignatureAlgorithm.getId(), "RSA"); + encryptionAlgs.put(TeleTrusTObjectIdentifiers.teleTrusTRSAsignatureAlgorithm.getId(), + "RSA"); encryptionAlgs.put(X509ObjectIdentifiers.id_ea_rsa.getId(), "RSA"); encryptionAlgs.put(CMSSignedDataGenerator.ENCRYPTION_ECDSA, "ECDSA"); encryptionAlgs.put(X9ObjectIdentifiers.ecdsa_with_SHA2.getId(), "ECDSA"); @@ -260,7 +258,7 @@ public static String getEncryptionAlgName(String encryptionAlgOID) { } public static MessageDigest getDigestInstance(String algorithm, String provider) - throws NoSuchProviderException, NoSuchAlgorithmException { + throws NoSuchProviderException, NoSuchAlgorithmException { if (provider != null) { try { return MessageDigest.getInstance(algorithm, provider); @@ -386,9 +384,9 @@ public String getParsedDateStr() { @Override public String toString() { - return "TSDMetas [parseBuilt=" + parseBuilt + ", emitDate=" + emitDate + ", policyId=" + - policyId + ", serialNumber=" + serialNumber + ", tsa=" + tsa + ", algorithm=" + - algorithm + ", parsedDate=" + parsedDate + "]"; + return "TSDMetas [parseBuilt=" + parseBuilt + ", emitDate=" + emitDate + ", policyId=" + + policyId + ", serialNumber=" + serialNumber + ", tsa=" + tsa + + ", algorithm=" + algorithm + ", parsedDate=" + parsedDate + "]"; } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java index 42761d3de2..6f0d290242 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.crypto; @@ -21,14 +19,12 @@ import static org.junit.jupiter.api.Assertions.fail; import java.io.InputStream; - -import org.xml.sax.ContentHandler; - import org.apache.tika.TikaTest; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; public class Pkcs7ParserTest extends TikaTest { public void testDetachedSignature() throws Exception { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java index 02ba1f7a79..ee62f84de6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.crypto; @@ -22,26 +20,28 @@ import java.io.InputStream; import java.util.Arrays; import java.util.List; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; public class TSDParserTest extends TikaTest { @Test public void testTSDFileData() throws Exception { try (InputStream inputXml = getResourceAsStream("/test-documents/MANIFEST.XML.TSD"); - InputStream inputTxt1 = getResourceAsStream("/test-documents/Test1.txt.tsd"); - InputStream inputTxt2 = getResourceAsStream("/test-documents/Test2.txt.tsd"); - InputStream inputDocx = getResourceAsStream("/test-documents/Test3.docx.tsd"); - InputStream inputPdf = getResourceAsStream("/test-documents/Test4.pdf.tsd"); - InputStream inputPng = getResourceAsStream("/test-documents/Test5.PNG.tsd")) { + InputStream inputTxt1 = + getResourceAsStream("/test-documents/Test1.txt.tsd"); + InputStream inputTxt2 = + getResourceAsStream("/test-documents/Test2.txt.tsd"); + InputStream inputDocx = + getResourceAsStream("/test-documents/Test3.docx.tsd"); + InputStream inputPdf = getResourceAsStream("/test-documents/Test4.pdf.tsd"); + InputStream inputPng = + getResourceAsStream("/test-documents/Test5.PNG.tsd")) { TSDParser tsdParser = new TSDParser(); @@ -118,7 +118,7 @@ public void testTSDFileDataRecursiveMetadataXML() throws Exception { List list = getRecursiveMetadata("MANIFEST.XML.TSD"); assertEquals(3, list.size()); assertContains(TSDParser.class.getName(), - Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); + Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); } @Test @@ -126,7 +126,7 @@ public void testTSDFileDataRecursiveMetadataTxt1() throws Exception { List list = getRecursiveMetadata("Test1.txt.tsd"); assertEquals(2, list.size()); assertContains(TSDParser.class.getName(), - Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); + Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); } @Test @@ -134,7 +134,7 @@ public void testTSDFileDataRecursiveMetadataTxt2() throws Exception { List list = getRecursiveMetadata("Test2.txt.tsd"); assertEquals(2, list.size()); assertContains(TSDParser.class.getName(), - Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); + Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); } @Test @@ -142,7 +142,7 @@ public void testTSDFileDataRecursiveMetadataDocx() throws Exception { List list = getRecursiveMetadata("Test3.docx.tsd"); assertEquals(2, list.size()); assertContains(TSDParser.class.getName(), - Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); + Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); } @Test @@ -150,14 +150,14 @@ public void testTSDFileDataRecursiveMetadataPdf() throws Exception { List list = getRecursiveMetadata("Test4.pdf.tsd"); assertEquals(2, list.size()); assertContains(TSDParser.class.getName(), - Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); + Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); } - //@Test + // @Test public void testTSDFileDataRecursiveMetadataPng() throws Exception { List list = getRecursiveMetadata("Test5.PNG.tsd"); assertEquals(2, list.size()); assertContains(TSDParser.class.getName(), - Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); + Arrays.asList(list.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY))); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java index 2a7ee23bc6..44d1f67098 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/BouncyCastleDigester.java @@ -1,32 +1,28 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.digestutils; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.security.Provider; - import org.apache.commons.codec.binary.Base32; -import org.bouncycastle.jce.provider.BouncyCastleProvider; -import org.bouncycastle.util.encoders.Hex; - import org.apache.tika.parser.DigestingParser; import org.apache.tika.parser.digest.CompositeDigester; import org.apache.tika.parser.digest.InputStreamDigester; +import org.bouncycastle.jce.provider.BouncyCastleProvider; +import org.bouncycastle.util.encoders.Hex; /** * Digester that relies on BouncyCastle for MessageDigest implementations. @@ -34,9 +30,8 @@ public class BouncyCastleDigester extends CompositeDigester { /** - * Include a string representing the comma-separated algorithms to run: e.g. "md5,sha1". - * If you want base 32 encoding instead of hexadecimal, add ":32" to the algorithm, e.g. - * "md5,sha1:32" + * Include a string representing the comma-separated algorithms to run: e.g. "md5,sha1". If you + * want base 32 encoding instead of hexadecimal, add ":32" to the algorithm, e.g. "md5,sha1:32" *

      * Will throw an IllegalArgumentException if an algorithm isn't supported * @@ -87,7 +82,7 @@ public String encode(byte[] bytes) { private static class BCInputStreamDigester extends InputStreamDigester { public BCInputStreamDigester(int markLimit, String algorithm, - DigestingParser.Encoder encoder) { + DigestingParser.Encoder encoder) { super(markLimit, algorithm, encoder); try { MessageDigest.getInstance(algorithm, getProvider()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java index 44617ab345..e9a02c3426 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java @@ -1,50 +1,46 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.digestutils; import java.util.ArrayList; import java.util.List; import java.util.Locale; - import org.apache.commons.codec.binary.Base32; import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Hex; - import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.DigestingParser; import org.apache.tika.parser.digest.CompositeDigester; import org.apache.tika.parser.digest.InputStreamDigester; /** - * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester} - * that relies on commons.codec.digest.DigestUtils to calculate digest hashes. + * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester} that relies on + * commons.codec.digest.DigestUtils to calculate digest hashes. *

      - * This digester tries to use the regular mark/reset protocol on the InputStream. - * However, this wraps an internal BoundedInputStream, and if the InputStream - * is not fully read, then this will reset the stream and - * spool the InputStream to disk (via TikaInputStream) and then digest the file. + * This digester tries to use the regular mark/reset protocol on the InputStream. However, this + * wraps an internal BoundedInputStream, and if the InputStream is not fully read, then this will + * reset the stream and spool the InputStream to disk (via TikaInputStream) and then digest the + * file. */ public class CommonsDigester extends CompositeDigester { /** - * Include a string representing the comma-separated algorithms to run: e.g. "md5,sha1". - * If you want base 32 encoding instead of hexadecimal, add ":32" to the algorithm, e.g. - * "md5,sha1:32". If you want uppercase digests for the hexadecimal encoder, - * use uppercase in the algorithm name, e.g. "MD5". + * Include a string representing the comma-separated algorithms to run: e.g. "md5,sha1". If you + * want base 32 encoding instead of hexadecimal, add ":32" to the algorithm, e.g. "md5,sha1:32". + * If you want uppercase digests for the hexadecimal encoder, use uppercase in the algorithm + * name, e.g. "MD5". *

      * Will throw an IllegalArgumentException if an algorithm isn't supported * @@ -56,8 +52,8 @@ public CommonsDigester(int markLimit, String algorithmString) { } /** - * @param markLimit limit for mark/reset; after this limit is hit, the - * stream is reset and spooled to disk + * @param markLimit limit for mark/reset; after this limit is hit, the stream is reset and + * spooled to disk * @param algorithms algorithms to run */ public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) { @@ -65,21 +61,20 @@ public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) { } private static DigestingParser.Digester[] buildDigesters(int markLimit, - DigestAlgorithm[] algorithms) { + DigestAlgorithm[] algorithms) { DigestingParser.Digester[] digesters = new DigestingParser.Digester[algorithms.length]; int i = 0; for (DigestAlgorithm algorithm : algorithms) { - digesters[i++] = - new InputStreamDigester(markLimit, algorithm.getJavaName(), algorithm.name(), - new HexEncoder(false)); + digesters[i++] = new InputStreamDigester(markLimit, algorithm.getJavaName(), + algorithm.name(), new HexEncoder(false)); } return digesters; } /** - * This returns digest algorithms only. It does not understand the encoding - * syntax, e.g. "MD5:32" (base 32 encoding of MD5). To parse - * those, see {@link #CommonsDigester(int, String)}. + * This returns digest algorithms only. It does not understand the encoding syntax, e.g. + * "MD5:32" (base 32 encoding of MD5). To parse those, see + * {@link #CommonsDigester(int, String)}. * * @param s comma-delimited (no space) list of algorithms to use: md5,sha256. * @return @@ -120,7 +115,7 @@ private static DigestAlgorithm getDigestAlgorithm(String algoString) { sb.append(algo.toString()); } throw new IllegalArgumentException( - "Couldn't match " + algoString + " with any of: " + sb.toString()); + "Couldn't match " + algoString + " with any of: " + sb.toString()); } } @@ -133,7 +128,7 @@ private static DigestingParser.Digester[] buildDigesters(int markLimit, String d DigestingParser.Encoder encoder = getEncoder(parts); DigestAlgorithm digestAlgorithm = getDigestAlgorithm(parts[0]); digesters[i++] = new InputStreamDigester(markLimit, digestAlgorithm.getJavaName(), - digestAlgorithm.name(), encoder); + digestAlgorithm.name(), encoder); } return digesters; } @@ -158,9 +153,9 @@ private static DigestingParser.Encoder getEncoder(String[] parts) { } public enum DigestAlgorithm { - //those currently available in commons.digest - MD2("MD2"), MD5("MD5"), SHA1("SHA-1"), SHA256("SHA-256"), SHA384("SHA-384"), - SHA512("SHA-512"); + // those currently available in commons.digest + MD2("MD2"), MD5("MD5"), SHA1("SHA-1"), SHA256("SHA-256"), SHA384("SHA-384"), SHA512( + "SHA-512"); private final String javaName; @@ -173,13 +168,14 @@ String getJavaName() { } String getMetadataKey() { - return TikaCoreProperties.TIKA_META_PREFIX + "digest" + - TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + this.toString(); + return TikaCoreProperties.TIKA_META_PREFIX + "digest" + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + this.toString(); } } private static abstract class CasingEncoderBase implements DigestingParser.Encoder { private final boolean upperCase; + private CasingEncoderBase(boolean upperCase) { this.upperCase = upperCase; } @@ -187,6 +183,7 @@ private CasingEncoderBase(boolean upperCase) { } private static class HexEncoder implements DigestingParser.Encoder { private final boolean upperCase; + private HexEncoder(boolean upperCase) { this.upperCase = upperCase; } @@ -200,7 +197,7 @@ String toCase(String digest) { if (upperCase) { return digest.toUpperCase(Locale.ROOT); } else { - //this is redundant, but useful for future proofing? + // this is redundant, but useful for future proofing? return digest.toLowerCase(Locale.ROOT); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java index d37f7acb10..9687ea8d1d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.digestutils; @@ -20,8 +18,7 @@ import org.apache.tika.parser.DigestingParser; /** - * Simple factory for {@link CommonsDigester} with - * default markLimit = 1000000 and md5 digester. + * Simple factory for {@link CommonsDigester} with default markLimit = 1000000 and md5 digester. */ public class CommonsDigesterFactory implements DigestingParser.DigesterFactory { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java index aa4c3fed8a..d5ae9b8f25 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.font; @@ -22,12 +20,8 @@ import java.util.Collections; import java.util.List; import java.util.Set; - import org.apache.fontbox.afm.AFMParser; import org.apache.fontbox.afm.FontMetrics; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; @@ -36,6 +30,8 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Parser for AFM Font Files @@ -65,7 +61,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { FontMetrics fontMetrics; AFMParser parser = new AFMParser(stream); @@ -74,7 +70,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, // Get the comments in the file to display in xhtml List unModifiableComments = fontMetrics.getComments(); - //have to copy because we modify list in extractCreationDate + // have to copy because we modify list in extractCreationDate List comments = new ArrayList<>(unModifiableComments); // Get the creation date extractCreationDate(metadata, comments); @@ -84,7 +80,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, // Add metadata associated with the font type addMetadataByString(metadata, MET_AVG_CHAR_WIDTH, - Float.toString(fontMetrics.getAverageCharacterWidth())); + Float.toString(fontMetrics.getAverageCharacterWidth())); addMetadataByString(metadata, MET_DOC_VERSION, Float.toString(fontMetrics.getAFMVersion())); addMetadataByString(metadata, MET_FONT_NAME, fontMetrics.getFontName()); addMetadataByString(metadata, MET_FONT_FULL_NAME, fontMetrics.getFullName()); @@ -93,7 +89,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, addMetadataByString(metadata, MET_FONT_WEIGHT, fontMetrics.getWeight()); addMetadataByString(metadata, MET_FONT_NOTICE, fontMetrics.getNotice()); addMetadataByString(metadata, MET_FONT_UNDERLINE_THICKNESS, - Float.toString(fontMetrics.getUnderlineThickness())); + Float.toString(fontMetrics.getUnderlineThickness())); // Output the remaining comments as text XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java index b88ffdb749..822c2460bc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.font; @@ -20,7 +18,6 @@ import java.io.InputStream; import java.util.Collections; import java.util.Set; - import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.fontbox.ttf.NameRecord; import org.apache.fontbox.ttf.NamingTable; @@ -29,9 +26,6 @@ import org.apache.pdfbox.io.RandomAccessRead; import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.io.RandomAccessReadBufferedFile; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -40,6 +34,8 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Parser for TrueType font files (TTF). @@ -60,7 +56,7 @@ public Set getSupportedTypes(ParseContext context) { } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { TikaInputStream tis = TikaInputStream.cast(stream); // Ask FontBox to parse the file for us @@ -73,7 +69,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } } else { try (RandomAccessRead rar = - new RandomAccessReadBuffer(CloseShieldInputStream.wrap(tis))) { + new RandomAccessReadBuffer(CloseShieldInputStream.wrap(tis))) { font = parser.parse(rar); } } @@ -83,7 +79,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, metadata.set(TikaCoreProperties.CREATED, font.getHeader().getCreated()); metadata.set(TikaCoreProperties.MODIFIED, font.getHeader().getModified()); metadata.set(AdobeFontMetricParser.MET_DOC_VERSION, - Float.toString(font.getHeader().getVersion())); + Float.toString(font.getHeader().getVersion())); // Pull out the naming info NamingTable fontNaming = font.getNaming(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java index 94815193f4..cd37bbcb9c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-font-module/src/test/java/org/apache/tika/parser/font/FontParsersTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.font; @@ -25,15 +23,14 @@ import static org.apache.tika.parser.font.AdobeFontMetricParser.MET_PS_NAME; import static org.junit.jupiter.api.Assertions.assertEquals; -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; - import org.apache.tika.TikaTest; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; /** * Test case for parsing various different font files. @@ -46,8 +43,8 @@ public void testAdobeFontMetricParsing() throws Exception { Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); - try (TikaInputStream stream = TikaInputStream - .get(getResourceAsUrl("/test-documents/testAFM.afm"))) { + try (TikaInputStream stream = + TikaInputStream.get(getResourceAsUrl("/test-documents/testAFM.afm"))) { AUTO_DETECT_PARSER.parse(stream, handler, metadata, context); } @@ -75,12 +72,12 @@ public void testTTFParsing() throws Exception { ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); - //Open Sans font is ASL 2.0 according to - //http://www.google.com/fonts/specimen/Open+Sans - //...despite the copyright in the file's metadata. + // Open Sans font is ASL 2.0 according to + // http://www.google.com/fonts/specimen/Open+Sans + // ...despite the copyright in the file's metadata. try (TikaInputStream stream = TikaInputStream - .get(getResourceAsUrl("/test-documents/testTrueType3.ttf"))) { + .get(getResourceAsUrl("/test-documents/testTrueType3.ttf"))) { AUTO_DETECT_PARSER.parse(stream, handler, metadata, context); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURIScheme.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURIScheme.java index 0533b47676..e3b80eed27 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURIScheme.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURIScheme.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; @@ -20,9 +18,7 @@ import java.io.InputStream; import java.util.Arrays; import java.util.Objects; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; - import org.apache.tika.mime.MediaType; public class DataURIScheme { @@ -43,8 +39,8 @@ public InputStream getInputStream() { } /** - * @return parsed media type or null if parse fails or if media type string was - * not specified + * @return parsed media type or null if parse fails or if media type string was not + * specified */ public MediaType getMediaType() { if (rawMediaTypeString != null) { @@ -66,9 +62,9 @@ public boolean equals(Object o) { return false; } DataURIScheme that = (DataURIScheme) o; - return isBase64() == that.isBase64() && - Objects.equals(rawMediaTypeString, that.rawMediaTypeString) && - Arrays.equals(data, that.data); + return isBase64() == that.isBase64() + && Objects.equals(rawMediaTypeString, that.rawMediaTypeString) + && Arrays.equals(data, that.data); } @Override diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeParseException.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeParseException.java index 9fae974a01..49d748f234 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeParseException.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeParseException.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeUtil.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeUtil.java index 5a7e66d99e..593b049b38 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeUtil.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DataURISchemeUtil.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; @@ -25,13 +23,11 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.commons.codec.binary.Base64; - import org.apache.tika.mime.MediaType; /** - * Not thread safe. Create a separate util for each thread. + * Not thread safe. Create a separate util for each thread. */ public class DataURISchemeUtil { @@ -39,7 +35,7 @@ public class DataURISchemeUtil { private static Pattern PARSE_PATTERN = Pattern.compile("(?s)data:([^,]*?)(base64)?,(.*)$"); private static Pattern EXTRACT_PATTERN = - Pattern.compile("(?s)data:([^,]*?)(base64)?,([^\"\']*)[\"\']"); + Pattern.compile("(?s)data:([^,]*?)(base64)?,([^\"\']*)[\"\']"); private final Matcher parseMatcher = PARSE_PATTERN.matcher(""); private final Matcher extractMatcher = EXTRACT_PATTERN.matcher(""); Base64 base64 = new Base64(); @@ -54,7 +50,7 @@ public DataURIScheme parse(String string) throws DataURISchemeParseException { private DataURIScheme build(String mediaTypeString, String isBase64, String dataString) { byte[] data = null; - //strip out back slashes as you might have in css + // strip out back slashes as you might have in css dataString = (dataString != null) ? dataString.replaceAll("\\\\", " ") : dataString; if (dataString == null || dataString.isEmpty()) { @@ -62,7 +58,7 @@ private DataURIScheme build(String mediaTypeString, String isBase64, String data } else if (isBase64 != null) { data = base64.decode(dataString); } else { - //TODO: handle encodings + // TODO: handle encodings MediaType mediaType = MediaType.parse(mediaTypeString); Charset charset = StandardCharsets.UTF_8; if (mediaType.hasParameters()) { @@ -71,7 +67,7 @@ private DataURIScheme build(String mediaTypeString, String isBase64, String data try { charset = Charset.forName(charsetName); } catch (IllegalCharsetNameException e) { - //swallow and default to UTF-8 + // swallow and default to UTF-8 } } } @@ -91,7 +87,7 @@ public List extract(String string) { List list = null; while (extractMatcher.find()) { DataURIScheme dataURIScheme = build(extractMatcher.group(1), extractMatcher.group(2), - extractMatcher.group(3)); + extractMatcher.group(3)); if (list == null) { list = new ArrayList<>(); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java index 0056106e87..99153bb25d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; @@ -35,7 +33,8 @@ public class DefaultHtmlMapper implements HtmlMapper { */ public static final HtmlMapper INSTANCE = new DefaultHtmlMapper(); // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd - private static final Map SAFE_ELEMENTS = new HashMap() {{ + private static final Map SAFE_ELEMENTS = new HashMap() { + { put("H1", "h1"); put("H2", "h2"); put("H3", "h3"); @@ -79,38 +78,48 @@ public class DefaultHtmlMapper implements HtmlMapper { put("PARAM", "param"); put("INS", "ins"); put("DEL", "del"); - }}; - private static final Set DISCARDABLE_ELEMENTS = new HashSet() {{ + } + }; + private static final Set DISCARDABLE_ELEMENTS = new HashSet() { + { add("STYLE"); add("SCRIPT"); - }}; + } + }; // For information on tags & attributes, see: // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict // http://www.w3schools.com/TAGS/ private static final Map> SAFE_ATTRIBUTES = - new HashMap>() {{ - put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", - "shape", "coords")); - put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", - "ismap")); - put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", - "marginheight", "noresize", "scrolling")); - put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", - "marginheight", "scrolling", "align", "height", "width")); - put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media")); - put("map", attrSet("id", "class", "style", "title", "name")); - put("area", attrSet("shape", "coords", "href", "nohref", "alt")); - put("object", attrSet("declare", "classid", "codebase", "data", "type", - "codetype", "archive", "standby", "height", "width", - "usemap", "name", "tabindex", "align", "border", "hspace", "vspace")); - put("param", attrSet("id", "name", "value", "valuetype", "type")); - put("blockquote", attrSet("cite")); - put("ins", attrSet("cite", "datetime")); - put("del", attrSet("cite", "datetime")); - put("q", attrSet("cite")); + new HashMap>() { + { + put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", + "rev", "shape", "coords")); + put("img", attrSet("src", "alt", "longdesc", "height", "width", + "usemap", "ismap")); + put("frame", attrSet("longdesc", "name", "src", "frameborder", + "marginwidth", "marginheight", "noresize", + "scrolling")); + put("iframe", attrSet("longdesc", "name", "src", "frameborder", + "marginwidth", "marginheight", "scrolling", "align", + "height", "width")); + put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", + "media")); + put("map", attrSet("id", "class", "style", "title", "name")); + put("area", attrSet("shape", "coords", "href", "nohref", "alt")); + put("object", attrSet("declare", "classid", "codebase", "data", "type", + "codetype", "archive", "standby", "height", "width", + "usemap", "name", "tabindex", "align", "border", + "hspace", "vspace")); + put("param", attrSet("id", "name", "value", "valuetype", "type")); + put("blockquote", attrSet("cite")); + put("ins", attrSet("cite", "datetime")); + put("del", attrSet("cite", "datetime")); + put("q", attrSet("cite")); - // TODO - fill out this set. Include core, i18n, etc sets where appropriate. - }}; + // TODO - fill out this set. Include core, i18n, etc sets where + // appropriate. + } + }; private static Set attrSet(String... attrs) { return new HashSet<>(Arrays.asList(attrs)); @@ -121,8 +130,7 @@ public String mapSafeElement(String name) { } /** - * Normalizes an attribute name. Assumes that the element name - * is valid and normalized + * Normalizes an attribute name. Assumes that the element name is valid and normalized */ public String mapSafeAttribute(String elementName, String attributeName) { Set safeAttrs = SAFE_ATTRIBUTES.get(elementName); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java index 9f66c9ac06..a0fe3c2b3c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; @@ -29,17 +27,15 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.apache.tika.config.Field; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.metadata.Metadata; import org.apache.tika.utils.CharsetUtils; /** - * Character encoding detector for determining the character encoding of a - * HTML document based on the potential charset parameter found in a - * Content-Type http-equiv meta tag somewhere near the beginning. Especially - * useful for determining the type among multiple closely related encodings + * Character encoding detector for determining the character encoding of a HTML document based on + * the potential charset parameter found in a Content-Type http-equiv meta tag somewhere near the + * beginning. Especially useful for determining the type among multiple closely related encodings * (ISO-8859-*) for which other types of encoding detection are unreliable. * * @since Apache Tika 1.2 @@ -49,34 +45,33 @@ public class HtmlEncodingDetector implements EncodingDetector { // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K) private static final int DEFAULT_MARK_LIMIT = 8192; private static final Pattern HTTP_META_PATTERN = - Pattern.compile("(?is)<\\s*meta(?:/|\\s+)([^<>]+)"); - //this should match both the older: - // - //and - //html5 - //See http://webdesign.about.com/od/metatags/qt/meta-charset.htm - //for the noisiness that one might encounter in charset attrs. - //Chose to go with strict ([-_:\\.a-z0-9]+) to match encodings - //following http://docs.oracle.com/javase/7/docs/api/java/nio/charset/Charset.html - //For a more general "not" matcher, try: - //("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)") + Pattern.compile("(?is)<\\s*meta(?:/|\\s+)([^<>]+)"); + // this should match both the older: + // + // and + // html5 + // See http://webdesign.about.com/od/metatags/qt/meta-charset.htm + // for the noisiness that one might encounter in charset attrs. + // Chose to go with strict ([-_:\\.a-z0-9]+) to match encodings + // following http://docs.oracle.com/javase/7/docs/api/java/nio/charset/Charset.html + // For a more general "not" matcher, try: + // ("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)") private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = - Pattern.compile(("(?is)\\bcharset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")); + Pattern.compile(("(?is)\\bcharset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")); private static final Charset ASCII = Charset.forName("US-ASCII"); /** - * HTML can include non-iana supported charsets that Java - * recognizes, e.g. "unicode". This can lead to incorrect detection/mojibake. - * Ignore charsets in html meta-headers that are not supported by IANA. - * See: TIKA-2592 + * HTML can include non-iana supported charsets that Java recognizes, e.g. "unicode". This can + * lead to incorrect detection/mojibake. Ignore charsets in html meta-headers that are not + * supported by IANA. See: TIKA-2592 */ private static Set CHARSETS_UNSUPPORTED_BY_IANA; static { Set unsupported = new HashSet<>(); try (BufferedReader reader = new BufferedReader(new InputStreamReader( - HtmlEncodingDetector.class - .getResourceAsStream("StandardCharsets_unsupported_by_IANA.txt"), - StandardCharsets.UTF_8))) { + HtmlEncodingDetector.class.getResourceAsStream( + "StandardCharsets_unsupported_by_IANA.txt"), + StandardCharsets.UTF_8))) { String line = reader.readLine(); while (line != null) { if (line.startsWith("#")) { @@ -91,7 +86,7 @@ public class HtmlEncodingDetector implements EncodingDetector { } } catch (IOException e) { throw new IllegalArgumentException( - "couldn't find StandardCharsets_unsupported_by_IANA.txt on the class path"); + "couldn't find StandardCharsets_unsupported_by_IANA.txt on the class path"); } CHARSETS_UNSUPPORTED_BY_IANA = Collections.unmodifiableSet(unsupported); } @@ -119,11 +114,11 @@ public Charset detect(InputStream input, Metadata metadata) throws IOException { // a possible character encoding hint String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString(); - //strip out comments + // strip out comments String headNoComments = head.replaceAll("|$)", " "); - //try to find the encoding in head without comments + // try to find the encoding in head without comments Charset charset = findCharset(headNoComments); - //if nothing is found, back off to find any encoding + // if nothing is found, back off to find any encoding if (charset == null) { return findCharset(head); } @@ -131,17 +126,17 @@ public Charset detect(InputStream input, Metadata metadata) throws IOException { } - //returns null if no charset was found + // returns null if no charset was found private Charset findCharset(String s) { Matcher equiv = HTTP_META_PATTERN.matcher(s); Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); - //iterate through meta tags + // iterate through meta tags while (equiv.find()) { String attrs = equiv.group(1); charsetMatcher.reset(attrs); - //iterate through charset= and return the first match - //that is valid + // iterate through charset= and return the first match + // that is valid while (charsetMatcher.find()) { String candCharset = charsetMatcher.group(1); if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) { @@ -155,7 +150,7 @@ private Charset findCharset(String s) { try { return CharsetUtils.forName(candCharset); } catch (IllegalArgumentException e) { - //ignore + // ignore } } } @@ -168,8 +163,7 @@ public int getMarkLimit() { } /** - * How far into the stream to read for charset detection. - * Default is 8192. + * How far into the stream to read for charset detection. Default is 8192. * * @param markLimit */ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java index b613f39399..a973a954b8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; @@ -29,12 +27,6 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; - -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; - import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; @@ -48,14 +40,18 @@ import org.apache.tika.sax.TextContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.StringUtils; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; class HtmlHandler extends TextContentHandler { // List of attributes that need to be resolved. private static final Set URI_ATTRIBUTES = - new HashSet<>(Arrays.asList("src", "href", "longdesc", "cite")); + new HashSet<>(Arrays.asList("src", "href", "longdesc", "cite")); private static final Pattern ICBM = - Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*"); + Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*"); private static final Map META_HEADER_MAPPINGS = new HashMap<>(); @@ -79,11 +75,11 @@ class HtmlHandler extends TextContentHandler { private int discardLevel = 0; private int titleLevel = 0; private int scriptLevel = 0; - private Attributes scriptAtts = EMPTY_ATTS;//attributes from outermost script element + private Attributes scriptAtts = EMPTY_ATTS;// attributes from outermost script element private boolean isTitleSetToMetadata = false; private HtmlHandler(HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata, - ParseContext context, boolean extractScripts) { + ParseContext context, boolean extractScripts) { super(xhtml); this.mapper = mapper; this.xhtml = xhtml; @@ -106,13 +102,13 @@ private HtmlHandler(HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metad } public HtmlHandler(HtmlMapper mapper, ContentHandler handler, Metadata metadata, - ParseContext context, boolean extractScripts) { + ParseContext context, boolean extractScripts) { this(mapper, new XHTMLContentHandler(handler, metadata), metadata, context, extractScripts); } @Override public void startElement(String uri, String local, String name, Attributes atts) - throws SAXException { + throws SAXException { if ("HTML".equals(name) && atts.getValue("lang") != null) { metadata.set(Metadata.CONTENT_LANGUAGE, atts.getValue("lang")); @@ -142,7 +138,8 @@ public void startElement(String uri, String local, String name, Attributes atts) addHtmlMetadata(atts.getValue("name"), atts.getValue("content")); } else if (atts.getValue("property") != null) { // TIKA-983: Handle tags - metadata.add(HTML.PREFIX_HTML_META + atts.getValue("property"), atts.getValue("content")); + metadata.add(HTML.PREFIX_HTML_META + atts.getValue("property"), + atts.getValue("content")); } } else if ("BASE".equals(name) && atts.getValue("href") != null) { startElementWithSafeAttributes("base", atts); @@ -166,8 +163,8 @@ public void startElement(String uri, String local, String name, Attributes atts) title.setLength(0); String value = atts.getValue("src"); if (value != null && value.startsWith("data:")) { - //don't extract data if we're in a script - //and the user doesn't want to extract scripts + // don't extract data if we're in a script + // and the user doesn't want to extract scripts if (scriptLevel == 0 || extractScripts) { handleDataURIScheme(value); } @@ -181,12 +178,12 @@ public void startElement(String uri, String local, String name, Attributes atts) } /** - * Adds a metadata setting from the HTML to the Tika metadata - * object. The name and value are normalized where possible. + * Adds a metadata setting from the HTML to the Tika metadata object. The name and value + * are normalized where possible. */ private void addHtmlMetadata(String name, String value) { - //note that "name" derives from attributes and is not uppercased - //like the elements by the XHTMLDowngradeHandler + // note that "name" derives from attributes and is not uppercased + // like the elements by the XHTMLDowngradeHandler if (StringUtils.isBlank(name) || StringUtils.isBlank(value)) { return; @@ -205,7 +202,7 @@ private void addHtmlMetadata(String name, String value) { } if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) { - //don't overwrite Metadata.CONTENT_TYPE! + // don't overwrite Metadata.CONTENT_TYPE! MediaType type = MediaType.parse(value); if (type != null) { metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString()); @@ -219,8 +216,8 @@ private void addHtmlMetadata(String name, String value) { if (META_HEADER_MAPPINGS.containsKey(lcName)) { Property property = META_HEADER_MAPPINGS.get(lcName); if (property.equals(TikaCoreProperties.TITLE) && isTitleSetToMetadata) { - //prefer the title element if it is already set - //do nothing + // prefer the title element if it is already set + // do nothing metadata.add(HTML.PREFIX_HTML_META + TikaCoreProperties.TITLE.getName(), value); } else if (property.isMultiValuePermitted()) { metadata.add(property, value); @@ -263,9 +260,9 @@ private void startElementWithSafeAttributes(String name, Attributes atts) throws // And resolve relative links. Eventually this should be pushed // into the HtmlMapper code. if (URI_ATTRIBUTES.contains(normAttrName)) { - //if this is a src="data: " element, - //we've handled that as an embedded file, don't include the full thing - //here + // if this is a src="data: " element, + // we've handled that as an embedded file, don't include the full thing + // here if (normAttrName.equals("src")) { String v = newAttributes.getValue(att); if (v.startsWith("data:")) { @@ -275,8 +272,8 @@ private void startElementWithSafeAttributes(String name, Attributes atts) throws newAttributes.setValue(att, resolve(newAttributes.getValue(att))); } else if (isObject && "codebase".equals(normAttrName)) { newAttributes.setValue(att, codebase); - } else if (isObject && - ("data".equals(normAttrName) || "classid".equals(normAttrName))) { + } else if (isObject && ("data".equals(normAttrName) + || "classid".equals(normAttrName))) { newAttributes.setValue(att, resolve(codebase, newAttributes.getValue(att))); } } @@ -331,16 +328,18 @@ public void endElement(String uri, String local, String name) throws SAXExceptio discardLevel--; } } + private void handleSrcDoc(String string) throws SAXException { Metadata m = new Metadata(); m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, - TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); + TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); m.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, "text/html"); - //TODO add metadata about iframe content? + // TODO add metadata about iframe content? EmbeddedDocumentExtractor embeddedDocumentExtractor = - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { - try (TikaInputStream tis = TikaInputStream.get(string.getBytes(StandardCharsets.UTF_8))) { + try (TikaInputStream tis = + TikaInputStream.get(string.getBytes(StandardCharsets.UTF_8))) { embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); @@ -353,19 +352,19 @@ private void handleDataURIScheme(String string) throws SAXException { try { dataURIScheme = dataURISchemeUtil.parse(string); } catch (DataURISchemeParseException e) { - //swallow + // swallow return; } - //do anything with attrs? + // do anything with attrs? Metadata m = new Metadata(); m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, - TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); + TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); if (dataURIScheme.getMediaType() != null) { m.set(Metadata.CONTENT_TYPE, dataURIScheme.getMediaType().toString()); } EmbeddedDocumentExtractor embeddedDocumentExtractor = - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { try (TikaInputStream tis = TikaInputStream.get(dataURIScheme.getInputStream())) { embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true); @@ -376,43 +375,43 @@ private void handleDataURIScheme(String string) throws SAXException { } private void writeScript() throws SAXException { - //don't write an attached macro if there is no content - //we may want to revisit this behavior + // don't write an attached macro if there is no content + // we may want to revisit this behavior if (script.toString().isBlank()) { return; } - //do anything with attrs? + // do anything with attrs? Metadata m = new Metadata(); m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, - TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); String src = scriptAtts.getValue("src"); if (src != null) { m.set(HTML.SCRIPT_SOURCE, src); } EmbeddedDocumentExtractor embeddedDocumentExtractor = - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); - //try to scrape dataURISchemes from javascript + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + // try to scrape dataURISchemes from javascript List dataURISchemes = dataURISchemeUtil.extract(script.toString()); for (DataURIScheme dataURIScheme : dataURISchemes) { Metadata dataUriMetadata = new Metadata(); dataUriMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, - TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); + TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); dataUriMetadata.set(Metadata.CONTENT_TYPE, dataURIScheme.getMediaType().toString()); if (embeddedDocumentExtractor.shouldParseEmbedded(dataUriMetadata)) { try (TikaInputStream tis = TikaInputStream.get(dataURIScheme.getInputStream())) { - embeddedDocumentExtractor - .parseEmbedded(tis, xhtml, dataUriMetadata, true); + embeddedDocumentExtractor.parseEmbedded(tis, xhtml, dataUriMetadata, true); } catch (IOException e) { - //swallow + // swallow } } } - try (TikaInputStream tis = TikaInputStream.get(script.toString().getBytes(StandardCharsets.UTF_8))) { + try (TikaInputStream tis = + TikaInputStream.get(script.toString().getBytes(StandardCharsets.UTF_8))) { embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true); } catch (IOException e) { - //shouldn't ever happen + // shouldn't ever happen } finally { script.setLength(0); } @@ -449,9 +448,9 @@ private String resolve(String base, String url) { // Return the URL as-is if no base URL is available or if the URL // matches a common non-hierarchical or pseudo URI prefix String lower = url.toLowerCase(Locale.ENGLISH); - if (base == null || lower.startsWith("urn:") || lower.startsWith("mailto:") || - lower.startsWith("tel:") || lower.startsWith("data:") || - lower.startsWith("javascript:") || lower.startsWith("about:")) { + if (base == null || lower.startsWith("urn:") || lower.startsWith("mailto:") + || lower.startsWith("tel:") || lower.startsWith("data:") + || lower.startsWith("javascript:") || lower.startsWith("about:")) { return url; } @@ -465,7 +464,7 @@ private String resolve(String base, String url) { String path = baseURL.getPath(); if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) { return new URL(baseURL.getProtocol(), baseURL.getHost(), baseURL.getPort(), - baseURL.getPath() + url).toExternalForm(); + baseURL.getPath() + url).toExternalForm(); } else { return new URL(baseURL, url).toExternalForm(); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java index 1ca74345e9..aabc13e7f1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java @@ -1,68 +1,61 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; /** - * HTML mapper used to make incoming HTML documents easier to handle by - * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from - * the parse context and uses it to map parsed HTML to "safe" XHTML. A client - * that wants to customize this mapping can place a custom HtmlMapper instance - * into the parse context. + * HTML mapper used to make incoming HTML documents easier to handle by Tika clients. The + * {@link HtmlParser} looks up an optional HTML mapper from the parse context and uses it to map + * parsed HTML to "safe" XHTML. A client that wants to customize this mapping can place a custom + * HtmlMapper instance into the parse context. * * @since Apache Tika 0.6 */ public interface HtmlMapper { /** - * Maps "safe" HTML element names to semantic XHTML equivalents. If the - * given element is unknown or deemed unsafe for inclusion in the parse - * output, then this method returns null and the element - * will be ignored but the content inside it is still processed. See - * the {@link #isDiscardElement(String)} method for a way to discard - * the entire contents of an element. + * Maps "safe" HTML element names to semantic XHTML equivalents. If the given element is unknown + * or deemed unsafe for inclusion in the parse output, then this method returns + * null and the element will be ignored but the content inside it is still + * processed. See the {@link #isDiscardElement(String)} method for a way to discard the entire + * contents of an element. * * @param name HTML element name (upper case) - * @return XHTML element name (lower case), or - * null if the element is unsafe + * @return XHTML element name (lower case), or null if the element is unsafe */ String mapSafeElement(String name); /** - * Checks whether all content within the given HTML element should be - * discarded instead of including it in the parse output. + * Checks whether all content within the given HTML element should be discarded instead of + * including it in the parse output. * * @param name HTML element name (upper case) - * @return true if content inside the named element - * should be ignored, false otherwise + * @return true if content inside the named element should be ignored, + * false otherwise */ boolean isDiscardElement(String name); /** - * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the - * given attribute is unknown or deemed unsafe for inclusion in the parse - * output, then this method returns null and the attribute - * will be ignored. This method assumes that the element name - * is valid and normalised. + * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the given attribute is + * unknown or deemed unsafe for inclusion in the parse output, then this method returns + * null and the attribute will be ignored. This method assumes that the element + * name is valid and normalised. * - * @param elementName HTML element name (lower case) + * @param elementName HTML element name (lower case) * @param attributeName HTML attribute name (lower case) - * @return XHTML attribute name (lower case), or - * null if the element is unsafe + * @return XHTML attribute name (lower case), or null if the element is unsafe */ String mapSafeAttribute(String elementName, String attributeName); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java index da046aaae6..65fd0d2cfd 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java @@ -1,26 +1,23 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; import java.util.Locale; /** - * Alternative HTML mapping rules that pass the input HTML as-is without any - * modifications. + * Alternative HTML mapping rules that pass the input HTML as-is without any modifications. * * @since Apache Tika 0.8 */ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java index 36115a08f2..c2e7a12897 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; @@ -28,8 +26,14 @@ import java.util.Iterator; import java.util.Set; import javax.xml.XMLConstants; - import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.config.Field; +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractEncodingDetectorParser; +import org.apache.tika.parser.ParseContext; import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.DataNode; @@ -45,19 +49,10 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.config.Field; -import org.apache.tika.detect.EncodingDetector; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractEncodingDetectorParser; -import org.apache.tika.parser.ParseContext; - /** - * HTML parser. Uses JSoup to turn the input document to HTML SAX events, - * and post-processes the events to produce XHTML and metadata expected by - * Tika clients. + * HTML parser. Uses JSoup to turn the input document to HTML SAX events, and post-processes the + * events to produce XHTML and metadata expected by Tika clients. */ public class JSoupParser extends AbstractEncodingDetectorParser { @@ -72,14 +67,16 @@ public class JSoupParser extends AbstractEncodingDetectorParser { private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml"); private static final MediaType X_ASP = MediaType.application("x-asp"); - private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet( - new HashSet(Arrays.asList(MediaType.text("html"), XHTML, WAP_XHTML, X_ASP))); + private static final Set SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet(Arrays + .asList(MediaType.text("html"), XHTML, WAP_XHTML, X_ASP))); private static final TagSet SELF_CLOSEABLE_TAGS = TagSet.Html(); static { try (BufferedReader reader = new BufferedReader(new InputStreamReader( - JSoupParser.class.getResourceAsStream("self-closeable-tags.txt"), StandardCharsets.UTF_8))) { + JSoupParser.class.getResourceAsStream("self-closeable-tags.txt"), + StandardCharsets.UTF_8))) { String line = reader.readLine(); while (line != null) { if (line.startsWith("#") || line.trim().isEmpty()) { @@ -115,8 +112,7 @@ public boolean isExtractScripts() { } /** - * Whether or not to extract contents in script entities. - * Default is false + * Whether or not to extract contents in script entities. Default is false * * @param extractScripts */ @@ -127,7 +123,7 @@ public void setExtractScripts(boolean extractScripts) { public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { + ParseContext context) throws IOException, SAXException, TikaException { EncodingDetector encodingDetector = getEncodingDetector(context); Charset charset = encodingDetector.detect(stream, metadata); @@ -153,19 +149,17 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, HtmlMapper mapper = context.get(HtmlMapper.class, new DefaultHtmlMapper()); TagSet tagSet = new TagSet(SELF_CLOSEABLE_TAGS); - /* TODO -- when we upgrade jsoup to 1.21.1 - .onNewTag(tag -> { - if (!tag.isKnownTag()) - tag.set(Tag.SelfClose); - }); - */ - - //do better with baseUri? + /* + * TODO -- when we upgrade jsoup to 1.21.1 .onNewTag(tag -> { if (!tag.isKnownTag()) + * tag.set(Tag.SelfClose); }); + */ + + // do better with baseUri? Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream), charset.name(), "", - Parser.htmlParser().tagSet(tagSet)); + Parser.htmlParser().tagSet(tagSet)); document.quirksMode(Document.QuirksMode.quirks); ContentHandler xhtml = new XHTMLDowngradeHandler( - new HtmlHandler(mapper, handler, metadata, context, extractScripts)); + new HtmlHandler(mapper, handler, metadata, context, extractScripts)); xhtml.startDocument(); try { NodeTraversor.filter(new TikaNodeFilter(xhtml), document); @@ -176,15 +170,16 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, } } - public void parseString(String html, ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException { + public void parseString(String html, ContentHandler handler, Metadata metadata, + ParseContext context) throws SAXException { // Get the HTML mapper from the parse context HtmlMapper mapper = context.get(HtmlMapper.class, new DefaultHtmlMapper()); - //do better with baseUri? + // do better with baseUri? Document document = Jsoup.parse(html, Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS)); document.quirksMode(Document.QuirksMode.quirks); ContentHandler xhtml = new XHTMLDowngradeHandler( - new HtmlHandler(mapper, handler, metadata, context, extractScripts)); + new HtmlHandler(mapper, handler, metadata, context, extractScripts)); xhtml.startDocument(); try { NodeTraversor.filter(new TikaNodeFilter(xhtml), document); @@ -219,8 +214,8 @@ public NodeFilter.FilterResult head(Node node, int i) { } return FilterResult.CONTINUE; } else if (node instanceof DataNode) { - //maybe handle script data directly here instead of - //passing it through to the HTMLHandler? + // maybe handle script data directly here instead of + // passing it through to the HTMLHandler? String txt = ((DataNode) node).getWholeData(); if (txt != null) { char[] chars = txt.toCharArray(); @@ -239,11 +234,11 @@ public NodeFilter.FilterResult head(Node node, int i) { while (jsoupAttrs.hasNext()) { Attribute jsoupAttr = jsoupAttrs.next(); attributes.addAttribute("", jsoupAttr.getKey(), jsoupAttr.getKey(), "", - jsoupAttr.getValue()); + jsoupAttr.getValue()); } try { handler.startElement(XMLConstants.NULL_NS_URI, node.nodeName(), node.nodeName(), - attributes); + attributes); } catch (SAXException e) { throw new RuntimeSAXException(e); } @@ -277,8 +272,8 @@ SAXException getWrapped() { } /** - * Look for an EncodingDetetor in the ParseContext. If it hasn't been - * passed in, use the original EncodingDetector from initialization. + * Look for an EncodingDetetor in the ParseContext. If it hasn't been passed in, use the + * original EncodingDetector from initialization. * * @param parseContext * @return diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java index 7ed05d63be..c6500f8c4c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java @@ -1,37 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; import java.util.Locale; import javax.xml.XMLConstants; - +import org.apache.tika.sax.ContentHandlerDecorator; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.sax.ContentHandlerDecorator; - /** - * Content handler decorator that downgrades XHTML elements to - * old-style HTML elements before passing them on to the decorated - * content handler. This downgrading consists of dropping all namespaces - * (and namespaced attributes) and uppercasing all element names. - * Used by the {@link HtmlParser} to make all incoming HTML look the same. + * Content handler decorator that downgrades XHTML elements to old-style HTML elements before + * passing them on to the decorated content handler. This downgrading consists of dropping all + * namespaces (and namespaced attributes) and uppercasing all element names. Used by the + * {@link HtmlParser} to make all incoming HTML look the same. */ class XHTMLDowngradeHandler extends ContentHandlerDecorator { @@ -41,7 +36,7 @@ public XHTMLDowngradeHandler(ContentHandler handler) { @Override public void startElement(String uri, String localName, String name, Attributes atts) - throws SAXException { + throws SAXException { String upper = localName.toUpperCase(Locale.ENGLISH); AttributesImpl attributes = new AttributesImpl(); @@ -49,9 +44,8 @@ public void startElement(String uri, String localName, String name, Attributes a String auri = atts.getURI(i); String local = atts.getLocalName(i); String qname = atts.getQName(i); - if (XMLConstants.NULL_NS_URI.equals(auri) && - !local.equals(XMLConstants.XMLNS_ATTRIBUTE) && - !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) { + if (XMLConstants.NULL_NS_URI.equals(auri) && !local.equals(XMLConstants.XMLNS_ATTRIBUTE) + && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) { attributes.addAttribute(auri, local, qname, atts.getType(i), atts.getValue(i)); } } @@ -66,11 +60,9 @@ public void endElement(String uri, String localName, String name) throws SAXExce } @Override - public void startPrefixMapping(String prefix, String uri) { - } + public void startPrefixMapping(String prefix, String uri) {} @Override - public void endPrefixMapping(String prefix) { - } + public void endPrefixMapping(String prefix) {} } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java index fa055bda23..33a6270fa7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html.charsetdetector; @@ -24,7 +22,6 @@ import java.util.HashMap; import java.util.Locale; import java.util.Map; - import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset; import org.apache.tika.parser.html.charsetdetector.charsets.XUserDefinedCharset; @@ -36,8 +33,7 @@ final class CharsetAliases { private static final Map charsetsByLabel = new HashMap<>(); - private CharsetAliases() { - } + private CharsetAliases() {} /** * @param label a charset name @@ -61,42 +57,43 @@ private static void addAll() { addCharset(charset("Big5"), "big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5"); addCharset(charset("EUC-JP"), "cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"); addCharset(charset("EUC-KR"), "cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean", - "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949"); + "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949"); addCharset(charset("GBK"), "chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", - "gb_2312-80", "gbk", "iso-ir-58", "x-gbk"); + "gb_2312-80", "gbk", "iso-ir-58", "x-gbk"); addCharset(charset("IBM866"), "866", "cp866", "csibm866", "ibm866"); addCharset(charset("ISO-2022-JP"), "csiso2022jp", "iso-2022-jp"); addCharset(charset("ISO-8859-10", "ISO-8859-4"), "csisolatin6", "iso-8859-10", "iso-ir-157", - "iso8859-10", "iso885910", "l6", "latin6"); + "iso8859-10", "iso885910", "l6", "latin6"); addCharset(charset("ISO-8859-13"), "iso-8859-13", "iso8859-13", "iso885913"); addCharset(charset("ISO-8859-14", "ISO-8859-1"), "iso-8859-14", "iso8859-14", "iso885914"); addCharset(charset("ISO-8859-15"), "csisolatin9", "iso-8859-15", "iso8859-15", "iso885915", - "iso_8859-15", "l9"); + "iso_8859-15", "l9"); addCharset(charset("ISO-8859-16", "ISO-8859-1"), "iso-8859-16"); addCharset(charset("ISO-8859-2"), "csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2", - "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2"); + "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2"); addCharset(charset("ISO-8859-3"), "csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3", - "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3"); + "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3"); addCharset(charset("ISO-8859-4"), "csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4", - "iso88594", "iso_8859-4", "iso_8859-4:1988", "l4", "latin4"); + "iso88594", "iso_8859-4", "iso_8859-4:1988", "l4", "latin4"); addCharset(charset("ISO-8859-5"), "csisolatincyrillic", "cyrillic", "iso-8859-5", - "iso-ir-144", "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988"); + "iso-ir-144", "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988"); addCharset(charset("ISO-8859-6"), "arabic", "asmo-708", "csiso88596e", "csiso88596i", - "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", - "iso-ir-127", "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987"); + "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", + "iso-8859-6-i", "iso-ir-127", "iso8859-6", "iso88596", "iso_8859-6", + "iso_8859-6:1987"); addCharset(charset("ISO-8859-7"), "csisolatingreek", "ecma-118", "elot_928", "greek", - "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", - "iso_8859-7:1987", "sun_eu_greek"); + "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", + "iso_8859-7:1987", "sun_eu_greek"); // ISO-8859-8 actually should have an influence on the layout direction // (text should be decoded in the visual order). However, this is not implemented in tika. addCharset(charset("ISO-8859-8"), "csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8", - "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", - "iso_8859-8:1988", "visual"); + "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", + "iso_8859-8:1988", "visual"); addCharset(charset("ISO-8859-8-I", "ISO-8859-8"), "csiso88598i", "iso-8859-8-i", "logical"); addCharset(charset("KOI8-R"), "cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"); addCharset(charset("KOI8-U"), "koi8-ru", "koi8-u"); addCharset(charset("Shift_JIS"), "csshiftjis", "ms932", "ms_kanji", "shift-jis", - "shift_jis", "sjis", "windows-31j", "x-sjis"); + "shift_jis", "sjis", "windows-31j", "x-sjis"); addCharset(charset("UTF-16BE"), "utf-16be"); addCharset(charset("UTF-16LE"), "utf-16", "utf-16le"); addCharset(charset("UTF-8"), "unicode-1-1-utf-8", "utf-8", "utf8"); @@ -104,33 +101,33 @@ private static void addAll() { addCharset(charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250"); addCharset(charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251"); addCharset(charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819", - "csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", - "iso_8859-1", "iso_8859-1:1987", "l1", "latin1", "us-ascii", "windows-1252", - "x-cp1252"); + "csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", + "iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1", "us-ascii", + "windows-1252", "x-cp1252"); addCharset(charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253"); addCharset(charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148", - "iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", - "windows-1254", "x-cp1254"); + "iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", + "windows-1254", "x-cp1254"); addCharset(charset("windows-1255"), "cp1255", "windows-1255", "x-cp1255"); addCharset(charset("windows-1256"), "cp1256", "windows-1256", "x-cp1256"); addCharset(charset("windows-1257"), "cp1257", "windows-1257", "x-cp1257"); addCharset(charset("windows-1258"), "cp1258", "windows-1258", "x-cp1258"); addCharset(charset("windows-874"), "dos-874", "iso-8859-11", "iso8859-11", "iso885911", - "tis-620", "windows-874"); + "tis-620", "windows-874"); addCharset(charset("x-MacCyrillic"), "x-mac-cyrillic", "x-mac-ukrainian"); addCharset(charset("x-MacRoman"), "csmacintosh", "mac", "macintosh", "x-mac-roman"); - // The "replacement" charset is a dummy charset. It is present to mitigate wrong-charset + // The "replacement" charset is a dummy charset. It is present to mitigate wrong-charset // attacks addCharset(new ReplacementCharset(), "csiso2022kr", "hz-gb-2312", "iso-2022-cn", - "iso-2022-cn-ext", "iso-2022-kr", "replacement"); + "iso-2022-cn-ext", "iso-2022-kr", "replacement"); // The x-user-defined charset is not present in java addCharset(new XUserDefinedCharset(), "x-user-defined"); } /** * @param names jvm charset names - * @return the first of the given charsets that exists in the current JVM, - * or ISO_8859_1 if none exists + * @return the first of the given charsets that exists in the current JVM, or ISO_8859_1 if none + * exists */ private static Charset charset(String... names) { for (String name : names) { @@ -146,7 +143,7 @@ private static Charset charset(String... names) { /** * @param charset name of the charset in the JVM - * @param names standard W3C charset names + * @param names standard W3C charset names */ private static void addCharset(Charset charset, String... names) { for (String name : names) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java index 2bb2246c5f..025d99fbf0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html.charsetdetector; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java index c0a3f3737b..7bf4383676 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html.charsetdetector; @@ -24,10 +22,9 @@ /** - * A class to process the attributes of an HTML meta tag in order to extract a character set. - * The user should repeatedly call {@link #processAttribute} on each attributes of the tag, - * then update its current detection result with - * {@link #updateDetectedCharset(CharsetDetectionResult)} + * A class to process the attributes of an HTML meta tag in order to extract a character set. The + * user should repeatedly call {@link #processAttribute} on each attributes of the tag, then update + * its current detection result with {@link #updateDetectedCharset(CharsetDetectionResult)} *

      * The algorithm implemented is meant to match the one described by the W3C here: * https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java index c1b69ab2b5..08a351e34c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html.charsetdetector; @@ -28,17 +26,17 @@ import java.util.regex.Pattern; /** - * A scanner meant to detect charset meta tags in a byte stream - * See: https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding + * A scanner meant to detect charset meta tags in a byte stream See: + * https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding */ class PreScanner { private static final Pattern CHARSET_PATTERN = - Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1"); + Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1"); private static final byte[] COMMENT_START = {(byte) '<', (byte) '!', (byte) '-', (byte) '-'}; private static final byte[] COMMENT_END = {(byte) '-', (byte) '-', (byte) '>'}; private static final byte[] META_TAG_START = - {(byte) '<', (byte) 'm', (byte) 'e', (byte) 't', (byte) 'a'}; + {(byte) '<', (byte) 'm', (byte) 'e', (byte) 't', (byte) 'a'}; private static final byte SLASH = (byte) '/'; private static final byte EQUAL = (byte) '='; private static final byte TAG_START = (byte) '<'; @@ -66,13 +64,15 @@ class PreScanner { private static BitSet bitSet(int... bs) { BitSet bitSet = new BitSet(0xFF); - for (int b : bs) bitSet.set(b); + for (int b : bs) + bitSet.set(b); return bitSet; } private static BitSet bitSet(BitSet base, int... bs) { BitSet bitSet = (BitSet) base.clone(); - for (int b : bs) bitSet.set(b); + for (int b : bs) + bitSet.set(b); return bitSet; } @@ -106,14 +106,15 @@ Charset detectBOM() { } else if (expect(UTF16_LE_BOM)) { return StandardCharsets.UTF_16LE; } - } catch (IOException e) { /* stream could not be read, also return null */ } + } catch (IOException e) { + /* stream could not be read, also return null */ } return null; } private boolean processAtLeastOneByte() { try { - return processComment() || processMeta() || processTag() || processSpecialTag() || - processAny(); + return processComment() || processMeta() || processTag() || processSpecialTag() + || processAny(); } catch (IOException e) { return false; } @@ -136,7 +137,8 @@ private boolean processTag() throws IOException { stream.mark(1); } while (!contains(SPACE_OR_TAG_END, read())); stream.reset(); - while (getAttribute() != null) {/* ignore the attribute*/} + while (getAttribute() != null) { + /* ignore the attribute */} return true; } } @@ -158,8 +160,8 @@ private boolean processMeta() throws IOException { stream.mark(6); // len(" attribute = getAttribute(); attribute != null; - attribute = getAttribute()) { + for (Map.Entry attribute = + getAttribute(); attribute != null; attribute = getAttribute()) { metaProcessor.processAttribute(attribute); } metaProcessor.updateDetectedCharset(detectedCharset); @@ -172,8 +174,8 @@ private boolean processMeta() throws IOException { /** * Read an attribute from the stream * - * @return the attribute as a Map.Entry, where the key is the attribute's name and - * the value is the attribute's value. If there is no attribute, return null + * @return the attribute as a Map.Entry, where the key is the attribute's name and the value is + * the attribute's value. If there is no attribute, return null */ private Map.Entry getAttribute() throws IOException { String name = getAttributeName(); @@ -196,8 +198,8 @@ private String getAttributeName() throws IOException { return null; } StringBuilder name = new StringBuilder(); - while (!(peek() == EQUAL && name.length() > 0) && !(peek() == TAG_END || peek() == SLASH) && - !skipAll(WHITESPACE)) { + while (!(peek() == EQUAL && name.length() > 0) && !(peek() == TAG_END || peek() == SLASH) + && !skipAll(WHITESPACE)) { name.append((char) getLowerCaseChar()); } return name.toString(); @@ -213,8 +215,8 @@ private String getAttributeValue() throws IOException { } } else { stream.reset(); - for (byte b = getLowerCaseChar(); !contains(SPACE_OR_TAG_END, b); - b = getLowerCaseChar()) { + for (byte b = getLowerCaseChar(); !contains(SPACE_OR_TAG_END, b); b = + getLowerCaseChar()) { value.append((char) b); stream.mark(1); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java index 90af81e18b..a45f36ec89 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html.charsetdetector; @@ -21,33 +19,28 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; - import org.apache.commons.io.input.BoundedInputStream; - import org.apache.tika.config.Field; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; /** - * An encoding detector that tries to respect the spirit of the HTML spec - * part 12.2.3 "The input byte stream", or at least the part that is compatible with - * the implementation of tika. + * An encoding detector that tries to respect the spirit of the HTML spec part 12.2.3 "The input + * byte stream", or at least the part that is compatible with the implementation of tika. *

      * https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream *

      - * If a resource was fetched over HTTP, then HTTP headers should be added to tika metadata - * when using {@link #detect}, especially {@link Metadata#CONTENT_TYPE}, as it may contain - * charset information. + * If a resource was fetched over HTTP, then HTTP headers should be added to tika metadata when + * using {@link #detect}, especially {@link Metadata#CONTENT_TYPE}, as it may contain charset + * information. *

      - * This encoding detector may return null if no encoding is detected. - * It is meant to be used inside a {@link org.apache.tika.detect.CompositeEncodingDetector}. - * For instance: + * This encoding detector may return null if no encoding is detected. It is meant to be used inside + * a {@link org.apache.tika.detect.CompositeEncodingDetector}. For instance: + * *

       {@code
      - *     EncodingDetector detector = new CompositeEncodingDetector(
      - *       Arrays.asList(
      - *         new StandardHtmlEncodingDetector(),
      - *         new Icu4jEncodingDetector()));
      + * EncodingDetector detector = new CompositeEncodingDetector(
      + *                 Arrays.asList(new StandardHtmlEncodingDetector(), new Icu4jEncodingDetector()));
        * }
      *

      */ @@ -78,7 +71,8 @@ public Charset detect(InputStream input, Metadata metadata) throws IOException { int limit = getMarkLimit(); input.mark(limit); // Never read more than the first META_TAG_BUFFER_SIZE bytes - InputStream limitedStream = BoundedInputStream.builder().setInputStream(input).setMaxCount(limit).get(); + InputStream limitedStream = + BoundedInputStream.builder().setInputStream(input).setMaxCount(limit).get(); PreScanner preScanner = new PreScanner(limitedStream); // The order of priority for detection is: @@ -102,8 +96,7 @@ public int getMarkLimit() { } /** - * How far into the stream to read for charset detection. - * Default is 8192. + * How far into the stream to read for charset detection. Default is 8192. */ @Field public void setMarkLimit(int markLimit) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java index 32b96cf4c5..3695468a7d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html.charsetdetector.charsets; @@ -24,8 +22,8 @@ import java.nio.charset.CoderResult; /** - * An implementation of the standard "replacement" charset defined by the W3C. - * See: https://encoding.spec.whatwg.org/#replacement + * An implementation of the standard "replacement" charset defined by the W3C. See: + * https://encoding.spec.whatwg.org/#replacement */ public class ReplacementCharset extends Charset { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java index b550cd19a8..b6fc984c96 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html.charsetdetector.charsets; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/DataURISchemeParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/DataURISchemeParserTest.java index 77508e1779..41601ac741 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/DataURISchemeParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/DataURISchemeParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; @@ -24,12 +22,10 @@ import java.io.ByteArrayOutputStream; import java.nio.charset.StandardCharsets; - import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - import org.apache.tika.TikaTest; import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.Test; public class DataURISchemeParserTest extends TikaTest { DataURISchemeUtil dataURISchemeUtil = new DataURISchemeUtil(); @@ -56,7 +52,7 @@ public void testNewlines() throws Exception { @Test public void testBackslashNewlines() throws Exception { - //like you'd have in a css fragment + // like you'd have in a css fragment String data = "\\\nODdh"; DataURIScheme dataURIScheme = dataURISchemeUtil.parse(data); assertTrue(dataURIScheme.isBase64()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java index 0b48d8c60c..4179ff811b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; @@ -25,12 +23,10 @@ import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; - +import org.apache.tika.metadata.Metadata; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import org.apache.tika.metadata.Metadata; - public class HtmlEncodingDetectorTest { @Test @@ -61,16 +57,16 @@ public void withSlash() throws IOException { @Test @Disabled("until we do a full parse") public void insideTag() throws IOException { - assertWindows1252("" + - ""); + assertWindows1252("" + + ""); } @Test @Disabled("until we do a full parse") public void missingAttribute() throws IOException { assertWindows1252("" + // missing http-equiv attribute - "" // valid declaration + "" // valid declaration ); } @@ -80,8 +76,8 @@ public void insideSpecialTag() throws IOException { // Content inside " + // inside special tag - "" // real charset declaration + "" + // inside special tag + "" // real charset declaration ); } @@ -89,14 +85,14 @@ public void insideSpecialTag() throws IOException { @Disabled("until we can prove this harms detection") public void spaceBeforeTag() throws IOException { assertWindows1252("< meta charset='UTF-8'>" + // invalid charset declaration - "" // real charset declaration + "" // real charset declaration ); } @Test public void invalidAttribute() throws IOException { assertWindows1252("" // real charset declaration + "charset='WINDOWS-1252'>" // real charset declaration ); } @@ -104,8 +100,8 @@ public void invalidAttribute() throws IOException { @Disabled("until we can prove this harms detection") public void unmatchedQuote() throws IOException { assertWindows1252("" + - // invalid charset declaration - "" // real charset declaration + // invalid charset declaration + "" // real charset declaration ); } @@ -115,10 +111,10 @@ public void unmatchedQuote() throws IOException { public void withCompactComment() throws IOException { // is a valid comment assertWindows1252("" + // end comment - "" + // compact comment - "" // outside comment, charset declaration + "" + // inside comment + "-->" + // end comment + "" + // compact comment + "" // outside comment, charset declaration ); } @@ -127,8 +123,7 @@ private void assertWindows1252(String html) throws IOException { } private void assertCharset(String html, Charset charset) throws IOException { - assertEquals(charset, detectCharset(html), - html + " should be detected as " + charset); + assertEquals(charset, detectCharset(html), html + " should be detected as " + charset); } private Charset detectCharset(String test) throws IOException { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 25f9ddeb11..d0a2f65b2d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -1,18 +1,16 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. */ package org.apache.tika.parser.html; @@ -52,16 +50,7 @@ import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; - import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.Locator; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.apache.tika.Tika; import org.apache.tika.TikaTest; import org.apache.tika.config.ServiceLoader; @@ -83,6 +72,13 @@ import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.apache.tika.utils.XMLReaderUtils; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; public class HtmlParserTest extends TikaTest { @@ -97,7 +93,7 @@ public void testParseAscii() throws Exception { ContentHandler link = new DefaultHandler() { @Override public void startElement(String u, String l, String n, Attributes a) - throws SAXException { + throws SAXException { if ("a".equals(l)) { if (a.getValue("href") != null) { href.append(a.getValue("href")); @@ -107,8 +103,8 @@ public void startElement(String u, String l, String n, Attributes a) } } }; - new JSoupParser() - .parse(stream, new TeeContentHandler(body, link), metadata, new ParseContext()); + new JSoupParser().parse(stream, new TeeContentHandler(body, link), metadata, + new ParseContext()); } assertEquals("Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE)); @@ -123,9 +119,9 @@ public void startElement(String u, String l, String n, Attributes a) String content = body.toString(); assertTrue(content.contains("Test Indexation Html"), - "Did not contain expected text:" + "Test Indexation Html"); + "Did not contain expected text:" + "Test Indexation Html"); assertTrue(content.contains("Indexation du fichier"), - "Did not contain expected text:" + "Indexation du fichier"); + "Did not contain expected text:" + "Indexation du fichier"); } @Test @@ -136,10 +132,10 @@ public void XtestParseUTF8() throws IOException, SAXException, TikaException { String content = new Tika().parseToString(getResourceAsStream(path), metadata); assertTrue(content.contains("Title : Tilte with UTF-8 chars öäå"), - "Did not contain expected text:" + "Title : Tilte with UTF-8 chars öäå"); + "Did not contain expected text:" + "Title : Tilte with UTF-8 chars öäå"); assertTrue(content.contains("Content with UTF-8 chars"), - "Did not contain expected text:" + "Content with UTF-8 chars"); + "Did not contain expected text:" + "Content with UTF-8 chars"); assertTrue(content.contains("åäö"), "Did not contain expected text:" + "åäö"); } @@ -150,9 +146,9 @@ public void testXhtmlParsing() throws Exception { Metadata metadata = new Metadata(); String content = new Tika().parseToString(getResourceAsStream(path), metadata); - //can't specify charset because default differs between OS's - assertTrue( - metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; charset=")); + // can't specify charset because default differs between OS's + assertTrue(metadata.get(Metadata.CONTENT_TYPE) + .startsWith("application/xhtml+xml; charset=")); assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Tika Developers", metadata.get(TikaCoreProperties.CREATOR)); @@ -166,7 +162,7 @@ public void testXhtmlParsing() throws Exception { public void testParseEmpty() throws Exception { ContentHandler handler = new BodyContentHandler(); new JSoupParser().parse(new ByteArrayInputStream(new byte[0]), handler, new Metadata(), - new ParseContext()); + new ParseContext()); assertEquals("", handler.toString()); } @@ -196,37 +192,38 @@ public void testBaseHref() throws Exception { assertRelativeLink("http://domain.com/file.html", "http://domain.com/path/", "/file.html"); assertRelativeLink("http://domain.com/path/file.html", "http://domain.com/path/", - "./file.html"); + "./file.html"); assertRelativeLink("http://domain.com/path/file.html", "http://domain.com/path/", - "file.html"); + "file.html"); assertRelativeLink("http://domain2.com/newpath", "http://domain.com/path/to/file", - "http://domain2.com/newpath"); + "http://domain2.com/newpath"); - // See http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx + // See + // http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx // Also http://www.ietf.org/rfc/rfc3986.txt // Also http://issues.apache.org/jira/browse/NUTCH-566 // Also http://issues.apache.org/jira/browse/NUTCH-436 assertRelativeLink("http://domain.com/path/?pid=1", "http://domain.com/path/", "?pid=1"); assertRelativeLink("http://domain.com/file?pid=1", "http://domain.com/file", "?pid=1"); assertRelativeLink("http://domain.com/path/d;p?pid=1", "http://domain.com/path/d;p?q#f", - "?pid=1"); + "?pid=1"); } private void assertRelativeLink(String url, String base, String relative) throws Exception { - String test = - "" + "test"; + String test = "" + "test"; final List links = new ArrayList<>(); - new JSoupParser() - .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new DefaultHandler() { - @Override - public void startElement(String u, String l, String name, Attributes atts) { - if (name.equals("a") && atts.getValue("", "href") != null) { - links.add(atts.getValue("", "href")); - } - } - }, new Metadata(), new ParseContext()); + new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), + new DefaultHandler() { + @Override + public void startElement(String u, String l, String name, + Attributes atts) { + if (name.equals("a") && atts.getValue("", "href") != null) { + links.add(atts.getValue("", "href")); + } + } + }, new Metadata(), new ParseContext()); assertEquals(1, links.size()); assertEquals(url, links.get(0)); } @@ -252,12 +249,12 @@ public void testWhitespaceBetweenTableCells() throws Exception { */ @Test public void testHttpEquivCharset() throws Exception { - String test = "" + - "the name is \u00e1ndre" + ""; + String test = "" + + "the name is \u00e1ndre" + ""; Metadata metadata = new Metadata(); new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } @@ -268,11 +265,11 @@ public void testHttpEquivCharset() throws Exception { */ @Test public void testHtml5Charset() throws Exception { - String test = "" + - "the name is \u00e1ndre" + ""; + String test = "" + + "the name is \u00e1ndre" + ""; Metadata metadata = new Metadata(); new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); } @@ -285,9 +282,8 @@ public void testHtml5Charset() throws Exception { public void testDetectOfCharset() throws Exception { String test = "\u017d"; Metadata metadata = new Metadata(); - new JSoupParser() - .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), - metadata, new ParseContext()); + new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("\u017d", metadata.get(TikaCoreProperties.TITLE)); } @@ -298,27 +294,25 @@ public void testDetectOfCharset() throws Exception { */ @Test public void testUsingCharsetInContentTypeHeader() throws Exception { - final String test = - "the name is \u00e1ndre" + ""; + final String test = "the name is \u00e1ndre" + + ""; Metadata metadata = new Metadata(); - new JSoupParser() - .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), - metadata, new ParseContext()); + new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1"); new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } /** - * Test case for HTML content like - * ">div<foo>br<bar>/div>" that should result - * in three whitespace-separated tokens "foo", "bar" and "baz" instead - * of a single token "foobarbaz". + * Test case for HTML content like ">div<foo>br<bar>/div>" that should result + * in three whitespace-separated tokens "foo", "bar" and "baz" instead of a single token + * "foobarbaz". * * @see TIKA-343 */ @@ -343,9 +337,8 @@ public void testIgnoreCharsetDetectorLanguage() throws Exception { String test = "Simple Content"; Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_LANGUAGE, "en"); - new JSoupParser() - .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), - metadata, new ParseContext()); + new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE)); } @@ -357,21 +350,21 @@ public void testIgnoreCharsetDetectorLanguage() throws Exception { */ @Test public void testHttpEquivCharsetFunkyAttributes() throws Exception { - String test1 = "" + - "the name is \u00e1ndre" + ""; + String test1 = "" + + "the name is \u00e1ndre" + ""; Metadata metadata = new Metadata(); new JSoupParser().parse(new ByteArrayInputStream(test1.getBytes(ISO_8859_1)), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // Some HTML pages have errors like ';;' versus '; ' as separator - String test2 = "" + - "the name is \u00e1ndre" + ""; + String test2 = "" + + "the name is \u00e1ndre" + ""; metadata = new Metadata(); new JSoupParser().parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); } @@ -382,19 +375,18 @@ public void testHttpEquivCharsetFunkyAttributes() throws Exception { */ @Test public void testUsingFunkyCharsetInContentTypeHeader() throws Exception { - final String test = - "the name is \u00e1ndre" + ""; + final String test = "the name is \u00e1ndre" + + ""; Metadata metadata = new Metadata(); - new JSoupParser() - .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), - metadata, new ParseContext()); + new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html"); new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), - new BodyContentHandler(), metadata, new ParseContext()); + new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } @@ -409,7 +401,7 @@ public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception { String path = "/test-documents/big-preamble.html"; Metadata metadata = new Metadata(); new JSoupParser().parse(getResourceAsStream(path), new BodyContentHandler(), metadata, - new ParseContext()); + new ParseContext()); assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING)); } @@ -421,15 +413,14 @@ public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception { */ @Test public void testElementOrdering() throws Exception { - final String test = "Title" + - "" + - "" + - "

      Simple Content

      "; + final String test = "Title" + + "" + + "" + + "

      Simple Content

      "; StringWriter sw = new StringWriter(); - new JSoupParser() - .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw), - new Metadata(), new ParseContext()); + new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); String result = sw.toString(); @@ -461,13 +452,12 @@ public void testElementOrdering() throws Exception { @Test public void testImgUrlExtraction() throws Exception { final String test = - "Title" + "" + - ""; + "Title" + "" + + ""; StringWriter sw = new StringWriter(); - new JSoupParser() - .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw), - new Metadata(), new ParseContext()); + new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); String result = sw.toString(); @@ -482,19 +472,18 @@ public void testImgUrlExtraction() throws Exception { */ @Test public void testFrameSrcExtraction() throws Exception { - final String test = - "Title" + "" + - ""; + final String test = "Title" + + "" + + ""; StringWriter sw = new StringWriter(); - new JSoupParser() - .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw), - new Metadata(), new ParseContext()); + new JSoupParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); String result = sw.toString(); // tag should exist, with fully resolved URL assertTrue(Pattern.matches("(?s).*.*$", - result)); + result)); } /** @@ -504,21 +493,20 @@ public void testFrameSrcExtraction() throws Exception { */ @Test public void testIFrameSrcExtraction() throws Exception { - final String test = - "Title" + "" + - "