diff --git a/contrib/format-pdf/README.md b/contrib/format-pdf/README.md
new file mode 100644
index 00000000000..6dbc2a569e6
--- /dev/null
+++ b/contrib/format-pdf/README.md
@@ -0,0 +1,75 @@
+# Format Plugin for PDF Table Reader
+One of the most annoying tasks is when you are working on a data science project and you get data that is in a PDF file. This plugin endeavours to enable you to query data in PDF tables using Drill's SQL interface.
+
+## Data Model
+Since PDF files generally are not intended to be queried or read by machines, mapping the data to tables and rows is not a perfect process. The PDF reader does support
+provided schema. You can read about Drill's [provided schema functionality here](https://drill.apache.org/docs/plugin-configuration-basics/#specifying-the-schema-as-table-function-parameter)
+
+
+### Merging Pages
+The PDF reader reads tables from PDF files on each page. If your PDF file has tables that span multiple pages, you can set the `combinePages` parameter to `true` and Drill
+will merge all the tables in the PDF file. You can also do this at query time with the `table()` function.
+
+## Configuration
+To configure the PDF reader, simply add the information below to the `formats` section of a file based storage plugin, such as `dfs`, `hdfs` or `s3`.
+
+```json
+"pdf": {
+ "type": "pdf",
+ "extensions": [
+ "pdf"
+ ],
+ "extractionAlgorithm": "spreadsheet",
+ "extractHeaders": true,
+ "combinePages": false
+}
+```
+The available options are:
+* `extractHeaders`: Extracts the first row of any tables as the header row. If set to `false`, Drill will assign column names of `field_0`, `field_1` to each column.
+* `combinePages`: Merges multi page tables together.
+* `defaultTableIndex`: Allows you to query different tables within the PDF file. Index starts at `1`.
+* `extractionAlgorithm`: Allows you to choose the extraction algorithm used for extracting data from the PDF file. Choices are `spreadsheet` and `basic`. Depending on your data, one may work better than the other.
+
+## Accessing Document Metadata Fields
+PDF files have a considerable amount of metadata which can be useful for analysis. Drill will extract the following fields from every PDF file. Note that these fields are not projected in star queries and must be selected explicitly. The document's creator populates these fields and some or all may be empty. With the exception of `_page_count` which is an `INT` and the two date fields, all the other fields are `VARCHAR` fields.
+
+ The fields are:
+ * `_page_count`
+ * `_author`
+ * `_title`
+ * `_keywords`
+ * `_creator`
+ * `_producer`
+ * `_creation_date`
+ * `_modification_date`
+ * `_trapped`
+ * `_table_count`
+
+ The query below will access a document's metadata:
+
+ ```sql
+SELECT _page_count, _title, _author, _subject,
+_keywords, _creator, _producer, _creation_date,
+_modification_date, _trapped
+FROM dfs.`pdf/20.pdf`
+```
+The query below demonstrates how to define a schema at query time:
+
+```sql
+SELECT * FROM table(cp.`pdf/schools.pdf` (type => 'pdf', combinePages => true,
+schema => 'inline=(`Last Name` VARCHAR, `First Name Address` VARCHAR,
+`field_0` VARCHAR, `City` VARCHAR, `State` VARCHAR, `Zip` VARCHAR,
+`field_1` VARCHAR, `Occupation Employer` VARCHAR,
+`Date` VARCHAR, `field_2` DATE properties {`drill.format` = `M/d/yyyy`},
+`Amount` DOUBLE)'))
+LIMIT 5
+```
+
+### Encrypted Files
+If a PDF file is encrypted, you can supply the password to the file via the `table()` function as shown below. Note that the password will be recorded in any query logs that
+may exist.
+
+```sql
+SELECT *
+FROM table(dfs.`encrypted_pdf.pdf`(type => 'pdf', password=> 'your_password'))
+```
diff --git a/contrib/format-pdf/pom.xml b/contrib/format-pdf/pom.xml
new file mode 100644
index 00000000000..3a72156716e
--- /dev/null
+++ b/contrib/format-pdf/pom.xml
@@ -0,0 +1,105 @@
+
+
+
+ 4.0.0
+
+
+ drill-contrib-parent
+ org.apache.drill.contrib
+ 1.20.0-SNAPSHOT
+
+
+ drill-format-pdf
+ Drill : Contrib : Format : PDF
+
+
+
+ org.apache.drill.exec
+ drill-java-exec
+ ${project.version}
+
+
+ technology.tabula
+ tabula
+ 1.0.5
+
+
+ slf4j-simple
+ org.slf4j
+
+
+
+
+ org.apache.pdfbox
+ pdfbox
+ 2.0.25
+
+
+ commons-logging
+ commons-logging
+
+
+
+
+
+ org.apache.drill.exec
+ drill-java-exec
+ tests
+ ${project.version}
+ test
+
+
+ org.apache.drill
+ drill-common
+ tests
+ ${project.version}
+ test
+
+
+
+
+
+ maven-resources-plugin
+
+
+ copy-java-sources
+ process-sources
+
+ copy-resources
+
+
+ ${basedir}/target/classes/org/apache/drill/exec/store/pdf
+
+
+
+ src/main/java/org/apache/drill/exec/store/pdf
+ true
+
+
+
+
+
+
+
+
+
diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java
new file mode 100644
index 00000000000..48a6bdd5499
--- /dev/null
+++ b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java
@@ -0,0 +1,513 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.pdf;
+
+import org.apache.drill.exec.record.MaterializedField;
+import org.apache.drill.shaded.guava.com.google.common.base.Strings;
+import org.apache.drill.common.AutoCloseables;
+import org.apache.drill.common.exceptions.CustomErrorContext;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.physical.resultSet.ResultSetLoader;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.ColumnMetadata;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.exec.vector.accessor.ScalarWriter;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import technology.tabula.RectangularTextContainer;
+import technology.tabula.Table;
+
+import java.io.InputStream;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.time.Instant;
+import java.time.LocalDate;
+import java.time.LocalTime;
+import java.time.format.DateTimeFormatter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.GregorianCalendar;
+import java.util.List;
+
+public class PdfBatchReader implements ManagedReader {
+
+ private static final Logger logger = LoggerFactory.getLogger(PdfBatchReader.class);
+ private static final String NEW_FIELD_PREFIX = "field_";
+ private final int maxRecords;
+
+ private final List writers;
+ private final PdfReaderConfig config;
+ private final int startingTableIndex;
+ private PdfMetadataReader metadataReader;
+ private FileSplit split;
+ private CustomErrorContext errorContext;
+ private RowSetLoader rowWriter;
+ private PDDocument document;
+
+ private SchemaBuilder builder;
+ private List columnHeaders;
+ private Table currentTable;
+ private int currentTableIndex;
+ private List firstRow;
+ private PdfRowIterator rowIterator;
+ private FileScanFramework.FileSchemaNegotiator negotiator;
+ private int unregisteredColumnCount;
+
+ // Tables
+ private List tables;
+
+ static class PdfReaderConfig {
+ final PdfFormatPlugin plugin;
+ PdfReaderConfig(PdfFormatPlugin plugin) {
+ this.plugin = plugin;
+ }
+ }
+
+ public PdfBatchReader(PdfReaderConfig readerConfig, int maxRecords) {
+ this.maxRecords = maxRecords;
+ this.unregisteredColumnCount = 0;
+ this.writers = new ArrayList<>();
+ this.config = readerConfig;
+ this.startingTableIndex = readerConfig.plugin.getConfig().defaultTableIndex() < 0 ? 0 : readerConfig.plugin.getConfig().defaultTableIndex();
+ this.currentTableIndex = this.startingTableIndex;
+ this.columnHeaders = new ArrayList<>();
+ }
+
+ @Override
+ public boolean open(FileScanFramework.FileSchemaNegotiator negotiator) {
+ this.negotiator = negotiator;
+
+ split = negotiator.split();
+ errorContext = negotiator.parentErrorContext();
+ builder = new SchemaBuilder();
+
+ openFile();
+ metadataReader = new PdfMetadataReader(document);
+
+ // Get the tables if the user set the combine pages to true
+ if (config.plugin.getConfig().combinePages() ) {
+ tables = PdfUtils.extractTablesFromPDF(document, config.plugin.getConfig().getAlgorithm());
+ currentTable = tables.get(0);
+ } else {
+ currentTable = PdfUtils.getSpecificTable(document, startingTableIndex, config.plugin.getConfig().getAlgorithm());
+ tables = Collections.singletonList(currentTable);
+
+ // If the user specifies a table index, and that table does not exist, throw an exception.
+ if (currentTable == null && startingTableIndex != 0) {
+ throw UserException.dataReadError()
+ .message("The specified table index " + startingTableIndex + " does not exist in this file. ")
+ .addContext(errorContext)
+ .build(logger);
+ }
+ }
+
+ // Get the row iterator and grab the first row to build the schema
+ rowIterator = new PdfRowIterator(currentTable);
+ if (rowIterator.hasNext()) {
+ firstRow = PdfUtils.convertRowToStringArray(rowIterator.next());
+ }
+
+ // Support provided schema
+ TupleMetadata schema = null;
+ if (negotiator.hasProvidedSchema()) {
+ schema = negotiator.providedSchema();
+ negotiator.tableSchema(schema, false);
+ } else {
+ negotiator.tableSchema(buildSchema(), false);
+ }
+
+ ResultSetLoader loader = negotiator.build();
+ rowWriter = loader.writer();
+ metadataReader.setRowWriter(rowWriter);
+ // Build the schema
+ if (negotiator.hasProvidedSchema()) {
+ buildWriterListFromProvidedSchema(schema);
+ } else {
+ buildWriterList();
+ }
+ metadataReader.addImplicitColumnsToSchema();
+ return true;
+ }
+
+ @Override
+ public boolean next() {
+
+ while(!rowWriter.isFull()) {
+ if (rowWriter.limitReached(maxRecords)) {
+ // Stop reading if the limit has been reached
+ return false;
+ } else if (config.plugin.getConfig().combinePages() &&
+ (!rowIterator.hasNext()) &&
+ currentTableIndex < (tables.size() - 1)) {
+ // Case for end of current page but more tables exist and combinePages is set to true.
+ // Get the next table
+ currentTableIndex++;
+ currentTable = tables.get(currentTableIndex);
+
+ // Update the row iterator
+ rowIterator = new PdfRowIterator(currentTable);
+ // Skip the first row in the new table because it most likely contains headers.
+ if (config.plugin.getConfig().extractHeaders()) {
+ rowIterator.next();
+ }
+ } else if (! rowIterator.hasNext()) {
+ // Special case for document with no tables
+ if (currentTable == null) {
+ rowWriter.start();
+ metadataReader.writeMetadata();
+ rowWriter.save();
+ }
+ return false;
+ }
+
+ // Process the row
+ processRow(rowIterator.next());
+ }
+ return true;
+ }
+
+ private void processRow(List row) {
+ if (row == null || row.size() == 0) {
+ rowWriter.start();
+ metadataReader.writeMetadata();
+ rowWriter.save();
+ return;
+ }
+
+ String value;
+ rowWriter.start();
+ int rowPosition = 0;
+ for (RectangularTextContainer cellValue : row) {
+ value = cellValue.getText();
+
+ if (!Strings.isNullOrEmpty(value)) {
+ writers.get(rowPosition).load(row.get(rowPosition));
+ }
+ rowPosition++;
+ }
+
+ metadataReader.writeMetadata();
+ rowWriter.save();
+ }
+
+ @Override
+ public void close() {
+ if (document != null) {
+ AutoCloseables.closeSilently(document.getDocument());
+ AutoCloseables.closeSilently(document);
+ document = null;
+ }
+ }
+
+ /**
+ * This method opens the PDF file and finds the tables
+ */
+ private void openFile() {
+ try {
+ InputStream fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
+ if (Strings.isNullOrEmpty(config.plugin.getConfig().password())) {
+ document = PDDocument.load(fsStream);
+ } else {
+ // Case for encrypted files
+ document = PDDocument.load(fsStream, config.plugin.getConfig().password());
+ }
+
+ AutoCloseables.closeSilently(fsStream);
+ } catch (Exception e) {
+ throw UserException
+ .dataReadError(e)
+ .addContext("Failed to open open input file: %s", split.getPath().toString())
+ .addContext(errorContext)
+ .build(logger);
+ }
+ }
+
+ private TupleMetadata buildSchema() {
+ // Get column header names
+ columnHeaders = firstRow;
+
+ // Case for file with no tables
+ if (columnHeaders == null) {
+ return builder.buildSchema();
+ }
+
+ // Add columns to table
+ int index = 0;
+ for (String columnName : firstRow) {
+ if (Strings.isNullOrEmpty(columnName) || !config.plugin.getConfig().extractHeaders()) {
+ columnName = NEW_FIELD_PREFIX + unregisteredColumnCount;
+ columnHeaders.set(index, columnName);
+ unregisteredColumnCount++;
+ }
+ builder.addNullable(columnName, MinorType.VARCHAR);
+ index++;
+ }
+
+ return builder.buildSchema();
+ }
+
+ private void buildWriterList() {
+ // Case for file with no tables.
+ if (columnHeaders == null) {
+ return;
+ }
+
+ for (String header : columnHeaders) {
+ writers.add(new StringPdfColumnWriter(columnHeaders.indexOf(header), header, rowWriter));
+ }
+ }
+
+ private void buildWriterListFromProvidedSchema(TupleMetadata schema) {
+ if (schema == null) {
+ buildWriterList();
+ return;
+ }
+ int counter = 0;
+ for (MaterializedField field: schema.toFieldList()) {
+ String fieldName = field.getName();
+ MinorType type = field.getType().getMinorType();
+ columnHeaders.add(fieldName);
+
+ switch (type) {
+ case VARCHAR:
+ writers.add(new StringPdfColumnWriter(counter, fieldName, rowWriter));
+ break;
+ case SMALLINT:
+ case TINYINT:
+ case INT:
+ writers.add(new IntPdfColumnWriter(counter, fieldName, rowWriter));
+ break;
+ case BIGINT:
+ writers.add(new BigIntPdfColumnWriter(counter, fieldName, rowWriter));
+ break;
+ case FLOAT4:
+ case FLOAT8:
+ writers.add(new DoublePdfColumnWriter(counter, fieldName, rowWriter));
+ break;
+ case DATE:
+ writers.add(new DatePdfColumnWriter(counter, fieldName, rowWriter, negotiator));
+ break;
+ case TIME:
+ writers.add(new TimePdfColumnWriter(counter, fieldName, rowWriter, negotiator));
+ break;
+ case TIMESTAMP:
+ writers.add(new TimestampPdfColumnWriter(counter, fieldName, rowWriter, negotiator));
+ break;
+ default:
+ throw UserException.unsupportedError()
+ .message("PDF Reader with provided schema does not support " + type.name() + " data type.")
+ .addContext(errorContext)
+ .build(logger);
+ }
+ }
+ }
+
+ public abstract static class PdfColumnWriter {
+ final String columnName;
+ final ScalarWriter writer;
+ final int columnIndex;
+
+ public PdfColumnWriter(int columnIndex, String columnName, ScalarWriter writer) {
+ this.columnIndex = columnIndex;
+ this.columnName = columnName;
+ this.writer = writer;
+ }
+
+ public abstract void load (RectangularTextContainer> cell);
+
+ public abstract void loadFromValue(Object value);
+ }
+
+ public static class IntPdfColumnWriter extends PdfColumnWriter {
+ IntPdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter) {
+ super(columnIndex, columnName, rowWriter.scalar(columnName));
+ }
+
+ @Override
+ public void load(RectangularTextContainer> cell) {
+ writer.setInt(Integer.parseInt(cell.getText()));
+ }
+
+ @Override
+ public void loadFromValue(Object value) {
+ writer.setInt((Integer) value);
+ }
+ }
+
+ public static class BigIntPdfColumnWriter extends PdfColumnWriter {
+ BigIntPdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter) {
+ super(columnIndex, columnName, rowWriter.scalar(columnName));
+ }
+
+ @Override
+ public void load(RectangularTextContainer> cell) {
+ writer.setLong(Long.parseLong(cell.getText()));
+ }
+
+ @Override
+ public void loadFromValue(Object value) {
+ writer.setLong((Long) value);
+ }
+ }
+
+ public static class DoublePdfColumnWriter extends PdfColumnWriter {
+ DoublePdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter) {
+ super(columnIndex, columnName, rowWriter.scalar(columnName));
+ }
+
+ @Override
+ public void load(RectangularTextContainer> cell) {
+ writer.setDouble(Double.parseDouble(cell.getText()));
+ }
+
+ @Override
+ public void loadFromValue(Object value) {
+ writer.setDouble((Double) value);
+ }
+ }
+
+ public static class StringPdfColumnWriter extends PdfColumnWriter {
+ StringPdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter) {
+ super(columnIndex, columnName, rowWriter.scalar(columnName));
+ }
+
+ @Override
+ public void load(RectangularTextContainer> cell) {
+ writer.setString(cell.getText());
+ }
+
+ @Override
+ public void loadFromValue(Object value) {
+ if (! Strings.isNullOrEmpty((String) value)) {
+ writer.setString((String) value);
+ }
+ }
+ }
+
+ public static class DatePdfColumnWriter extends PdfColumnWriter {
+ private String dateFormat;
+
+ DatePdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter, FileScanFramework.FileSchemaNegotiator negotiator) {
+ super(columnIndex, columnName, rowWriter.scalar(columnName));
+
+ ColumnMetadata metadata = negotiator.providedSchema().metadata(columnName);
+ if (metadata != null) {
+ this.dateFormat = metadata.property("drill.format");
+ }
+ }
+
+ @Override
+ public void load(RectangularTextContainer> cell) {
+ LocalDate localDate;
+ if (Strings.isNullOrEmpty(this.dateFormat)) {
+ localDate = LocalDate.parse(cell.getText());
+ } else {
+ localDate = LocalDate.parse(cell.getText(), DateTimeFormatter.ofPattern(dateFormat));
+ }
+ writer.setDate(localDate);
+ }
+
+ @Override
+ public void loadFromValue(Object value) {
+ if (value != null) {
+ writer.setDate(LocalDate.parse((String) value));
+ }
+ }
+ }
+
+ public static class TimePdfColumnWriter extends PdfColumnWriter {
+ private String dateFormat;
+
+ TimePdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter, FileScanFramework.FileSchemaNegotiator negotiator) {
+ super(columnIndex, columnName, rowWriter.scalar(columnName));
+
+ ColumnMetadata metadata = negotiator.providedSchema().metadata(columnName);
+ if (metadata != null) {
+ this.dateFormat = metadata.property("drill.format");
+ }
+ }
+
+ @Override
+ public void load(RectangularTextContainer> cell) {
+ LocalTime localTime;
+ if (Strings.isNullOrEmpty(this.dateFormat)) {
+ localTime = LocalTime.parse(cell.getText());
+ } else {
+ localTime = LocalTime.parse(cell.getText(), DateTimeFormatter.ofPattern(dateFormat));
+ }
+ writer.setTime(localTime);
+ }
+
+ @Override
+ public void loadFromValue(Object value) {
+ if (value != null) {
+ writer.setTime(LocalTime.parse((String) value));
+ }
+ }
+ }
+
+ public static class TimestampPdfColumnWriter extends PdfColumnWriter {
+ private String dateFormat;
+
+ TimestampPdfColumnWriter(int columnIndex, String columnName, RowSetLoader rowWriter) {
+ super(columnIndex, columnName, rowWriter.scalar(columnName));
+ }
+
+ TimestampPdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter, FileScanFramework.FileSchemaNegotiator negotiator) {
+ super(columnIndex, columnName, rowWriter.scalar(columnName));
+
+ ColumnMetadata metadata = negotiator.providedSchema().metadata(columnName);
+ if (metadata != null) {
+ this.dateFormat = metadata.property("drill.format");
+ }
+ }
+
+ @Override
+ public void load(RectangularTextContainer> cell) {
+ Instant timestamp = null;
+ if (Strings.isNullOrEmpty(this.dateFormat)) {
+ timestamp = Instant.parse(cell.getText());
+ } else {
+ try {
+ SimpleDateFormat simpleDateFormat = new SimpleDateFormat(dateFormat);
+ Date parsedDate = simpleDateFormat.parse(cell.getText());
+ timestamp = Instant.ofEpochMilli(parsedDate.getTime());
+ } catch (ParseException e) {
+ logger.error("Error parsing timestamp: " + e.getMessage());
+ }
+ }
+ writer.setTimestamp(timestamp);
+ }
+
+ @Override
+ public void loadFromValue(Object value) {
+ if (value != null) {
+ GregorianCalendar calendar = (GregorianCalendar) value;
+ writer.setTimestamp(calendar.getTime().toInstant());
+ }
+ }
+ }
+}
diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfFormatConfig.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfFormatConfig.java
new file mode 100644
index 00000000000..d590a050513
--- /dev/null
+++ b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfFormatConfig.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.pdf;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonTypeName;
+
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
+import lombok.Builder;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+import lombok.Setter;
+import lombok.ToString;
+import lombok.experimental.Accessors;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.drill.common.logical.FormatPluginConfig;
+import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
+import technology.tabula.extractors.BasicExtractionAlgorithm;
+import technology.tabula.extractors.ExtractionAlgorithm;
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+
+import java.util.Collections;
+import java.util.List;
+
+
+@Slf4j
+@Builder
+@Getter
+@Setter
+@Accessors(fluent = true)
+@EqualsAndHashCode
+@ToString
+@JsonInclude(JsonInclude.Include.NON_DEFAULT)
+@JsonDeserialize(builder = PdfFormatConfig.PdfFormatConfigBuilder.class)
+@JsonTypeName(PdfFormatPlugin.DEFAULT_NAME)
+public class PdfFormatConfig implements FormatPluginConfig {
+
+ @JsonProperty
+ private final List extensions;
+
+ @JsonProperty
+ private final boolean combinePages;
+
+ @JsonProperty
+ private final boolean extractHeaders;
+
+ @JsonProperty
+ private final String extractionAlgorithm;
+
+ @JsonProperty
+ private final String password;
+
+ @JsonProperty
+ private final int defaultTableIndex;
+
+ private PdfFormatConfig(PdfFormatConfig.PdfFormatConfigBuilder builder) {
+ this.extensions = builder.extensions == null ? Collections.singletonList("pdf") : ImmutableList.copyOf(builder.extensions);
+ this.combinePages = builder.combinePages;
+ this.extractHeaders = builder.extractHeaders;
+ this.defaultTableIndex = builder.defaultTableIndex;
+ this.extractionAlgorithm = builder.extractionAlgorithm;
+ this.password = builder.password;
+ }
+
+ @JsonIgnore
+ public PdfBatchReader.PdfReaderConfig getReaderConfig(PdfFormatPlugin plugin) {
+ return new PdfBatchReader.PdfReaderConfig(plugin);
+ }
+
+ @JsonIgnore
+ public ExtractionAlgorithm getAlgorithm() {
+ if (StringUtils.isEmpty(this.extractionAlgorithm) || this.extractionAlgorithm.equalsIgnoreCase("basic")) {
+ return new BasicExtractionAlgorithm();
+ } else if (this.extractionAlgorithm.equalsIgnoreCase("spreadsheet")) {
+ return new SpreadsheetExtractionAlgorithm();
+ } else {
+ throw UserException.validationError()
+ .message(extractionAlgorithm + " is not a valid extraction algorithm. The available choices are basic or spreadsheet.")
+ .build(logger);
+ }
+ }
+
+ @JsonPOJOBuilder(withPrefix = "")
+ public static class PdfFormatConfigBuilder {
+ public PdfFormatConfig build() {
+ return new PdfFormatConfig(this);
+ }
+ }
+}
diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfFormatPlugin.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfFormatPlugin.java
new file mode 100644
index 00000000000..01ceece2a9e
--- /dev/null
+++ b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfFormatPlugin.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.pdf;
+
+import org.apache.drill.common.logical.StoragePluginConfig;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.Types;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileReaderFactory;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
+import org.apache.drill.exec.server.DrillbitContext;
+import org.apache.drill.exec.server.options.OptionManager;
+import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin;
+import org.apache.drill.exec.store.dfs.easy.EasySubScan;
+import org.apache.hadoop.conf.Configuration;
+
+
+public class PdfFormatPlugin extends EasyFormatPlugin {
+
+ protected static final String DEFAULT_NAME = "pdf";
+
+ private static class PdfReaderFactory extends FileReaderFactory {
+ private final PdfBatchReader.PdfReaderConfig readerConfig;
+ private final int maxRecords;
+
+ public PdfReaderFactory(PdfBatchReader.PdfReaderConfig config, int maxRecords) {
+ readerConfig = config;
+ this.maxRecords = maxRecords;
+ }
+
+ @Override
+ public ManagedReader extends FileSchemaNegotiator> newReader() {
+ return new PdfBatchReader(readerConfig, maxRecords);
+ }
+ }
+
+ public PdfFormatPlugin(String name, DrillbitContext context,
+ Configuration fsConf, StoragePluginConfig storageConfig,
+ PdfFormatConfig formatConfig) {
+ super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig);
+ }
+
+ private static EasyFormatPlugin.EasyFormatConfig easyConfig(Configuration fsConf, PdfFormatConfig pluginConfig) {
+ return EasyFormatConfig.builder()
+ .readable(true)
+ .writable(false)
+ .blockSplittable(false)
+ .compressible(true)
+ .supportsProjectPushdown(true)
+ .extensions(pluginConfig.extensions())
+ .fsConf(fsConf)
+ .defaultName(DEFAULT_NAME)
+ .useEnhancedScan(true)
+ .supportsLimitPushdown(true)
+ .build();
+ }
+
+ @Override
+ public ManagedReader extends FileSchemaNegotiator> newBatchReader(
+ EasySubScan scan, OptionManager options) {
+ return new PdfBatchReader(formatConfig.getReaderConfig(this), scan.getMaxRecords());
+ }
+
+ @Override
+ protected FileScanBuilder frameworkBuilder(OptionManager options, EasySubScan scan) {
+ FileScanBuilder builder = new FileScanBuilder();
+ PdfBatchReader.PdfReaderConfig readerConfig = new PdfBatchReader.PdfReaderConfig(this);
+ builder.setReaderFactory(new PdfReaderFactory(readerConfig, scan.getMaxRecords()));
+
+ initScanBuilder(builder, scan);
+ builder.nullType(Types.optional(TypeProtos.MinorType.VARCHAR));
+ return builder;
+ }
+}
diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfMetadataReader.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfMetadataReader.java
new file mode 100644
index 00000000000..297fac93071
--- /dev/null
+++ b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfMetadataReader.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.pdf;
+
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.common.types.TypeProtos.DataMode;
+import org.apache.drill.exec.physical.resultSet.RowSetLoader;
+import org.apache.drill.exec.record.metadata.ColumnMetadata;
+import org.apache.drill.exec.record.metadata.MetadataUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+public class PdfMetadataReader {
+
+ private final Map metadata;
+ private final List writers;
+ private RowSetLoader rowWriter;
+
+
+ public PdfMetadataReader(PDDocument document) {
+ this.writers = new ArrayList<>();
+ // We are using a LinkedHashMap to preserve the order
+ this.metadata = new LinkedHashMap<>();
+ PDDocumentInformation info = document.getDocumentInformation();
+ metadata.put("pageCount", document.getNumberOfPages());
+ metadata.put("title",info.getTitle());
+ metadata.put("author", info.getAuthor());
+ metadata.put("subject", info.getSubject());
+ metadata.put("keywords", info.getKeywords());
+ metadata.put("creator", info.getCreator());
+ metadata.put("producer", info.getProducer());
+ metadata.put("creationDate", info.getCreationDate());
+ metadata.put("modificationDate", info.getModificationDate());
+ metadata.put("trapped", info.getTrapped());
+ }
+
+ public void setRowWriter(RowSetLoader rowWriter) {
+ this.rowWriter = rowWriter;
+ }
+
+ public void addImplicitColumnsToSchema() {
+ // Add to schema
+ addMetadataColumnToSchema("_page_count", MinorType.INT);
+ addMetadataColumnToSchema("_title", MinorType.VARCHAR);
+ addMetadataColumnToSchema("_author", MinorType.VARCHAR);
+ addMetadataColumnToSchema("_subject", MinorType.VARCHAR);
+ addMetadataColumnToSchema("_keywords", MinorType.VARCHAR);
+ addMetadataColumnToSchema("_creator", MinorType.VARCHAR);
+ addMetadataColumnToSchema("_producer", MinorType.VARCHAR);
+ addMetadataColumnToSchema("_creation_date", MinorType.TIMESTAMP);
+ addMetadataColumnToSchema("_modification_date", MinorType.TIMESTAMP);
+ addMetadataColumnToSchema("_trapped", MinorType.VARCHAR);
+ }
+
+ public void writeMetadata() {
+ int counter = 0;
+ for (Object value : metadata.values()) {
+ writers.get(counter).loadFromValue(value);
+ counter++;
+ }
+ }
+
+ private void addMetadataColumnToSchema(String columnName, MinorType dataType) {
+ int index = rowWriter.tupleSchema().index(columnName);
+ if (index == -1) {
+ ColumnMetadata colSchema = MetadataUtils.newScalar(columnName, dataType, DataMode.OPTIONAL);
+
+ // Exclude from wildcard queries
+ colSchema.setBooleanProperty(ColumnMetadata.EXCLUDE_FROM_WILDCARD, true);
+ index = rowWriter.addColumn(colSchema);
+ }
+ if (dataType == MinorType.VARCHAR) {
+ writers.add(new PdfBatchReader.StringPdfColumnWriter(index, columnName, rowWriter));
+ } else if (dataType == MinorType.TIMESTAMP) {
+ writers.add(new PdfBatchReader.TimestampPdfColumnWriter(index, columnName, rowWriter));
+ } else if (dataType == MinorType.INT) {
+ writers.add(new PdfBatchReader.IntPdfColumnWriter(index, columnName, rowWriter));
+ }
+ }
+}
diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfRowIterator.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfRowIterator.java
new file mode 100644
index 00000000000..4e90d6beeed
--- /dev/null
+++ b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfRowIterator.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.pdf;
+
+import technology.tabula.RectangularTextContainer;
+import technology.tabula.Table;
+
+import java.util.Iterator;
+import java.util.List;
+
+public class PdfRowIterator implements Iterator> {
+ private final Table table;
+ private int rowCounter;
+
+ public PdfRowIterator(Table table) {
+ this.table = table;
+ this.rowCounter = 0;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (table == null) {
+ return false;
+ }
+ return rowCounter < table.getRowCount();
+ }
+
+ @Override
+ public List next() {
+ List nextRow = PdfUtils.getRow(table, rowCounter);
+ rowCounter++;
+ return nextRow;
+ }
+}
diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfUtils.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfUtils.java
new file mode 100644
index 00000000000..ec72b86ed4f
--- /dev/null
+++ b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfUtils.java
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.pdf;
+
+import org.apache.drill.common.exceptions.UserException;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import technology.tabula.ObjectExtractor;
+import technology.tabula.Page;
+import technology.tabula.PageIterator;
+import technology.tabula.Rectangle;
+import technology.tabula.RectangularTextContainer;
+import technology.tabula.Table;
+import technology.tabula.detectors.NurminenDetectionAlgorithm;
+import technology.tabula.extractors.BasicExtractionAlgorithm;
+import technology.tabula.extractors.ExtractionAlgorithm;
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class PdfUtils {
+
+ public static final ExtractionAlgorithm DEFAULT_ALGORITHM = new BasicExtractionAlgorithm();
+ private static final Logger logger = LoggerFactory.getLogger(PdfUtils.class);
+
+ /**
+ * Returns a list of tables found in a given PDF document. There are several extraction algorithms
+ * available and this function uses the default Basic Extraction Algorithm.
+ * @param document The input PDF document to search for tables
+ * @return A list of tables found in the document.
+ */
+ public static List extractTablesFromPDF(PDDocument document) {
+ return extractTablesFromPDF(document, DEFAULT_ALGORITHM);
+ }
+
+ /**
+ * Returns a list of tables found in a given PDF document. There are several extraction algorithms
+ * available and this function allows the user to select which to use.
+ * @param document The input PDF document to search for tables
+ * @param algorithm The extraction algorithm
+ * @return A list of tables found in the document.
+ */
+ public static List extractTablesFromPDF(PDDocument document, ExtractionAlgorithm algorithm) {
+ NurminenDetectionAlgorithm detectionAlgorithm = new NurminenDetectionAlgorithm();
+
+ ExtractionAlgorithm algExtractor;
+
+ SpreadsheetExtractionAlgorithm extractor = new SpreadsheetExtractionAlgorithm();
+
+ ObjectExtractor objectExtractor = new ObjectExtractor(document);
+ PageIterator pages = objectExtractor.extract();
+ List tables= new ArrayList<>();
+ while (pages.hasNext()) {
+ Page page = pages.next();
+
+ algExtractor = algorithm;
+ List tablesOnPage = detectionAlgorithm.detect(page);
+
+ for (Rectangle guessRect : tablesOnPage) {
+ Page guess = page.getArea(guessRect);
+ tables.addAll(algExtractor.extract(guess));
+ }
+ }
+
+ try {
+ objectExtractor.close();
+ } catch (Exception e) {
+ throw UserException.parseError(e)
+ .message("Error extracting table: " + e.getMessage())
+ .build(logger);
+ }
+
+ return tables;
+ }
+
+ /**
+ * Returns a specific table from a PDF document. Returns null in the event that
+ * the user requests a table that does not exist. If there is an error with the document
+ * the function will throw a UserException.
+ * @param document The source PDF document
+ * @param tableIndex The index of the desired table
+ * @return The desired Table, null if the table is not valid, or if the document has no tables.
+ */
+ public static Table getSpecificTable(PDDocument document, int tableIndex, ExtractionAlgorithm algorithm) {
+ NurminenDetectionAlgorithm detectionAlgorithm = new NurminenDetectionAlgorithm();
+ ExtractionAlgorithm algExtractor;
+
+ if (algorithm == null) {
+ algExtractor = DEFAULT_ALGORITHM;
+ } else {
+ algExtractor = algorithm;
+ }
+
+ ObjectExtractor objectExtractor = new ObjectExtractor(document);
+ PageIterator pages = objectExtractor.extract();
+
+ Table specificTable;
+ int tableCounter = 0;
+ while (pages.hasNext()) {
+ Page page = pages.next();
+
+ List rectanglesOnPage = detectionAlgorithm.detect(page);
+ List tablesOnPage = new ArrayList<>();
+
+ for (Rectangle guessRect : rectanglesOnPage) {
+ Page guess = page.getArea(guessRect);
+ tablesOnPage.addAll(algExtractor.extract(guess));
+ if (tablesOnPage.size() == 0) {
+ return null;
+ }
+
+ for (Table table : tablesOnPage) {
+ if (tableCounter == tableIndex) {
+ specificTable = table;
+ return specificTable;
+ }
+ tableCounter++;
+ }
+ }
+ }
+ try {
+ objectExtractor.close();
+ } catch (Exception e) {
+ throw UserException.parseError(e)
+ .message("Error extracting table: " + e.getMessage())
+ .build(logger);
+ }
+
+ return null;
+ }
+
+ /**
+ * Returns the values contained in a PDF Table row
+ * @param table The source table
+ * @return A list of the header rows
+ */
+ public static List extractFirstRowValues(Table table) {
+ List values = new ArrayList<>();
+ if (table == null) {
+ return values;
+ }
+ List firstRow = table.getRows().get(0);
+
+ if (firstRow != null) {
+ for (RectangularTextContainer rectangularTextContainer : firstRow) {
+ values.add(rectangularTextContainer.getText());
+ }
+ }
+ return values;
+ }
+
+ /**
+ * This function retuns the contents of a specific row in a PDF table as a list of Strings.
+ * @param table The table containing the data.
+ * @param rowIndex The desired row index
+ * @return A list of Strings with the data.
+ */
+ public static List getRowAsStringList(Table table, int rowIndex) {
+ List values = new ArrayList<>();
+ if (table == null) {
+ return values;
+ }
+
+ List row = table.getRows().get(rowIndex);
+ for (RectangularTextContainer rectangularTextContainer : row) {
+ values.add(rectangularTextContainer.getText());
+ }
+ return values;
+ }
+
+ public static List convertRowToStringArray(List input) {
+ List values = new ArrayList<>();
+ for (RectangularTextContainer rectangularTextContainer : input) {
+ values.add(rectangularTextContainer.getText());
+ }
+ return values;
+ }
+
+
+ /**
+ * This function retuns the contents of a specific row in a PDF table as a list of Strings.
+ * @param table The table containing the data.
+ * @param rowIndex The desired row index
+ * @return A list of Strings with the data.
+ */
+ public static List getRow(Table table, int rowIndex) {
+ List values = new ArrayList<>();
+ if (table == null) {
+ return values;
+ }
+ return table.getRows().get(rowIndex);
+ }
+}
diff --git a/contrib/format-pdf/src/main/resources/bootstrap-format-plugins.json b/contrib/format-pdf/src/main/resources/bootstrap-format-plugins.json
new file mode 100644
index 00000000000..162a66e943c
--- /dev/null
+++ b/contrib/format-pdf/src/main/resources/bootstrap-format-plugins.json
@@ -0,0 +1,46 @@
+{
+ "storage":{
+ "dfs": {
+ "type": "file",
+ "formats": {
+ "pdf": {
+ "type": "pdf",
+ "extensions": [
+ "pdf"
+ ],
+ "extractionAlgorithm": "basic",
+ "extractHeaders": true,
+ "combinePages": false
+ }
+ }
+ },
+ "cp": {
+ "type": "file",
+ "formats": {
+ "pdf": {
+ "type": "pdf",
+ "extensions": [
+ "pdf"
+ ],
+ "extractionAlgorithm": "basic",
+ "extractHeaders": true,
+ "combinePages": false
+ }
+ }
+ },
+ "s3": {
+ "type": "file",
+ "formats": {
+ "pdf": {
+ "type": "pdf",
+ "extensions": [
+ "pdf"
+ ],
+ "extractionAlgorithm": "basic",
+ "extractHeaders": true,
+ "combinePages": false
+ }
+ }
+ }
+ }
+}
diff --git a/contrib/format-pdf/src/main/resources/drill-module.conf b/contrib/format-pdf/src/main/resources/drill-module.conf
new file mode 100644
index 00000000000..5a93feac2af
--- /dev/null
+++ b/contrib/format-pdf/src/main/resources/drill-module.conf
@@ -0,0 +1,23 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file tells Drill to consider this module when class path scanning.
+# This file can also include any supplementary configuration information.
+# This file is in HOCON format, see https://github.com/typesafehub/config/blob/master/HOCON.md for more information.
+
+drill.classpath.scanning.packages += "org.apache.drill.exec.store.pdf"
\ No newline at end of file
diff --git a/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java b/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java
new file mode 100644
index 00000000000..1383448fb6b
--- /dev/null
+++ b/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java
@@ -0,0 +1,410 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.pdf;
+
+import org.apache.drill.categories.RowSetTests;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.physical.rowSet.RowSet;
+import org.apache.drill.exec.physical.rowSet.RowSetBuilder;
+import org.apache.drill.exec.record.metadata.SchemaBuilder;
+import org.apache.drill.exec.record.metadata.TupleMetadata;
+import org.apache.drill.exec.rpc.RpcException;
+import org.apache.drill.test.ClusterFixture;
+import org.apache.drill.test.ClusterTest;
+import org.apache.drill.test.QueryBuilder;
+import org.apache.drill.test.QueryBuilder.QuerySummary;
+import org.apache.drill.test.QueryTestUtil;
+import org.apache.drill.test.rowSet.RowSetComparison;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.nio.file.Paths;
+import java.time.LocalDate;
+
+import static org.apache.drill.test.QueryTestUtil.generateCompressedFile;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+@Category(RowSetTests.class)
+public class TestPdfFormat extends ClusterTest {
+
+ @BeforeClass
+ public static void setup() throws Exception {
+ ClusterTest.startCluster(ClusterFixture.builder(dirTestWatcher));
+
+ // Needed for compressed file unit test
+ dirTestWatcher.copyResourceToRoot(Paths.get("pdf/"));
+ }
+
+ @Test
+ public void testStarQuery() throws RpcException {
+ String sql = "SELECT * FROM cp.`pdf/argentina_diputados_voting_record.pdf` WHERE `Provincia` = 'Rio Negro'";
+
+ QueryBuilder q = client.queryBuilder().sql(sql);
+ RowSet results = q.rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("Apellido y Nombre", MinorType.VARCHAR)
+ .addNullable("Bloque político", MinorType.VARCHAR)
+ .addNullable("Provincia", MinorType.VARCHAR)
+ .addNullable("field_0", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow("ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .addRow("AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .addRow("CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testExplicitQuery() throws RpcException {
+ String sql = "SELECT `Apellido y Nombre`, `Bloque político`, `Provincia`, `field_0` " +
+ "FROM cp.`pdf/argentina_diputados_voting_record.pdf` WHERE `Provincia` = 'Rio Negro'";
+
+ QueryBuilder q = client.queryBuilder().sql(sql);
+ RowSet results = q.rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("Apellido y Nombre", MinorType.VARCHAR)
+ .addNullable("Bloque político", MinorType.VARCHAR)
+ .addNullable("Provincia", MinorType.VARCHAR)
+ .addNullable("field_0", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow("ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .addRow("AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .addRow("CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testFullScan() throws Exception {
+ String sql = "SELECT * " +
+ "FROM table(cp.`pdf/argentina_diputados_voting_record.pdf` " +
+ "(type => 'pdf', combinePages => false, extractHeaders => false))";
+
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+ assertEquals(31, results.rowCount());
+ results.clear();
+
+ sql = "SELECT * " +
+ "FROM table(cp.`pdf/argentina_diputados_voting_record.pdf` " +
+ "(type => 'pdf', combinePages => false, extractHeaders => true))";
+
+ results = client.queryBuilder().sql(sql).rowSet();
+ assertEquals(31,results.rowCount());
+ results.clear();
+ }
+
+ @Test
+ public void testEncryptedFile() throws Exception {
+ String sql = "SELECT * " +
+ "FROM table(cp.`pdf/encrypted.pdf` " +
+ "(type => 'pdf', combinePages => false, extractHeaders => true, password => 'userpassword'))";
+
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("FLA Audit Profile", MinorType.VARCHAR)
+ .addNullable("field_0", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow("Country", "China")
+ .addRow("Factory name", "01001523B")
+ .addRow("IEM", "BVCPS (HK), Shen Zhen Office")
+ .addRow("Date of audit", "May 20-22, 2003")
+ .addRow("PC(s)", "adidas-Salomon")
+ .addRow("Number of workers", "243")
+ .addRow("Product(s)", "Scarf, cap, gloves, beanies and headbands")
+ .addRow("Production processes", "Sewing, cutting, packing, embroidery, die-cutting")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testNoHeaders() throws RpcException {
+ String sql = "SELECT * " +
+ "FROM table(cp.`pdf/argentina_diputados_voting_record.pdf` " +
+ "(type => 'pdf', combinePages => false, extractHeaders => false)) WHERE field_2 = 'Rio Negro'";
+
+ QueryBuilder q = client.queryBuilder().sql(sql);
+ RowSet results = q.rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("field_0", MinorType.VARCHAR)
+ .addNullable("field_1", MinorType.VARCHAR)
+ .addNullable("field_2", MinorType.VARCHAR)
+ .addNullable("field_3", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow("ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .addRow("AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .addRow("CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testMetadataQuery() throws RpcException {
+ String sql = "SELECT _page_count, " +
+ "_title, " +
+ "_author, " +
+ "_subject, " +
+ "_keywords, " +
+ "_creator, " +
+ "_producer," +
+ "_creation_date, " +
+ "_modification_date, " +
+ "_trapped " +
+ "FROM cp.`pdf/20.pdf` " +
+ "LIMIT 1";
+
+ QueryBuilder q = client.queryBuilder().sql(sql);
+ RowSet results = q.rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("_page_count", MinorType.INT)
+ .addNullable("_title", MinorType.VARCHAR)
+ .addNullable("_author", MinorType.VARCHAR)
+ .addNullable("_subject", MinorType.VARCHAR)
+ .addNullable("_keywords", MinorType.VARCHAR)
+ .addNullable("_creator", MinorType.VARCHAR)
+ .addNullable("_producer", MinorType.VARCHAR)
+ .addNullable("_creation_date", MinorType.TIMESTAMP)
+ .addNullable("_modification_date", MinorType.TIMESTAMP)
+ .addNullable("_trapped", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow(1, "Agricultural Landuse Survey in The Sumas River Watershed Summa",
+ "Vision", "Agricultural Landuse Survey in The Sumas River Watershed Summa",
+ "Agricultural Landuse Survey in The Sumas River Watershed Summa",
+ "PScript5.dll Version 5.2.2",
+ "Acrobat Distiller 7.0.5 (Windows)",
+ 857403000000L,
+ 1230835135000L,
+ null)
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testUnicode() throws Exception {
+ String sql = "SELECT * FROM cp.`pdf/arabic.pdf`";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("مرحباً", MinorType.VARCHAR)
+ .addNullable("اسمي سلطان", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow("انا من ولاية كارولينا الشمال", "من اين انت؟")
+ .addRow( "1234", "عندي 47 قطط")
+ .addRow("هل انت شباك؟", "اسمي Jeremy في الانجليزية")
+ .addRow("Jeremy is جرمي in Arabic", null)
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testSerDe() throws Exception {
+ String sql = "SELECT COUNT(*) AS cnt FROM " +
+ "table(cp.`pdf/argentina_diputados_voting_record.pdf` (type => 'pdf', combinePages => false))";
+ String plan = queryBuilder().sql(sql).explainJson();
+ long cnt = queryBuilder().physical(plan).singletonLong();
+ assertEquals("Counts should match",31L, cnt);
+ }
+
+ @Test
+ public void testPageMerge() throws Exception {
+ String sql = "SELECT * FROM table(cp.`pdf/schools.pdf` (type => 'pdf', combinePages => true, extractHeaders=> true))";
+ QuerySummary results = client.queryBuilder().sql(sql).run();
+ assertEquals(221, results.recordCount());
+ }
+
+ @Test
+ public void testFileWithNoTables() throws Exception {
+ String sql = "SELECT * FROM table(cp.`pdf/labor.pdf` (type => 'pdf', extractionAlgorithm => 'spreadsheet'))";
+ QuerySummary results = client.queryBuilder().sql(sql).run();
+ assertEquals(1,results.recordCount());
+ }
+
+ @Test
+ public void testMetadataQueryWithFileWithNoTables() throws RpcException {
+ String sql = "SELECT _page_count, " +
+ "_title, " +
+ "_author, " +
+ "_subject, " +
+ "_keywords, " +
+ "_creator, " +
+ "_producer," +
+ "_creation_date, " +
+ "_modification_date, " +
+ "_trapped " +
+ "FROM table(cp.`pdf/labor.pdf` (type => 'pdf', extractionAlgorithm => 'spreadsheet')) LIMIT 1";
+
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("_page_count", MinorType.INT)
+ .addNullable("_title", MinorType.VARCHAR)
+ .addNullable("_author", MinorType.VARCHAR)
+ .addNullable("_subject", MinorType.VARCHAR)
+ .addNullable("_keywords", MinorType.VARCHAR)
+ .addNullable("_creator", MinorType.VARCHAR)
+ .addNullable("_producer", MinorType.VARCHAR)
+ .addNullable("_creation_date", MinorType.TIMESTAMP)
+ .addNullable("_modification_date", MinorType.TIMESTAMP)
+ .addNullable("_trapped", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow(1, null, null, null, null, "pdftk 2.01 - www.pdftk.com",
+ "itext-paulo-155 (itextpdf.sf.net-lowagie.com)",
+ QueryTestUtil.ConvertDateToLong("2015-04-25T23:09:47Z"),
+ QueryTestUtil.ConvertDateToLong("2015-04-25T23:09:47Z"), null)
+ .build();
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testExtractionAlgorithms() throws Exception {
+
+ String sql = "SELECT * FROM table(cp.`pdf/schools.pdf` (type => 'pdf', combinePages => true, extractionAlgorithm => 'spreadsheet'))";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("field_0", MinorType.VARCHAR)
+ .addNullable("Last Name", MinorType.VARCHAR)
+ .addNullable("First Name", MinorType.VARCHAR)
+ .addNullable("Address", MinorType.VARCHAR)
+ .addNullable("City", MinorType.VARCHAR)
+ .addNullable("State", MinorType.VARCHAR)
+ .addNullable("Zip", MinorType.VARCHAR)
+ .addNullable("Occupation", MinorType.VARCHAR)
+ .addNullable("Employer", MinorType.VARCHAR)
+ .addNullable("Date", MinorType.VARCHAR)
+ .addNullable("Amount", MinorType.VARCHAR)
+ .buildSchema();
+
+ assertTrue(results.schema().isEquivalent(expectedSchema));
+ assertEquals(216, results.rowCount());
+ results.clear();
+
+ sql = "SELECT * FROM table(cp.`pdf/schools.pdf` (type => 'pdf', combinePages => true, extractionAlgorithm => 'basic'))";
+ results = client.queryBuilder().sql(sql).rowSet();
+
+ expectedSchema = new SchemaBuilder()
+ .addNullable("Last Name", MinorType.VARCHAR)
+ .addNullable("First Name Address", MinorType.VARCHAR)
+ .addNullable("field_0", MinorType.VARCHAR)
+ .addNullable("City", MinorType.VARCHAR)
+ .addNullable("State", MinorType.VARCHAR)
+ .addNullable("Zip", MinorType.VARCHAR)
+ .addNullable("field_1", MinorType.VARCHAR)
+ .addNullable("Occupation Employer", MinorType.VARCHAR)
+ .addNullable("Date", MinorType.VARCHAR)
+ .addNullable("field_2", MinorType.VARCHAR)
+ .addNullable("Amount", MinorType.VARCHAR)
+ .buildSchema();
+
+ assertTrue(results.schema().isEquivalent(expectedSchema));
+ assertEquals(221, results.rowCount());
+ results.clear();
+ }
+
+ @Test
+ public void testProvidedSchema() throws Exception {
+ String sql = "SELECT * FROM table(cp.`pdf/schools.pdf` (type => 'pdf', combinePages => true, " +
+ "schema => 'inline=(`Last Name` VARCHAR, `First Name Address` VARCHAR, `field_0` VARCHAR, `City` " +
+ "VARCHAR, `State` VARCHAR, `Zip` VARCHAR, `field_1` VARCHAR, `Occupation Employer` VARCHAR, " +
+ "`Date` VARCHAR, `field_2` DATE properties {`drill.format` = `M/d/yyyy`}, `Amount` DOUBLE)')) " +
+ "LIMIT 5";
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("Last Name", MinorType.VARCHAR)
+ .addNullable("First Name Address", MinorType.VARCHAR)
+ .addNullable("field_0", MinorType.VARCHAR)
+ .addNullable("City", MinorType.VARCHAR)
+ .addNullable("State", MinorType.VARCHAR)
+ .addNullable("Zip", MinorType.VARCHAR)
+ .addNullable("field_1", MinorType.VARCHAR)
+ .addNullable("Occupation Employer", MinorType.VARCHAR)
+ .addNullable("Date", MinorType.VARCHAR)
+ .addNullable("field_2", MinorType.DATE)
+ .addNullable("Amount", MinorType.FLOAT8)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow("Lidstad", "Dick & Peg 62 Mississippi River Blvd N", null, "Saint Paul", "MN", null, "55104", "retired", null, LocalDate.parse("2012-10-12"), 60.0)
+ .addRow("Strom", "Pam 1229 Hague Ave", null, "St. Paul", "MN", null, "55104", null, null, LocalDate.parse("2012-09-12"), 60.0)
+ .addRow("Seeba", "Louise & Paul 1399 Sheldon St", null, "Saint Paul", "MN", null, "55108", "BOE City of Saint Paul", null, LocalDate.parse("2012-10-12"), 60.0)
+ .addRow("Schumacher / Bales", "Douglas L. / Patricia 948 County Rd. D W", null, "Saint Paul", "MN", null, "55126", null, null, LocalDate.parse("2012-10-13"), 60.0)
+ .addRow("Abrams", "Marjorie 238 8th St east", null, "St Paul", "MN", null, "55101", "Retired Retired", null, LocalDate.parse("2012-08-08"), 75.0)
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
+ @Test
+ public void testSpecificTable() throws Exception {
+ String sql = "SELECT COUNT(*) FROM table(cp.`pdf/schools.pdf` (type => 'pdf', defaultTableIndex => 3))";
+ long resultCount = client.queryBuilder().sql(sql).singletonLong();
+ assertEquals(45L, resultCount);
+ }
+
+ @Test
+ public void testWithCompressedFile() throws Exception {
+ generateCompressedFile("pdf/argentina_diputados_voting_record.pdf", "zip", "pdf/compressed.pdf.zip" );
+
+ String sql = "SELECT * FROM dfs.`pdf/compressed.pdf.zip` WHERE `Provincia` = 'Rio Negro'";
+
+ QueryBuilder q = client.queryBuilder().sql(sql);
+ RowSet results = q.rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("Apellido y Nombre", MinorType.VARCHAR)
+ .addNullable("Bloque político", MinorType.VARCHAR)
+ .addNullable("Provincia", MinorType.VARCHAR)
+ .addNullable("field_0", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow("ALBRIEU, Oscar Edmundo Nicolas", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .addRow("AVOSCAN, Herman Horacio", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .addRow("CEJAS, Jorge Alberto", "Frente para la Victoria - PJ", "Rio Negro", "AFIRMATIVO")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+}
diff --git a/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfUtils.java b/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfUtils.java
new file mode 100644
index 00000000000..d5a4dabfa0b
--- /dev/null
+++ b/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfUtils.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.pdf;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.junit.Test;
+import technology.tabula.Table;
+import java.io.File;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+
+
+public class TestPdfUtils {
+
+ private static final String DATA_PATH = "src/test/resources/pdf/";
+
+ @Test
+ public void testTableExtractor() throws Exception {
+ PDDocument document = getDocument("argentina_diputados_voting_record.pdf");
+ List tableList = PdfUtils.extractTablesFromPDF(document);
+ document.close();
+ assertEquals(tableList.size(), 1);
+
+ PDDocument document2 = getDocument("twotables.pdf");
+ List tableList2 = PdfUtils.extractTablesFromPDF(document2);
+ document2.close();
+ assertEquals(tableList2.size(), 2);
+ }
+
+ @Test
+ public void testTableExtractorWithNoBoundingFrame() throws Exception {
+ PDDocument document = getDocument("spreadsheet_no_bounding_frame.pdf");
+ List tableList = PdfUtils.extractTablesFromPDF(document);
+ document.close();
+ assertEquals(tableList.size(), 1);
+ }
+
+ @Test
+ public void testTableExtractorWitMultipage() throws Exception {
+ PDDocument document = getDocument("us-020.pdf");
+ List tableList = PdfUtils.extractTablesFromPDF(document);
+ document.close();
+ assertEquals(tableList.size(), 4);
+ }
+
+ @Test
+ public void testGetSpecificTable() throws Exception {
+ PDDocument document = getDocument("us-020.pdf");
+ Table table = PdfUtils.getSpecificTable(document, 0, null);
+ assertNotNull(table);
+ assertEquals(7, table.getColCount());
+ }
+
+ @Test
+ public void testGetFullPageSpecificTable() throws Exception {
+ PDDocument document = getDocument("schools.pdf");
+ Table table = PdfUtils.getSpecificTable(document, 3, null);
+ assertNotNull(table);
+ }
+
+ @Test
+ public void testGetSpecificTableOutSideOfBounds() throws Exception {
+ PDDocument document = getDocument("us-020.pdf");
+ Table table = PdfUtils.getSpecificTable(document, 4, null);
+ assertNull(table);
+ }
+
+ @Test
+ public void testFirstRowExtractor() throws Exception {
+ PDDocument document = getDocument("schools.pdf");
+ List tableList = PdfUtils.extractTablesFromPDF(document);
+ document.close();
+
+ List values = PdfUtils.extractFirstRowValues(tableList.get(0));
+ assertEquals(values.size(), 11);
+ }
+
+ private PDDocument getDocument(String fileName) throws Exception {
+ return PDDocument.load(new File(DATA_PATH + fileName));
+ }
+}
diff --git a/contrib/format-pdf/src/test/resources/pdf/20.pdf b/contrib/format-pdf/src/test/resources/pdf/20.pdf
new file mode 100755
index 00000000000..9aa111d14c0
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/20.pdf differ
diff --git a/contrib/format-pdf/src/test/resources/pdf/arabic.pdf b/contrib/format-pdf/src/test/resources/pdf/arabic.pdf
new file mode 100644
index 00000000000..87b2d12d64c
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/arabic.pdf differ
diff --git a/contrib/format-pdf/src/test/resources/pdf/argentina_diputados_voting_record.pdf b/contrib/format-pdf/src/test/resources/pdf/argentina_diputados_voting_record.pdf
new file mode 100644
index 00000000000..847f21be653
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/argentina_diputados_voting_record.pdf differ
diff --git a/contrib/format-pdf/src/test/resources/pdf/campaign_donors.pdf b/contrib/format-pdf/src/test/resources/pdf/campaign_donors.pdf
new file mode 100644
index 00000000000..40eb8087914
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/campaign_donors.pdf differ
diff --git a/contrib/format-pdf/src/test/resources/pdf/encrypted.pdf b/contrib/format-pdf/src/test/resources/pdf/encrypted.pdf
new file mode 100644
index 00000000000..d3586d60f88
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/encrypted.pdf differ
diff --git a/contrib/format-pdf/src/test/resources/pdf/jpeg2000.pdf b/contrib/format-pdf/src/test/resources/pdf/jpeg2000.pdf
new file mode 100644
index 00000000000..815a5010494
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/jpeg2000.pdf differ
diff --git a/contrib/format-pdf/src/test/resources/pdf/labor.pdf b/contrib/format-pdf/src/test/resources/pdf/labor.pdf
new file mode 100644
index 00000000000..9e322812a33
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/labor.pdf differ
diff --git a/contrib/format-pdf/src/test/resources/pdf/schools.pdf b/contrib/format-pdf/src/test/resources/pdf/schools.pdf
new file mode 100644
index 00000000000..eef50e25558
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/schools.pdf differ
diff --git a/contrib/format-pdf/src/test/resources/pdf/spreadsheet_no_bounding_frame.pdf b/contrib/format-pdf/src/test/resources/pdf/spreadsheet_no_bounding_frame.pdf
new file mode 100644
index 00000000000..aa221b981de
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/spreadsheet_no_bounding_frame.pdf differ
diff --git a/contrib/format-pdf/src/test/resources/pdf/twotables.pdf b/contrib/format-pdf/src/test/resources/pdf/twotables.pdf
new file mode 100644
index 00000000000..42921a972c2
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/twotables.pdf differ
diff --git a/contrib/format-pdf/src/test/resources/pdf/us-017.pdf b/contrib/format-pdf/src/test/resources/pdf/us-017.pdf
new file mode 100644
index 00000000000..64158f5fe83
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/us-017.pdf differ
diff --git a/contrib/format-pdf/src/test/resources/pdf/us-020.pdf b/contrib/format-pdf/src/test/resources/pdf/us-020.pdf
new file mode 100644
index 00000000000..39a8546ce0d
Binary files /dev/null and b/contrib/format-pdf/src/test/resources/pdf/us-020.pdf differ
diff --git a/contrib/pom.xml b/contrib/pom.xml
index 78a1dd52000..9aacd68d8b1 100644
--- a/contrib/pom.xml
+++ b/contrib/pom.xml
@@ -48,6 +48,7 @@
format-excel
format-httpd
format-esri
+ format-pdf
format-hdf5
format-sas
format-spss
diff --git a/distribution/pom.xml b/distribution/pom.xml
index 5fc7d18ff84..8c9b6b84809 100644
--- a/distribution/pom.xml
+++ b/distribution/pom.xml
@@ -397,6 +397,11 @@
drill-format-httpd
${project.version}
+
+ org.apache.drill.contrib
+ drill-format-pdf
+ ${project.version}
+
org.apache.drill.contrib
drill-format-hdf5
diff --git a/distribution/src/assemble/component.xml b/distribution/src/assemble/component.xml
index 7421aa3d753..9aa9c3fe571 100644
--- a/distribution/src/assemble/component.xml
+++ b/distribution/src/assemble/component.xml
@@ -51,6 +51,7 @@
org.apache.drill.contrib:drill-format-hdf5:jar
org.apache.drill.contrib:drill-format-ltsv:jar
org.apache.drill.contrib:drill-format-httpd:jar
+ org.apache.drill.contrib:drill-format-pdf:jar
org.apache.drill.contrib:drill-format-excel:jar
org.apache.drill.contrib:drill-format-spss:jar
org.apache.drill.contrib:drill-format-sas:jar
diff --git a/distribution/src/main/resources/drill-config.sh b/distribution/src/main/resources/drill-config.sh
index 66e9f4d7b36..cacfec20d93 100644
--- a/distribution/src/main/resources/drill-config.sh
+++ b/distribution/src/main/resources/drill-config.sh
@@ -302,6 +302,9 @@ export DRILLBIT_CODE_CACHE_SIZE=${DRILLBIT_CODE_CACHE_SIZE:-"1G"}
export DRILLBIT_OPTS="-Xms$DRILL_HEAP -Xmx$DRILL_HEAP -XX:MaxDirectMemorySize=$DRILL_MAX_DIRECT_MEMORY"
export DRILLBIT_OPTS="$DRILLBIT_OPTS -XX:ReservedCodeCacheSize=$DRILLBIT_CODE_CACHE_SIZE -Ddrill.exec.enable-epoll=false"
+# This option prevents the PDF Format Plugin from opening unnecessary Java windows
+export DRILLBIT_OPTS="$DRILLBIT_OPTS -Djava.awt.headless=true"
+
# Check that java is newer than 1.8
"$JAVA" -version 2>&1 | grep "version" | egrep -e "1\.8" > /dev/null
if [ $? -gt 0 ]; then
diff --git a/pom.xml b/pom.xml
index c4f36181e9f..384452b3634 100644
--- a/pom.xml
+++ b/pom.xml
@@ -411,6 +411,7 @@
**/git.properties
**/*.csv
**/*.csvh
+ **/*.pdf
**/*.csvh-test
**/*.tsv
**/*.txt
@@ -726,6 +727,7 @@
**/git.properties
**/*.csv
**/*.csvh
+ **/*.pdf
**/*.csvh-test
**/*.tsv
**/*.txt