Skip to content

Commit

Permalink
Revert "BackPort [ARROW-13572]: ORC support , [ARROW-13797]: column p…
Browse files Browse the repository at this point in the history
…rojection pushdow, and refactor some classes (#36)" (#41)

This reverts commit a2c70e8.
  • Loading branch information
zhztheplayer authored Nov 3, 2021
1 parent ab913d8 commit 74dcc77
Show file tree
Hide file tree
Showing 23 changed files with 18 additions and 806 deletions.
59 changes: 0 additions & 59 deletions cpp/src/arrow/adapters/orc/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -276,15 +276,6 @@ class ORCFileReader::Impl {
return ReadTable(opts, schema, out);
}

Status Read(const std::vector<std::string>& include_names,
std::shared_ptr<Table>* out) {
liborc::RowReaderOptions opts;
RETURN_NOT_OK(SelectNames(&opts, include_names));
std::shared_ptr<Schema> schema;
RETURN_NOT_OK(ReadSchema(opts, &schema));
return ReadTable(opts, schema, out);
}

Status Read(const std::shared_ptr<Schema>& schema,
const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
liborc::RowReaderOptions opts;
Expand All @@ -310,16 +301,6 @@ class ORCFileReader::Impl {
return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
}

Status ReadStripe(int64_t stripe, const std::vector<std::string>& include_names,
std::shared_ptr<RecordBatch>* out) {
liborc::RowReaderOptions opts;
RETURN_NOT_OK(SelectNames(&opts, include_names));
RETURN_NOT_OK(SelectStripe(&opts, stripe));
std::shared_ptr<Schema> schema;
RETURN_NOT_OK(ReadSchema(opts, &schema));
return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
}

Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) {
ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(),
Status::Invalid("Out of bounds stripe: ", stripe));
Expand Down Expand Up @@ -356,13 +337,6 @@ class ORCFileReader::Impl {
return Status::OK();
}

Status SelectNames(liborc::RowReaderOptions* opts,
const std::vector<std::string>& include_names) {
std::list<std::string> include_names_list(include_names.begin(), include_names.end());
opts->include(include_names_list);
return Status::OK();
}

Status ReadTable(const liborc::RowReaderOptions& row_opts,
const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
liborc::RowReaderOptions opts(row_opts);
Expand Down Expand Up @@ -461,23 +435,10 @@ Status ORCFileReader::Open(const std::shared_ptr<io::RandomAccessFile>& file,
return Status::OK();
}

Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open(
const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool) {
auto result = std::unique_ptr<ORCFileReader>(new ORCFileReader());
RETURN_NOT_OK(result->impl_->Open(file, pool));
return std::move(result);
}

Status ORCFileReader::ReadSchema(std::shared_ptr<Schema>* out) {
return impl_->ReadSchema(out);
}

Result<std::shared_ptr<Schema>> ORCFileReader::ReadSchema() {
std::shared_ptr<Schema> schema;
RETURN_NOT_OK(impl_->ReadSchema(&schema));
return schema;
}

Status ORCFileReader::Read(std::shared_ptr<Table>* out) { return impl_->Read(out); }

Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
Expand All @@ -490,13 +451,6 @@ Status ORCFileReader::Read(const std::vector<int>& include_indices,
return impl_->Read(include_indices, out);
}

Result<std::shared_ptr<Table>> ORCFileReader::Read(
const std::vector<std::string>& include_names) {
std::shared_ptr<Table> table;
RETURN_NOT_OK(impl_->Read(include_names, &table));
return table;
}

Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
const std::vector<int>& include_indices,
std::shared_ptr<Table>* out) {
Expand All @@ -507,24 +461,11 @@ Status ORCFileReader::ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* o
return impl_->ReadStripe(stripe, out);
}

Result<std::shared_ptr<RecordBatch>> ORCFileReader::ReadStripe(int64_t stripe) {
std::shared_ptr<RecordBatch> recordBatch;
RETURN_NOT_OK(impl_->ReadStripe(stripe, &recordBatch));
return recordBatch;
}

Status ORCFileReader::ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
std::shared_ptr<RecordBatch>* out) {
return impl_->ReadStripe(stripe, include_indices, out);
}

Result<std::shared_ptr<RecordBatch>> ORCFileReader::ReadStripe(
int64_t stripe, const std::vector<std::string>& include_names) {
std::shared_ptr<RecordBatch> recordBatch;
RETURN_NOT_OK(impl_->ReadStripe(stripe, include_names, &recordBatch));
return recordBatch;
}

Status ORCFileReader::Seek(int64_t row_number) { return impl_->Seek(row_number); }

Status ORCFileReader::NextStripeReader(int64_t batch_sizes,
Expand Down
35 changes: 0 additions & 35 deletions cpp/src/arrow/adapters/orc/adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,24 +48,11 @@ class ARROW_EXPORT ORCFileReader {
static Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool,
std::unique_ptr<ORCFileReader>* reader);

/// \brief Creates a new ORC reader
///
/// \param[in] file the data source
/// \param[in] pool a MemoryPool to use for buffer allocations
/// \return the returned reader object
static Result<std::unique_ptr<ORCFileReader>> Open(
const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool);

/// \brief Return the schema read from the ORC file
///
/// \param[out] out the returned Schema object
Status ReadSchema(std::shared_ptr<Schema>* out);

/// \brief Return the schema read from the ORC file
///
/// \return the returned Schema object
Result<std::shared_ptr<Schema>> ReadSchema();

/// \brief Read the file as a Table
///
/// The table will be composed of one record batch per stripe.
Expand All @@ -89,14 +76,6 @@ class ARROW_EXPORT ORCFileReader {
/// \param[out] out the returned Table
Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out);

/// \brief Read the file as a Table
///
/// The table will be composed of one record batch per stripe.
///
/// \param[in] include_names the selected field names to read
/// \return the returned Table
Result<std::shared_ptr<Table>> Read(const std::vector<std::string>& include_names);

/// \brief Read the file as a Table
///
/// The table will be composed of one record batch per stripe.
Expand All @@ -113,12 +92,6 @@ class ARROW_EXPORT ORCFileReader {
/// \param[out] out the returned RecordBatch
Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out);

/// \brief Read a single stripe as a RecordBatch
///
/// \param[in] stripe the stripe index
/// \return the returned RecordBatch
Result<std::shared_ptr<RecordBatch>> ReadStripe(int64_t stripe);

/// \brief Read a single stripe as a RecordBatch
///
/// \param[in] stripe the stripe index
Expand All @@ -127,14 +100,6 @@ class ARROW_EXPORT ORCFileReader {
Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
std::shared_ptr<RecordBatch>* out);

/// \brief Read a single stripe as a RecordBatch
///
/// \param[in] stripe the stripe index
/// \param[in] include_names the selected field names to read
/// \return the returned RecordBatch
Result<std::shared_ptr<RecordBatch>> ReadStripe(
int64_t stripe, const std::vector<std::string>& include_names);

/// \brief Seek to designated row. Invoke NextStripeReader() after seek
/// will return stripe reader starting from designated row.
///
Expand Down
8 changes: 0 additions & 8 deletions cpp/src/arrow/dataset/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@ if(ARROW_CSV)
set(ARROW_DATASET_SRCS ${ARROW_DATASET_SRCS} file_csv.cc)
endif()

if(ARROW_ORC)
set(ARROW_DATASET_SRCS ${ARROW_DATASET_SRCS} file_orc.cc)
endif()

if(ARROW_PARQUET)
set(ARROW_DATASET_LINK_STATIC ${ARROW_DATASET_LINK_STATIC} parquet_static)
set(ARROW_DATASET_LINK_SHARED ${ARROW_DATASET_LINK_SHARED} parquet_shared)
Expand Down Expand Up @@ -120,10 +116,6 @@ if(ARROW_CSV)
add_arrow_dataset_test(file_csv_test)
endif()

if(ARROW_ORC)
add_arrow_dataset_test(file_orc_test)
endif()

if(ARROW_PARQUET)
add_arrow_dataset_test(file_parquet_test)
endif()
Expand Down
1 change: 0 additions & 1 deletion cpp/src/arrow/dataset/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,5 @@
#include "arrow/dataset/file_base.h"
#include "arrow/dataset/file_csv.h"
#include "arrow/dataset/file_ipc.h"
#include "arrow/dataset/file_orc.h"
#include "arrow/dataset/file_parquet.h"
#include "arrow/dataset/scanner.h"
11 changes: 0 additions & 11 deletions cpp/src/arrow/dataset/expression.cc
Original file line number Diff line number Diff line change
Expand Up @@ -612,17 +612,6 @@ std::vector<FieldRef> FieldsInExpression(const Expression& expr) {
return fields;
}

bool ExpressionHasFieldRefs(const Expression& expr) {
if (expr.literal()) return false;

if (expr.field_ref()) return true;

for (const Expression& arg : CallNotNull(expr)->arguments) {
if (ExpressionHasFieldRefs(arg)) return true;
}
return false;
}

Result<Expression> FoldConstants(Expression expr) {
return Modify(
std::move(expr), [](Expression expr) { return expr; },
Expand Down
4 changes: 0 additions & 4 deletions cpp/src/arrow/dataset/expression.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,6 @@ Expression call(std::string function, std::vector<Expression> arguments,
ARROW_DS_EXPORT
std::vector<FieldRef> FieldsInExpression(const Expression&);

/// Check if the expression references any fields.
ARROW_EXPORT
bool ExpressionHasFieldRefs(const Expression&);

/// Assemble a mapping from field references to known values.
ARROW_DS_EXPORT
Result<std::unordered_map<FieldRef, Datum, FieldRef::Hash>> ExtractKnownFieldValues(
Expand Down
7 changes: 0 additions & 7 deletions cpp/src/arrow/dataset/file_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,6 @@ Result<std::shared_ptr<io::InputStream>> FileSource::OpenCompressed(
return io::CompressedInputStream::Make(codec.get(), std::move(file));
}

Future<util::optional<int64_t>> FileFormat::CountRows(
const std::shared_ptr<FileFragment>& file, Expression predicate,
const std::shared_ptr<ScanOptions>& options) {
// Just adapt to ORC interface
return Future<util::optional<int64_t>>::MakeFinished(util::nullopt);
}

Result<std::shared_ptr<FileFragment>> FileFormat::MakeFragment(
FileSource source, std::shared_ptr<Schema> physical_schema) {
return MakeFragment(std::move(source), literal(true), std::move(physical_schema));
Expand Down
4 changes: 0 additions & 4 deletions cpp/src/arrow/dataset/file_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,6 @@ class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileForma
const std::shared_ptr<ScanOptions>& options,
const std::shared_ptr<FileFragment>& file);

virtual Future<util::optional<int64_t>> CountRows(
const std::shared_ptr<FileFragment>& file, Expression predicate,
const std::shared_ptr<ScanOptions>& options);

/// \brief Open a fragment
virtual Result<std::shared_ptr<FileFragment>> MakeFragment(
FileSource source, Expression partition_expression,
Expand Down
Loading

0 comments on commit 74dcc77

Please sign in to comment.