Skip to content

Commit

Permalink
Change file reader logic (#103)
Browse files Browse the repository at this point in the history
* Change file reader logic

* Google Java Format

* Updating all dependencies

* Removing hprof files

* Google Java Format

* Adding new line

* Removing commented code

Co-authored-by: GitHub Actions <>
  • Loading branch information
benwatson528 authored Nov 27, 2022
1 parent 1d5ceee commit b9628fb
Show file tree
Hide file tree
Showing 12 changed files with 89 additions and 79 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ core.*
heapdump.*
javacore.*
Snap.*
*.hprof
26 changes: 13 additions & 13 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ fun properties(key: String) = project.findProperty(key).toString()

plugins {
id("java")
id("org.jetbrains.kotlin.jvm") version "1.6.10"
id("org.jetbrains.intellij") version "1.4.0"
id("org.jetbrains.kotlin.jvm") version "1.7.21"
id("org.jetbrains.intellij") version "1.10.0"
}

group = properties("pluginGroup")
Expand All @@ -17,25 +17,25 @@ repositories {

dependencies {
//Avro dependencies
implementation("org.apache.avro:avro:1.11.0")
implementation("org.apache.avro:avro:1.11.1")
implementation("org.xerial.snappy:snappy-java:1.1.8.4")

//Parquet dependencies
implementation("org.apache.parquet:parquet-avro:1.12.2")
implementation("org.apache.parquet:parquet-column:1.12.2")
implementation("org.apache.parquet:parquet-hadoop:1.12.2")
implementation("org.apache.parquet:parquet-format-structures:1.12.2")
implementation("org.apache.hadoop:hadoop-client:3.3.2")
implementation("org.apache.parquet:parquet-avro:1.12.3")
implementation("org.apache.parquet:parquet-column:1.12.3")
implementation("org.apache.parquet:parquet-hadoop:1.12.3")
implementation("org.apache.parquet:parquet-format-structures:1.12.3")
implementation("org.apache.hadoop:hadoop-client:3.3.4")

//External dependencies
implementation("com.google.code.gson:gson:2.9.0")
implementation("com.google.code.gson:gson:2.10")
implementation("com.google.guava:guava:31.1-jre")
implementation("com.fifesoft:rsyntaxtextarea:3.1.6")
implementation("com.github.wnameless.json:json-flattener:0.13.0")
implementation("com.fifesoft:rsyntaxtextarea:3.3.0")
implementation("com.github.wnameless.json:json-flattener:0.15.1")

//Test dependencies
testImplementation("org.junit.jupiter:junit-jupiter-engine:5.8.2")
testImplementation("org.assertj:assertj-core:3.22.0")
testImplementation("org.junit.jupiter:junit-jupiter-engine:5.9.1")
testImplementation("org.assertj:assertj-core:3.23.1")
}

configurations.implementation {
Expand Down
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ platformType=IC
platformVersion=2021.1.3
javaVersion=11

gradleVersion=7.4
gradleVersion=7.6

kotlin.stdlib.default.dependency=false
Binary file modified gradle/wrapper/gradle-wrapper.jar
Binary file not shown.
3 changes: 2 additions & 1 deletion gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-7.6-bin.zip
networkTimeout=10000
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
18 changes: 14 additions & 4 deletions gradlew
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
# Darwin, MinGW, and NonStop.
#
# (3) This script is generated from the Groovy template
# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# within the Gradle project.
#
# You can find Gradle at https://github.com/gradle/gradle/.
Expand All @@ -80,10 +80,10 @@ do
esac
done

APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit

APP_NAME="Gradle"
# This is normally unused
# shellcheck disable=SC2034
APP_BASE_NAME=${0##*/}
APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit

# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
Expand Down Expand Up @@ -143,12 +143,16 @@ fi
if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
case $MAX_FD in #(
max*)
# In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
# shellcheck disable=SC3045
MAX_FD=$( ulimit -H -n ) ||
warn "Could not query maximum file descriptor limit"
esac
case $MAX_FD in #(
'' | soft) :;; #(
*)
# In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
# shellcheck disable=SC3045
ulimit -n "$MAX_FD" ||
warn "Could not set maximum file descriptor limit to $MAX_FD"
esac
Expand Down Expand Up @@ -205,6 +209,12 @@ set -- \
org.gradle.wrapper.GradleWrapperMain \
"$@"

# Stop when "xargs" is not available.
if ! command -v xargs >/dev/null 2>&1
then
die "xargs is not available"
fi

# Use "xargs" to parse quoted args.
#
# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
Expand Down
15 changes: 9 additions & 6 deletions gradlew.bat
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
@rem limitations under the License.
@rem

@if "%DEBUG%" == "" @echo off
@if "%DEBUG%"=="" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
Expand All @@ -25,7 +25,8 @@
if "%OS%"=="Windows_NT" setlocal

set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
if "%DIRNAME%"=="" set DIRNAME=.
@rem This is normally unused
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%

Expand All @@ -40,7 +41,7 @@ if defined JAVA_HOME goto findJavaFromJavaHome

set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if "%ERRORLEVEL%" == "0" goto execute
if %ERRORLEVEL% equ 0 goto execute

echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Expand Down Expand Up @@ -75,13 +76,15 @@ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar

:end
@rem End local scope for the variables with windows NT shell
if "%ERRORLEVEL%"=="0" goto mainEnd
if %ERRORLEVEL% equ 0 goto mainEnd

:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
exit /b 1
set EXIT_CODE=%ERRORLEVEL%
if %EXIT_CODE% equ 0 set EXIT_CODE=1
if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
exit /b %EXIT_CODE%

:mainEnd
if "%OS%"=="Windows_NT" endlocal
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,6 @@ public synchronized void drop(DropTargetDropEvent evt) {
File file =
((List<File>) evt.getTransferable().getTransferData(DataFlavor.javaFileListFlavor))
.get(0);
String fileName = file.getName().toLowerCase();
if (!fileName.contains("avro") && !fileName.contains("parquet")) {
JOptionPane.showMessageDialog(
null,
String.format(
"File name \"%s\" must contain either \"avro\" or \"parquet\"", fileName));
return;
}
String path = file.getPath();
schemaTextPane.setText(String.format("Processing file %s", path));
LOGGER.info(String.format("Received file %s", path));
Expand Down Expand Up @@ -186,10 +178,9 @@ private void populatePanes(File file, int numRecords) {
protected Boolean doInBackground() {
schemaTextPane.setText(String.format("Processing file %s...", file.getPath()));
try {
Reader reader =
currentFile.getName().toLowerCase().contains("avro")
? new AvroFileReader(currentFile)
: new ParquetFileReader(currentFile);
Reader reader = detectFileType(currentFile);
LOGGER.info(
String.format("Detected file %s as a %s", currentFile, reader.getClass()));
List<String> records = reader.getRecords(numRecords);
int totalRecords = reader.getNumRecords();
configureDataPanes(records);
Expand All @@ -216,6 +207,29 @@ protected Boolean doInBackground() {
swingWorker.execute();
}

/**
* Identifies the file type of the dropped file by attempting to parse it with both readers -
* either Avro or Parquet.
*
* @param currentFile the file to be parsed
* @return the AvroFileReader or ParquetFileReader, else an exception if the file is not
* recognised by either
*/
private Reader detectFileType(File currentFile) throws IOException {
try {
return new AvroFileReader(currentFile);
} catch (Exception e) {
LOGGER.debug(String.format("File %s is not an Avro file", currentFile));
}
try {
return new ParquetFileReader(currentFile);
} catch (Exception e) {
LOGGER.debug(String.format("File %s is not a Parquet file", currentFile));
}
throw new IOException(
String.format("File %s is not recognised as either Parquet or Avro", currentFile));
}

/**
* Populates the raw and table data panes with records and configures the radio buttons. If
* invalid JSON is found, the table pane is disabled and no data is loaded into it.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ public class AvroFileReader implements Reader {
private final File file;
private final GenericDatumReader<GenericRecord> datumReader;

public AvroFileReader(File file) throws OutOfMemoryError {
public AvroFileReader(File file) throws OutOfMemoryError, IOException {
this.file = file;
GenericDataConfigurer.configureGenericData();
this.datumReader = new GenericDatumReader<>(null, null, GenericData.get());
getRecords(1);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,8 @@
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import org.apache.avro.Conversion;
import org.apache.avro.LogicalType;
import org.apache.avro.Schema;
import org.apache.avro.data.TimeConversions;
import org.apache.avro.generic.GenericData;
Expand All @@ -47,11 +44,12 @@ public class ParquetFileReader implements Reader {
private final Path path;
private final Configuration conf;

public ParquetFileReader(File file) {
public ParquetFileReader(File file) throws IOException {
this.path = file.toPath();
this.conf = new Configuration();
this.conf.set("parquet.avro.readInt96AsFixed", "true");
GenericDataConfigurer.configureGenericData();
getRecords(1);
}

@Override
Expand Down Expand Up @@ -117,7 +115,6 @@ public List<String> getRecords(int numRecords) throws IOException, IllegalArgume
* https://stackoverflow.com/a/52041154/729819.
*/
private GenericRecord deserialize(Schema schema, byte[] data) throws IOException {
GenericData.get().addLogicalTypeConversion(new TimestampMillisConversion());
InputStream is = new ByteArrayInputStream(data);
Decoder decoder = DecoderFactory.get().binaryDecoder(is, null);
DatumReader<GenericRecord> reader = new GenericDatumReader<>(schema, schema, GenericData.get());
Expand All @@ -133,24 +130,4 @@ private byte[] toByteArray(Schema schema, GenericRecord genericRecord) throws IO
encoder.flush();
return baos.toByteArray();
}

public static class TimestampMillisConversion extends Conversion<String> {
public TimestampMillisConversion() {}

public Class<String> getConvertedType() {
return String.class;
}

public String getLogicalTypeName() {
return "timestamp-millis";
}

public String fromLong(Long millisFromEpoch, Schema schema, LogicalType type) {
return Instant.ofEpochMilli(millisFromEpoch).toString();
}

public Long toLong(String timestamp, Schema schema, LogicalType type) {
return new Long(timestamp);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.fail;

import java.io.File;
import java.io.IOException;
Expand Down Expand Up @@ -76,25 +77,28 @@ public void testComplexNesting() throws IOException {

@Test
@DisplayName("Assert that an Avro file with a decimal LogicalType is correctly parsed")
public void testDecimalLogicalType() throws IOException {
AvroFileReader avroFileReader = readRecords(DECIMAL_LOGICAL_TYPE);
int totalRecords = avroFileReader.getNumRecords();
assertThat(totalRecords).isEqualTo(1);
List<String> records = avroFileReader.getRecords(100);
assertThat(records).hasSize(1);
String firstRecord = records.get(0);
assertThat(firstRecord).contains("25.190000");
public void testDecimalLogicalType() {
try {
AvroFileReader avroFileReader = readRecords(DECIMAL_LOGICAL_TYPE);
int totalRecords = avroFileReader.getNumRecords();
assertThat(totalRecords).isEqualTo(1);
List<String> records = avroFileReader.getRecords(100);
assertThat(records).hasSize(1);
String firstRecord = records.get(0);
assertThat(firstRecord).contains("25.190000");
} catch (IOException e) {
fail();
}
}

@Test
@DisplayName("Assert that an invalid Avro file throws an exception")
public void testInvalidFile() {
File file = new File(getClass().getClassLoader().getResource(INVALID_AVRO_FILE).getFile());
AvroFileReader avroFileReader = new AvroFileReader(file);
assertThrows(OutOfMemoryError.class, () -> avroFileReader.getRecords(5));
assertThrows(OutOfMemoryError.class, () -> new AvroFileReader(file));
}

private AvroFileReader readRecords(String fileName) {
private AvroFileReader readRecords(String fileName) throws IOException {
File file = new File(getClass().getClassLoader().getResource(fileName).getFile());
return new AvroFileReader(file);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ public void testList() throws IOException {
}

@Test
@DisplayName("Assert that a Parquet file with an INT96 column can still be displayed")
@DisplayName("Assert that a Parquet file with an INT96 column can be displayed")
public void testInt96File() throws IOException {
ParquetFileReader parquetFileReader = readRecords(INT96_PARQUET_FILE);
int totalRecords = parquetFileReader.getNumRecords();
Expand All @@ -102,7 +102,7 @@ public void testInt96File() throws IOException {
}

@Test
@DisplayName("Assert that a Parquet file with a LogicalType date column can still be displayed")
@DisplayName("Assert that a Parquet file with a LogicalType date column can be displayed")
public void testDateLogicalType() throws IOException {
ParquetFileReader parquetFileReader = readRecords(LOGICAL_DATE_PARQUET_FILE);
int totalRecords = parquetFileReader.getNumRecords();
Expand All @@ -115,8 +115,7 @@ public void testDateLogicalType() throws IOException {
}

@Test
@DisplayName(
"Assert that a Parquet file with a LogicalType decimal column can still be displayed")
@DisplayName("Assert that a Parquet file with a LogicalType decimal column can be displayed")
public void testDecimalLogicalType() throws IOException {
ParquetFileReader parquetFileReader = readRecords(LOGICAL_DECIMAL_PARQUET_FILE);
int totalRecords = parquetFileReader.getNumRecords();
Expand All @@ -127,7 +126,7 @@ public void testDecimalLogicalType() throws IOException {
assertThat(firstRecord).contains("{\"name\": \"ben\", \"score\": 1.15}");
}

private ParquetFileReader readRecords(String fileName) {
private ParquetFileReader readRecords(String fileName) throws IOException {
File file = new File(getClass().getClassLoader().getResource(fileName).getFile());
return new ParquetFileReader(file);
}
Expand Down

0 comments on commit b9628fb

Please sign in to comment.