-
Notifications
You must be signed in to change notification settings - Fork 141
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add Streaming Source Impl Signed-off-by: Peng Huo <penghuo@gmail.com> * update build.gradle Signed-off-by: Peng Huo <penghuo@gmail.com> * change to hadoop-fs Signed-off-by: Peng Huo <penghuo@gmail.com> * exclude FileSystemStreamSource from jacoco Signed-off-by: Peng Huo <penghuo@gmail.com> * exclude unnecessary depedency Signed-off-by: Peng Huo <penghuo@gmail.com> * Update integ-test depedency Signed-off-by: Peng Huo <penghuo@gmail.com> * change from splits to split in batch Signed-off-by: Peng Huo <penghuo@gmail.com> Signed-off-by: Peng Huo <penghuo@gmail.com>
- Loading branch information
Showing
14 changed files
with
542 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
17 changes: 17 additions & 0 deletions
17
core/src/main/java/org/opensearch/sql/executor/streaming/Batch.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.sql.executor.streaming; | ||
|
||
import lombok.Data; | ||
import org.opensearch.sql.storage.split.Split; | ||
|
||
/** | ||
* A batch of streaming execution. | ||
*/ | ||
@Data | ||
public class Batch { | ||
private final Split split; | ||
} |
17 changes: 17 additions & 0 deletions
17
core/src/main/java/org/opensearch/sql/executor/streaming/Offset.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.sql.executor.streaming; | ||
|
||
import lombok.Data; | ||
|
||
/** | ||
* Offset. | ||
*/ | ||
@Data | ||
public class Offset { | ||
|
||
private final Long offset; | ||
} |
29 changes: 29 additions & 0 deletions
29
core/src/main/java/org/opensearch/sql/executor/streaming/StreamingSource.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.sql.executor.streaming; | ||
|
||
import java.util.Optional; | ||
|
||
/** | ||
* Streaming source. | ||
*/ | ||
public interface StreamingSource { | ||
/** | ||
* Get current {@link Offset} of stream data. | ||
* | ||
* @return empty if the stream does not has new data. | ||
*/ | ||
Optional<Offset> getLatestOffset(); | ||
|
||
/** | ||
* Get a {@link Batch} from source between (start, end]. | ||
* | ||
* @param start start offset. | ||
* @param end end offset. | ||
* @return @link Batch}. | ||
*/ | ||
Batch getBatch(Optional<Offset> start, Offset end); | ||
} |
21 changes: 21 additions & 0 deletions
21
core/src/main/java/org/opensearch/sql/storage/split/Split.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.sql.storage.split; | ||
|
||
import org.opensearch.sql.storage.StorageEngine; | ||
|
||
/** | ||
* Split is a sections of a data set. Each {@link StorageEngine} should have specific | ||
* implementation of Split. | ||
*/ | ||
public interface Split { | ||
|
||
/** | ||
* Get the split id. | ||
* @return split id. | ||
*/ | ||
String getSplitId(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
plugins { | ||
id 'java-library' | ||
id "io.freefair.lombok" | ||
id 'jacoco' | ||
} | ||
|
||
ext { | ||
hadoop = "3.3.4" | ||
aws = "1.12.330" | ||
} | ||
|
||
configurations.all { | ||
resolutionStrategy.force "commons-io:commons-io:2.8.0" | ||
} | ||
|
||
dependencies { | ||
implementation project(':core') | ||
// required by hadoop filesystem https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/filesystem/index.html. | ||
implementation("org.apache.hadoop:hadoop-common:${hadoop}") { | ||
exclude group: 'org.apache.zookeeper' | ||
exclude group: 'org.eclipse.jetty' | ||
exclude group: 'com.sun.jersey' | ||
exclude group: 'javax.servlet.jsp' | ||
exclude group: 'javax.servlet' | ||
exclude group: 'org.apache.kerby' | ||
exclude group: 'org.apache.curator' | ||
exclude group: 'com.google.protobuf', module: 'protobuf-java' | ||
exclude group: 'org.apache.avro', module: 'avro' | ||
exclude group: 'com.nimbusds', module: 'nimbus-jose-jwt' | ||
// enforce version. | ||
exclude group: 'com.fasterxml.woodstox', module: 'woodstox-core' | ||
exclude group: 'commons-io', module: 'commons-io' | ||
exclude group: 'ch.qos.reload4j', module: 'reload4j' | ||
exclude group: 'org.apache.httpcomponents', module: 'httpcore' | ||
} | ||
implementation('com.fasterxml.woodstox:woodstox-core') | ||
constraints { | ||
implementation('com.fasterxml.woodstox:woodstox-core:6.4.0') { | ||
because 'https://www.mend.io/vulnerability-database/CVE-2022-40156' | ||
} | ||
} | ||
implementation('commons-io:commons-io') | ||
constraints { | ||
implementation('commons-io:commons-io:2.8.0') { | ||
because 'between versions 2.8.0 and 2.5' | ||
} | ||
} | ||
implementation('ch.qos.reload4j:reload4j') | ||
constraints { | ||
implementation('ch.qos.reload4j:reload4j:1.2.22') { | ||
because 'between versions 1.2.22 and 1.2.19' | ||
} | ||
} | ||
implementation('org.apache.httpcomponents:httpcore') | ||
constraints { | ||
implementation('org.apache.httpcomponents:httpcore:4.4.15') { | ||
because 'between versions 4.4.15 and 4.4.13' | ||
} | ||
} | ||
|
||
testImplementation('org.junit.jupiter:junit-jupiter:5.6.2') | ||
testImplementation group: 'org.hamcrest', name: 'hamcrest-library', version: '2.1' | ||
testImplementation group: 'org.mockito', name: 'mockito-core', version: '3.12.4' | ||
testImplementation group: 'org.mockito', name: 'mockito-junit-jupiter', version: '3.12.4' | ||
} | ||
|
||
test { | ||
useJUnitPlatform() | ||
testLogging { | ||
events "passed", "skipped", "failed" | ||
exceptionFormat "full" | ||
} | ||
|
||
// hadoop-fs depend on native library which is missing on windows. | ||
// https://hadoop.apache.org/docs/r3.3.4/hadoop-project-dist/hadoop-common/NativeLibraries.html#Native_Hadoop_Library | ||
if ('windows' == project.getProperties().getOrDefault('buildPlatform', 'linux')) { | ||
excludes = [ | ||
'**/FileSystemStreamSourceTest.class' | ||
] | ||
} | ||
} | ||
|
||
jacocoTestReport { | ||
reports { | ||
html.enabled true | ||
xml.enabled true | ||
} | ||
afterEvaluate { | ||
classDirectories.setFrom(files(classDirectories.files.collect { | ||
fileTree(dir: it) | ||
})) | ||
} | ||
} | ||
test.finalizedBy(project.tasks.jacocoTestReport) | ||
|
||
jacocoTestCoverageVerification { | ||
violationRules { | ||
rule { | ||
// hadoop-fs depend on native library which is missing on windows. | ||
// https://hadoop.apache.org/docs/r3.3.4/hadoop-project-dist/hadoop-common/NativeLibraries.html#Native_Hadoop_Library | ||
if ('windows' == project.getProperties().getOrDefault('buildPlatform', 'linux')) { | ||
excludes = [ | ||
'org.opensearch.sql.filesystem.streaming.FileSystemStreamSource' | ||
] | ||
} | ||
element = 'CLASS' | ||
limit { | ||
counter = 'LINE' | ||
minimum = 1.0 | ||
} | ||
limit { | ||
counter = 'BRANCH' | ||
minimum = 1.0 | ||
} | ||
} | ||
} | ||
afterEvaluate { | ||
classDirectories.setFrom(files(classDirectories.files.collect { | ||
fileTree(dir: it) | ||
})) | ||
} | ||
} | ||
check.dependsOn jacocoTestCoverageVerification | ||
jacocoTestCoverageVerification.dependsOn jacocoTestReport |
24 changes: 24 additions & 0 deletions
24
filesystem/src/main/java/org/opensearch/sql/filesystem/storage/split/FileSystemSplit.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.sql.filesystem.storage.split; | ||
|
||
import java.util.Set; | ||
import java.util.UUID; | ||
import lombok.Data; | ||
import lombok.EqualsAndHashCode; | ||
import lombok.Getter; | ||
import org.apache.hadoop.fs.Path; | ||
import org.opensearch.sql.storage.split.Split; | ||
|
||
@Data | ||
public class FileSystemSplit implements Split { | ||
|
||
@Getter | ||
@EqualsAndHashCode.Exclude | ||
private final String splitId = UUID.randomUUID().toString(); | ||
|
||
private final Set<Path> paths; | ||
} |
21 changes: 21 additions & 0 deletions
21
filesystem/src/main/java/org/opensearch/sql/filesystem/streaming/FileMetaData.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.sql.filesystem.streaming; | ||
|
||
import java.util.Set; | ||
import lombok.Data; | ||
import org.apache.hadoop.fs.Path; | ||
|
||
/** | ||
* File metadata. Batch id associate with the set of {@link Path}. | ||
*/ | ||
@Data | ||
public class FileMetaData { | ||
|
||
private final Long batchId; | ||
|
||
private final Set<Path> paths; | ||
} |
105 changes: 105 additions & 0 deletions
105
filesystem/src/main/java/org/opensearch/sql/filesystem/streaming/FileSystemStreamSource.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.sql.filesystem.streaming; | ||
|
||
import com.google.common.collect.Sets; | ||
import java.io.IOException; | ||
import java.util.Arrays; | ||
import java.util.HashSet; | ||
import java.util.Optional; | ||
import java.util.Set; | ||
import java.util.stream.Collectors; | ||
import lombok.SneakyThrows; | ||
import org.apache.commons.lang3.tuple.Pair; | ||
import org.apache.hadoop.fs.FileStatus; | ||
import org.apache.hadoop.fs.FileSystem; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.opensearch.sql.executor.streaming.Batch; | ||
import org.opensearch.sql.executor.streaming.DefaultMetadataLog; | ||
import org.opensearch.sql.executor.streaming.MetadataLog; | ||
import org.opensearch.sql.executor.streaming.Offset; | ||
import org.opensearch.sql.executor.streaming.StreamingSource; | ||
import org.opensearch.sql.filesystem.storage.split.FileSystemSplit; | ||
|
||
/** | ||
* FileSystem Streaming Source use Hadoop FileSystem. | ||
*/ | ||
public class FileSystemStreamSource implements StreamingSource { | ||
|
||
private static final Logger log = LogManager.getLogger(FileSystemStreamSource.class); | ||
|
||
private final MetadataLog<FileMetaData> fileMetaDataLog; | ||
|
||
private Set<Path> seenFiles; | ||
|
||
private final FileSystem fs; | ||
|
||
private final Path basePath; | ||
|
||
/** | ||
* Constructor of FileSystemStreamSource. | ||
*/ | ||
public FileSystemStreamSource(FileSystem fs, Path basePath) { | ||
this.fs = fs; | ||
this.basePath = basePath; | ||
// todo, need to add state recovery | ||
this.fileMetaDataLog = new DefaultMetadataLog<>(); | ||
// todo, need to add state recovery | ||
this.seenFiles = new HashSet<>(); | ||
} | ||
|
||
@SneakyThrows(value = IOException.class) | ||
@Override | ||
public Optional<Offset> getLatestOffset() { | ||
// list all files. todo. improvement list performance. | ||
Set<Path> allFiles = | ||
Arrays.stream(fs.listStatus(basePath)) | ||
.filter(status -> !status.isDirectory()) | ||
.map(FileStatus::getPath) | ||
.collect(Collectors.toSet()); | ||
|
||
// find unread files. | ||
log.debug("all files {}", allFiles); | ||
Set<Path> unread = Sets.difference(allFiles, seenFiles); | ||
|
||
// update seenFiles. | ||
seenFiles = allFiles; | ||
log.debug("seen files {}", seenFiles); | ||
|
||
Optional<Long> latestBatchIdOptional = fileMetaDataLog.getLatest().map(Pair::getKey); | ||
if (!unread.isEmpty()) { | ||
long latestBatchId = latestBatchIdOptional.map(id -> id + 1).orElse(0L); | ||
fileMetaDataLog.add(latestBatchId, new FileMetaData(latestBatchId, unread)); | ||
log.debug("latestBatchId {}", latestBatchId); | ||
return Optional.of(new Offset(latestBatchId)); | ||
} else { | ||
log.debug("no unread data"); | ||
Optional<Offset> offset = | ||
latestBatchIdOptional.isEmpty() | ||
? Optional.empty() | ||
: Optional.of(new Offset(latestBatchIdOptional.get())); | ||
log.debug("return empty offset {}", offset); | ||
return offset; | ||
} | ||
} | ||
|
||
@Override | ||
public Batch getBatch(Optional<Offset> start, Offset end) { | ||
Long startBatchId = start.map(Offset::getOffset).map(id -> id + 1).orElse(0L); | ||
Long endBatchId = end.getOffset(); | ||
|
||
Set<Path> paths = | ||
fileMetaDataLog.get(Optional.of(startBatchId), Optional.of(endBatchId)).stream() | ||
.map(FileMetaData::getPaths) | ||
.flatMap(Set::stream) | ||
.collect(Collectors.toSet()); | ||
|
||
log.debug("fetch files {} with id from: {} to: {}.", paths, start, end); | ||
return new Batch(new FileSystemSplit(paths)); | ||
} | ||
} |
Oops, something went wrong.