Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature]: add ignore missing field to text chunking processors #907

4 changes: 2 additions & 2 deletions DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ One easy way to get Java 11 on *nix is to use [sdkman](https://sdkman.io/).
```bash
curl -s "https://get.sdkman.io" | bash
source ~/.sdkman/bin/sdkman-init.sh
sdk install java 11.0.2-open
sdk use java 11.0.2-open
sdk install java 21.0.2-open
sdk use java 21.0.2-open
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
```

JDK versions 14 and 17 were tested and are fully supported for local development.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,13 @@ public final class TextChunkingProcessor extends AbstractProcessor {
public static final String FIELD_MAP_FIELD = "field_map";
public static final String ALGORITHM_FIELD = "algorithm";
private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME;
public static final String IGNORE_MISSING = "ignore_missing";
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
public static final Boolean DEFAULT_IGNORE_MISSING = false;
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved

private int maxChunkLimit;
private Chunker chunker;
private final Map<String, Object> fieldMap;
private final Boolean ignoreMissing;
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
private final ClusterService clusterService;
private final AnalysisRegistry analysisRegistry;
private final Environment environment;
Expand All @@ -59,12 +62,14 @@ public TextChunkingProcessor(
final String description,
final Map<String, Object> fieldMap,
final Map<String, Object> algorithmMap,
final boolean ignoreMissing,
final Environment environment,
final ClusterService clusterService,
final AnalysisRegistry analysisRegistry
) {
super(tag, description);
this.fieldMap = fieldMap;
this.ignoreMissing = ignoreMissing;
this.environment = environment;
this.clusterService = clusterService;
this.analysisRegistry = analysisRegistry;
Expand Down Expand Up @@ -250,8 +255,11 @@ private void chunkMapType(
} else {
// chunk the object when target key is of leaf type (null, string and list of string)
Object chunkObject = sourceAndMetadataMap.get(originalKey);
List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);

IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
if (!(ignoreMissing && chunkObject == null)) {
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);
}
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.DEFAULT_IGNORE_MISSING;
import static org.opensearch.ingest.ConfigurationUtils.readMap;
import static org.opensearch.ingest.ConfigurationUtils.readBooleanProperty;

/**
* Factory for chunking ingest processor for ingestion pipeline.
Expand Down Expand Up @@ -45,6 +48,16 @@ public TextChunkingProcessor create(
) throws Exception {
Map<String, Object> fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD);
Map<String, Object> algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD);
return new TextChunkingProcessor(processorTag, description, fieldMap, algorithmMap, environment, clusterService, analysisRegistry);
boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, IGNORE_MISSING, DEFAULT_IGNORE_MISSING);
return new TextChunkingProcessor(
martin-gaievski marked this conversation as resolved.
Show resolved Hide resolved
processorTag,
description,
fieldMap,
algorithmMap,
ignoreMissing,
environment,
clusterService,
analysisRegistry
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ public class TextChunkingProcessorIT extends BaseNeuralSearchIT {
private static final String TEST_DOCUMENT = "processor/chunker/TextChunkingTestDocument.json";

private static final String TEST_LONG_DOCUMENT = "processor/chunker/TextChunkingTestLongDocument.json";
private static final String TEST_DOCUMENT_NO_BODY = "processor/chunker/TextChunkingTestDocumentNoBody.json";
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved

IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
private static final String IGNORE_MISSING_PIPELINE_NAME = "pipeline-with-ignore-missing";

private static final Map<String, String> PIPELINE_CONFIGS_BY_NAME = Map.of(
FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME,
Expand All @@ -59,7 +62,9 @@ public class TextChunkingProcessorIT extends BaseNeuralSearchIT {
DELIMITER_PIPELINE_NAME,
"processor/chunker/PipelineForDelimiterChunker.json",
CASCADE_PIPELINE_NAME,
"processor/chunker/PipelineForCascadedChunker.json"
"processor/chunker/PipelineForCascadedChunker.json",
IGNORE_MISSING_PIPELINE_NAME,
"processor/chunker/PipelineWithIgnoreMissing.json"
);

@Before
Expand Down Expand Up @@ -176,6 +181,36 @@ public void testTextChunkingProcessor_withCascadePipeline_successful() {
}
}

@SneakyThrows
public void testTextChunkingProcessor_withIgnoreMissing() {
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
try {
createPipelineProcessor(IGNORE_MISSING_PIPELINE_NAME);
createTextChunkingIndex(INDEX_NAME, IGNORE_MISSING_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT_NO_BODY);

validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, null);
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved

validateIndexIngestResults(INDEX_NAME, INTERMEDIATE_FIELD, null);
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
} finally {
wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null);
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
}
}

@SneakyThrows
public void testTextChunkingProcessor_withoutIgnoreMissing() {
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createTextChunkingIndex(INDEX_NAME, CASCADE_PIPELINE_NAME);
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
ingestDocument(TEST_DOCUMENT_NO_BODY);

List<String> expectedPassages = new ArrayList<>();
validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages);
validateIndexIngestResults(INDEX_NAME, INTERMEDIATE_FIELD, expectedPassages);
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
} finally {
wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null);
IanMenendez marked this conversation as resolved.
Show resolved Hide resolved
}
}

private void validateIndexIngestResults(String indexName, String fieldName, Object expected) {
assertEquals(1, getDocCount(indexName));
MatchAllQueryBuilder query = new MatchAllQueryBuilder();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"description": "An example fixed token length chunker pipeline with ignore missing == true",
"processors" : [
{
"text_chunking": {
"ignore_missing": true,
"field_map": {
"body": "body_chunk"
},
"algorithm": {
"fixed_token_length": {
"token_limit": 10,
"tokenizer": "letter"
}
}
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"name": "OpenSearch"
}
Loading