Skip to content

Commit

Permalink
[Feature]: add ignore missing field to text chunking processors (#907)
Browse files Browse the repository at this point in the history
* feat: add ignore missing field to text chunking processor

Signed-off-by: Ian Menendez <ianfmenendezd@gmail.com>
Co-authored-by: Ian Menendez <ian.menendez@upstartcommerce.com>
(cherry picked from commit 00e622e)
  • Loading branch information
IanMenendez authored and github-actions[bot] committed Oct 2, 2024
1 parent 072b3c8 commit 11f4701
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.17...2.x)
### Features
### Enhancements
- Implement `ignore_missing` field in text chunking processors ([#907](https://github.com/opensearch-project/neural-search/pull/907))
### Bug Fixes
### Infrastructure
### Documentation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,13 @@ public final class TextChunkingProcessor extends AbstractProcessor {
public static final String FIELD_MAP_FIELD = "field_map";
public static final String ALGORITHM_FIELD = "algorithm";
private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME;
public static final String IGNORE_MISSING = "ignore_missing";
public static final boolean DEFAULT_IGNORE_MISSING = false;

private int maxChunkLimit;
private Chunker chunker;
private final Map<String, Object> fieldMap;
private final boolean ignoreMissing;
private final ClusterService clusterService;
private final AnalysisRegistry analysisRegistry;
private final Environment environment;
Expand All @@ -59,12 +62,14 @@ public TextChunkingProcessor(
final String description,
final Map<String, Object> fieldMap,
final Map<String, Object> algorithmMap,
final boolean ignoreMissing,
final Environment environment,
final ClusterService clusterService,
final AnalysisRegistry analysisRegistry
) {
super(tag, description);
this.fieldMap = fieldMap;
this.ignoreMissing = ignoreMissing;
this.environment = environment;
this.clusterService = clusterService;
this.analysisRegistry = analysisRegistry;
Expand All @@ -75,6 +80,11 @@ public String getType() {
return TYPE;
}

// if ignore missing is true null fields return null. If ignore missing is false null fields return an empty list
private boolean shouldProcessChunk(Object chunkObject) {
return !ignoreMissing || Objects.nonNull(chunkObject);
}

@SuppressWarnings("unchecked")
private void parseAlgorithmMap(final Map<String, Object> algorithmMap) {
if (algorithmMap.size() > 1) {
Expand Down Expand Up @@ -250,8 +260,11 @@ private void chunkMapType(
} else {
// chunk the object when target key is of leaf type (null, string and list of string)
Object chunkObject = sourceAndMetadataMap.get(originalKey);
List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);

if (shouldProcessChunk(chunkObject)) {
List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.DEFAULT_IGNORE_MISSING;
import static org.opensearch.ingest.ConfigurationUtils.readMap;
import static org.opensearch.ingest.ConfigurationUtils.readBooleanProperty;

/**
* Factory for chunking ingest processor for ingestion pipeline.
Expand Down Expand Up @@ -45,6 +48,16 @@ public TextChunkingProcessor create(
) throws Exception {
Map<String, Object> fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD);
Map<String, Object> algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD);
return new TextChunkingProcessor(processorTag, description, fieldMap, algorithmMap, environment, clusterService, analysisRegistry);
boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, IGNORE_MISSING, DEFAULT_IGNORE_MISSING);
return new TextChunkingProcessor(
processorTag,
description,
fieldMap,
algorithmMap,
ignoreMissing,
environment,
clusterService,
analysisRegistry
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD;

public class TextChunkingProcessorTests extends OpenSearchTestCase {
Expand Down Expand Up @@ -181,6 +182,20 @@ private TextChunkingProcessor createDelimiterInstance() {
return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
}

@SneakyThrows
private TextChunkingProcessor createIgnoreMissingInstance() {
Map<String, Object> config = new HashMap<>();
Map<String, Object> fieldMap = new HashMap<>();
Map<String, Object> algorithmMap = new HashMap<>();
algorithmMap.put(DelimiterChunker.ALGORITHM_NAME, createDelimiterParameters());
fieldMap.put(INPUT_FIELD, OUTPUT_FIELD);
config.put(FIELD_MAP_FIELD, fieldMap);
config.put(ALGORITHM_FIELD, algorithmMap);
config.put(IGNORE_MISSING, true);
Map<String, Processor.Factory> registry = new HashMap<>();
return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
}

public void testCreate_whenAlgorithmFieldMissing_thenFail() {
Map<String, Object> config = new HashMap<>();
Map<String, Object> fieldMap = new HashMap<>();
Expand Down Expand Up @@ -945,4 +960,16 @@ public void testExecute_withDelimiter_andSourceDataString_thenSucceed() {
expectedPassages.add(" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.");
assertEquals(expectedPassages, passages);
}

@SneakyThrows
public void testExecute_withIgnoreMissing_thenSucceed() {
Map<String, Object> sourceAndMetadata = new HashMap<>();
sourceAndMetadata.put("text_field", "");
sourceAndMetadata.put(IndexFieldMapper.NAME, INDEX_NAME);
IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>());

TextChunkingProcessor processor = createIgnoreMissingInstance();
IngestDocument document = processor.execute(ingestDocument);
assertFalse(document.getSourceAndMetadata().containsKey(OUTPUT_FIELD));
}
}

0 comments on commit 11f4701

Please sign in to comment.