Skip to content

Commit

Permalink
Onboard neural sparse search (#141) (#142)
Browse files Browse the repository at this point in the history
Signed-off-by: Tyler Ohlsen <ohltyler@amazon.com>
(cherry picked from commit 4d5f50c)

Co-authored-by: Tyler Ohlsen <ohltyler@amazon.com>
  • Loading branch information
opensearch-trigger-bot[bot] and ohltyler authored Apr 22, 2024
1 parent fc29edf commit c3e4586
Show file tree
Hide file tree
Showing 23 changed files with 576 additions and 119 deletions.
35 changes: 34 additions & 1 deletion common/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {
MODEL_ALGORITHM,
PRETRAINED_MODEL_FORMAT,
PretrainedSentenceTransformer,
PretrainedSparseEncodingModel,
WORKFLOW_STATE,
} from './interfaces';

Expand Down Expand Up @@ -61,11 +62,15 @@ export const CREATE_INGEST_PIPELINE_STEP_TYPE = 'create_ingest_pipeline';
export const CREATE_INDEX_STEP_TYPE = 'create_index';
export const REGISTER_LOCAL_PRETRAINED_MODEL_STEP_TYPE =
'register_local_pretrained_model';
export const REGISTER_LOCAL_SPARSE_ENCODING_MODEL_STEP_TYPE =
'register_local_sparse_encoding_model';

/**
* ML PLUGIN PRETRAINED MODELS
* (based off of https://opensearch.org/docs/latest/ml-commons-plugin/pretrained-models/#sentence-transformers)
* (based off of https://opensearch.org/docs/latest/ml-commons-plugin/pretrained-models)
*/

// ---- SENTENCE TRANSFORMERS ----
export const ROBERTA_SENTENCE_TRANSFORMER = {
name: 'huggingface/sentence-transformers/all-distilroberta-v1',
shortenedName: 'all-distilroberta-v1',
Expand Down Expand Up @@ -96,6 +101,34 @@ export const BERT_SENTENCE_TRANSFORMER = {
vectorDimensions: 768,
} as PretrainedSentenceTransformer;

// ---- SPARSE ENCODERS ----
export const NEURAL_SPARSE_TRANSFORMER = {
name: 'amazon/neural-sparse/opensearch-neural-sparse-encoding-v1',
shortenedName: 'opensearch-neural-sparse-encoding-v1',
description: 'A general neural sparse encoding model',
format: PRETRAINED_MODEL_FORMAT.TORCH_SCRIPT,
algorithm: MODEL_ALGORITHM.SPARSE_ENCODING,
version: '1.0.1',
} as PretrainedSparseEncodingModel;

export const NEURAL_SPARSE_DOC_TRANSFORMER = {
name: 'amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1',
shortenedName: 'opensearch-neural-sparse-encoding-doc-v1',
description: 'A general neural sparse encoding model',
format: PRETRAINED_MODEL_FORMAT.TORCH_SCRIPT,
algorithm: MODEL_ALGORITHM.SPARSE_ENCODING,
version: '1.0.1',
} as PretrainedSparseEncodingModel;

export const NEURAL_SPARSE_TOKENIZER_TRANSFORMER = {
name: 'amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1',
shortenedName: 'opensearch-neural-sparse-tokenizer-v1',
description: 'A neural sparse tokenizer model',
format: PRETRAINED_MODEL_FORMAT.TORCH_SCRIPT,
algorithm: MODEL_ALGORITHM.SPARSE_ENCODING,
version: '1.0.1',
} as PretrainedSparseEncodingModel;

/**
* MISCELLANEOUS
*/
Expand Down
26 changes: 25 additions & 1 deletion common/interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import { Node, Edge } from 'reactflow';
import { IComponentData } from '../public/component_types';
import { COMPONENT_CLASS } from '../public/utils';

export type Index = {
name: string;
Expand All @@ -16,7 +17,11 @@ export type Index = {
*/

export type ReactFlowComponent = Node<IComponentData>;
export type ReactFlowEdge = Edge<{}> & {};
export type ReactFlowEdge = Edge<{}> & {
key: string;
sourceClasses: COMPONENT_CLASS[];
targetClasses: COMPONENT_CLASS[];
};

type ReactFlowViewport = {
x: number;
Expand Down Expand Up @@ -49,6 +54,22 @@ export type TextEmbeddingProcessor = IngestProcessor & {
};
};

export type SparseEncodingProcessor = IngestProcessor & {
sparse_encoding: {
model_id: string;
field_map: {};
};
};

export type IndexConfiguration = {
settings: {};
mappings: IndexMappings;
};

export type IndexMappings = {
properties: {};
};

export type TemplateNode = {
id: string;
type: string;
Expand Down Expand Up @@ -135,6 +156,7 @@ export type Workflow = WorkflowTemplate & {

export enum USE_CASE {
SEMANTIC_SEARCH = 'SEMANTIC_SEARCH',
NEURAL_SPARSE_SEARCH = 'NEURAL_SPARSE_SEARCH',
}

/**
Expand Down Expand Up @@ -196,6 +218,8 @@ export type PretrainedSentenceTransformer = PretrainedModel & {
vectorDimensions: number;
};

export type PretrainedSparseEncodingModel = PretrainedModel & {};

export type ModelConfig = {
modelType?: string;
embeddingDimension?: number;
Expand Down
1 change: 1 addition & 0 deletions public/component_types/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
export * from './interfaces';
export * from './transformer';
export * from './indexer';
export * from './other';
6 changes: 3 additions & 3 deletions public/component_types/indexer/indexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ export class Indexer extends BaseComponent {
this.baseClasses = [this.type];
this.inputs = [
{
id: 'transformer',
label: 'Transformer',
baseClass: COMPONENT_CLASS.TRANSFORMER,
id: 'document',
label: 'Document',
baseClass: COMPONENT_CLASS.DOCUMENT,
acceptMultiple: false,
},
];
Expand Down
2 changes: 1 addition & 1 deletion public/component_types/indexer/knn_indexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ export class KnnIndexer extends Indexer {
constructor() {
super();
this.type = COMPONENT_CLASS.KNN_INDEXER;
this.label = 'K-NN Indexer';
this.label = 'K-NN Index';
this.description = 'A specialized indexer for K-NN indices';
this.baseClasses = [...this.baseClasses, this.type];
this.createFields = [
Expand Down
30 changes: 30 additions & 0 deletions public/component_types/other/document.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

import { COMPONENT_CATEGORY, COMPONENT_CLASS } from '../../utils';
import { BaseComponent } from '../base_component';

/**
* A basic Document placeholder UI component.
* Does not have any functionality.
*/
export class Document extends BaseComponent {
constructor() {
super();
this.type = COMPONENT_CLASS.DOCUMENT;
this.label = 'Document';
this.description = 'A document to be ingested';
this.categories = [COMPONENT_CATEGORY.INGEST];
this.allowsCreation = false;
this.baseClasses = [this.type];
this.inputs = [];
this.outputs = [
{
label: this.label,
baseClasses: this.baseClasses,
},
];
}
}
6 changes: 6 additions & 0 deletions public/component_types/other/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

export * from './document';
1 change: 1 addition & 0 deletions public/component_types/transformer/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@

export * from './ml_transformer';
export * from './text_embedding_transformer';
export * from './sparse_encoder_transformer';
64 changes: 64 additions & 0 deletions public/component_types/transformer/sparse_encoder_transformer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/

import { COMPONENT_CATEGORY, COMPONENT_CLASS } from '../../../common';
import { MLTransformer } from '.';

/**
* A specialized sparse encoder ML transformer UI component
*/
export class SparseEncoderTransformer extends MLTransformer {
constructor() {
super();
this.type = COMPONENT_CLASS.SPARSE_ENCODER_TRANSFORMER;
this.label = 'Sparse Encoder';
this.description =
'A specialized ML transformer to perform sparse encoding';
this.categories = [COMPONENT_CATEGORY.INGEST];
this.baseClasses = [...this.baseClasses, this.type];
this.inputs = [
{
id: 'document',
label: 'Document',
baseClass: COMPONENT_CLASS.DOCUMENT,
acceptMultiple: false,
},
];
this.createFields = [
{
label: 'Sparse Encoding Model',
id: 'model',
type: 'model',
helpText:
'A sparse encoding model to be used for generating sparse vectors.',
helpLink:
'https://opensearch.org/docs/latest/ml-commons-plugin/integrating-ml-models/#choosing-a-model',
},
{
label: 'Input Field',
id: 'inputField',
type: 'string',
helpText:
'The name of the document field from which to obtain text for generating sparse embeddings.',
helpLink:
'https://opensearch.org/docs/latest/ingest-pipelines/processors/sparse-encoding/#configuration-parameters',
},
{
label: 'Vector Field',
id: 'vectorField',
type: 'string',
helpText: `The name of the document's vector field in which to store the generated sparse embeddings.`,
helpLink:
'https://opensearch.org/docs/latest/ingest-pipelines/processors/sparse-encoding/#configuration-parameters',
},
];
this.outputs = [
{
label: 'Transformed Document',
baseClasses: [COMPONENT_CLASS.DOCUMENT],
},
];
}
}
23 changes: 15 additions & 8 deletions public/component_types/transformer/text_embedding_transformer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* SPDX-License-Identifier: Apache-2.0
*/

import { COMPONENT_CLASS } from '../../../common';
import { COMPONENT_CATEGORY, COMPONENT_CLASS } from '../../../common';
import { MLTransformer } from '.';

/**
Expand All @@ -13,10 +13,18 @@ export class TextEmbeddingTransformer extends MLTransformer {
constructor() {
super();
this.type = COMPONENT_CLASS.TEXT_EMBEDDING_TRANSFORMER;
this.label = 'Text Embedding Transformer';
this.label = 'Text Embedder';
this.description = 'A specialized ML transformer for embedding text';
this.categories = [COMPONENT_CATEGORY.INGEST];
this.baseClasses = [...this.baseClasses, this.type];
this.inputs = [];
this.inputs = [
{
id: 'document',
label: 'Document',
baseClass: COMPONENT_CLASS.DOCUMENT,
acceptMultiple: false,
},
];
this.createFields = [
{
label: 'Text Embedding Model',
Expand All @@ -31,24 +39,23 @@ export class TextEmbeddingTransformer extends MLTransformer {
id: 'inputField',
type: 'string',
helpText:
'The name of the field from which to obtain text for generating text embeddings.',
'The name of the document field from which to obtain text for generating text embeddings.',
helpLink:
'https://opensearch.org/docs/latest/ingest-pipelines/processors/text-embedding/',
},
{
label: 'Vector Field',
id: 'vectorField',
type: 'string',
helpText:
' The name of the vector field in which to store the generated text embeddings.',
helpText: `The name of the document's vector field in which to store the generated text embeddings.`,
helpLink:
'https://opensearch.org/docs/latest/ingest-pipelines/processors/text-embedding/',
},
];
this.outputs = [
{
label: this.label,
baseClasses: this.baseClasses,
label: 'Transformed Document',
baseClasses: [COMPONENT_CLASS.DOCUMENT],
},
];
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*/

import React, { useState } from 'react';
import { EuiSpacer, EuiText, EuiTitle } from '@elastic/eui';
import { EuiHorizontalRule, EuiSpacer, EuiText, EuiTitle } from '@elastic/eui';
import { InputFieldList } from './input_field_list';
import { NODE_CATEGORY, ReactFlowComponent } from '../../../../common';
import { NewOrExistingTabs } from '../workspace/workspace_components/new_or_existing_tabs';
Expand Down Expand Up @@ -58,11 +58,12 @@ export function ComponentInputs(props: ComponentInputsProps) {
<EuiText color="subdued">
{props.selectedComponent.data.description}
</EuiText>
<NewOrExistingTabs
{/* TODO: Add tabs back once it is finalized how much flexibility we want */}
{/* <NewOrExistingTabs
selectedTabId={selectedTabId}
setSelectedTabId={setSelectedTabId}
/>
<EuiSpacer size="s" />
/> */}
<EuiHorizontalRule size="full" />

<InputFieldList
componentId={props.selectedComponent.id}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ import {
ModelFormValue,
MODEL_CATEGORY,
MPNET_SENTENCE_TRANSFORMER,
NEURAL_SPARSE_TRANSFORMER,
NEURAL_SPARSE_DOC_TRANSFORMER,
NEURAL_SPARSE_TOKENIZER_TRANSFORMER,
} from '../../../../../common';
import { AppState } from '../../../../store';

Expand Down Expand Up @@ -113,6 +116,24 @@ export function ModelField(props: ModelFieldProps) {
category: MODEL_CATEGORY.PRETRAINED,
algorithm: BERT_SENTENCE_TRANSFORMER.algorithm,
},
{
id: NEURAL_SPARSE_TRANSFORMER.name,
name: NEURAL_SPARSE_TRANSFORMER.shortenedName,
category: MODEL_CATEGORY.PRETRAINED,
algorithm: NEURAL_SPARSE_TRANSFORMER.algorithm,
},
{
id: NEURAL_SPARSE_DOC_TRANSFORMER.name,
name: NEURAL_SPARSE_DOC_TRANSFORMER.shortenedName,
category: MODEL_CATEGORY.PRETRAINED,
algorithm: NEURAL_SPARSE_DOC_TRANSFORMER.algorithm,
},
{
id: NEURAL_SPARSE_TOKENIZER_TRANSFORMER.name,
name: NEURAL_SPARSE_TOKENIZER_TRANSFORMER.shortenedName,
category: MODEL_CATEGORY.PRETRAINED,
algorithm: NEURAL_SPARSE_TOKENIZER_TRANSFORMER.algorithm,
},
];
setPretrainedModels(modelItems);
}, []);
Expand All @@ -121,6 +142,8 @@ export function ModelField(props: ModelFieldProps) {
// e.g., only show deployed models when 'deployed' button is selected
useEffect(() => {
if (selectedRadioId !== undefined) {
// TODO: add fine-grained filtering so only relevant pretrained and existing models
// are visible based on the use case
if (selectedRadioId === MODEL_CATEGORY.DEPLOYED) {
setSelectableModels(deployedModels);
} else {
Expand Down
Loading

0 comments on commit c3e4586

Please sign in to comment.