ml6team · PhilippeMoussalli · Jan 19, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 19, 2024
diff --git a/components/retrieve_from_weaviate/README.md b/components/retrieve_from_weaviate/README.md
@@ -2,16 +2,140 @@
 
 <a id="retrieve_from_weaviate#description"></a>
 ## Description
-Component that retrieves chunks from a weaviate vectorDB
+Component that retrieves chunks from a Weaviate vector database.
+The component can retrieve chunks based on a text search or based on a vector search.
+Reranking is only supported for text search.
+More info here [Cohere Ranking](https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynb)
+[Weaviate Search Rerank](https://weaviate.io/developers/weaviate/search/rerank)
+
+### Running with text as input
+
+```python
+import pyarrow as pa
+from fondant.pipeline import Pipeline
+
+pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline")
+
+dataset = pipeline.read(
+ "load_from_csv",
+ arguments={
+ "dataset_uri": "path/to/dataset.csv",
+ },
+ produces={
+ "text": pa.string(),
+ }
+)
+
+dataset = dataset.apply(
+ "index_weaviate",
+ arguments={
+ "weaviate_url": "http://localhost:8080",
+ "class_name": "my_class",
+ "vectorizer": "text2vec-openai",
+ "additional_headers": {
+ "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY"
+ }
+ },
+ consumes={
+ "text": "text"
+ }
+)
+
+dataset = dataset.apply(
+ "retrieve_from_weaviate",
+ arguments={
+ "weaviate_url": "http://localhost:8080",
+ "class_name": "my_class",
+ "top_k": 3,
+ "additional_headers": {
+ "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY"
+ }
+ },
+ consumes={
+ "text": "text"
+ }
+)
+```
+
+```python
+import pyarrow as pa
+from fondant.pipeline import Pipeline
+
+pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline")
+
+dataset = pipeline.read(
+ "load_from_csv",
+ arguments={
+ "dataset_uri": "path/to/dataset.csv",
+ },
+ produces={
+ "text": pa.string(),
+ }
+)
+
+dataset = dataset.apply(
+ "embed_text",
+ arguments={...},
+ consumes={
+ "text": "text",
+ },
+)
+
+dataset = dataset.apply(
+ "index_weaviate",
+ arguments={
+ "weaviate_url": "http://localhost:8080",
+ "class_name": "my_class",
+ },
+ consumes={
+ "embedding": "embedding"
+ }
+)
+
+dataset = pipeline.read(
+ "load_from_csv",
+ arguments={
+ "dataset_uri": "path/to/prompt_dataset.csv",
+ },
+ produces={
+ "prompts": pa.string(),
+ }
+)
+
+dataset = dataset.apply(
+ "embed_text",
+ arguments={...},
+ consumes={
+ "prompts": "text",
+ },
+)
+
+dataset = dataset.apply(
+ "retrieve_from_weaviate",
+ arguments={
+ "weaviate_url": "http://localhost:8080",
+ "class_name": "my_class",
+ "top_k": 3,
+ consumes={
+ "text": "text"
+ }
+)
+```
+
 
 <a id="retrieve_from_weaviate#inputs_outputs"></a>
 ## Inputs / outputs 
 
 <a id="retrieve_from_weaviate#consumes"></a>
 ### Consumes 
-**This component consumes:**
 
-- embedding: list<item: float>
+**This component can consume additional fields**
+- <field_name>: <dataset_field_name>
+This defines a mapping to update the fields consumed by the operation as defined in the component spec.
+The keys are the names of the fields to be received by the component, while the values are 
+the name of the field to map from the input dataset
+
+See the usage example below on how to define a field name for additional fields.
 
 
 
@@ -38,6 +162,7 @@ The component takes the following arguments to alter its behavior:
 | additional_headers | dict | Additional headers to pass to the weaviate client. | / |
 | hybrid_query | str | The hybrid query to be used for retrieval. Optional parameter. | / |
 | hybrid_alpha | float | Argument to change how much each search affects the results. An alpha of 1 is a pure vector search. An alpha of 0 is a pure keyword search. | / |
+| rerank | bool | Whether to rerank the results based on the hybrid query. Defaults to False.Check this notebook for more information on reranking:https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynbhttps://weaviate.io/developers/weaviate/search/rerank. | / |
 
 <a id="retrieve_from_weaviate#usage"></a>
 ## Usage 
@@ -63,7 +188,12 @@ dataset = dataset.apply(
  # "additional_headers": {},
  # "hybrid_query": ,
  # "hybrid_alpha": 0.0,
+ # "rerank": False,
  },
+ consumes={
+ <field_name>: <dataset_field_name>,
+ ..., # Add fields
+ },
 )
 ```
 

diff --git a/components/retrieve_from_weaviate/fondant_component.yaml b/components/retrieve_from_weaviate/fondant_component.yaml
@@ -1,14 +1,131 @@
 name: retrieve_from_weaviate
-description: Component that retrieves chunks from a weaviate vectorDB
+description: |
+ Component that retrieves chunks from a Weaviate vector database.
+ The component can retrieve chunks based on a text search or based on a vector search.
+ Reranking is only supported for text search.
+ More info here [Cohere Ranking](https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynb)
+ [Weaviate Search Rerank](https://weaviate.io/developers/weaviate/search/rerank)
+
+ ### Running with text as input
+
+ ```python
+ import pyarrow as pa
+ from fondant.pipeline import Pipeline
+
+ pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline")
+
+ dataset = pipeline.read(
+ "load_from_csv",
+ arguments={
+ "dataset_uri": "path/to/dataset.csv",
+ },
+ produces={
+ "text": pa.string(),
+ }
+ )
+
+ dataset = dataset.apply(
+ "index_weaviate",
+ arguments={
+ "weaviate_url": "http://localhost:8080",
+ "class_name": "my_class",
+ "vectorizer": "text2vec-openai",
+ "additional_headers": {
+ "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY"
+ }
+ },
+ consumes={
+ "text": "text"
+ }
+ )
+
+ dataset = dataset.apply(
+ "retrieve_from_weaviate",
+ arguments={
+ "weaviate_url": "http://localhost:8080",
+ "class_name": "my_class",
+ "top_k": 3,
+ "additional_headers": {
+ "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY"
+ }
+ },
+ consumes={
+ "text": "text"
+ }
+ )
+ ```
+
+ ```python
+ import pyarrow as pa
+ from fondant.pipeline import Pipeline
+
+ pipeline = Pipeline(name="my_pipeline", base_path="path/to/pipeline")
+
+ dataset = pipeline.read(
+ "load_from_csv",
+ arguments={
+ "dataset_uri": "path/to/dataset.csv",
+ },
+ produces={
+ "text": pa.string(),
+ }
+ )
+
+ dataset = dataset.apply(
+ "embed_text",
+ arguments={...},
+ consumes={
+ "text": "text",
+ },
+ )
+
+ dataset = dataset.apply(
+ "index_weaviate",
+ arguments={
+ "weaviate_url": "http://localhost:8080",
+ "class_name": "my_class",
+ },
+ consumes={
+ "embedding": "embedding"
+ }
+ )
+
+ dataset = pipeline.read(
+ "load_from_csv",
+ arguments={
+ "dataset_uri": "path/to/prompt_dataset.csv",
+ },
+ produces={
+ "prompts": pa.string(),
+ }
+ )
+
+ dataset = dataset.apply(
+ "embed_text",
+ arguments={...},
+ consumes={
+ "prompts": "text",
+ },
+ )
+
+ dataset = dataset.apply(
+ "retrieve_from_weaviate",
+ arguments={
+ "weaviate_url": "http://localhost:8080",
+ "class_name": "my_class",
+ "top_k": 3,
+ consumes={
+ "text": "text"
+ }
+ )
+ ```
+
 image: fndnt/retrieve_from_weaviate:dev
 tags:
  - Data retrieval
 
 consumes:
- embedding:
- type: array
- items:
- type: float32
+ additionalProperties: true
 
 produces:
  retrieved_chunks:
@@ -47,4 +164,12 @@ args:
  description: |
  Argument to change how much each search affects the results. An alpha of 1 is a pure vector search. An alpha of 0 is a pure keyword search.
  type: float
- default: None
+ default: None
+ rerank:
+ description: |
+ Whether to rerank the results based on the hybrid query. Defaults to False.
+ Check this notebook for more information on reranking:
+ https://github.com/weaviate/recipes/blob/main/ranking/cohere-ranking/cohere-ranking.ipynb
+ https://weaviate.io/developers/weaviate/search/rerank.
+ type: bool
+ default: False