Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(datasets): Add limited langchain support for Anthropic, Cohere, and OpenAI models #434

Merged
merged 23 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
8b6c34c
Add openai datasets.
ianwhale Nov 16, 2023
8a3bdfb
Add anthropic and cohere
ianwhale Nov 16, 2023
a3de75d
Add python API examples to docstrings.
ianwhale Nov 27, 2023
39dd5ae
Clean up python example.
ianwhale Apr 30, 2024
b38786f
Merge branch 'main' into feat/langchain-dataset
merelcht May 21, 2024
72cf548
Remove setup.py and move lanchain reqs to pyproject.toml
merelcht May 21, 2024
de2596b
Move lanchain datasets to experimental
merelcht May 21, 2024
b67c43f
Try get antrophic dataset running. Looks like API URL is not necessary?
merelcht May 21, 2024
0fab1f6
Update cohere package and imports
merelcht May 21, 2024
5e059f3
Merge branch 'main' into feat/langchain-dataset
merelcht May 21, 2024
6d9ba95
Update openai dependency + allow for url in antrophic
merelcht May 21, 2024
4d60267
Merge branch 'feat/langchain-dataset' of https://github.com/ianwhale/…
merelcht May 21, 2024
05d8573
Improve Cohere dataset
merelcht May 29, 2024
82de16a
Make credentials consistent + fix openai examples
merelcht May 29, 2024
f865aa6
Turn cohere dataset into chatcohere dataset
merelcht May 29, 2024
2d805d9
Clean up cohere dataset
merelcht May 30, 2024
56ce7ff
Merge branch 'main' into feat/langchain-dataset
merelcht May 30, 2024
4d726b1
Update release notes + init
merelcht May 30, 2024
89c49a1
Apply suggestions from code review
merelcht Jun 3, 2024
0d88147
Add version pins for langchain dependencies
merelcht Jun 3, 2024
8b2d578
Update kedro-datasets/kedro_datasets_experimental/langchain/_anthropi…
merelcht Jun 3, 2024
acec1d5
Try loosen pin on langchain-cohere
merelcht Jun 3, 2024
dac5066
Only pin dependencies of dataset def in pyproject.toml
merelcht Jun 3, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions kedro-datasets/kedro_datasets/langchain/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Provides interface to langchain model API objects."""
from typing import Any

import lazy_loader as lazy

# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901
ChatOpenAIDataset: Any
OpenAIEmbeddingsDataset: Any
ChatAnthropicDataset: Any
CohereDataset: Any

__getattr__, __dir__, __all__ = lazy.attach(
__name__,
submod_attrs={
"openai": ["ChatOpenAIDataset", "OpenAIEmbeddingsDataset"],
"anthropic": ["ChatAnthropicDataset"],
"cohere": ["CohereDataset"],
},
)
55 changes: 55 additions & 0 deletions kedro-datasets/kedro_datasets/langchain/anthropic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Defines an interface to common Anthropic models."""

from typing import Any, Dict, NoReturn

from kedro.io import AbstractDataSet, DatasetError
from langchain.chat_models import ChatAnthropic


class ChatAnthropicDataset(AbstractDataSet[None, ChatAnthropic]):
"""``ChatOpenAIDataset`` loads a ChatAnthropic `langchain <https://python.langchain.com/>`_ model.

Example usage for the :doc:`YAML API <kedro:data/data_catalog_yaml_examples>`:

catalog.yml:

.. code-block:: yaml
claude_instant_1:
type: langchain.anthropic.ChatAnthropicDataset
kwargs:
model: "claude-instant-1"
temperature: 0.0
credentials: anthropic


credentials.yml:

.. code-block:: yaml
anthropic:
anthropic_api_url: <anthropic-api-base>
anthropic_api_key: <anthropic-api-key>
"""

def __init__(self, credentials: Dict[str, str], kwargs: Dict[str, Any] = None):
"""Constructor.

Args:
credentials: must contain `anthropic_api_url` and `anthropic_api_key`.
kwargs: keyword arguments passed to the ChatAnthropic constructor.
"""
self.anthropic_api_url = credentials["anthropic_api_url"]
self.anthropic_api_key = credentials["anthropic_api_key"]
self.kwargs = kwargs or {}

def _describe(self) -> dict[str, Any]:
return {**self.kwargs}

def _save(self, data: None) -> NoReturn:
raise DatasetError(f"{self.__class__.__name__} is a read only data set type")

def _load(self) -> ChatAnthropic:
return ChatAnthropic(
anthropic_api_url=self.anthropic_api_url,
anthropic_api_key=self.anthropic_api_key,
**self.kwargs,
)
1 change: 1 addition & 0 deletions kedro-datasets/kedro_datasets/langchain/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Defines an interface to common OpenAI models."""
63 changes: 63 additions & 0 deletions kedro-datasets/kedro_datasets/langchain/cohere.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""
Cohere dataset definition.
"""

from typing import Any, Dict, NoReturn

from cohere import AsyncClient, Client
from kedro.io import AbstractDataset, DatasetError
from langchain.llms import Cohere


class CohereDataset(AbstractDataset[None, Cohere]):
"""``CohereDataset`` loads a Cohere `langchain <https://python.langchain.com/>`_ model.

Example usage for the :doc:`YAML API <kedro:data/data_catalog_yaml_examples>`:

catalog.yml:

.. code-block:: yaml
command:
type: langchain.cohere.CohereDataset
kwargs:
model: "command"
temperature: 0.0
credentials: cohere


credentials.yml:

.. code-block:: yaml
cohere:
cohere_api_url: <cohere-api-base>
cohere_api_key: <cohere-api-key>
"""

def __init__(self, credentials: Dict[str, str], kwargs: Dict[str, Any] = None):
"""Constructor.

Args:
credentials: must contain `cohere_api_url` and `cohere_api_key`.
kwargs: keyword arguments passed to the underlying constructor.
"""
self.cohere_api_url = credentials["cohere_api_url"]
self.cohere_api_key = credentials["cohere_api_key"]
self.kwargs = kwargs or {}

def _describe(self) -> dict[str, Any]:
return {**self.kwargs}

def _save(self, data: None) -> NoReturn:
raise DatasetError(f"{self.__class__.__name__} is a read only data set type")

def _load(self) -> Cohere:
llm = Cohere(cohere_api_key="_", **self.kwargs)

client_kwargs = {
"api_key": self.cohere_api_key,
"api_url": self.cohere_api_url,
}
llm.client = Client(**client_kwargs, client_name=llm.user_agent)
llm.async_client = AsyncClient(**client_kwargs, client_name=llm.user_agent)

return llm
101 changes: 101 additions & 0 deletions kedro-datasets/kedro_datasets/langchain/openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"""Defines an interface to common OpenAI models."""

from abc import abstractmethod
from typing import Any, Dict, NoReturn, TypeVar, Generic

from kedro.io import AbstractDataset, DatasetError
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings


OPENAI_TYPE = TypeVar("OPENAI_TYPE")


class OpenAIDataset(AbstractDataset[None, OPENAI_TYPE], Generic[OPENAI_TYPE]):
"""OpenAI dataset used to access credentials at runtime.
"""
@property
@abstractmethod
def constructor(self) -> OPENAI_TYPE:
"""Return the OpenAI class to construct in the _load method."""

def __init__(self, credentials: Dict[str, str], kwargs: Dict[str, Any] = None):
"""Constructor.

Args:
credentials: must contain `openai_api_base` and `openai_api_key`.
kwargs: keyword arguments passed to the underlying constructor.
"""
self.openai_api_base = credentials["openai_api_base"]
self.openai_api_key = credentials["openai_api_key"]
self.kwargs = kwargs or {}

def _describe(self) -> dict[str, Any]:
return {**self.kwargs}

def _save(self, data: None) -> NoReturn:
raise DatasetError(f"{self.__class__.__name__} is a read only data set type")

def _load(self) -> OPENAI_TYPE:
return self.constructor(
openai_api_base=self.openai_api_base,
openai_api_key=self.openai_api_key,
**self.kwargs,
)


class OpenAIEmbeddingsDataset(OpenAIDataset[OpenAIEmbeddings]):
"""``OpenAIEmbeddingsDataset`` loads a OpenAIEmbeddings `langchain <https://python.langchain.com/>`_ model.

Example usage for the :doc:`YAML API <kedro:data/data_catalog_yaml_examples>`:

catalog.yml:

.. code-block:: yaml
gpt_3_5_turbo:
type: langchain.openai.ChatOpenAIDataSet
kwargs:
model: "gpt-3.5-turbo"
temperature: 0.0
credentials: openai


credentials.yml:

.. code-block:: yaml
openai:
openai_api_base: <openai-api-base>
openai_api_key: <openai-api-key>
"""

@property
def constructor(self) -> type[OpenAIEmbeddings]:
return OpenAIEmbeddings


class ChatOpenAIDataset(OpenAIDataset[ChatOpenAI]):
"""``ChatOpenAIDataset`` loads a ChatOpenAI `langchain <https://python.langchain.com/>`_ model.

Example usage for the :doc:`YAML API <kedro:data/data_catalog_yaml_examples>`:

catalog.yml:

.. code-block:: yaml
gpt_3_5_turbo:
type: langchain.openai.ChatOpenAIDataSet
kwargs:
model: "gpt-3.5-turbo"
temperature: 0.0
credentials: openai


credentials.yml:

.. code-block:: yaml
openai:
openai_api_base: <openai-api-base>
openai_api_key: <openai-api-key>
"""
@property
def constructor(self) -> type[ChatOpenAI]:
return ChatOpenAI
8 changes: 8 additions & 0 deletions kedro-datasets/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ def _collect_requirements(requires):
"huggingface.HFDataset": ["datasets", "huggingface_hub"],
"huggingface.HFTransformerPipelineDataset": ["transformers"],
}
langchain_require = {
"langchain.ChatOpenAIDataset": ["langchain[openai]"],
"langchain.OpenAIEmbeddingsDataset": ["langchain[openai]"],
"langchain.ChatAnthropicDataset": ["langchain[anthropic]"],
"langchain.CohereDataset": ["langchain[cohere]"],
}
matplotlib_require = {"matplotlib.MatplotlibWriter": ["matplotlib>=3.0.3, <4.0"]}
networkx_require = {"networkx.NetworkXDataSet": ["networkx~=2.4"]}
pandas_require = {
Expand Down Expand Up @@ -107,6 +113,7 @@ def _collect_requirements(requires):
"geopandas": _collect_requirements(geopandas_require),
"holoviews": _collect_requirements(holoviews_require),
"huggingface": _collect_requirements(huggingface_require),
"langchain": _collect_requirements(langchain_require),
"matplotlib": _collect_requirements(matplotlib_require),
"networkx": _collect_requirements(networkx_require),
"pandas": _collect_requirements(pandas_require),
Expand Down Expand Up @@ -233,6 +240,7 @@ def _collect_requirements(requires):
"datasets",
"huggingface_hub",
"transformers",
"langchain[openai,anthropic,cohere]",
]

setup(
Expand Down
Loading