peft(models): improve implementation (#60)

If you have a local Dolly-V2 version, please do `openllm prune`
bentoml · Jun 24, 2023 · 98328be · 98328be
1 parent 3d9cc93
commit 98328be
Show file tree

Hide file tree

Showing 21 changed files with 383 additions and 192 deletions.
diff --git a/changelog.d/60.fix.md b/changelog.d/60.fix.md
@@ -0,0 +1,8 @@
+Moved implementation of dolly-v2 and falcon serialization to save PreTrainedModel instead of pipeline.
+
+Save dolly-v2 now save the actual model instead of the pipeline abstraction. If you have a Dolly-V2
+model available locally, kindly ask you to do `openllm prune` to have the new implementation available.
+
+Dolly-v2 and falcon nows implements some memory optimization to help with loading with lower resources system
+
+Configuration removed field: 'use_pipeline'
diff --git a/examples/bentoml-demo/service.py b/examples/bentoml-demo/service.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import bentoml
 import openllm
-from bentoml.io import Text
 
 
 model = "dolly-v2"
@@ -25,7 +26,12 @@
 svc = bentoml.Service(name="llm-service", runners=[llm_runner])
 
 
-@svc.api(input=Text(), output=Text())
+@svc.on_startup
+def download(_: bentoml.Context):
+    llm_runner.llm.ensure_model_id_exists()
+
+
+@svc.api(input=bentoml.io.Text(), output=bentoml.io.Text())
 async def prompt(input_text: str) -> str:
-    answer = await llm_runner.generate(input_text)
-    return answer
+    answer = await llm_runner.generate.async_run(input_text)
+    return answer[0]["generated_text"]
diff --git a/examples/langchain-chains-demo/bentofile.yaml b/examples/langchain-chains-demo/bentofile.yaml
@@ -14,9 +14,6 @@
 
 service: "service:svc"
 include:
-- "*.py"
+  - "*.py"
 python:
-  packages:
-  - openllm
-  - langchain
-  - pydantic
+  requirements_txt: ./requirements.txt
diff --git a/examples/langchain-chains-demo/download_model.py b/examples/langchain-chains-demo/download_model.py
diff --git a/examples/langchain-chains-demo/requirements.txt b/examples/langchain-chains-demo/requirements.txt
@@ -0,0 +1,4 @@
+openllm
+langchain>=0.0.212
+pydantic
+BeautifulSoup4
diff --git a/examples/langchain-chains-demo/service.py b/examples/langchain-chains-demo/service.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any
-from typing import Dict
+from __future__ import annotations
+
+import subprocess
+import sys
+import typing as t
 
 from langchain.chains import LLMChain
 from langchain.llms import OpenLLM
@@ -28,15 +31,20 @@
 class Query(BaseModel):
     industry: str
     product_name: str
-    keywords: list[str]
-    llm_config: Dict[str, Any]
+    keywords: t.List[str]
+    llm_config: t.Dict[str, t.Any]
 
 
-llm = OpenLLM(
-    model_name="dolly-v2",
-    model_id="databricks/dolly-v2-7b",
-    embedded=False,
-)
+def gen_llm(model_name: str, model_id: str | None = None) -> OpenLLM:
+    args = [sys.executable, "-m", "openllm", "download", model_name]
+    if model_id:
+        args += ["--model-id", model_id]
+    subprocess.check_output(args)
+    return OpenLLM(model_name=model_name, model_id=model_id, embedded=False)
+
+
+llm = gen_llm("dolly-v2", model_id="databricks/dolly-v2-7b")
+
 prompt = PromptTemplate(
     input_variables=["industry", "product_name", "keywords"],
     template="""
@@ -57,6 +65,12 @@ class Query(BaseModel):
 
 svc = bentoml.Service("fb-ads-copy", runners=[llm.runner])
 
+
+@svc.on_startup
+def download(_: bentoml.Context):
+    llm.runner.llm.ensure_model_id_exists()
+
+
 SAMPLE_INPUT = Query(
     industry="SAAS",
     product_name="BentoML",

diff --git a/examples/langchain-tools-demo/download_model.py b/examples/langchain-tools-demo/download_model.py
diff --git a/examples/langchain-tools-demo/service.py b/examples/langchain-tools-demo/service.py
@@ -28,12 +28,11 @@
     model_id="databricks/dolly-v2-7b",
     embedded=False,
 )
-llm = OpenLLM(model_name="dolly-v2", embedded=False)
 tools = load_tools(["serpapi"], llm=llm)
 agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION)
 svc = bentoml.Service("langchain-openllm", runners=[llm.runner])
 
 
-@svc.api(input=Text.from_sample(SAMPLE_INPUT), output=Text())
+@svc.api(input=Text.from_sample(sample=SAMPLE_INPUT), output=Text())
 def chat(input_text: str):
     return agent.run(input_text)
diff --git a/src/openllm/_configuration.py b/src/openllm/_configuration.py
@@ -72,6 +72,7 @@ class GenerationConfig:
 from .exceptions import ForbiddenAttributeError
 from .utils import ENV_VARS_TRUE_VALUES
 from .utils import LazyType
+from .utils import ReprMixin
 from .utils import bentoml_cattr
 from .utils import codegen
 from .utils import dantic
@@ -110,7 +111,6 @@ class GenerationConfig:
     import peft
     from attr import _CountingAttr  # type: ignore
     from attr import _make_init  # type: ignore
-    from attr import _make_repr  # type: ignore
     from attr import _transform_attrs  # type: ignore
     from attr._compat import set_closure_cell
 
@@ -136,7 +136,6 @@ class GenerationConfig:
     from attr._compat import set_closure_cell
     from attr._make import _CountingAttr
     from attr._make import _make_init
-    from attr._make import _make_repr
     from attr._make import _transform_attrs
 
     transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
@@ -652,7 +651,6 @@ class ModelSettings(t.TypedDict, total=False):
     requirements: t.Optional[ListStr]
 
     # llm implementation specifics
-    use_pipeline: bool
     bettertransformer: bool
     model_type: t.Literal["causal_lm", "seq2seq_lm"]
     runtime: t.Literal["transformers", "cpp"]
@@ -712,7 +710,6 @@ def default(cls) -> _ModelSettingsAttr:
                     name_type="dasherize",
                     requires_gpu=False,
                     url="",
-                    use_pipeline=False,
                     model_type="causal_lm",
                     trust_remote_code=False,
                     requirements=None,
@@ -988,13 +985,6 @@ def __attrs_init__(self, **attrs: t.Any):
         """The default PyPI requirements needed to run this given LLM. By default, we will depend on
         bentoml, torch, transformers."""
 
-        __openllm_use_pipeline__: bool = Field(False)
-        """Whether this LLM will use HuggingFace Pipeline API. By default, this is set to False.
-        The reason for this to be here is because we want to access this object before loading
-        the _bentomodel. This is because we will actually download the model weights when accessing
-        _bentomodel.
-        """
-
         __openllm_bettertransformer__: bool = Field(False)
         """Whether to use BetterTransformer for this given LLM. This depends per model
         architecture. By default, we will use BetterTransformer for T5 and StableLM models,
@@ -1214,7 +1204,9 @@ def build_class(self) -> type[LLMConfig]:
             for base_cls in self._cls.__mro__[1:-1]:
                 if base_cls.__dict__.get("__weakref__", None) is not None:
                     weakref_inherited = True
-                existing_slots.update({name: getattr(base_cls, name, codegen._sentinel) for name in getattr(base_cls, "__slots__", [])})
+                existing_slots.update(
+                    {name: getattr(base_cls, name, codegen._sentinel) for name in getattr(base_cls, "__slots__", [])}
+                )
 
             base_names = set(self._base_names)
             names = self._attr_names
@@ -1295,8 +1287,11 @@ def add_attrs_init(self) -> t.Self:
             )
             return self
 
-        def add_repr(self, ns: str | None):
-            self._cls_dict["__repr__"] = codegen.add_method_dunders(self._cls, _make_repr(self._attrs, ns, self._cls))
+        def add_repr(self):
+            for key, fn in ReprMixin.__dict__.items():
+                if key not in ("__module__", "__doc__", "__repr_keys__"):
+                    self._cls_dict[key] = codegen.add_method_dunders(self._cls, fn)
+            self._cls_dict["__repr_keys__"] = property(lambda _: {i.name for i in self._attrs})
             return self
 
     def __init_subclass__(cls: type[LLMConfig]):
@@ -1384,7 +1379,7 @@ def __init_subclass__(cls: type[LLMConfig]):
             type=GenerationConfig,
         )
 
-        cls = cls._ConfigBuilder(cls, model_name, these).add_attrs_init().add_repr(None).build_class()
+        cls = cls._ConfigBuilder(cls, model_name, these).add_attrs_init().add_repr().build_class()
         # auto assignment attributes generated from __config__ after create the new slot class.
         _make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettingsAttr))(cls)
 
@@ -1426,14 +1421,19 @@ def __init__(
         if generation_config is None:
             generation_config = {k: v for k, v in attrs.items() if k in _generation_cl_dict}
         else:
-            generation_config = config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in _generation_cl_dict})
+            generation_config = config_merger.merge(
+                generation_config, {k: v for k, v in attrs.items() if k in _generation_cl_dict}
+            )
 
         for k in _cached_keys:
             if k in generation_config or attrs.get(k) is None:
                 del attrs[k]
         _cached_keys = tuple(k for k in _cached_keys if k in attrs)
 
-        self.__openllm_extras__ = config_merger.merge( first_not_none(__openllm_extras__, default={}), {k: v for k, v in attrs.items() if k not in self.__openllm_accepted_keys__})
+        self.__openllm_extras__ = config_merger.merge(
+            first_not_none(__openllm_extras__, default={}),
+            {k: v for k, v in attrs.items() if k not in self.__openllm_accepted_keys__},
+        )
 
         for k in _cached_keys:
             if k in self.__openllm_extras__:
@@ -1464,8 +1464,6 @@ def __getitem__(self, item: t.Literal["service_name"] = ...) -> str: ...
         @overload
         def __getitem__(self, item: t.Literal["requirements"] = ...) -> t.Optional[ListStr]: ...
         @overload
-        def __getitem__(self, item: t.Literal["use_pipeline"] = ...) -> bool: ...
-        @overload
         def __getitem__(self, item: t.Literal["bettertransformer"] = ...) -> bool: ...
         @overload
         def __getitem__(self, item: t.Literal["model_type"] = ...) -> t.Literal['causal_lm', 'seq2seq_lm']: ...