Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gb/tests #1

Merged
merged 5 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: pytests

on: pull_request

jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ['3.10']
include:
- os: ubuntu-latest
python-version: 3.9
- os: ubuntu-latest
python-version: 3.8

steps:
- uses: actions/checkout@v2
with:
ref: ${{ github.event.pull_request.head.ref }}
fetch-depth: 1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install pytest
python -m pip install pytest-mock
python -m pip install pytest-cov
python -m pip install .
- name: Run pytest and Generate coverage report
run: |
python -m pytest -v --disable-warnings --cov=./ --cov-report=xml:coverage.xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: ./coverage.xml
flags: unittests
env_vars: OS,PYTHON
name: codecov-umbrella
fail_ci_if_error: false
verbose: true
4 changes: 4 additions & 0 deletions elm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Energy Language Model
"""

import os
from elm.base import ApiBase
from elm.chunk import Chunker
from elm.embed import ChunkAndEmbed
Expand All @@ -13,3 +14,6 @@

__author__ = """Grant Buster"""
__email__ = "Grant.Buster@nrel.gov"

ELM_DIR = os.path.dirname(os.path.realpath(__file__))
TEST_DATA_DIR = os.path.join(os.path.dirname(ELM_DIR), 'tests', 'data')
68 changes: 48 additions & 20 deletions elm/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,23 @@ class ChunkAndEmbed(ApiBase):
DEFAULT_MODEL = 'text-embedding-ada-002'
"""Default model to do embeddings."""

def __init__(self, text, tag=None, model=None, tokens_per_chunk=500,
overlap=1):
def __init__(self, text, model=None, **chunk_kwargs):
"""
Parameters
----------
text : str
Single continuous piece of text to chunk up by paragraph and embed
or filepath to .txt file containing one piece of text.
tag : None | str
Optional reference tag to include at the beginning of each text
chunk
model : None | str
Optional specification of OpenAI model to use. Default is
cls.DEFAULT_MODEL
tokens_per_chunk : float
Nominal token count per text chunk
overlap : int
Number of paragraphs to overlap between chunks
chunk_kwargs : dict | None
kwargs for initialization of :class:`elm.chunk.Chunker`
"""

super().__init__(model)

self.text = text
self.tag = tag

if os.path.isfile(text):
logger.info('Loading text file: {}'.format(text))
Expand All @@ -52,9 +46,7 @@ def __init__(self, text, tag=None, model=None, tokens_per_chunk=500,
assert isinstance(self.text, str)
self.text = self.clean_tables(self.text)

self.text_chunks = Chunker(self.text, tag=tag,
tokens_per_chunk=tokens_per_chunk,
overlap=overlap)
self.text_chunks = Chunker(self.text, **chunk_kwargs)

@staticmethod
def clean_tables(text):
Expand All @@ -81,8 +73,50 @@ def clean_tables(text):

return '\n'.join(lines)

def run(self, rate_limit=175e3):
"""Run text embedding in serial

Parameters
----------
rate_limit : float
OpenAI API rate limit (tokens / minute). Note that the
embedding limit is 350k as of 4/2023, but we're using a large
factor of safety (~1/2) because we can only count the tokens on the
input side and assume the output is about the same count.

Returns
-------
embedding : list
List of 1D arrays representing the embeddings for all text chunks
"""

logger.info('Embedding {} text chunks...'
.format(len(self.text_chunks)))

embeddings = []
for i, chunk in enumerate(self.text_chunks):
req = {"input": chunk, "model": self.model}

if 'azure' in str(openai.api_type).lower():
req['engine'] = self.model

out = self.call_api(self.EMBEDDING_URL, self.HEADERS, req)

try:
out = out['data'][0]['embedding']
embeddings.append(out)
except Exception:
msg = ('Could not get embeddings for chunk {}, '
'received API response: {}'.format(i + 1, out))
logger.error(msg)
embeddings.append(None)

logger.info('Finished all embeddings.')

return embeddings

async def run_async(self, rate_limit=175e3):
"""Run text embedding
"""Run text embedding on chunks asynchronously

NOTE: you need to call this using the await command in ipython or
jupyter, e.g.: `out = await ChunkAndEmbed.run_async()`
Expand All @@ -101,12 +135,6 @@ async def run_async(self, rate_limit=175e3):
List of 1D arrays representing the embeddings for all text chunks
"""

if not isinstance(self.text_chunks, Chunker):
msg = ('You must init a Chunker obj with the text before '
'running async embeddings!')
logger.error(msg)
raise RuntimeError(msg)

logger.info('Embedding {} text chunks...'
.format(len(self.text_chunks)))

Expand Down
16 changes: 4 additions & 12 deletions elm/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ class Summary(ApiBase):
"""Prefix to the engineered prompt. That `n_words` is an initialization
argument for the Summary class."""

def __init__(self, text, model=None, tokens_per_chunk=500, overlap=1,
split_on='\n\n', n_words=500):
def __init__(self, text, model=None, n_words=500, **chunk_kwargs):
"""
Parameters
----------
Expand All @@ -34,18 +33,13 @@ def __init__(self, text, model=None, tokens_per_chunk=500, overlap=1,
document with empty lines between paragraphs.
model : str
GPT model name, default is the DEFAULT_MODEL global var
tokens_per_chunk : float
Nominal token count per text chunk. Overlap paragraphs will exceed
this.
overlap : int
Number of paragraphs to overlap between chunks
split_on : str
Sub string to split text into paragraphs.
n_words : int
Desired length of the output text. Note that this is never perfect
but helps guide the LLM to an approximate desired output length.
400-600 words seems to work quite well with GPT-4. This gets
formatted into the MODEL_INSTRUCTION attribute.
chunk_kwargs : dict | None
kwargs for initialization of :class:`elm.chunk.Chunker`
"""

super().__init__(model)
Expand All @@ -60,9 +54,7 @@ def __init__(self, text, model=None, tokens_per_chunk=500, overlap=1,

assert isinstance(self.text, str)

self.text_chunks = Chunker(self.text,
tokens_per_chunk=tokens_per_chunk,
overlap=overlap, split_on=split_on)
self.text_chunks = Chunker(self.text, **chunk_kwargs)

def combine(self, text_summary):
"""Combine separate chunk summaries into one more comprehensive
Expand Down
15 changes: 15 additions & 0 deletions elm/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(self, graph):
transition.
"""
self._g = graph
self._history = []
assert isinstance(self.graph, nx.DiGraph)
assert 'api' in self.graph.graph

Expand Down Expand Up @@ -93,6 +94,16 @@ def all_messages_txt(self):
messages = '\n\n'.join(messages)
return messages

@property
def history(self):
"""Get a record of the nodes traversed in the tree

Returns
-------
list
"""
return self._history

@property
def graph(self):
"""Get the networkx graph object
Expand Down Expand Up @@ -122,6 +133,7 @@ def call_node(self, node0):
txt_fmt = {k: v for k, v in self.graph.graph.items() if k != 'api'}
prompt = prompt.format(**txt_fmt)

self._history.append(node0)
out = self.api.chat(prompt)

successors = list(self.graph.successors(node0))
Expand Down Expand Up @@ -168,6 +180,9 @@ def run(self, node0='init'):
out : str
Final response from LLM at the leaf node.
"""

self._history = []

while True:
try:
out = self.call_node(node0)
Expand Down
6 changes: 1 addition & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
import os
from codecs import open
from setuptools import setup, find_packages
from setuptools.command.develop import develop
from subprocess import check_call
import shlex
from warnings import warn

here = os.path.abspath(os.path.dirname(__file__))

Expand All @@ -24,7 +20,7 @@
install_requires = f.readlines()


test_requires = ["pytest>=5.2", ]
test_requires = ["pytest>=5.2", "pytest-mock"]
description = "Energy Language Model"

setup(
Expand Down
Binary file added tests/data/GPT-4.pdf
Binary file not shown.
Loading