diff --git a/.github/workflows/pytest_ords.yml b/.github/workflows/pytest_ords.yml index cc097142..2675d074 100644 --- a/.github/workflows/pytest_ords.yml +++ b/.github/workflows/pytest_ords.yml @@ -12,17 +12,18 @@ jobs: python-version: [3.11] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.ref }} fetch-depth: 1 - name: Set up Python ${{ matrix.python-version }} - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v3 with: auto-update-conda: true + channels: conda-forge,defaults python-version: ${{ matrix.python-version }} miniconda-version: "latest" - - name: Install dependencies' + - name: Install dependencies shell: bash -l {0} run: | conda install -c conda-forge poppler diff --git a/elm/ords/README.md b/elm/ords/README.md index 46678e3b..ec860c7e 100644 --- a/elm/ords/README.md +++ b/elm/ords/README.md @@ -1,7 +1,10 @@ # Welcome to Energy Language Model - OrdinanceGPT -The ordinance web scraping and data extraction portion of this codebase required a few extra dependencies that do not come out-of-the-box with the base ELM software. -To set up ELM for ordinances, first create a conda environment. Then, _before installing ELM_, run the poppler installation: +The ordinance web scraping and data extraction portion of this codebase +required a few extra dependencies that do not come out-of-the-box with the base +ELM software. To set up ELM for ordinances, first create a conda environment. +We have had some issues using python 3.9 and recommend using python 3.11. Then, +_before installing ELM_, run the poppler installation: $ conda install -c conda-forge poppler @@ -9,7 +12,7 @@ Then, install `pdftotext`: $ pip install pdftotext -(OPTIONAL) If you want to have access to Optical Character Recognition (OCR) for PDF parsing, you should also install pytesseract during this step: +(OPTIONAL) If you want to have access to Optical Character Recognition (OCR) for PDF parsing, you should also install pytesseract during this step. Note that there may be additional OS-specific installation steps to get tesseract working properly (see the [pytesseract install instructions](https://pypi.org/project/pytesseract/)) $ pip install pytesseract pdf2image diff --git a/elm/ords/extraction/apply.py b/elm/ords/extraction/apply.py index 875f4847..0ca9f641 100644 --- a/elm/ords/extraction/apply.py +++ b/elm/ords/extraction/apply.py @@ -277,7 +277,7 @@ async def extract_ordinance_values(doc, **kwargs): A document known to contain ordinance text. This means it must contain an ``"cleaned_ordinance_text"`` key in the metadata. You can run - :func:`~elm.ords.extraction.apply.extract_ordinance_text` + :func:`~elm.ords.extraction.apply.extract_ordinance_text_with_llm` to have this attribute populated automatically for documents that are found to contain ordinance data. Note that if the document's metadata does not contain the @@ -297,8 +297,8 @@ async def extract_ordinance_values(doc, **kwargs): if not doc.metadata.get("cleaned_ordinance_text"): msg = ( "Input document has no 'cleaned_ordinance_text' key or string " - "does not contain info. Please run `extract_ordinance_text` " - "prior to calling this method." + "does not contain info. Please run " + "`extract_ordinance_text_with_llm` prior to calling this method." ) logger.warning(msg) warn(msg, UserWarning) diff --git a/elm/ords/extraction/ordinance.py b/elm/ords/extraction/ordinance.py index a77e8320..da5452df 100644 --- a/elm/ords/extraction/ordinance.py +++ b/elm/ords/extraction/ordinance.py @@ -173,6 +173,8 @@ async def parse(self, min_chunks_to_process=3): logger.debug("Text at ind %d is not legal text", ind) continue + logger.debug("Text at ind %d is legal text", ind) + contains_ord_info = await self.parse_from_ind( ind, self.CONTAINS_ORD_PROMPT, key="contains_ord_info" ) @@ -182,6 +184,8 @@ async def parse(self, min_chunks_to_process=3): ) continue + logger.debug("Text at ind %d does contain ordinance info", ind) + is_utility_scale = await self.parse_from_ind( ind, self.IS_UTILITY_SCALE_PROMPT, key="x" ) @@ -191,6 +195,8 @@ async def parse(self, min_chunks_to_process=3): ) continue + logger.debug("Text at ind %d is for utility-scale WECS", ind) + self._ordinance_chunks.append({"text": text, "ind": ind}) logger.debug("Added text at ind %d to ordinances", ind) # mask, since we got a good result diff --git a/elm/ords/process.py b/elm/ords/process.py index acdc3689..e3750981 100644 --- a/elm/ords/process.py +++ b/elm/ords/process.py @@ -269,7 +269,7 @@ async def _process_with_logs( ): """Process counties with logging enabled.""" counties = _load_counties_to_process(county_fp) - azure_api_key, azure_version, azure_endpoint = _validate_api_params( + azure_api_key, azure_version, azure_endpoint = validate_api_params( azure_api_key, azure_version, azure_endpoint ) @@ -318,7 +318,7 @@ async def _process_with_logs( ) trackers.append(usage_tracker) task = asyncio.create_task( - download_docs_for_county_with_logging( + process_county_with_logging( log_listener, log_dir, location, @@ -379,7 +379,8 @@ def _load_counties_to_process(county_fp): return load_counties_from_fp(county_fp) -def _validate_api_params(azure_api_key, azure_version, azure_endpoint): +def validate_api_params(azure_api_key=None, azure_version=None, + azure_endpoint=None): """Validate OpenAI API parameters.""" azure_api_key = azure_api_key or os.environ.get("AZURE_OPENAI_API_KEY") azure_version = azure_version or os.environ.get("AZURE_OPENAI_VERSION") @@ -404,7 +405,7 @@ def _configure_file_loader_kwargs(file_loader_kwargs): return file_loader_kwargs -async def download_docs_for_county_with_logging( +async def process_county_with_logging( listener, log_dir, county, @@ -461,7 +462,7 @@ async def download_docs_for_county_with_logging( listener, log_dir, location=county.full_name, level=level ): task = asyncio.create_task( - download_doc_for_county( + process_county( county, text_splitter, num_urls=num_urls, @@ -485,7 +486,7 @@ async def download_docs_for_county_with_logging( return doc -async def download_doc_for_county( +async def process_county( county, text_splitter, num_urls=5, diff --git a/elm/ords/services/provider.py b/elm/ords/services/provider.py index df82fd1c..50990443 100644 --- a/elm/ords/services/provider.py +++ b/elm/ords/services/provider.py @@ -150,3 +150,37 @@ async def __aexit__(self, exc_type, exc, tb): for service in self.services: logger.debug("Tearing down Service: %s", service.name) tear_down_service_queue(service.name) + + @classmethod + def run(cls, services, coroutine): + """Run an async function that relies on services. + + You can treat this function like the ``asyncio.run`` function + with an extra parameter:: + + openai_service = OpenAIService(...) + RunningAsyncServices.run( + [openai_service], my_async_func(*args, **kwargs) + ) + + + Parameters + ---------- + services : iterable of :class:`elm.ords.services.base.Service` + An iterable (i.e. a list) of Services that are needed to run + the asynchronous function. + coroutine : coroutine + A coroutine that should be run with the services. + + Returns + ------- + Any + Returns the output of the coroutine. + """ + return asyncio.run(cls._run_coroutine(services, coroutine)) + + @classmethod + async def _run_coroutine(cls, services, coroutine): + """Run a coroutine under services. """ + async with cls(services): + return await coroutine diff --git a/elm/ords/validation/content.py b/elm/ords/validation/content.py index 4355579c..fd34e34d 100644 --- a/elm/ords/validation/content.py +++ b/elm/ords/validation/content.py @@ -125,7 +125,6 @@ async def parse_from_ind(self, ind, prompt, key): logger.debug("Mem at ind %d is %s", step, mem) check = mem.get(key) if check is None: - # logger.debug("text=%s", text) content = await self.slc.call( sys_msg=prompt.format(key=key), content=text, diff --git a/elm/pdf.py b/elm/pdf.py index 0f056304..aeb2e2cf 100644 --- a/elm/pdf.py +++ b/elm/pdf.py @@ -336,15 +336,19 @@ def clean_headers(self, char_thresh=0.6, page_thresh=0.8, split_on='\n', self.full = combine_pages(self.pages) return self.full - def convert_to_txt(self, txt_fp, separator=' '): - """Function to convert contents of pdf document to txt file. + def convert_to_txt(self, txt_fp=None, separator=' ', + clean_header_kwargs=None): + """Function to convert contents of pdf document to txt file using + poppler. Parameters ---------- - txt_fp: str - Directory for output txt file. - separator : str + txt_fp: str, optional + Optional Directory for output txt file. + separator : str, optional Heuristic split string to look for spaces between columns + clean_header_kwargs : dict, optional + Optional kwargs to override clean_headers kwargs Returns ------- @@ -354,11 +358,13 @@ def convert_to_txt(self, txt_fp, separator=' '): text = self.clean_poppler(layout=True) if is_multi_col(text, separator=separator): text = self.clean_poppler(layout=False) - text = self.clean_headers(char_thresh=0.6, page_thresh=0.8, - split_on='\n', - iheaders=[0, 1, 3, -3, -2, -1]) - with open(txt_fp, 'w') as f: - f.write(text) - logger.info(f'Saved: {txt_fp}') + + clean_header_kwargs = clean_header_kwargs or {} + text = self.clean_headers(**clean_header_kwargs) + + if txt_fp is not None: + with open(txt_fp, 'w') as f: + f.write(text) + logger.info(f'Saved: {txt_fp}') return text diff --git a/elm/version.py b/elm/version.py index 7ceff579..03bfa088 100644 --- a/elm/version.py +++ b/elm/version.py @@ -2,4 +2,4 @@ ELM version number """ -__version__ = "0.0.4" +__version__ = "0.0.5" diff --git a/elm/web/document.py b/elm/web/document.py index 67ba17cd..10de52d9 100644 --- a/elm/web/document.py +++ b/elm/web/document.py @@ -3,6 +3,7 @@ from abc import ABC, abstractmethod from copy import deepcopy from functools import cached_property +import logging from elm.utilities.parse import ( combine_pages, @@ -10,12 +11,17 @@ html_to_text, remove_blank_pages, format_html_tables, + read_pdf, + read_pdf_ocr, replace_common_pdf_conversion_chars, replace_multi_dot_lines, remove_empty_lines_or_page_footers, ) +logger = logging.getLogger(__name__) + + class BaseDocument(ABC): """Base ELM web document representation.""" @@ -173,6 +179,41 @@ def _raw_pages(self): raw_pages += [page for page in self.pages[self._last_page_index:]] return raw_pages + @classmethod + def from_file(cls, fp, **init_kwargs): + """Initialize a PDFDocument object from a .pdf file on disk. This + method will try to use pdftotext (a poppler utility) and then + OCR with pytesseract. + + Parameters + ---------- + fp : str + filepath to .pdf on disk + init_kwargs : dict + Optional kwargs for PDFDocument Initialization + + Returns + ------- + out : PDFDocument + Initialized PDFDocument class from input fp + """ + + with open(fp, 'rb') as f: + pages = read_pdf(f.read()) + + if all(len(page) < 10 for page in pages): + # fallback to OCR with pytesseract if no pages have more than 10 + # chars. Typical scanned document only has weird ascii per page. + with open(fp, 'rb') as f: + pages = read_pdf_ocr(f.read()) + + if not any(pages): + msg = f'Could not get text from pdf: {fp}' + logger.error(msg) + raise RuntimeError(msg) + + return cls(pages, **init_kwargs) + class HTMLDocument(BaseDocument): """ELM web HTML document""" diff --git a/examples/ordinance_gpt/README.rst b/examples/ordinance_gpt/README.rst index fc9b0ad3..78e95575 100644 --- a/examples/ordinance_gpt/README.rst +++ b/examples/ordinance_gpt/README.rst @@ -2,8 +2,7 @@ Ordinance GPT ************* -This example folder contains supporting documents, results, and code for the -Ordinance GPT experiment. +This example folder contains supporting documents, results, and code for the Ordinance GPT experiment. Prerequisites ============= @@ -11,8 +10,21 @@ We recommend installing the pytesseract module to allow PDF retrieval for scanne See the `ordinance-specific installation instructions `_ for more details. -Setup -===== +Running from Python +=================== +This instruction set presents a simplified example to extract ordinance data from a ordinance document on disk. This corresponds with the ordinance data extraction from PDF results in `Buster et al., 2024 `_. + +To run this, first download one or more ordinance documents from `the Box folder `_. + +After downloading the ordinance document(s), set the relevant path for the ``fp_pdf`` variable, and then run the script: +.. code-block:: bash + + $ python parse_pdf.py + +Running from the Command Line Utility +===================================== +This instruction set is an experimental process to use LLMs to search the internet for relevant ordinance documents, download those documents, and then extract the relevant ordinance data. + There are a few key things you need to set up in order to run ordinance retrieval and extraction. First, you must specify which counties you want to process. You can do this by setting up a CSV file with a ``County`` and a ``State`` column. Each row in the CSV file then represents a single county to process. @@ -21,7 +33,7 @@ file for reference. Once you have set up the county CSV, you can fill out the `template JSON config `_. -See the documentation for the `"process_counties_with_openai" function `_ +See the documentation for the `"process_counties_with_openai" function `_ for an explanation of all the allowed inputs to the configuration file. Some notable inputs here are the ``azure*`` keys, which should be configured to match your Azure OpenAI API deployment (unless it's defined in your environment with the ``AZURE_OPENAI_API_KEY``, ``AZURE_OPENAI_VERSION``, @@ -32,7 +44,7 @@ the ``llm_service_rate_limit`` to match your deployment's API tokens-per-minute paths to all files/directories unless you are executing the program from your working folder. Execution -========= +--------- Once you are happy with the configuration parameters, you can kick off the processing using .. code-block:: bash @@ -44,18 +56,18 @@ asynchronously, so the the logs will not print in order). .. WARNING:: Running all of the 85 counties given in the sample county CSV file can cost $700-$1000 in API calls. We recommend running a smaller subset for example purposes. +Debugging +--------- +Not sure why things aren't working? No error messages? Make sure you run the CLI call with a ``-v`` flag for "verbose" logging (e.g., ``$ elm ords -c config.json -v``) + +Errors on import statements? Trouble importing ``pdftotext`` with cryptic error messages like ``symbol not found in flat namespace``? Follow the `ordinance-specific install instructions `_ *exactly*. + Source Ordinance Documents ========================== The ordinance documents downloaded using (an older version of) this example code can be downloaded `here `_. -Debugging -========= -Not sure why things aren't working? No error messages? Make sure you run the CLI call with a ``-v`` flag for "verbose" logging (e.g., ``$ elm ords -c config.json -v``) - -Errors on import statements? Trouble importing ``pdftotext`` with cryptic error messages like ``symbol not found in flat namespace``? Follow the `ordinance-specific install instructions `_ *exactly*. - Extension to Other Technologies =============================== Extending this functionality to other technologies is possible but requires deeper understanding of the underlying processes. @@ -64,5 +76,5 @@ as well as how they are applied in `parse.py `_ to get a better sense of how to adjust the web-scraping portion of the code to your technology. When you have set up the validation and parsing for your -technology, put it all together by adjusting the `"process_counties_with_openai" function `_ +technology, put it all together by adjusting the `"process_counties_with_openai" function `_ to call your new routines. diff --git a/examples/ordinance_gpt/config.json b/examples/ordinance_gpt/config.json index 37767ec3..1aef11e2 100644 --- a/examples/ordinance_gpt/config.json +++ b/examples/ordinance_gpt/config.json @@ -22,4 +22,4 @@ }, "pytesseract_exe_fp": "", "log_level": "INFO" -} \ No newline at end of file +} diff --git a/examples/ordinance_gpt/parse_pdf.py b/examples/ordinance_gpt/parse_pdf.py new file mode 100644 index 00000000..83600ded --- /dev/null +++ b/examples/ordinance_gpt/parse_pdf.py @@ -0,0 +1,73 @@ +"""Example on parsing an existing PDF file on-disk for ordinances.""" +from functools import partial + +import openai +from langchain.text_splitter import RecursiveCharacterTextSplitter + +from rex import init_logger +from elm.base import ApiBase +from elm.web.document import PDFDocument +from elm.ords.llm import LLMCaller +from elm.ords.services.openai import OpenAIService +from elm.ords.utilities import RTS_SEPARATORS +from elm.ords.process import validate_api_params +from elm.ords.extraction.ordinance import OrdinanceExtractor +from elm.ords.extraction.apply import extract_ordinance_values +from elm.ords.services.provider import RunningAsyncServices as ARun +from elm.ords.extraction.apply import (check_for_ordinance_info, + extract_ordinance_text_with_llm) + + +if __name__ == '__main__': + init_logger('elm', log_level='INFO') + + # download this from https://app.box.com/s/a8oi8jotb9vnu55rzdul7e291jnn7hmq + fp_pdf = 'Palo Alto Iowa.pdf' + + fp_txt_all = fp_pdf.replace('.pdf', '_all.txt') + fp_txt_clean = fp_pdf.replace('.pdf', '_clean.txt') + fp_ords = fp_pdf.replace('.pdf', '_ords.csv') + + doc = PDFDocument.from_file(fp_pdf) + + text_splitter = RecursiveCharacterTextSplitter( + RTS_SEPARATORS, + chunk_size=3000, + chunk_overlap=300, + length_function=partial(ApiBase.count_tokens, model='gpt-4'), + ) + + # setup LLM and Ordinance service/utility classes + azure_api_key, azure_version, azure_endpoint = validate_api_params() + client = openai.AsyncAzureOpenAI(api_key=azure_api_key, + api_version=azure_version, + azure_endpoint=azure_endpoint) + llm_service = OpenAIService(client, rate_limit=1e9) + services = [llm_service] + kwargs = dict(llm_service=llm_service, model='gpt-4', temperature=0) + extractor = OrdinanceExtractor(LLMCaller(**kwargs)) + + """The following three function calls present three (equivalent) ways to + call ELM async ordinance functions. The three functions 1) check ordinance + documents for relevant ordinance info, 2) extract the relevant text, and 3) + run the decision tree to get structured ordinance data from the + unstructured legal text.""" + + # 1) call async func using a partial function (`run_async`) + run_async = partial(ARun.run, services) + doc = run_async(check_for_ordinance_info(doc, text_splitter, **kwargs)) + + # 2) Build coroutine first the use it to call async func + # (extract_ordinance_text_with_llm is an async function) + extrct = extract_ordinance_text_with_llm(doc, text_splitter, extractor) + doc = ARun.run(services, extrct) + + # 3) Build coroutine and use it to call async func in one go + doc = ARun.run(services, extract_ordinance_values(doc, **kwargs)) + + # save outputs + doc.metadata['ordinance_values'].to_csv(fp_ords) + with open(fp_txt_all, 'w') as f: + f.write(doc.metadata["ordinance_text"]) + with open(fp_txt_clean, 'w') as f: + f.write(doc.metadata["cleaned_ordinance_text"])