feat: mvp

sysid · Mar 17, 2024 · 7598208 · 7598208
1 parent 4e6e245
commit 7598208
Show file tree

Hide file tree

Showing 13 changed files with 3,755 additions and 45 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+version: 2
+updates:
+ - package-ecosystem: "pip" 
+ directory: "/"
+ schedule:
+ interval: "monthly"
+
+ - package-ecosystem: "github-actions" 
+ directory: "/"
+ schedule:
+ interval: "monthly"
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,88 @@
+name: Tests
+
+on:
+ workflow_dispatch:
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+
+ strategy:
+ matrix:
+ python: [ '3.10', '3.11', '3.12' ]
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python }}
+
+ - name: Setup PDM
+ uses: pdm-project/setup-pdm@v4
+ with:
+ python-version: ${{ matrix.python }}
+ cache: true
+
+ - name: Install dependencies
+ run: |
+ sudo apt update && sudo apt install --yes libgl1-mesa-dev
+ pdm lock
+ pdm install --dev
+
+ - name: Test with pytest
+ run: |
+ pdm run make test-cicd
+
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v4.0.2
+ with:
+ token: ${{ secrets.CODECOV_TOKEN }}
+ slug: sysid/prepembd
+
+ lint:
+ runs-on: ubuntu-latest
+
+ strategy:
+ matrix:
+ python: [ '3.10', '3.11', '3.12' ]
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python }}
+
+ - name: Setup PDM
+ uses: pdm-project/setup-pdm@v4
+ with:
+ python-version: ${{ matrix.python }}
+ cache: true
+
+ - name: Install dependencies
+ run: |
+ pdm lock
+ pdm install --dev
+
+ - name: mypy
+ run: |
+ pdm run mypy --python-version=${{ matrix.python }} src/
+
+ - name: lint
+ run: |
+ pdm run make lint
+
+ - name: format
+ run: |
+ pdm run make format-check
+
+ - name: isort
+ run: |
+ pdm run isort . --check --diff
diff --git a/Makefile b/Makefile
@@ -7,7 +7,7 @@ MAKE = make
 VERSION = $(shell cat VERSION)
 
 app_root := $(if $(PROJ_DIR),$(PROJ_DIR),$(CURDIR))
-pkg_src = $(app_root)/prepembd
+pkg_src = $(app_root)/src/prepembd
 tests_src = $(app_root)/tests
 
 .PHONY: all
@@ -20,9 +20,10 @@ all: clean build upload ## Build and upload
 # Building, Deploying \
 BUILDING: ## ############################################################
 .PHONY: build
-build: clean format isort ## format and build
+build: clean format sort-imports ## format and build
  @echo "building"
- python -m build
+ #python -m build
+ pdm build
 
 .PHONY: publish
 publish: ## publish
@@ -78,39 +79,44 @@ create-release: ## create a release on GitHub via the gh cli
 ################################################################################
 # Testing \
 TESTING: ## ############################################################
-.PHONY: test
-test: ## run tests
+.PHONY: test-unit
+test-unit: ## run unit tests
  python -m pytest -ra --junitxml=report.xml --cov-config=pyproject.toml --cov-report=xml --cov-report term --cov=$(pkg_src) tests/
 
-.PHONY: tox
-tox: ## Run tox
- tox
+.PHONY: test
+test: init test-unit ## run all tests
+
+
+.PHONY: test-cicd
+test-cicd: test-unit ## run cicd tsts
 
 ################################################################################
 # Code Quality \
 QUALITY: ## ############################################################
-.PHONY: style
-style: isort format ## perform code style format (black, isort)
 
 .PHONY: format
-format: ## perform black formatting
- black $(pkg_src) tests
+format: ## perform ruff formatting
+ @ruff format $(pkg_src) $(tests_src)
 
-.PHONY: isort
-isort: ## apply import sort ordering
- isort . --profile black
+.PHONY: format-check
+format-check: ## perform black formatting
+ @ruff format --check $(pkg_src) $(tests_src)
 
-.PHONY: lint
-lint: flake8 mypy ## lint code with all static code checks
+.PHONY: sort-imports
+sort-imports: ## apply import sort ordering
+ isort $(pkg_src) $(tests_src) --profile black
+
+.PHONY: style
+style: sort-imports format ## perform code style format (black, isort)
 
-.PHONY: flake8
-flake8: ## check style with flake8
- @flake8 $(pkg_src)
+.PHONY: lint
+lint: ## check style with ruff
+ @ruff $(pkg_src) $(tests_src)
 
 .PHONY: mypy
 mypy: ## check type hint annotations
- # keep config in pyproject.toml for integration with PyCharm
- mypy --config-file pyproject.toml $(pkg_src)
+ #@mypy --config-file pyproject.toml $(pkg_src)
+ @mypy --config-file pyproject.toml --install-types --non-interactive $(pkg_src)
 
 ################################################################################
 # Clean \

diff --git a/README.md b/README.md
@@ -1 +1,32 @@
 # prepembd
+
+[![PyPi](https://img.shields.io/pypi/v/inka2)](https://pypi.org/project/prepembd)
+[![Tests CI](https://img.shields.io/github/actions/workflow/status/sysid/prepembd/test.yml?branch=main)](https://github.com/sysid/inka2/actions/workflows/prepembd.yml)
+[![Codecov](https://codecov.io/gh/sysid/prepembd/branch/main/graph/badge.svg?token=8IL9MN4FK5)](https://codecov.io/gh/sysid/prepembd)
+
+
+## Installation
+
+Install **prepembd**:
+
+```shell
+python3 -m pip install prepembd --upgrade
+```
+
+### Requirements
+
+- [Python](https://www.python.org/) >= 3.10
+
+## Why
+
+I've been using markdown now for a long time to take notes in every possible scenario. I even manage my Anki cards with markdown ([inka2](https://github.com/sysid/inka2)) so finding relevant information again is paramount.
+With the advent of semantic search via Embeddings search became so much more powerfull. However, to create the
+embeddings out of markdown the files have to be prepared in order to reduce noice and create the correct chunk sizes.
+
+This Python script automates the process and creates a json representation of all the markdown files which then can be fed into an embedding model. It is basically just a thin wrapper aroung LangChain combined with some bespoke filter to eliminated noise.
+
+
+## Usage
+```bash
+prepembd tokenize <directory> | tee -a output.ndjson
+```
diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
  "unstructured>=0.11.8",
  "nltk>=3.8.1",
  "typer>=0.9.0",
+ "markdown>=3.6",
 ]
 requires-python = ">=3.10"
 readme = "README.md"
@@ -34,7 +35,7 @@ dev = [
 ]
 
 [project.scripts]
-tokenize = "prepembd.bin.tokenize:app"
+prepembd = "prepembd.bin.tokenize:app"
 
 [tool.bumpversion]
 current_version = "0.1.1"

diff --git a/report.xml b/report.xml
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="10" time="4.518" timestamp="2024-03-17T16:24:12.587130" hostname="LMUCM1032419"><testcase classname="tests.lib.test_helper" name="test_remove_excessive_dots" time="0.002" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[This is a test.\nThis is another line.-This is a test.\nThis is another line.]" time="0.001" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[&gt; This is a quoted line.\nThis is not quoted.-This is a quoted line.\nThis is not quoted.]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[&gt; Quote1.\n&gt; Quote2.-Quote1.\nQuote2.]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[-]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[Single line with &gt; quote-Single line with &gt; quote]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[&gt;No space after quote mark-&gt;No space after quote mark]" time="0.000" /><testcase classname="tests.lib.test_path" name="test_scan_directory" time="0.020" /><testcase classname="tests.test_tokenize.TestTokenize" name="test_tokenize_tiktoken" time="0.246" /><testcase classname="tests.test_tokenize.TestTokenize" name="test_tokenize_help" time="0.016" /></testsuite></testsuites>
diff --git a/src/prepembd/bin/tokenize.py b/src/prepembd/bin/tokenize.py
@@ -2,7 +2,12 @@
 import logging
 from pathlib import Path
 
+import tiktoken
 import typer
+from langchain_text_splitters import MarkdownTextSplitter
+
+from prepembd.lib.helper import remove_excessive_dots, strip_quote_prefixes
+from prepembd.lib.path import scan_directory
 
 """
 jina-embeddings-v2-small-en is an English, monolingual embedding model supporting 8192 sequence length. It is based on a Bert architecture
@@ -42,15 +47,29 @@ def process_md_file(
  """
  if (directory / md_file).is_symlink(): # skip duplicated files
  return
- html_content = parse_markdown_to_html(directory / md_file)
- text_without_code = remove_code_blocks(html_content)
- chunks = split_into_chunks_with_tiktoken(
- text_without_code, max_chunk_size=max_chunk_size
- )
 
- for i, chunk in enumerate(chunks):
+ md_content = (directory / md_file).read_text(encoding="utf-8", errors="ignore")
+ md_content = strip_quote_prefixes(md_content)
+ md_content = remove_excessive_dots(md_content)
+
+ markdown_splitter = MarkdownTextSplitter.from_tiktoken_encoder(
+ "cl100k_base", chunk_size=max_chunk_size, chunk_overlap=200
+ )
+ encoding = tiktoken.get_encoding("cl100k_base")
+ docs = markdown_splitter.create_documents([md_content])
+
+ for doc in docs:
+ word_tokens = encoding.encode(doc.page_content)
+ word_token_length = len(word_tokens)
+ assert word_token_length < max_chunk_size
+
+ for i, doc in enumerate(docs):
+ assert (
+ len(encoding.encode(doc.page_content)) < max_chunk_size
+ ), f"Chunk {i} too large: {len(encoding.encode(doc.page_content))} tokens."
  id_ = f"{prefix}{str(md_file)}:{i}"
- json_output = json.dumps({"id": id_, "content": chunk}, ensure_ascii=True)
+ # json_output = json.dumps({"id": id_, "content": chunk.page_content}, ensure_ascii=True)
+ json_output = json.dumps({"id": id_, "content": doc.page_content})
  print(json_output)
 
 
@@ -60,7 +79,10 @@ def main(
  verbose: bool = typer.Option(False, "-v", "--verbose", help="verbosity"),
  version: bool = typer.Option(False, "-V", "--version", help="show version"),
 ):
- log_fmt = r"%(asctime)-15s %(levelname)-7s %(message)s"
+ log_fmt = (
+ r"%(asctime)-15s %(levelname)s %(name)s %(funcName)s:%(lineno)d %(message)s"
+ )
+ # log_fmt = r"%(asctime)-15s %(levelname)-7s %(message)s"
  if verbose:
  logging.basicConfig(
  format=log_fmt, level=logging.DEBUG, datefmt="%m-%d %H:%M:%S"
@@ -69,9 +91,7 @@ def main(
  logging.basicConfig(
  format=log_fmt, level=logging.INFO, datefmt="%m-%d %H:%M:%S"
  )
- logging.getLogger("botocore").setLevel(logging.INFO)
- logging.getLogger("boto3").setLevel(logging.INFO)
- logging.getLogger("urllib3").setLevel(logging.INFO)
+ logging.getLogger("unstructured").setLevel(logging.WARN)
 
  if ctx.invoked_subcommand is None and version:
  ctx.invoke(print_version)