feat: mvp

sysid · Mar 17, 2024 · 976f229 · 976f229
1 parent 4e6e245
commit 976f229
Show file tree

Hide file tree

Showing 13 changed files with 3,752 additions and 43 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+version: 2
+updates:
+  - package-ecosystem: "pip" 
+    directory: "/"
+    schedule:
+      interval: "monthly"
+
+  - package-ecosystem: "github-actions" 
+    directory: "/"
+    schedule:
+      interval: "monthly"
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,88 @@
+name: Tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python: [ '3.10', '3.11', '3.12' ]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Setup PDM
+        uses: pdm-project/setup-pdm@v4
+        with:
+          python-version: ${{ matrix.python }}
+          cache: true
+
+      - name: Install dependencies
+        run: |
+          sudo apt update && sudo apt install --yes libgl1-mesa-dev
+          pdm lock
+          pdm install --dev
+
+      - name: Test with pytest
+        run: |
+          pdm run make test-cicd
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4.0.2
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          slug: sysid/prepembd
+
+  lint:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python: [ '3.10', '3.11', '3.12' ]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Setup PDM
+        uses: pdm-project/setup-pdm@v4
+        with:
+          python-version: ${{ matrix.python }}
+          cache: true
+
+      - name: Install dependencies
+        run: |
+          pdm lock
+          pdm install --dev
+
+      - name: mypy
+        run: |
+          pdm run mypy --python-version=${{ matrix.python }} src/
+
+      - name: lint
+        run: |
+          pdm run make lint
+
+      - name: format
+        run: |
+          pdm run make format-check
+
+      - name: isort
+        run: |
+          pdm run isort . --check --diff
diff --git a/Makefile b/Makefile
@@ -7,7 +7,7 @@ MAKE          = make
 VERSION       = $(shell cat VERSION)
 
 app_root := $(if $(PROJ_DIR),$(PROJ_DIR),$(CURDIR))
-pkg_src =  $(app_root)/prepembd
+pkg_src =  $(app_root)/src/prepembd
 tests_src = $(app_root)/tests
 
 .PHONY: all
@@ -78,39 +78,44 @@ create-release:  ## create a release on GitHub via the gh cli
 ################################################################################
 # Testing \
 TESTING:  ## ############################################################
-.PHONY: test
-test:  ## run tests
+.PHONY: test-unit
+test-unit:  ## run unit tests
 	python -m pytest -ra --junitxml=report.xml --cov-config=pyproject.toml --cov-report=xml --cov-report term --cov=$(pkg_src) tests/
 
-.PHONY: tox
-tox:   ## Run tox
-	tox
+.PHONY: test
+test: init  test-unit  ## run all tests
+
+
+.PHONY: test-cicd
+test-cicd: test-unit  ## run cicd tsts
 
 ################################################################################
 # Code Quality \
 QUALITY:  ## ############################################################
-.PHONY: style
-style: isort format  ## perform code style format (black, isort)
 
 .PHONY: format
-format:  ## perform black formatting
-	black $(pkg_src) tests
+format:  ## perform ruff formatting
+	@ruff format $(pkg_src) $(tests_src)
 
-.PHONY: isort
-isort:  ## apply import sort ordering
-	isort . --profile black
+.PHONY: format-check
+format-check:  ## perform black formatting
+	@ruff format --check $(pkg_src) $(tests_src)
 
-.PHONY: lint
-lint: flake8 mypy ## lint code with all static code checks
+.PHONY: sort-imports
+sort-imports:  ## apply import sort ordering
+	isort $(pkg_src) $(tests_src) --profile black
+
+.PHONY: style
+style: sort-imports format  ## perform code style format (black, isort)
 
-.PHONY: flake8
-flake8:  ## check style with flake8
-	@flake8 $(pkg_src)
+.PHONY: lint
+lint:  ## check style with ruff
+	@ruff $(pkg_src) $(tests_src)
 
 .PHONY: mypy
 mypy:  ## check type hint annotations
-	# keep config in pyproject.toml for integration with PyCharm
-	mypy --config-file pyproject.toml $(pkg_src)
+	#@mypy --config-file pyproject.toml $(pkg_src)
+	@mypy --config-file pyproject.toml --install-types --non-interactive $(pkg_src)
 
 ################################################################################
 # Clean \

diff --git a/README.md b/README.md
@@ -1 +1,32 @@
 # prepembd
+
+[![PyPi](https://img.shields.io/pypi/v/inka2)](https://pypi.org/project/prepembd)
+[![Tests CI](https://img.shields.io/github/actions/workflow/status/sysid/prepembd/test.yml?branch=main)](https://github.com/sysid/inka2/actions/workflows/prepembd.yml)
+[![Codecov](https://codecov.io/gh/sysid/prepembd/branch/main/graph/badge.svg?token=8IL9MN4FK5)](https://codecov.io/gh/sysid/prepembd)
+
+
+## Installation
+
+Install **prepembd**:
+
+```shell
+python3 -m pip install prepembd --upgrade
+```
+
+### Requirements
+
+- [Python](https://www.python.org/) >= 3.10
+
+## Why
+
+I've been using markdown now for a long time to take notes in every possible scenario. I even manage my Anki cards with markdown ([inka2](https://github.com/sysid/inka2)) so finding relevant information again is paramount.
+With the advent of semantic search via Embeddings search became so much more powerfull. However, to create the
+embeddings out of markdown the files have to be prepared in order to reduce noice and create the correct chunk sizes.
+
+This Python script automates the process and creates a json representation of all the markdown files which then can be fed into an embedding model. It is basically just a thin wrapper aroung LangChain combined with some bespoke filter to eliminated noise.
+
+
+## Usage
+```bash
+prepembd tokenize <directory> | tee -a output.ndjson
+```
diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
     "unstructured>=0.11.8",
     "nltk>=3.8.1",
     "typer>=0.9.0",
+    "markdown>=3.6",
 ]
 requires-python = ">=3.10"
 readme = "README.md"
@@ -34,7 +35,7 @@ dev = [
 ]
 
 [project.scripts]
-tokenize = "prepembd.bin.tokenize:app"
+prepembd = "prepembd.bin.tokenize:app"
 
 [tool.bumpversion]
 current_version = "0.1.1"

diff --git a/report.xml b/report.xml
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="10" time="4.518" timestamp="2024-03-17T16:24:12.587130" hostname="LMUCM1032419"><testcase classname="tests.lib.test_helper" name="test_remove_excessive_dots" time="0.002" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[This is a test.\nThis is another line.-This is a test.\nThis is another line.]" time="0.001" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[&gt; This is a quoted line.\nThis is not quoted.-This is a quoted line.\nThis is not quoted.]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[&gt; Quote1.\n&gt; Quote2.-Quote1.\nQuote2.]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[-]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[Single line with &gt; quote-Single line with &gt; quote]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[&gt;No space after quote mark-&gt;No space after quote mark]" time="0.000" /><testcase classname="tests.lib.test_path" name="test_scan_directory" time="0.020" /><testcase classname="tests.test_tokenize.TestTokenize" name="test_tokenize_tiktoken" time="0.246" /><testcase classname="tests.test_tokenize.TestTokenize" name="test_tokenize_help" time="0.016" /></testsuite></testsuites>
diff --git a/src/prepembd/bin/tokenize.py b/src/prepembd/bin/tokenize.py
@@ -2,7 +2,12 @@
 import logging
 from pathlib import Path
 
+import tiktoken
 import typer
+from langchain_text_splitters import MarkdownTextSplitter
+
+from prepembd.lib.helper import remove_excessive_dots, strip_quote_prefixes
+from prepembd.lib.path import scan_directory
 
 """
 jina-embeddings-v2-small-en is an English, monolingual embedding model supporting 8192 sequence length. It is based on a Bert architecture
@@ -42,15 +47,29 @@ def process_md_file(
     """
     if (directory / md_file).is_symlink():  # skip duplicated files
         return
-    html_content = parse_markdown_to_html(directory / md_file)
-    text_without_code = remove_code_blocks(html_content)
-    chunks = split_into_chunks_with_tiktoken(
-        text_without_code, max_chunk_size=max_chunk_size
-    )
 
-    for i, chunk in enumerate(chunks):
+    md_content = (directory / md_file).read_text(encoding="utf-8", errors="ignore")
+    md_content = strip_quote_prefixes(md_content)
+    md_content = remove_excessive_dots(md_content)
+
+    markdown_splitter = MarkdownTextSplitter.from_tiktoken_encoder(
+        "cl100k_base", chunk_size=max_chunk_size, chunk_overlap=200
+    )
+    encoding = tiktoken.get_encoding("cl100k_base")
+    docs = markdown_splitter.create_documents([md_content])
+
+    for doc in docs:
+        word_tokens = encoding.encode(doc.page_content)
+        word_token_length = len(word_tokens)
+        assert word_token_length < max_chunk_size
+
+    for i, doc in enumerate(docs):
+        assert (
+            len(encoding.encode(doc.page_content)) < max_chunk_size
+        ), f"Chunk {i} too large: {len(encoding.encode(doc.page_content))} tokens."
         id_ = f"{prefix}{str(md_file)}:{i}"
-        json_output = json.dumps({"id": id_, "content": chunk}, ensure_ascii=True)
+        # json_output = json.dumps({"id": id_, "content": chunk.page_content}, ensure_ascii=True)
+        json_output = json.dumps({"id": id_, "content": doc.page_content})
         print(json_output)
 
 
@@ -60,7 +79,10 @@ def main(
     verbose: bool = typer.Option(False, "-v", "--verbose", help="verbosity"),
     version: bool = typer.Option(False, "-V", "--version", help="show version"),
 ):
-    log_fmt = r"%(asctime)-15s %(levelname)-7s %(message)s"
+    log_fmt = (
+        r"%(asctime)-15s %(levelname)s %(name)s %(funcName)s:%(lineno)d %(message)s"
+    )
+    # log_fmt = r"%(asctime)-15s %(levelname)-7s %(message)s"
     if verbose:
         logging.basicConfig(
             format=log_fmt, level=logging.DEBUG, datefmt="%m-%d %H:%M:%S"
@@ -69,9 +91,7 @@ def main(
         logging.basicConfig(
             format=log_fmt, level=logging.INFO, datefmt="%m-%d %H:%M:%S"
         )
-    logging.getLogger("botocore").setLevel(logging.INFO)
-    logging.getLogger("boto3").setLevel(logging.INFO)
-    logging.getLogger("urllib3").setLevel(logging.INFO)
+    logging.getLogger("unstructured").setLevel(logging.WARN)
 
     if ctx.invoked_subcommand is None and version:
         ctx.invoke(print_version)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="10" time="4.518" timestamp="2024-03-17T16:24:12.587130" hostname="LMUCM1032419"><testcase classname="tests.lib.test_helper" name="test_remove_excessive_dots" time="0.002" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[This is a test.\nThis is another line.-This is a test.\nThis is another line.]" time="0.001" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[> This is a quoted line.\nThis is not quoted.-This is a quoted line.\nThis is not quoted.]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[> Quote1.\n> Quote2.-Quote1.\nQuote2.]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[-]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[Single line with > quote-Single line with > quote]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[>No space after quote mark->No space after quote mark]" time="0.000" /><testcase classname="tests.lib.test_path" name="test_scan_directory" time="0.020" /><testcase classname="tests.test_tokenize.TestTokenize" name="test_tokenize_tiktoken" time="0.246" /><testcase classname="tests.test_tokenize.TestTokenize" name="test_tokenize_help" time="0.016" /></testsuite></testsuites>