Skip to content

Commit

Permalink
feat: mvp
Browse files Browse the repository at this point in the history
  • Loading branch information
sysid committed Mar 17, 2024
1 parent 4e6e245 commit 7598208
Show file tree
Hide file tree
Showing 13 changed files with 3,755 additions and 45 deletions.
11 changes: 11 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "monthly"

- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "monthly"
88 changes: 88 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
name: Tests

on:
workflow_dispatch:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
test:
runs-on: ubuntu-latest

strategy:
matrix:
python: [ '3.10', '3.11', '3.12' ]

steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}

- name: Setup PDM
uses: pdm-project/setup-pdm@v4
with:
python-version: ${{ matrix.python }}
cache: true

- name: Install dependencies
run: |
sudo apt update && sudo apt install --yes libgl1-mesa-dev
pdm lock
pdm install --dev
- name: Test with pytest
run: |
pdm run make test-cicd
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4.0.2
with:
token: ${{ secrets.CODECOV_TOKEN }}
slug: sysid/prepembd

lint:
runs-on: ubuntu-latest

strategy:
matrix:
python: [ '3.10', '3.11', '3.12' ]

steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}

- name: Setup PDM
uses: pdm-project/setup-pdm@v4
with:
python-version: ${{ matrix.python }}
cache: true

- name: Install dependencies
run: |
pdm lock
pdm install --dev
- name: mypy
run: |
pdm run mypy --python-version=${{ matrix.python }} src/
- name: lint
run: |
pdm run make lint
- name: format
run: |
pdm run make format-check
- name: isort
run: |
pdm run isort . --check --diff
50 changes: 28 additions & 22 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ MAKE = make
VERSION = $(shell cat VERSION)

app_root := $(if $(PROJ_DIR),$(PROJ_DIR),$(CURDIR))
pkg_src = $(app_root)/prepembd
pkg_src = $(app_root)/src/prepembd
tests_src = $(app_root)/tests

.PHONY: all
Expand All @@ -20,9 +20,10 @@ all: clean build upload ## Build and upload
# Building, Deploying \
BUILDING: ## ############################################################
.PHONY: build
build: clean format isort ## format and build
build: clean format sort-imports ## format and build
@echo "building"
python -m build
#python -m build
pdm build

.PHONY: publish
publish: ## publish
Expand Down Expand Up @@ -78,39 +79,44 @@ create-release: ## create a release on GitHub via the gh cli
################################################################################
# Testing \
TESTING: ## ############################################################
.PHONY: test
test: ## run tests
.PHONY: test-unit
test-unit: ## run unit tests
python -m pytest -ra --junitxml=report.xml --cov-config=pyproject.toml --cov-report=xml --cov-report term --cov=$(pkg_src) tests/

.PHONY: tox
tox: ## Run tox
tox
.PHONY: test
test: init test-unit ## run all tests


.PHONY: test-cicd
test-cicd: test-unit ## run cicd tsts

################################################################################
# Code Quality \
QUALITY: ## ############################################################
.PHONY: style
style: isort format ## perform code style format (black, isort)

.PHONY: format
format: ## perform black formatting
black $(pkg_src) tests
format: ## perform ruff formatting
@ruff format $(pkg_src) $(tests_src)

.PHONY: isort
isort: ## apply import sort ordering
isort . --profile black
.PHONY: format-check
format-check: ## perform black formatting
@ruff format --check $(pkg_src) $(tests_src)

.PHONY: lint
lint: flake8 mypy ## lint code with all static code checks
.PHONY: sort-imports
sort-imports: ## apply import sort ordering
isort $(pkg_src) $(tests_src) --profile black

.PHONY: style
style: sort-imports format ## perform code style format (black, isort)

.PHONY: flake8
flake8: ## check style with flake8
@flake8 $(pkg_src)
.PHONY: lint
lint: ## check style with ruff
@ruff $(pkg_src) $(tests_src)

.PHONY: mypy
mypy: ## check type hint annotations
# keep config in pyproject.toml for integration with PyCharm
mypy --config-file pyproject.toml $(pkg_src)
#@mypy --config-file pyproject.toml $(pkg_src)
@mypy --config-file pyproject.toml --install-types --non-interactive $(pkg_src)

################################################################################
# Clean \
Expand Down
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,32 @@
# prepembd

[![PyPi](https://img.shields.io/pypi/v/inka2)](https://pypi.org/project/prepembd)
[![Tests CI](https://img.shields.io/github/actions/workflow/status/sysid/prepembd/test.yml?branch=main)](https://github.com/sysid/inka2/actions/workflows/prepembd.yml)
[![Codecov](https://codecov.io/gh/sysid/prepembd/branch/main/graph/badge.svg?token=8IL9MN4FK5)](https://codecov.io/gh/sysid/prepembd)


## Installation

Install **prepembd**:

```shell
python3 -m pip install prepembd --upgrade
```

### Requirements

- [Python](https://www.python.org/) >= 3.10

## Why

I've been using markdown now for a long time to take notes in every possible scenario. I even manage my Anki cards with markdown ([inka2](https://github.com/sysid/inka2)) so finding relevant information again is paramount.
With the advent of semantic search via Embeddings search became so much more powerfull. However, to create the
embeddings out of markdown the files have to be prepared in order to reduce noice and create the correct chunk sizes.

This Python script automates the process and creates a json representation of all the markdown files which then can be fed into an embedding model. It is basically just a thin wrapper aroung LangChain combined with some bespoke filter to eliminated noise.


## Usage
```bash
prepembd tokenize <directory> | tee -a output.ndjson
```
13 changes: 12 additions & 1 deletion pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies = [
"unstructured>=0.11.8",
"nltk>=3.8.1",
"typer>=0.9.0",
"markdown>=3.6",
]
requires-python = ">=3.10"
readme = "README.md"
Expand All @@ -34,7 +35,7 @@ dev = [
]

[project.scripts]
tokenize = "prepembd.bin.tokenize:app"
prepembd = "prepembd.bin.tokenize:app"

[tool.bumpversion]
current_version = "0.1.1"
Expand Down
1 change: 1 addition & 0 deletions report.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="10" time="4.518" timestamp="2024-03-17T16:24:12.587130" hostname="LMUCM1032419"><testcase classname="tests.lib.test_helper" name="test_remove_excessive_dots" time="0.002" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[This is a test.\nThis is another line.-This is a test.\nThis is another line.]" time="0.001" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[&gt; This is a quoted line.\nThis is not quoted.-This is a quoted line.\nThis is not quoted.]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[&gt; Quote1.\n&gt; Quote2.-Quote1.\nQuote2.]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[-]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[Single line with &gt; quote-Single line with &gt; quote]" time="0.000" /><testcase classname="tests.lib.test_helper" name="test_strip_quote_prefixes[&gt;No space after quote mark-&gt;No space after quote mark]" time="0.000" /><testcase classname="tests.lib.test_path" name="test_scan_directory" time="0.020" /><testcase classname="tests.test_tokenize.TestTokenize" name="test_tokenize_tiktoken" time="0.246" /><testcase classname="tests.test_tokenize.TestTokenize" name="test_tokenize_help" time="0.016" /></testsuite></testsuites>
42 changes: 31 additions & 11 deletions src/prepembd/bin/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
import logging
from pathlib import Path

import tiktoken
import typer
from langchain_text_splitters import MarkdownTextSplitter

from prepembd.lib.helper import remove_excessive_dots, strip_quote_prefixes
from prepembd.lib.path import scan_directory

"""
jina-embeddings-v2-small-en is an English, monolingual embedding model supporting 8192 sequence length. It is based on a Bert architecture
Expand Down Expand Up @@ -42,15 +47,29 @@ def process_md_file(
"""
if (directory / md_file).is_symlink(): # skip duplicated files
return
html_content = parse_markdown_to_html(directory / md_file)
text_without_code = remove_code_blocks(html_content)
chunks = split_into_chunks_with_tiktoken(
text_without_code, max_chunk_size=max_chunk_size
)

for i, chunk in enumerate(chunks):
md_content = (directory / md_file).read_text(encoding="utf-8", errors="ignore")
md_content = strip_quote_prefixes(md_content)
md_content = remove_excessive_dots(md_content)

markdown_splitter = MarkdownTextSplitter.from_tiktoken_encoder(
"cl100k_base", chunk_size=max_chunk_size, chunk_overlap=200
)
encoding = tiktoken.get_encoding("cl100k_base")
docs = markdown_splitter.create_documents([md_content])

for doc in docs:
word_tokens = encoding.encode(doc.page_content)
word_token_length = len(word_tokens)
assert word_token_length < max_chunk_size

for i, doc in enumerate(docs):
assert (
len(encoding.encode(doc.page_content)) < max_chunk_size
), f"Chunk {i} too large: {len(encoding.encode(doc.page_content))} tokens."
id_ = f"{prefix}{str(md_file)}:{i}"
json_output = json.dumps({"id": id_, "content": chunk}, ensure_ascii=True)
# json_output = json.dumps({"id": id_, "content": chunk.page_content}, ensure_ascii=True)
json_output = json.dumps({"id": id_, "content": doc.page_content})
print(json_output)


Expand All @@ -60,7 +79,10 @@ def main(
verbose: bool = typer.Option(False, "-v", "--verbose", help="verbosity"),
version: bool = typer.Option(False, "-V", "--version", help="show version"),
):
log_fmt = r"%(asctime)-15s %(levelname)-7s %(message)s"
log_fmt = (
r"%(asctime)-15s %(levelname)s %(name)s %(funcName)s:%(lineno)d %(message)s"
)
# log_fmt = r"%(asctime)-15s %(levelname)-7s %(message)s"
if verbose:
logging.basicConfig(
format=log_fmt, level=logging.DEBUG, datefmt="%m-%d %H:%M:%S"
Expand All @@ -69,9 +91,7 @@ def main(
logging.basicConfig(
format=log_fmt, level=logging.INFO, datefmt="%m-%d %H:%M:%S"
)
logging.getLogger("botocore").setLevel(logging.INFO)
logging.getLogger("boto3").setLevel(logging.INFO)
logging.getLogger("urllib3").setLevel(logging.INFO)
logging.getLogger("unstructured").setLevel(logging.WARN)

if ctx.invoked_subcommand is None and version:
ctx.invoke(print_version)
Expand Down
Loading

0 comments on commit 7598208

Please sign in to comment.