Skip to content

Commit

Permalink
feat: mvp
Browse files Browse the repository at this point in the history
  • Loading branch information
sysid committed Mar 17, 2024
1 parent 4e6e245 commit 4ac517a
Show file tree
Hide file tree
Showing 14 changed files with 3,798 additions and 51 deletions.
11 changes: 11 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "monthly"

- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "monthly"
91 changes: 91 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
name: Tests

on:
workflow_dispatch:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
test:
runs-on: ubuntu-latest

strategy:
matrix:
python: [ '3.10', '3.11', '3.12' ]

steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}

- name: Setup PDM
uses: pdm-project/setup-pdm@v4
with:
python-version: ${{ matrix.python }}
cache: true

- name: Install dependencies
run: |
sudo apt update && sudo apt install --yes libgl1-mesa-dev
pdm lock
pdm install --dev
- name: Set PYTHONPATH
run: echo "PYTHONPATH=${{ github.workspace }}/src" >> $GITHUB_ENV

- name: Test with pytest
run: |
pdm run make test-cicd
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4.0.2
with:
token: ${{ secrets.CODECOV_TOKEN }}
slug: sysid/prepembd

lint:
runs-on: ubuntu-latest

strategy:
matrix:
python: [ '3.10', '3.11', '3.12' ]

steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}

- name: Setup PDM
uses: pdm-project/setup-pdm@v4
with:
python-version: ${{ matrix.python }}
cache: true

- name: Install dependencies
run: |
pdm lock
pdm install --dev
# - name: mypy
# run: |
# pdm run mypy --python-version=${{ matrix.python }} src/

- name: lint
run: |
pdm run make lint
- name: format
run: |
pdm run make format-check
- name: isort
run: |
pdm run isort . --check --diff
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
report.xml
5 changes: 5 additions & 0 deletions .idea/runConfigurations/rsenv.sh

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

50 changes: 28 additions & 22 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ MAKE = make
VERSION = $(shell cat VERSION)

app_root := $(if $(PROJ_DIR),$(PROJ_DIR),$(CURDIR))
pkg_src = $(app_root)/prepembd
pkg_src = $(app_root)/src/prepembd
tests_src = $(app_root)/tests

.PHONY: all
Expand All @@ -20,9 +20,10 @@ all: clean build upload ## Build and upload
# Building, Deploying \
BUILDING: ## ############################################################
.PHONY: build
build: clean format isort ## format and build
build: clean format sort-imports ## format and build
@echo "building"
python -m build
#python -m build
pdm build

.PHONY: publish
publish: ## publish
Expand Down Expand Up @@ -78,39 +79,44 @@ create-release: ## create a release on GitHub via the gh cli
################################################################################
# Testing \
TESTING: ## ############################################################
.PHONY: test-unit
test-unit: ## run unit tests
pdm run python -m pytest -ra --junitxml=report.xml --cov-config=pyproject.toml --cov-report=xml --cov-report term --cov=$(pkg_src) tests/

.PHONY: test
test: ## run tests
python -m pytest -ra --junitxml=report.xml --cov-config=pyproject.toml --cov-report=xml --cov-report term --cov=$(pkg_src) tests/
test: test-unit ## run all tests


.PHONY: tox
tox: ## Run tox
tox
.PHONY: test-cicd
test-cicd: test-unit ## run cicd tsts

################################################################################
# Code Quality \
QUALITY: ## ############################################################
.PHONY: style
style: isort format ## perform code style format (black, isort)

.PHONY: format
format: ## perform black formatting
black $(pkg_src) tests
format: ## perform ruff formatting
@ruff format $(pkg_src) $(tests_src)

.PHONY: isort
isort: ## apply import sort ordering
isort . --profile black
.PHONY: format-check
format-check: ## perform black formatting
@ruff format --check $(pkg_src) $(tests_src)

.PHONY: lint
lint: flake8 mypy ## lint code with all static code checks
.PHONY: sort-imports
sort-imports: ## apply import sort ordering
isort $(pkg_src) $(tests_src) --profile black

.PHONY: flake8
flake8: ## check style with flake8
@flake8 $(pkg_src)
.PHONY: style
style: sort-imports format ## perform code style format (black, isort)

.PHONY: lint
lint: ## check style with ruff
@ruff $(pkg_src) $(tests_src)

.PHONY: mypy
mypy: ## check type hint annotations
# keep config in pyproject.toml for integration with PyCharm
mypy --config-file pyproject.toml $(pkg_src)
#@mypy --config-file pyproject.toml $(pkg_src)
@mypy --config-file pyproject.toml --install-types --non-interactive $(pkg_src)

################################################################################
# Clean \
Expand Down
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,34 @@
# prepembd

[![PyPi](https://img.shields.io/pypi/v/inka2)](https://pypi.org/project/prepembd)
[![Tests CI](https://img.shields.io/github/actions/workflow/status/sysid/prepembd/test.yml?branch=main)](https://github.com/sysid/inka2/actions/workflows/prepembd.yml)
[![Codecov](https://codecov.io/gh/sysid/prepembd/branch/main/graph/badge.svg?token=8IL9MN4FK5)](https://codecov.io/gh/sysid/prepembd)


## Installation

Install **prepembd**:

```shell
python3 -m pip install prepembd --upgrade
```

### Requirements

- [Python](https://www.python.org/) >= 3.10

## Why

I've been using markdown now for a long time to take notes in every possible scenario. I even manage my Anki cards with markdown ([inka2](https://github.com/sysid/inka2)) so finding relevant information again is paramount.
With the advent of semantic search via Embeddings search became so much more powerfull. However, to create the
embeddings out of markdown the files have to be prepared in order to reduce noice and create the correct chunk sizes.

This Python script automates the process and creates a json representation of all the markdown files which then can be fed into an embedding model. It is basically just a thin wrapper aroung LangChain combined with some bespoke filter to eliminated noise.


## Usage
```bash
prepembd tokenize <directory> | tee -a output.ndjson
```

This script integrates particularly well with [bkmr](https://github.com/sysid/bkmr).
39 changes: 38 additions & 1 deletion pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 9 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies = [
"unstructured>=0.11.8",
"nltk>=3.8.1",
"typer>=0.9.0",
"markdown>=3.6",
]
requires-python = ">=3.10"
readme = "README.md"
Expand All @@ -31,10 +32,17 @@ dev = [
"bump-my-version>=0.19.0",
"mypy>=1.9.0",
"isort>=5.13.2",
"ruff>=0.3.3",
]

[tool.pdm.build]
excludes = ["./**/.git"]
package-dir = "src"
includes = ["src/prepembd"]
source-includes = ["tests", "CHANGELOG.md", "LICENSE", "README.md", "tox.ini"]

[project.scripts]
tokenize = "prepembd.bin.tokenize:app"
prepembd = "prepembd.bin.tokenize:app"

[tool.bumpversion]
current_version = "0.1.1"
Expand Down Expand Up @@ -76,12 +84,6 @@ filename = "pyproject.toml"
[[tool.bumpversion.files]]
filename = "src/prepembd/__init__.py"

[tool.pdm.build]
excludes = ["./**/.git"]
package-dir = "src"
includes = ["src/prepembd"]
source-includes = ["tests", "CHANGELOG.md", "LICENSE", "README.md", "tox.ini"]

[tool.pytest.ini_options]
markers = [
"e2e: marks tests as e2e tests, not to be run in CICD"
Expand Down
Loading

0 comments on commit 4ac517a

Please sign in to comment.