Skip to content
This repository has been archived by the owner on Sep 12, 2024. It is now read-only.

add markdown reader and example code #1

Merged
merged 29 commits into from
Oct 2, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
b15b9f0
Initial commit
SeeknnDestroy Sep 21, 2023
0eb5d54
Removed test
SeeknnDestroy Sep 21, 2023
5ee0d27
Update name to playground
SeeknnDestroy Sep 22, 2023
5b0558f
Update readme
SeeknnDestroy Sep 22, 2023
8abcb27
Initial commit
SeeknnDestroy Sep 28, 2023
fc8ef14
Minor fix
SeeknnDestroy Sep 28, 2023
9e89cd7
Improve modularity
SeeknnDestroy Sep 28, 2023
de8cfe2
switch to gitpython
SeeknnDestroy Sep 28, 2023
a55aa3f
Minor update
SeeknnDestroy Sep 28, 2023
b1fd66c
Add configurables
SeeknnDestroy Sep 28, 2023
e2ebe6e
Add logging
SeeknnDestroy Sep 28, 2023
6f9db5e
Add llama_utils
SeeknnDestroy Sep 29, 2023
91e2bbc
Add update_index_for_changed_files
SeeknnDestroy Sep 29, 2023
786d888
Delete file_hashes
SeeknnDestroy Sep 29, 2023
a1bdfcc
Update naming conventions
SeeknnDestroy Sep 29, 2023
226d8ea
Add comments
SeeknnDestroy Sep 29, 2023
a67aa5d
delete data/
SeeknnDestroy Sep 29, 2023
3412596
Major update
SeeknnDestroy Sep 30, 2023
bb259c1
Markdownreader class inherited
SeeknnDestroy Sep 30, 2023
008b04c
update gitignore
SeeknnDestroy Sep 30, 2023
1859283
delete extra info from util function
SeeknnDestroy Sep 30, 2023
aee0aaa
Another major update
SeeknnDestroy Sep 30, 2023
b7a7943
Minor fix
SeeknnDestroy Sep 30, 2023
85da070
change playground to fastapi app
SeeknnDestroy Oct 2, 2023
02b4ebd
Initial fastapi app implementation
SeeknnDestroy Oct 2, 2023
ec24f69
add fastapi docs
SeeknnDestroy Oct 2, 2023
28a8692
Modify endpoint name
SeeknnDestroy Oct 2, 2023
b83af21
Update route name
SeeknnDestroy Oct 2, 2023
3f25e46
update similartiy_top_k to 10
SeeknnDestroy Oct 2, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,10 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# Ignore the cloned Ultralytics repository
/ultralytics/
# For testing
testing.py
# Hash storage file
file_hashes.txt
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,17 @@
# doc-qa-ai
# doc-qa-ai

## Setup

### Install dependencies

```bash
pip install -r requirements.txt
```

### Sample .env file

```bash
OPENAI_API_KEY=<your-openai-api-key>
GIT_REPO_URL="https://github.com/ultralytics/ultralytics.git"
GIT_REPO_PATH="./ultralytics"
```
21 changes: 21 additions & 0 deletions git_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import logging
from git import Repo
from pathlib import Path

logger = logging.getLogger(__name__)

def clone_or_pull_repository(git_url: str, local_path: Path) -> None:
"""
Clone a Git repository or pull latest changes if it already exists.

Parameters:
git_url (str): The URL of the Git repository.
local_path (Path): The local path where the repository will be cloned or updated.
"""
if local_path.exists():
logger.info(f"Updating existing repository at {local_path}")
repo = Repo(str(local_path))
repo.remotes.origin.pull()
else:
logger.info(f"Cloning repository from {git_url} to {local_path}")
Repo.clone_from(git_url, str(local_path))
75 changes: 75 additions & 0 deletions hash_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import logging
import hashlib
from pathlib import Path
from typing import List, Dict

logger = logging.getLogger(__name__)

def get_md5(file_path: Path) -> str:
"""
Compute the MD5 hash of a file.

Parameters:
file_path (Path): The path to the file.

Returns:
str: The MD5 hash of the file.
"""
hasher = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hasher.update(chunk)
return hasher.hexdigest()

def load_last_hashes(hash_file: Path) -> Dict[str, str]:
"""
Load the last known hashes from a file.

Parameters:
hash_file (Path): The path to the hash file.

Returns:
Dict[str, str]: A dictionary mapping file paths to their MD5 hashes.
"""
if hash_file.exists():
with open(hash_file, 'r') as f:
return {line.split()[0]: line.split()[1] for line in f.readlines()}
return {}

def save_current_hashes(current_hashes: Dict[str, str], hash_file: Path) -> None:
"""
Save the current hashes to a file.

Parameters:
current_hashes (Dict[str, str]): A dictionary mapping file paths to their MD5 hashes.
hash_file (Path): The path to the hash file.
"""
with open(hash_file, 'w') as f:
for file, hash in current_hashes.items():
f.write(f"{file} {hash}\n")

def check_for_changes(markdown_files: List[Path], hash_file: Path = Path("file_hashes.txt")) -> List[Path]:
"""
Check for file changes based on their MD5 hashes.

Parameters:
markdown_files (List[Path]): List of markdown files to check.
hash_file (Path): The path to the hash file.

Returns:
List[Path]: List of changed files.
"""
last_hashes = load_last_hashes(hash_file)
current_hashes = {}
changed_files = []

for file in markdown_files:
current_hash = get_md5(file)
current_hashes[str(file)] = current_hash
if str(file) not in last_hashes or last_hashes[str(file)] != current_hash:
changed_files.append(file)

logger.info(f"Found {len(changed_files)} changed files.")

save_current_hashes(current_hashes, hash_file)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

su an test amacli mi lokale yaziyoruz bunu? modal gibi bir yere deploy ettigimizde calisacak sekilde guncellememiz lazim bir noktada

return changed_files
45 changes: 45 additions & 0 deletions llama_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from pathlib import Path
from typing import List

from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters
from llama_index import VectorStoreIndex, Document


def delete_docs_from_changed_files(index: VectorStoreIndex, changed_files: List[Path]):
"""
Delete all documents in the index that belong to the list of changed files.

Parameters:
index (SummaryIndex): The index object.
changed_files (List[Path]): List of changed markdown files.
"""
for file in changed_files:
filters = MetadataFilters(filters=[ExactMatchFilter(key="original_file_path", value=str(file))])
retriever = index.as_retriever(filters=filters)

# Retrieve the documents to delete
docs_to_delete = retriever.retrieve("")

for doc in docs_to_delete:
index.delete_ref_doc(doc.node.id_, delete_from_docstore=True)


def update_index_for_changed_files(index: VectorStoreIndex, changed_files: List[Path], markdown_reader) -> None:
SeeknnDestroy marked this conversation as resolved.
Show resolved Hide resolved
"""
Delete old documents and insert new ones for the changed files.

Parameters:
index (VectorStoreIndex): The current Llama VectorStoreIndex.
changed_files (List[Path]): List of changed markdown files.
markdown_reader: An instance of MarkdownReader class.
"""
# Delete old header-docs for changed files
delete_docs_from_changed_files(index, changed_files)

# Process the updated markdown files and insert new header-docs
for file in changed_files:
extra_info = {"original_file_path": str(file)}
new_documents = markdown_reader.load_data(file, extra_info=extra_info)

for doc in new_documents:
index.insert(Document(text=doc.text, metadata=doc.metadata))
37 changes: 37 additions & 0 deletions markdown_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import logging
from multi_markdown_reader import MultiMarkdownReader
from pathlib import Path
from typing import List

logger = logging.getLogger(__name__)

def process_and_get_header_docs(folder_path: Path) -> list:
"""
Process markdown files to extract "header-documents."

Parameters:
folder_path (Path): Path to the folder containing markdown files.

Returns:
list: List of processed "header-documents."
"""
markdown_reader = MultiMarkdownReader()
documents = markdown_reader.load_data_from_folder(folder_path)
return documents


def get_markdown_files(repo_path: Path, docs_folder: Path = Path("docs")) -> List[Path]:
"""
Get all markdown files in the docs folder of a Git repository.

Parameters:
repo_path (Path): The path to the Git repository.
docs_folder (Path): The path to the docs folder within the repository. Defaults to "docs".

Returns:
List[Path]: List of Paths to all markdown files.
"""
docs_path = repo_path / docs_folder
markdown_files = list(docs_path.glob('**/*.md'))
logger.info(f"Found {len(markdown_files)} markdown files.")
return markdown_files
126 changes: 126 additions & 0 deletions markdown_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""Markdown Reader.

A parser for md files.

"""
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, cast

from llama_index.readers.base import BaseReader
from llama_index.schema import Document


class MarkdownReader(BaseReader):
"""Markdown parser.

Extract text from markdown files.
Returns dictionary with keys as headers and values as the text between headers.

"""

def __init__(
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._remove_hyperlinks = remove_hyperlinks
self._remove_images = remove_images

def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a markdown file to a dictionary.

The keys are the headers and the values are the text under each header.

"""
markdown_tups: List[Tuple[Optional[str], str]] = []
lines = markdown_text.split("\n")

current_header = None
current_text = ""

for line in lines:
header_match = re.match(r"^#+\s", line)
if header_match:
if current_header is not None:
if current_text == "" or None:
continue
markdown_tups.append((current_header, current_text))

current_header = line
current_text = ""
else:
current_text += line + "\n"
markdown_tups.append((current_header, current_text))

if current_header is not None:
# pass linting, assert keys are defined
markdown_tups = [
(re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
for key, value in markdown_tups
]
else:
markdown_tups = [
(key, re.sub("<.*?>", "", value)) for key, value in markdown_tups
]

return markdown_tups

def remove_images(self, content: str) -> str:
"""Get a dictionary of a markdown file from its path."""
pattern = r"!{1}\[\[(.*)\]\]"
content = re.sub(pattern, "", content)
return content

def remove_hyperlinks(self, content: str) -> str:
"""Get a dictionary of a markdown file from its path."""
pattern = r"\[(.*?)\]\((.*?)\)"
content = re.sub(pattern, r"\1", content)
return content

def parse_tups(
self, filepath: Path, content: Optional[str] = None, errors: str = "ignore"
) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples.
If content is provided, use that instead of reading from file."""
if content is None:
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
if self._remove_hyperlinks:
content = self.remove_hyperlinks(content)
if self._remove_images:
content = self.remove_images(content)
markdown_tups = self.markdown_to_tups(content)
return markdown_tups

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
content: Optional[str] = None,
) -> List[Document]:
"""Parse file into string.
If content is provided, use that instead of reading from file."""
tups = self.parse_tups(file, content=content)
results = []

# Add file path to metadata for tracking
metadata = {'original_file_path': str(file)}

# Merge with any additional metadata
if extra_info:
metadata.update(extra_info)

results = [
Document(
text=f"\n\n{header}\n{value}" if header else value,
metadata=metadata,
)
for header, value in tups
]

return results
Loading