This repository has been archived by the owner on Sep 12, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 95
add markdown reader and example code #1
Merged
Merged
Changes from 17 commits
Commits
Show all changes
29 commits
Select commit
Hold shift + click to select a range
b15b9f0
Initial commit
SeeknnDestroy 0eb5d54
Removed test
SeeknnDestroy 5ee0d27
Update name to playground
SeeknnDestroy 5b0558f
Update readme
SeeknnDestroy 8abcb27
Initial commit
SeeknnDestroy fc8ef14
Minor fix
SeeknnDestroy 9e89cd7
Improve modularity
SeeknnDestroy de8cfe2
switch to gitpython
SeeknnDestroy a55aa3f
Minor update
SeeknnDestroy b1fd66c
Add configurables
SeeknnDestroy e2ebe6e
Add logging
SeeknnDestroy 6f9db5e
Add llama_utils
SeeknnDestroy 91e2bbc
Add update_index_for_changed_files
SeeknnDestroy 786d888
Delete file_hashes
SeeknnDestroy a1bdfcc
Update naming conventions
SeeknnDestroy 226d8ea
Add comments
SeeknnDestroy a67aa5d
delete data/
SeeknnDestroy 3412596
Major update
SeeknnDestroy bb259c1
Markdownreader class inherited
SeeknnDestroy 008b04c
update gitignore
SeeknnDestroy 1859283
delete extra info from util function
SeeknnDestroy aee0aaa
Another major update
SeeknnDestroy b7a7943
Minor fix
SeeknnDestroy 85da070
change playground to fastapi app
SeeknnDestroy 02b4ebd
Initial fastapi app implementation
SeeknnDestroy ec24f69
add fastapi docs
SeeknnDestroy 28a8692
Modify endpoint name
SeeknnDestroy b83af21
Update route name
SeeknnDestroy 3f25e46
update similartiy_top_k to 10
SeeknnDestroy File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,17 @@ | ||
# doc-qa-ai | ||
# doc-qa-ai | ||
|
||
## Setup | ||
|
||
### Install dependencies | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
### Sample .env file | ||
|
||
```bash | ||
OPENAI_API_KEY=<your-openai-api-key> | ||
GIT_REPO_URL="https://github.com/ultralytics/ultralytics.git" | ||
GIT_REPO_PATH="./ultralytics" | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import logging | ||
from git import Repo | ||
from pathlib import Path | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
def clone_or_pull_repository(git_url: str, local_path: Path) -> None: | ||
""" | ||
Clone a Git repository or pull latest changes if it already exists. | ||
|
||
Parameters: | ||
git_url (str): The URL of the Git repository. | ||
local_path (Path): The local path where the repository will be cloned or updated. | ||
""" | ||
if local_path.exists(): | ||
logger.info(f"Updating existing repository at {local_path}") | ||
repo = Repo(str(local_path)) | ||
repo.remotes.origin.pull() | ||
else: | ||
logger.info(f"Cloning repository from {git_url} to {local_path}") | ||
Repo.clone_from(git_url, str(local_path)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import logging | ||
import hashlib | ||
from pathlib import Path | ||
from typing import List, Dict | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
def get_md5(file_path: Path) -> str: | ||
""" | ||
Compute the MD5 hash of a file. | ||
|
||
Parameters: | ||
file_path (Path): The path to the file. | ||
|
||
Returns: | ||
str: The MD5 hash of the file. | ||
""" | ||
hasher = hashlib.md5() | ||
with open(file_path, "rb") as f: | ||
for chunk in iter(lambda: f.read(4096), b""): | ||
hasher.update(chunk) | ||
return hasher.hexdigest() | ||
|
||
def load_last_hashes(hash_file: Path) -> Dict[str, str]: | ||
""" | ||
Load the last known hashes from a file. | ||
|
||
Parameters: | ||
hash_file (Path): The path to the hash file. | ||
|
||
Returns: | ||
Dict[str, str]: A dictionary mapping file paths to their MD5 hashes. | ||
""" | ||
if hash_file.exists(): | ||
with open(hash_file, 'r') as f: | ||
return {line.split()[0]: line.split()[1] for line in f.readlines()} | ||
return {} | ||
|
||
def save_current_hashes(current_hashes: Dict[str, str], hash_file: Path) -> None: | ||
""" | ||
Save the current hashes to a file. | ||
|
||
Parameters: | ||
current_hashes (Dict[str, str]): A dictionary mapping file paths to their MD5 hashes. | ||
hash_file (Path): The path to the hash file. | ||
""" | ||
with open(hash_file, 'w') as f: | ||
for file, hash in current_hashes.items(): | ||
f.write(f"{file} {hash}\n") | ||
|
||
def check_for_changes(markdown_files: List[Path], hash_file: Path = Path("file_hashes.txt")) -> List[Path]: | ||
""" | ||
Check for file changes based on their MD5 hashes. | ||
|
||
Parameters: | ||
markdown_files (List[Path]): List of markdown files to check. | ||
hash_file (Path): The path to the hash file. | ||
|
||
Returns: | ||
List[Path]: List of changed files. | ||
""" | ||
last_hashes = load_last_hashes(hash_file) | ||
current_hashes = {} | ||
changed_files = [] | ||
|
||
for file in markdown_files: | ||
current_hash = get_md5(file) | ||
current_hashes[str(file)] = current_hash | ||
if str(file) not in last_hashes or last_hashes[str(file)] != current_hash: | ||
changed_files.append(file) | ||
|
||
logger.info(f"Found {len(changed_files)} changed files.") | ||
|
||
save_current_hashes(current_hashes, hash_file) | ||
return changed_files |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
from pathlib import Path | ||
from typing import List | ||
|
||
from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters | ||
from llama_index import VectorStoreIndex, Document | ||
|
||
|
||
def delete_docs_from_changed_files(index: VectorStoreIndex, changed_files: List[Path]): | ||
""" | ||
Delete all documents in the index that belong to the list of changed files. | ||
|
||
Parameters: | ||
index (SummaryIndex): The index object. | ||
changed_files (List[Path]): List of changed markdown files. | ||
""" | ||
for file in changed_files: | ||
filters = MetadataFilters(filters=[ExactMatchFilter(key="original_file_path", value=str(file))]) | ||
retriever = index.as_retriever(filters=filters) | ||
|
||
# Retrieve the documents to delete | ||
docs_to_delete = retriever.retrieve("") | ||
|
||
for doc in docs_to_delete: | ||
index.delete_ref_doc(doc.node.id_, delete_from_docstore=True) | ||
|
||
|
||
def update_index_for_changed_files(index: VectorStoreIndex, changed_files: List[Path], markdown_reader) -> None: | ||
SeeknnDestroy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Delete old documents and insert new ones for the changed files. | ||
|
||
Parameters: | ||
index (VectorStoreIndex): The current Llama VectorStoreIndex. | ||
changed_files (List[Path]): List of changed markdown files. | ||
markdown_reader: An instance of MarkdownReader class. | ||
""" | ||
# Delete old header-docs for changed files | ||
delete_docs_from_changed_files(index, changed_files) | ||
|
||
# Process the updated markdown files and insert new header-docs | ||
for file in changed_files: | ||
extra_info = {"original_file_path": str(file)} | ||
new_documents = markdown_reader.load_data(file, extra_info=extra_info) | ||
|
||
for doc in new_documents: | ||
index.insert(Document(text=doc.text, metadata=doc.metadata)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import logging | ||
from multi_markdown_reader import MultiMarkdownReader | ||
from pathlib import Path | ||
from typing import List | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
def process_and_get_header_docs(folder_path: Path) -> list: | ||
""" | ||
Process markdown files to extract "header-documents." | ||
|
||
Parameters: | ||
folder_path (Path): Path to the folder containing markdown files. | ||
|
||
Returns: | ||
list: List of processed "header-documents." | ||
""" | ||
markdown_reader = MultiMarkdownReader() | ||
documents = markdown_reader.load_data_from_folder(folder_path) | ||
return documents | ||
|
||
|
||
def get_markdown_files(repo_path: Path, docs_folder: Path = Path("docs")) -> List[Path]: | ||
""" | ||
Get all markdown files in the docs folder of a Git repository. | ||
|
||
Parameters: | ||
repo_path (Path): The path to the Git repository. | ||
docs_folder (Path): The path to the docs folder within the repository. Defaults to "docs". | ||
|
||
Returns: | ||
List[Path]: List of Paths to all markdown files. | ||
""" | ||
docs_path = repo_path / docs_folder | ||
markdown_files = list(docs_path.glob('**/*.md')) | ||
logger.info(f"Found {len(markdown_files)} markdown files.") | ||
return markdown_files |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
"""Markdown Reader. | ||
|
||
A parser for md files. | ||
|
||
""" | ||
import re | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional, Tuple, cast | ||
|
||
from llama_index.readers.base import BaseReader | ||
from llama_index.schema import Document | ||
|
||
|
||
class MarkdownReader(BaseReader): | ||
"""Markdown parser. | ||
|
||
Extract text from markdown files. | ||
Returns dictionary with keys as headers and values as the text between headers. | ||
|
||
""" | ||
|
||
def __init__( | ||
self, | ||
*args: Any, | ||
remove_hyperlinks: bool = True, | ||
remove_images: bool = True, | ||
**kwargs: Any, | ||
) -> None: | ||
"""Init params.""" | ||
super().__init__(*args, **kwargs) | ||
self._remove_hyperlinks = remove_hyperlinks | ||
self._remove_images = remove_images | ||
|
||
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: | ||
"""Convert a markdown file to a dictionary. | ||
|
||
The keys are the headers and the values are the text under each header. | ||
|
||
""" | ||
markdown_tups: List[Tuple[Optional[str], str]] = [] | ||
lines = markdown_text.split("\n") | ||
|
||
current_header = None | ||
current_text = "" | ||
|
||
for line in lines: | ||
header_match = re.match(r"^#+\s", line) | ||
if header_match: | ||
if current_header is not None: | ||
if current_text == "" or None: | ||
continue | ||
markdown_tups.append((current_header, current_text)) | ||
|
||
current_header = line | ||
current_text = "" | ||
else: | ||
current_text += line + "\n" | ||
markdown_tups.append((current_header, current_text)) | ||
|
||
if current_header is not None: | ||
# pass linting, assert keys are defined | ||
markdown_tups = [ | ||
(re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) | ||
for key, value in markdown_tups | ||
] | ||
else: | ||
markdown_tups = [ | ||
(key, re.sub("<.*?>", "", value)) for key, value in markdown_tups | ||
] | ||
|
||
return markdown_tups | ||
|
||
def remove_images(self, content: str) -> str: | ||
"""Get a dictionary of a markdown file from its path.""" | ||
pattern = r"!{1}\[\[(.*)\]\]" | ||
content = re.sub(pattern, "", content) | ||
return content | ||
|
||
def remove_hyperlinks(self, content: str) -> str: | ||
"""Get a dictionary of a markdown file from its path.""" | ||
pattern = r"\[(.*?)\]\((.*?)\)" | ||
content = re.sub(pattern, r"\1", content) | ||
return content | ||
|
||
def parse_tups( | ||
self, filepath: Path, content: Optional[str] = None, errors: str = "ignore" | ||
) -> List[Tuple[Optional[str], str]]: | ||
"""Parse file into tuples. | ||
If content is provided, use that instead of reading from file.""" | ||
if content is None: | ||
with open(filepath, "r", encoding="utf-8") as f: | ||
content = f.read() | ||
if self._remove_hyperlinks: | ||
content = self.remove_hyperlinks(content) | ||
if self._remove_images: | ||
content = self.remove_images(content) | ||
markdown_tups = self.markdown_to_tups(content) | ||
return markdown_tups | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
content: Optional[str] = None, | ||
) -> List[Document]: | ||
"""Parse file into string. | ||
If content is provided, use that instead of reading from file.""" | ||
tups = self.parse_tups(file, content=content) | ||
results = [] | ||
|
||
# Add file path to metadata for tracking | ||
metadata = {'original_file_path': str(file)} | ||
|
||
# Merge with any additional metadata | ||
if extra_info: | ||
metadata.update(extra_info) | ||
|
||
results = [ | ||
Document( | ||
text=f"\n\n{header}\n{value}" if header else value, | ||
metadata=metadata, | ||
) | ||
for header, value in tups | ||
] | ||
|
||
return results |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
su an test amacli mi lokale yaziyoruz bunu? modal gibi bir yere deploy ettigimizde calisacak sekilde guncellememiz lazim bir noktada