safevideo · fcakyon · Oct 2, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 22, 2023
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,10 @@ cython_debug/
 # and can be added to the global gitignore or merged into this file. For a more nuclear
 # option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# Ignore the cloned Ultralytics repository
+/ultralytics/
+# For testing
+testing.py
+# Hash storage file
+file_hashes.txt
diff --git a/README.md b/README.md
@@ -1 +1,17 @@
-# doc-qa-ai
+# doc-qa-ai
+
+## Setup
+
+### Install dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+### Sample .env file
+
+```bash
+OPENAI_API_KEY=<your-openai-api-key>
+GIT_REPO_URL="https://github.com/ultralytics/ultralytics.git"
+GIT_REPO_PATH="./ultralytics"
+```
diff --git a/git_utils.py b/git_utils.py
@@ -0,0 +1,21 @@
+import logging
+from git import Repo
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+def clone_or_pull_repository(git_url: str, local_path: Path) -> None:
+ """
+ Clone a Git repository or pull latest changes if it already exists.
+
+ Parameters:
+ git_url (str): The URL of the Git repository.
+ local_path (Path): The local path where the repository will be cloned or updated.
+ """
+ if local_path.exists():
+ logger.info(f"Updating existing repository at {local_path}")
+ repo = Repo(str(local_path))
+ repo.remotes.origin.pull()
+ else:
+ logger.info(f"Cloning repository from {git_url} to {local_path}")
+ Repo.clone_from(git_url, str(local_path))
diff --git a/hash_utils.py b/hash_utils.py
@@ -0,0 +1,75 @@
+import logging
+import hashlib
+from pathlib import Path
+from typing import List, Dict
+
+logger = logging.getLogger(__name__)
+
+def get_md5(file_path: Path) -> str:
+ """
+ Compute the MD5 hash of a file.
+
+ Parameters:
+ file_path (Path): The path to the file.
+
+ Returns:
+ str: The MD5 hash of the file.
+ """
+ hasher = hashlib.md5()
+ with open(file_path, "rb") as f:
+ for chunk in iter(lambda: f.read(4096), b""):
+ hasher.update(chunk)
+ return hasher.hexdigest()
+
+def load_last_hashes(hash_file: Path) -> Dict[str, str]:
+ """
+ Load the last known hashes from a file.
+
+ Parameters:
+ hash_file (Path): The path to the hash file.
+
+ Returns:
+ Dict[str, str]: A dictionary mapping file paths to their MD5 hashes.
+ """
+ if hash_file.exists():
+ with open(hash_file, 'r') as f:
+ return {line.split()[0]: line.split()[1] for line in f.readlines()}
+ return {}
+
+def save_current_hashes(current_hashes: Dict[str, str], hash_file: Path) -> None:
+ """
+ Save the current hashes to a file.
+
+ Parameters:
+ current_hashes (Dict[str, str]): A dictionary mapping file paths to their MD5 hashes.
+ hash_file (Path): The path to the hash file.
+ """
+ with open(hash_file, 'w') as f:
+ for file, hash in current_hashes.items():
+ f.write(f"{file} {hash}\n")
+
+def check_for_changes(markdown_files: List[Path], hash_file: Path = Path("file_hashes.txt")) -> List[Path]:
+ """
+ Check for file changes based on their MD5 hashes.
+
+ Parameters:
+ markdown_files (List[Path]): List of markdown files to check.
+ hash_file (Path): The path to the hash file.
+
+ Returns:
+ List[Path]: List of changed files.
+ """
+ last_hashes = load_last_hashes(hash_file)
+ current_hashes = {}
+ changed_files = []
+
+ for file in markdown_files:
+ current_hash = get_md5(file)
+ current_hashes[str(file)] = current_hash
+ if str(file) not in last_hashes or last_hashes[str(file)] != current_hash:
+ changed_files.append(file)
+
+ logger.info(f"Found {len(changed_files)} changed files.")
+
+ save_current_hashes(current_hashes, hash_file)
+ return changed_files
diff --git a/llama_utils.py b/llama_utils.py
@@ -0,0 +1,45 @@
+from pathlib import Path
+from typing import List
+
+from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters
+from llama_index import VectorStoreIndex, Document
+
+
+def delete_docs_from_changed_files(index: VectorStoreIndex, changed_files: List[Path]):
+ """
+ Delete all documents in the index that belong to the list of changed files.
+
+ Parameters:
+ index (SummaryIndex): The index object.
+ changed_files (List[Path]): List of changed markdown files.
+ """
+ for file in changed_files:
+ filters = MetadataFilters(filters=[ExactMatchFilter(key="original_file_path", value=str(file))])
+ retriever = index.as_retriever(filters=filters)
+
+ # Retrieve the documents to delete
+ docs_to_delete = retriever.retrieve("") 
+
+ for doc in docs_to_delete:
+ index.delete_ref_doc(doc.node.id_, delete_from_docstore=True)
+
+
+def update_index_for_changed_files(index: VectorStoreIndex, changed_files: List[Path], markdown_reader) -> None:
+ """
+ Delete old documents and insert new ones for the changed files.
+
+ Parameters:
+ index (VectorStoreIndex): The current Llama VectorStoreIndex.
+ changed_files (List[Path]): List of changed markdown files.
+ markdown_reader: An instance of MarkdownReader class.
+ """
+ # Delete old header-docs for changed files
+ delete_docs_from_changed_files(index, changed_files)
+
+ # Process the updated markdown files and insert new header-docs
+ for file in changed_files:
+ extra_info = {"original_file_path": str(file)}
+ new_documents = markdown_reader.load_data(file, extra_info=extra_info)
+
+ for doc in new_documents:
+ index.insert(Document(text=doc.text, metadata=doc.metadata))
diff --git a/markdown_processing.py b/markdown_processing.py
@@ -0,0 +1,37 @@
+import logging
+from multi_markdown_reader import MultiMarkdownReader
+from pathlib import Path
+from typing import List
+
+logger = logging.getLogger(__name__)
+
+def process_and_get_header_docs(folder_path: Path) -> list:
+ """
+ Process markdown files to extract "header-documents."
+
+ Parameters:
+ folder_path (Path): Path to the folder containing markdown files.
+
+ Returns:
+ list: List of processed "header-documents."
+ """
+ markdown_reader = MultiMarkdownReader()
+ documents = markdown_reader.load_data_from_folder(folder_path)
+ return documents
+
+
+def get_markdown_files(repo_path: Path, docs_folder: Path = Path("docs")) -> List[Path]:
+ """
+ Get all markdown files in the docs folder of a Git repository.
+
+ Parameters:
+ repo_path (Path): The path to the Git repository.
+ docs_folder (Path): The path to the docs folder within the repository. Defaults to "docs".
+
+ Returns:
+ List[Path]: List of Paths to all markdown files.
+ """
+ docs_path = repo_path / docs_folder
+ markdown_files = list(docs_path.glob('**/*.md'))
+ logger.info(f"Found {len(markdown_files)} markdown files.")
+ return markdown_files
diff --git a/markdown_reader.py b/markdown_reader.py
@@ -0,0 +1,126 @@
+"""Markdown Reader.
+
+A parser for md files.
+
+"""
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, cast
+
+from llama_index.readers.base import BaseReader
+from llama_index.schema import Document
+
+
+class MarkdownReader(BaseReader):
+ """Markdown parser.
+
+ Extract text from markdown files.
+ Returns dictionary with keys as headers and values as the text between headers.
+
+ """
+
+ def __init__(
+ self,
+ *args: Any,
+ remove_hyperlinks: bool = True,
+ remove_images: bool = True,
+ **kwargs: Any,
+ ) -> None:
+ """Init params."""
+ super().__init__(*args, **kwargs)
+ self._remove_hyperlinks = remove_hyperlinks
+ self._remove_images = remove_images
+
+ def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
+ """Convert a markdown file to a dictionary.
+
+ The keys are the headers and the values are the text under each header.
+
+ """
+ markdown_tups: List[Tuple[Optional[str], str]] = []
+ lines = markdown_text.split("\n")
+
+ current_header = None
+ current_text = ""
+
+ for line in lines:
+ header_match = re.match(r"^#+\s", line)
+ if header_match:
+ if current_header is not None:
+ if current_text == "" or None:
+ continue
+ markdown_tups.append((current_header, current_text))
+
+ current_header = line
+ current_text = ""
+ else:
+ current_text += line + "\n"
+ markdown_tups.append((current_header, current_text))
+
+ if current_header is not None:
+ # pass linting, assert keys are defined
+ markdown_tups = [
+ (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
+ for key, value in markdown_tups
+ ]
+ else:
+ markdown_tups = [
+ (key, re.sub("<.*?>", "", value)) for key, value in markdown_tups
+ ]
+
+ return markdown_tups
+
+ def remove_images(self, content: str) -> str:
+ """Get a dictionary of a markdown file from its path."""
+ pattern = r"!{1}\[\[(.*)\]\]"
+ content = re.sub(pattern, "", content)
+ return content
+
+ def remove_hyperlinks(self, content: str) -> str:
+ """Get a dictionary of a markdown file from its path."""
+ pattern = r"\[(.*?)\]\((.*?)\)"
+ content = re.sub(pattern, r"\1", content)
+ return content
+
+ def parse_tups(
+ self, filepath: Path, content: Optional[str] = None, errors: str = "ignore"
+ ) -> List[Tuple[Optional[str], str]]:
+ """Parse file into tuples.
+ If content is provided, use that instead of reading from file."""
+ if content is None:
+ with open(filepath, "r", encoding="utf-8") as f:
+ content = f.read()
+ if self._remove_hyperlinks:
+ content = self.remove_hyperlinks(content)
+ if self._remove_images:
+ content = self.remove_images(content)
+ markdown_tups = self.markdown_to_tups(content)
+ return markdown_tups
+
+ def load_data(
+ self,
+ file: Path,
+ extra_info: Optional[Dict] = None,
+ content: Optional[str] = None,
+ ) -> List[Document]:
+ """Parse file into string.
+ If content is provided, use that instead of reading from file."""
+ tups = self.parse_tups(file, content=content)
+ results = []
+
+ # Add file path to metadata for tracking
+ metadata = {'original_file_path': str(file)}
+
+ # Merge with any additional metadata
+ if extra_info:
+ metadata.update(extra_info)
+
+ results = [
+ Document(
+ text=f"\n\n{header}\n{value}" if header else value,
+ metadata=metadata,
+ )
+ for header, value in tups
+ ]
+
+ return results