Skip to content

Commit

Permalink
Do not wrap filepath in Path to fix indexing markdown files on Windows (
Browse files Browse the repository at this point in the history
#993)

### Issue
- Path with / are converted to \\ on Windows using the `Path' operator.
- The `markdown_to_entries' module was trying to normalize file paths with`Path'  for some reason.
  This would store the file paths in DB Entry differently than the file to entries map if Khoj ran on Windows.
  That'd result in a KeyError when trying to look up the entry file path from `file_to_text_map' in the `text_to_entries:update_embeddings()' function.

### Fix
- Removing the unnecessary OS dependent Path normalization in `markdown_to_entries' should keep the file path storage consistent across `file_to_text_map' var, `FileObjectAdaptor', `Entry' DB tables on Windows for Markdown files as well.

This issue will affect users hosting Khoj server on Windows and attempting to index markdown files.

Resolves #984
  • Loading branch information
debanjum authored Dec 2, 2024
2 parents 9e0a2c7 + 47c926b commit db29894
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 9 deletions.
12 changes: 6 additions & 6 deletions src/khoj/processor/content/markdown/markdown_to_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path
from typing import Dict, List, Tuple

import urllib3
import urllib3.util

from khoj.database.models import Entry as DbEntry
from khoj.database.models import KhojUser
Expand Down Expand Up @@ -51,11 +51,11 @@ def process(self, files: dict[str, str], user: KhojUser, regenerate: bool = Fals
return num_new_embeddings, num_deleted_embeddings

@staticmethod
def extract_markdown_entries(markdown_files, max_tokens=256) -> Tuple[Dict, List[Entry]]:
def extract_markdown_entries(markdown_files: Dict[str, str], max_tokens=256) -> Tuple[Dict[str, str], List[Entry]]:
"Extract entries by heading from specified Markdown files"
entries: List[str] = []
entry_to_file_map: List[Tuple[str, str]] = []
file_to_text_map = dict()
file_to_text_map: Dict[str, str] = dict()
for markdown_file in markdown_files:
try:
markdown_content = markdown_files[markdown_file]
Expand Down Expand Up @@ -128,7 +128,7 @@ def process_single_markdown_file(
return entries, entry_to_file_map

@staticmethod
def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, str]) -> List[Entry]:
"Convert each Markdown entries into a dictionary"
entries: List[Entry] = []
for parsed_entry in parsed_entries:
Expand All @@ -139,7 +139,7 @@ def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_ma
# Escape the URL to avoid issues with special characters
entry_filename = urllib3.util.parse_url(raw_filename).url
else:
entry_filename = str(Path(raw_filename))
entry_filename = raw_filename

heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else ""
# Append base filename to compiled entry for context to model
Expand All @@ -151,7 +151,7 @@ def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_ma
compiled=compiled_entry,
raw=parsed_entry,
heading=f"{prefix}{heading}",
file=f"{entry_filename}",
file=entry_filename,
)
)

Expand Down
6 changes: 3 additions & 3 deletions src/khoj/processor/content/org_mode/org_to_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def convert_org_nodes_to_entries(
compiled += f"\n {parsed_entry.body}"

# Add the sub-entry contents to the entry
entry_compiled += f"{compiled}"
entry_compiled += compiled
entry_raw += f"{parsed_entry}"
if not entry_heading:
entry_heading = heading
Expand All @@ -218,8 +218,8 @@ def convert_org_nodes_to_entries(
Entry(
compiled=entry_compiled,
raw=entry_raw,
heading=f"{entry_heading}",
file=f"{entry_to_file_map[parsed_entry]}",
heading=entry_heading,
file=entry_to_file_map[parsed_entry],
)
)

Expand Down

0 comments on commit db29894

Please sign in to comment.