Skip to content

Commit

Permalink
Wayne/router fillin (#40)
Browse files Browse the repository at this point in the history
* latest updates

* remove bugs

* added pdf downloading

* formatting

* edited docstring

* metadata changes for rst and md scrapers

* latest_changes

* changes for berkeley.edu website

* rst changes

* md

* edit utils

* error handling

* first version

* gitignore

* delete

* working

* content tag matching changes

* fix

---------

Co-authored-by: terrianne-zhang <terriannezhang@berkeley.edu>
  • Loading branch information
LaiWeiQuan and terrianne-zhang authored Jun 30, 2024
1 parent 0799b17 commit f32d1bb
Show file tree
Hide file tree
Showing 135 changed files with 2,098 additions and 164 deletions.
Empty file.
13 changes: 13 additions & 0 deletions rag/file_conversion_router/classes/chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from abc import ABC, abstractmethod
class Chunk(ABC):
def __init__(self, titles, content, chunk_url):
# dictionary of attributes
self.titles = titles
# file type (md, pdf, etc.)
self.content = content
# page url
self.chunk_url = chunk_url




258 changes: 258 additions & 0 deletions rag/file_conversion_router/classes/page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
import string
from rag.file_conversion_router.classes.chunk import Chunk
import tiktoken
import pickle
class Page:
def __init__(self, pagename: str, content: dict, filetype: str, page_url: str = ""):
"""
Initialize a Page instance.
Args:
content (dict): Dictionary of page content attributes.
filetype (str): Type of the file (e.g., 'md', 'pdf').
page_url (str): URL of the page. Default is an empty string.
"""
self.pagename = pagename
self.content = content
self.filetype = filetype
self.page_url = page_url
self.segments = []
self.tree_segments = []
self.chunks = []

def recursive_separate(self, response: str, token_limit: int = 400) -> list:
"""
Recursively separate a response into chunks based on token limit.
Args:
response (str): The text response to be separated.
token_limit (int): Maximum number of tokens per chunk.
Returns:
list: List of separated text chunks.
"""
def token_size(sentence: str) -> int:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
return len(encoding.encode(sentence))

def rfind_punctuation(s: str, start: int, end: int) -> int:
for i in range(end - 1, start - 1, -1):
if s[i] in string.punctuation:
return i
return -1

msg_list = []
tokens = token_size(response)
if tokens > token_limit:
start = 0
while start < len(response):
end = start
while end < len(response) and token_size(response[start:end]) < token_limit:
end += 1

if end < len(response):
split_pos = response.rfind('\n\n', start, end)
if split_pos == -1:
split_pos = response.rfind('\n', start, end)
if split_pos == -1:
split_pos = rfind_punctuation(response, start, end)
if split_pos == -1:
split_pos = response.rfind(' ', start, end)
if split_pos == -1 or split_pos <= start:
split_pos = end - 1

msg_list.append(response[start:split_pos].strip())
start = split_pos + 1
else:
msg_list.append(response[start:end].strip())
break
else:
msg_list.append(response)

return msg_list

def extract_headers_and_content(self, md_content):
def count_consecutive_hashes(s):
count = 0
for char in s:
if char == "#":
count += 1
else:
break
return count

headers_content = [] # Stores tuples of ((header, level), content)
curheader = None # Current header, initially None
current_content = "" # Accumulates content for the current header
in_code_block = False # Indicates if inside a code block
md_content = md_content.split('\n')
for line in md_content:
stripped_line = line.strip()
if "```" in stripped_line:
in_code_block = not in_code_block # Toggle state

if in_code_block:
if curheader:
current_content += f"{line}\n" # Add to content within code blocks
else:
if line.startswith('#'):
if curheader:
headers_content.append((curheader, current_content)) # Store previous header and its content
header = line
header_level = count_consecutive_hashes(header)
header = header.strip('#').strip()
curheader = (header, header_level) # Start new header context
current_content = "" # Reset content for new header
else:
if curheader: # Only accumulate content if within a header
current_content += f"{line}\n"

# Append the last header and its content, if there was any header encountered
if curheader:
headers_content.append((curheader, current_content))

return headers_content
def page_seperate_to_segments(self) -> None:
self.segments = [i for i in self.extract_headers_and_content(self.content['text'])]

def print_header_tree(self):
result = ""
for (title, level), _ in self.segments:
indent = '--' * (level - 1)
header_tag = f"(h{level})"
result += f"{indent}{title} {header_tag}\n"
return result

def tree_print(self):
new_filename = f"{self.pagename}_tree.txt" # No need to use this
top_header = []
counter = 1

for (header, level), content in self.segments:
page_toc = ""
page_path = ""
segment = ""
if len(top_header) < level:
for i in range(len(top_header), level - 1):
top_header.append(("", [], i + 1))
top_header.append((header, content, level))
else:
# Table of Contents
page_toc += "(Table of Contents)\n"
page_toc += f"{self.print_header_tree()}\n"

# Page Path
page_path += "(Page path)\n"
first = True
for h, c, l in top_header:
if first:
page_path += f"(h{l}) {h}"
first = not first
else:
page_path += " > "
page_path += f"(h{l}) {h}"
# Segment Print
segment += f"(Segment {counter})\n"
header_list = [header[0] for header in top_header]
for h, c, l in top_header:
hash_symbols = '#' * l
segment += f"{hash_symbols}{h} (h{l})\n"
segment += f"{c}\n"
# Store the information in tree_segments
self.tree_segments.append({'Page_table': page_toc, 'Page_path': header_list, 'Segment_print': segment})
top_header = top_header[:(level - 1)]
top_header.append((header, content, level))
counter += 1

# Handle the last segment
all_headers = [header[0] for header in self.segments]
if (header, level) == all_headers[-1]:
page_toc = ""
page_path = ""
segment = ""
# Table of Contents
page_toc += "(Table of Contents)\n"
page_toc += f"{self.print_header_tree()}\n"

# Page Path
page_path += "(Page path)\n"
first = True
for h, c, l in top_header:
if first:
page_path += f"(h{l}) {h}"
first = not first
else:
page_path += " > "
page_path += f"(h{l}) {h}"
# Segment Print
segment += f"(Segment {counter})\n"
header_list = [header[0] for header in top_header]
for h, c, l in top_header:
hash_symbols = '#' * l
segment += f"{hash_symbols}{h} (h{l})\n"
segment += f"{c}\n"
# Store the information in tree_segments
self.tree_segments.append({'Page_table': page_toc, 'Page_path': header_list, 'Segment_print': segment})
top_header = top_header[:(level - 1)]
top_header.append((header, content, level))

def tree_segments_to_chunks(self):
def generate_hyperlink_header(header_text):
"""
This function takes a header string, converts all characters to lowercase,
and replaces all spaces with dashes to create a hyperlink-friendly header.
Parameters:
header_text (str): The header string to be converted.
Returns:
str: The converted hyperlink-friendly header string.
"""
# Convert the string to lowercase
lower_text = header_text.lower()

# Replace spaces with dashes
hyperlink_header = lower_text.replace(' ', '-')

return hyperlink_header
# seperate with recursive seperate
for i in self.tree_segments:
content_chunks = self.recursive_separate(i['Segment_print'], 400)
for count, content_chunk in enumerate(content_chunks):
headers = i['Page_path']
urls = [f"{self.page_url}#{generate_hyperlink_header(header)}" for header in headers]
page_path = ' > '.join(f"{item} (h{i+1})" for i, item in enumerate(i['Page_path'])) + f" ({count})"
self.chunks.append(Chunk(page_path, content_chunk, urls))
return self.chunks

def to_file(self, output_path: str) -> None:
"""
Write the page content to a file.
Args:
output_path (str): The path where the file will be written.
"""
with open(output_path, "w") as f:
f.write(str(self))

def to_chunk(self) -> None:
"""
Convert the page content to a list of Chunk objects.
Returns:
list[Chunk]: List of Chunk objects.
"""
self.page_seperate_to_segments()
self.tree_print()
self.chunks = self.tree_segments_to_chunks()

def chunks_to_pkl(self, output_path: str) -> None:
"""
Write the page content chunks to a pkl file.
Args:
output_path (str): The path where the pkl file will be written.
"""
with open(output_path, "wb") as f:
pickle.dump(self.chunks, f)

20 changes: 20 additions & 0 deletions rag/file_conversion_router/classes/vidpage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import string
from rag.file_conversion_router.classes.chunk import Chunk
from rag.file_conversion_router.classes.page import Page
import tiktoken
import pickle
class VidPage(Page):
def tree_segments_to_chunks(self):
# seperate with recursive seperate
print(len(self.tree_segments))
for n, i in enumerate(self.tree_segments):
content_chunks = self.recursive_separate(i['Segment_print'], 400)
for count, content_chunk in enumerate(content_chunks):
headers = i['Page_path']
urls = [f"{self.page_url}&t={int(self.content['timestamp'][n])}" for header in headers]
page_path = ' > '.join(f"{item} (h{i+1})" for i, item in enumerate(i['Page_path'])) + f" ({count})"
self.chunks.append(Chunk(page_path, content_chunk, urls))
return self.chunks



41 changes: 22 additions & 19 deletions rag/file_conversion_router/conversion/base_converter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Base class for all file type converters.
"""Base classes for all file type converters.
"""

from abc import ABC, abstractmethod
Expand All @@ -11,21 +11,22 @@
from rag.file_conversion_router.utils.logger import conversion_logger, logger
from rag.file_conversion_router.utils.markdown_parser import MarkdownParser
from rag.file_conversion_router.utils.utils import calculate_hash, ensure_path

from rag.file_conversion_router.classes.page import Page
from rag.file_conversion_router.classes.vidpage import VidPage

class BaseConverter(ABC):
"""Base class for all file type converters.
"""Base classes for all file type converters.
This base class defines the interface for all type converters, with a standardized workflow, which outputs 3 files:
This base classes defines the interface for all type converters, with a standardized workflow, which outputs 3 files:
- Markdown
- Tree txt
- Pickle
All child class need to implement the abstract methods:
All child classes need to implement the abstract methods:
- _to_markdown
As long as a child class can convert a file to Markdown,
the base class will handle the rest of the conversion process.
As long as a child classes can convert a file to Markdown,
the base classes will handle the rest of the conversion process.
"""

def __init__(self):
Expand Down Expand Up @@ -92,12 +93,9 @@ def _convert_to_markdown(self, input_path: Path, output_path: Path) -> None:
self._to_markdown(input_path, output_path)

@conversion_logger
def _convert_md_to_tree_txt_and_pkl(self, input_path: Path, output_folder: Path) -> None:
"""Convert the input Markdown file to a tree txt file and a pkl file.
Files will be saved in the same folder as the Markdown filepath set up in the MarkdownParser initialization.
"""
self._md_parser.concat_print()
def _convert_to_page(self, input_path: Path, output_path: Path) -> Page:
page = self._to_page(input_path, output_path)
return page

def _setup_output_paths(self, input_path: Union[str, Path], output_folder: Union[str, Path]) -> None:
"""Set up the output paths for the Markdown, tree txt, and pkl files."""
Expand All @@ -112,7 +110,7 @@ def _setup_output_paths(self, input_path: Union[str, Path], output_folder: Union

def _convert_and_cache(self, input_path: Path, output_folder: Path, file_hash: str) -> List[Path]:
self._setup_output_paths(input_path, output_folder)
# This method embeds the abstract method `_to_markdown`, which needs to be implemented by the child class.
# This method embeds the abstract method `_to_markdown`, which needs to be implemented by the child classes.
_, conversion_time = self._perform_conversion(input_path, output_folder)
paths = [self._md_path, self._tree_txt_path, self._pkl_path]
ConversionCache.set_cached_paths_and_time(file_hash, paths, conversion_time)
Expand All @@ -132,18 +130,23 @@ def _use_cached_files(self, cached_paths: List[Path], output_folder: Path) -> No
@conversion_logger
def _perform_conversion(self, input_path: Path, output_folder: Path) -> None:
"""Perform the file conversion process."""
self._convert_to_markdown(input_path, self._md_path)
self._md_parser = MarkdownParser(self._md_path)
self._convert_md_to_tree_txt_and_pkl(self._md_path, output_folder)
page = self._convert_to_page(input_path, output_folder)[0]
page.to_chunk()
pkl_file = output_folder.with_suffix(".pkl")
page.chunks_to_pkl(str(pkl_file))

@abstractmethod
def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Convert the input file to Expected Page format. To be implemented by subclasses."""
raise NotImplementedError("This method should be overridden by subclasses.")

@abstractmethod
def _to_markdown(self, input_path: Path, output_path: Path) -> None:
"""Convert the input file to Expected Markdown format. To be implemented by subclasses."""
raise NotImplementedError("This method should be overridden by subclasses.")


class ConversionCache:
"""A class to handle caching of conversion results."""
"""A classes to handle caching of conversion results."""
_cache: Dict[str, List[Path]] = {}
_futures_cache: Dict[str, Future] = {}
# Store the time taken for each file conversion
Expand Down
Loading

0 comments on commit f32d1bb

Please sign in to comment.