diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py index d98ea6959..040d4035a 100644 --- a/comps/cores/proto/docarray.py +++ b/comps/cores/proto/docarray.py @@ -22,6 +22,8 @@ class DocPath(BaseDoc): path: str chunk_size: int = 1500 chunk_overlap: int = 100 + process_table: bool = False + table_strategy: str = "fast" class EmbedDoc768(BaseDoc): diff --git a/comps/dataprep/milvus/README.md b/comps/dataprep/milvus/README.md index dbc3d256c..ddf740f89 100644 --- a/comps/dataprep/milvus/README.md +++ b/comps/dataprep/milvus/README.md @@ -6,6 +6,9 @@ ```bash pip install -r requirements.txt +apt-get install tesseract-ocr -y +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y ``` ## Start Milvus Server @@ -60,3 +63,11 @@ You can specify chunk_size and chunk_size by the following commands. ```bash curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","chunk_size":1500,"chunk_overlap":100}' http://localhost:6010/v1/dataprep ``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +```bash +curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep +``` diff --git a/comps/dataprep/milvus/prepare_doc_milvus.py b/comps/dataprep/milvus/prepare_doc_milvus.py index 4d58b59fe..1b19a2f36 100644 --- a/comps/dataprep/milvus/prepare_doc_milvus.py +++ b/comps/dataprep/milvus/prepare_doc_milvus.py @@ -13,7 +13,7 @@ from comps.cores.mega.micro_service import opea_microservices, register_microservice from comps.cores.proto.docarray import DocPath from comps.cores.telemetry.opea_telemetry import opea_telemetry -from comps.dataprep.utils import document_loader +from comps.dataprep.utils import document_loader, get_tables_result # current_script_path = os.path.dirname(os.path.abspath(__file__)) # parent_dir = os.path.dirname(current_script_path) @@ -49,7 +49,9 @@ def ingest_documents(doc_path: DocPath): content = document_loader(path) chunks = text_splitter.split_text(content) - + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") # Create vectorstore if EMBEDDING_ENDPOINT: diff --git a/comps/dataprep/milvus/requirements.txt b/comps/dataprep/milvus/requirements.txt index 09536fd35..fbfe59080 100644 --- a/comps/dataprep/milvus/requirements.txt +++ b/comps/dataprep/milvus/requirements.txt @@ -21,3 +21,4 @@ pymupdf==1.24.5 python-docx==0.8.11 sentence_transformers shortuuid +unstructured[all-docs]==0.11.5 diff --git a/comps/dataprep/qdrant/README.md b/comps/dataprep/qdrant/README.md index cd34061e5..bfa5c8f46 100644 --- a/comps/dataprep/qdrant/README.md +++ b/comps/dataprep/qdrant/README.md @@ -6,6 +6,9 @@ ```bash pip install -r requirements.txt +apt-get install tesseract-ocr -y +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y ``` ## Start Qdrant Server @@ -49,7 +52,6 @@ docker run -d --name="dataprep-qdrant-server" -p 6000:6000 --ipc=host -e http_pr ## Setup Environment Variables ```bash -export no_proxy=${your_no_proxy} export http_proxy=${your_http_proxy} export https_proxy=${your_http_proxy} export QDRANT=${host_ip} @@ -77,3 +79,11 @@ You can specify chunk_size and chunk_size by the following commands. ```bash curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document","chunk_size":1500,"chunk_overlap":100}' http://localhost:6000/v1/dataprep ``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +```bash +curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document","process_table":true,"table_strategy":"hq"}' http://localhost:6000/v1/dataprep +``` diff --git a/comps/dataprep/qdrant/prepare_doc_qdrant.py b/comps/dataprep/qdrant/prepare_doc_qdrant.py index 6074cbe48..1dc554eff 100644 --- a/comps/dataprep/qdrant/prepare_doc_qdrant.py +++ b/comps/dataprep/qdrant/prepare_doc_qdrant.py @@ -10,7 +10,7 @@ from langchain_text_splitters import HTMLHeaderTextSplitter from comps import DocPath, opea_microservices, opea_telemetry, register_microservice -from comps.dataprep.utils import document_loader +from comps.dataprep.utils import document_loader, get_tables_result tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") @@ -43,7 +43,9 @@ def ingest_documents(doc_path: DocPath): content = document_loader(path) chunks = text_splitter.split_text(content) - + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") # Create vectorstore if tei_embedding_endpoint: diff --git a/comps/dataprep/qdrant/requirements.txt b/comps/dataprep/qdrant/requirements.txt index 194995f5f..bcba8a156 100644 --- a/comps/dataprep/qdrant/requirements.txt +++ b/comps/dataprep/qdrant/requirements.txt @@ -18,3 +18,4 @@ python-docx qdrant-client sentence_transformers shortuuid +unstructured[all-docs]==0.11.5 diff --git a/comps/dataprep/redis/README.md b/comps/dataprep/redis/README.md index 31135219a..8c58fc85e 100644 --- a/comps/dataprep/redis/README.md +++ b/comps/dataprep/redis/README.md @@ -13,6 +13,9 @@ We organized these two folders in the same way, so you can use either framework ```bash apt update apt install default-jre +apt-get install tesseract-ocr -y +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y # for langchain cd langchain # for llama_index @@ -147,12 +150,25 @@ You can specify chunk_size and chunk_size by the following commands. ```bash curl -X POST \ -H "Content-Type: multipart/form-data" \ - -F "files=@/home/sdp/yuxiang/opea_intent/GenAIComps4/comps/table_extraction/LLAMA2_page6.pdf" \ + -F "files=@./file1.txt" \ -F "chunk_size=1500" \ -F "chunk_overlap=100" \ http://localhost:6007/v1/dataprep ``` +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./your_file.pdf" \ + -F "process_table=true" \ + -F "table_strategy=hq" \ + http://localhost:6007/v1/dataprep +``` + - Multiple file upload ```bash diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py index cdcbdd93a..36577f8d8 100644 --- a/comps/dataprep/redis/langchain/prepare_doc_redis.py +++ b/comps/dataprep/redis/langchain/prepare_doc_redis.py @@ -17,7 +17,7 @@ from pyspark import SparkConf, SparkContext from comps import DocPath, opea_microservices, register_microservice -from comps.dataprep.utils import document_loader, parse_html +from comps.dataprep.utils import document_loader, get_tables_result, parse_html tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") @@ -53,6 +53,9 @@ def ingest_data_to_redis(doc_path: DocPath): content = document_loader(path) chunks = text_splitter.split_text(content) + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") # Create vectorstore @@ -117,6 +120,8 @@ async def ingest_documents( link_list: Optional[str] = Form(None), chunk_size: int = Form(1500), chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), ): print(f"files:{files}") print(f"link_list:{link_list}") @@ -133,6 +138,15 @@ async def ingest_documents( for file in files: save_path = upload_folder + file.filename await save_file_to_local_disk(save_path, file) + ingest_data_to_redis( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) uploaded_files.append(save_path) print(f"Successfully saved file {save_path}") diff --git a/comps/dataprep/redis/langchain/requirements.txt b/comps/dataprep/redis/langchain/requirements.txt index d974603b6..3c542bed6 100644 --- a/comps/dataprep/redis/langchain/requirements.txt +++ b/comps/dataprep/redis/langchain/requirements.txt @@ -21,4 +21,4 @@ python-docx redis sentence_transformers shortuuid -unstructured +unstructured[all-docs]==0.11.5 diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py index b20d13e18..76f5e578e 100644 --- a/comps/dataprep/utils.py +++ b/comps/dataprep/utils.py @@ -22,6 +22,7 @@ import yaml from bs4 import BeautifulSoup from docx import Document as DDocument +from langchain import LLMChain, PromptTemplate from langchain_community.document_loaders import ( UnstructuredHTMLLoader, UnstructuredImageLoader, @@ -29,6 +30,7 @@ UnstructuredPowerPointLoader, UnstructuredXMLLoader, ) +from langchain_community.llms import HuggingFaceEndpoint from PIL import Image @@ -457,3 +459,104 @@ def parse_html(input): print("The given link/str {} cannot be parsed.".format(link)) return chucks + + +def get_tables_result(pdf_path, table_strategy): + """Extract tables information from pdf file.""" + if table_strategy == "fast": + return None + + from unstructured.documents.elements import FigureCaption + from unstructured.partition.pdf import partition_pdf + + tables_result = [] + raw_pdf_elements = partition_pdf( + filename=pdf_path, + infer_table_structure=True, + ) + tables = [el for el in raw_pdf_elements if el.category == "Table"] + for table in tables: + table_coords = table.metadata.coordinates.points + content = table.metadata.text_as_html + table_page_number = table.metadata.page_number + min_distance = float("inf") + table_summary = None + if table_strategy == "hq": + for element in raw_pdf_elements: + if isinstance(element, FigureCaption) or element.text.startswith("Tab"): + caption_page_number = element.metadata.page_number + caption_coords = element.metadata.coordinates.points + related, y_distance = get_relation( + table_coords, caption_coords, table_page_number, caption_page_number + ) + if related: + if y_distance < min_distance: + min_distance = y_distance + table_summary = element.text + if table_summary is None: + parent_id = table.metadata.parent_id + for element in raw_pdf_elements: + if element.id == parent_id: + table_summary = element.text + break + elif table_strategy == "llm": + table_summary = llm_generate(content) + table_summary = table_summary.lstrip("\n ") + elif table_strategy is None: + table_summary = None + if table_summary is None: + text = f"[Table: {content}]" + else: + text = f"|Table: [Summary: {table_summary}], [Content: {content}]|" + tables_result.append(text) + return tables_result + + +def llm_generate(content): + llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") + llm = HuggingFaceEndpoint( + endpoint_url=llm_endpoint, + max_new_tokens=1000, + top_k=40, + top_p=0.9, + temperature=0.8, + streaming=False, + num_beams=2, + num_return_sequences=2, + use_cache=True, + timeout=600, + ) + + table_summary_template = """ + Task: Your task is to give a concise summary of the table. \ + The summary should cover the overall table structure and all detailed information of the table. \ + The table will be given in html format. Summarize the table below. + --- + ### Table: + {table_content} + --- + ### Generated Summary: + """ + + prompt = PromptTemplate(template=table_summary_template, input_variables=["table_content"]) + + llm_chain = LLMChain(prompt=prompt, llm=llm) + + response = llm_chain.invoke(content) + response = response["text"] + print("response", response) + return response + + +def get_relation(table_coords, caption_coords, table_page_number, caption_page_number, threshold=100): + """Get the relation of a pair of table and caption.""" + same_page = table_page_number == caption_page_number + x_overlap = (min(table_coords[2][0], caption_coords[2][0]) - max(table_coords[0][0], caption_coords[0][0])) > 0 + if table_coords[0][1] - caption_coords[1][1] >= 0: + y_distance = table_coords[0][1] - caption_coords[1][1] + elif caption_coords[0][1] - table_coords[1][1] >= 0: + y_distance = caption_coords[0][1] - table_coords[1][1] + else: + y_distance = 0 + y_close = y_distance < threshold + return same_page and x_overlap and y_close, y_distance