Skip to content

Commit

Permalink
community: Correctly handle multi-element rich text (langchain-ai#25762)
Browse files Browse the repository at this point in the history
**Description:**

- Add _concatenate_rich_text method to combine all elements in rich text
arrays
- Update load_page method to use _concatenate_rich_text for rich text
properties
- Ensure all text content is captured, including inline code and
formatted text
- Add unit tests to verify correct handling of multi-element rich text
This fix prevents truncation of content after backticks or other
formatting elements.

 **Issue:**

Using Notion DB Loader, the text for `richtext` and `title` is truncated
after 1st element was loaded as Notion Loader only read the first
element.

**Dependencies:** any dependencies required for this change
None.

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
  • Loading branch information
3 people authored Dec 16, 2024
1 parent b2102b8 commit 8f5e72d
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 8 deletions.
16 changes: 8 additions & 8 deletions libs/community/langchain_community/document_loaders/notiondb.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,19 +107,15 @@ def load_page(self, page_summary: Dict[str, Any]) -> Document:
# load properties as metadata
metadata: Dict[str, Any] = {}

value: Any

for prop_name, prop_data in page_summary["properties"].items():
prop_type = prop_data["type"]

if prop_type == "rich_text":
value = (
prop_data["rich_text"][0]["plain_text"]
if prop_data["rich_text"]
else None
)
value = self._concatenate_rich_text(prop_data["rich_text"])
elif prop_type == "title":
value = (
prop_data["title"][0]["plain_text"] if prop_data["title"] else None
)
value = self._concatenate_rich_text(prop_data["title"])
elif prop_type == "multi_select":
value = (
[item["name"] for item in prop_data["multi_select"]]
Expand Down Expand Up @@ -228,3 +224,7 @@ def _request(
)
res.raise_for_status()
return res.json()

def _concatenate_rich_text(self, rich_text_array: List[Dict[str, Any]]) -> str:
"""Concatenate all text content from a rich_text array."""
return "".join(item["plain_text"] for item in rich_text_array)
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from unittest.mock import Mock, patch

from langchain_core.documents import Document

from langchain_community.document_loaders import NotionDBLoader


class TestNotionDBLoader:
def setup_method(self) -> None:
self.loader = NotionDBLoader(
integration_token="fake_token", database_id="fake_db_id"
)

def test_concatenate_rich_text(self) -> None:
# Setup
rich_text = [
{"plain_text": "Hello "},
{"plain_text": "world"},
{"plain_text": "!"},
]

# Exercise
result = self.loader._concatenate_rich_text(rich_text)

# Assert
assert result == "Hello world!"

@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request")
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks")
def test_load_page_with_rich_text(
self, mock_load_blocks: Mock, mock_request: Mock
) -> None:
# Setup
mock_load_blocks.return_value = "Mocked block content"
page_summary = {
"id": "page_id",
"properties": {
"Title": {"type": "title", "title": [{"plain_text": "Test Title"}]},
"Description": {
"type": "rich_text",
"rich_text": [
{"plain_text": "This is "},
{"plain_text": "a test"},
{"plain_text": " description"},
],
},
},
}
expected_doc = Document(
page_content="Mocked block content",
metadata={
"title": "Test Title",
"description": "This is a test description",
"id": "page_id",
},
)

# Exercise
result = self.loader.load_page(page_summary)

# Assert
assert result == expected_doc

@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request")
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks")
def test_load_page_with_code_in_rich_text(
self, mock_load_blocks: Mock, mock_request: Mock
) -> None:
# Setup
mock_load_blocks.return_value = "Mocked block content"
page_summary = {
"id": "page_id",
"properties": {
"Answer": {
"type": "rich_text",
"rich_text": [
{"plain_text": "Use "},
{"plain_text": "print('Hello')"},
{"plain_text": " to display text"},
],
}
},
}
expected_doc = Document(
page_content="Mocked block content",
metadata={"answer": "Use print('Hello') to display text", "id": "page_id"},
)

# Exercise
result = self.loader.load_page(page_summary)

# Assert
assert result == expected_doc

@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request")
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks")
def test_load(self, mock_load_blocks: Mock, mock_request: Mock) -> None:
# Setup
mock_load_blocks.return_value = "Mocked block content"
mock_request.return_value = {
"results": [
{
"id": "page_id_1",
"properties": {
"Title": {
"type": "title",
"title": [{"plain_text": "Test Title 1"}],
}
},
},
{
"id": "page_id_2",
"properties": {
"Title": {
"type": "title",
"title": [{"plain_text": "Test Title 2"}],
}
},
},
],
"has_more": False,
}
expected_docs = [
Document(
page_content="Mocked block content",
metadata={"title": "Test Title 1", "id": "page_id_1"},
),
Document(
page_content="Mocked block content",
metadata={"title": "Test Title 2", "id": "page_id_2"},
),
]

# Exercise
result = self.loader.load()

# Assert
assert result == expected_docs

0 comments on commit 8f5e72d

Please sign in to comment.