forked from langchain-ai/langchain
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
community: Correctly handle multi-element rich text (langchain-ai#25762)
**Description:** - Add _concatenate_rich_text method to combine all elements in rich text arrays - Update load_page method to use _concatenate_rich_text for rich text properties - Ensure all text content is captured, including inline code and formatted text - Add unit tests to verify correct handling of multi-element rich text This fix prevents truncation of content after backticks or other formatting elements. **Issue:** Using Notion DB Loader, the text for `richtext` and `title` is truncated after 1st element was loaded as Notion Loader only read the first element. **Dependencies:** any dependencies required for this change None. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Erick Friis <erick@langchain.dev>
- Loading branch information
1 parent
b2102b8
commit 8f5e72d
Showing
2 changed files
with
146 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
138 changes: 138 additions & 0 deletions
138
libs/community/tests/unit_tests/document_loaders/test_notiondb_loader.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
from unittest.mock import Mock, patch | ||
|
||
from langchain_core.documents import Document | ||
|
||
from langchain_community.document_loaders import NotionDBLoader | ||
|
||
|
||
class TestNotionDBLoader: | ||
def setup_method(self) -> None: | ||
self.loader = NotionDBLoader( | ||
integration_token="fake_token", database_id="fake_db_id" | ||
) | ||
|
||
def test_concatenate_rich_text(self) -> None: | ||
# Setup | ||
rich_text = [ | ||
{"plain_text": "Hello "}, | ||
{"plain_text": "world"}, | ||
{"plain_text": "!"}, | ||
] | ||
|
||
# Exercise | ||
result = self.loader._concatenate_rich_text(rich_text) | ||
|
||
# Assert | ||
assert result == "Hello world!" | ||
|
||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request") | ||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks") | ||
def test_load_page_with_rich_text( | ||
self, mock_load_blocks: Mock, mock_request: Mock | ||
) -> None: | ||
# Setup | ||
mock_load_blocks.return_value = "Mocked block content" | ||
page_summary = { | ||
"id": "page_id", | ||
"properties": { | ||
"Title": {"type": "title", "title": [{"plain_text": "Test Title"}]}, | ||
"Description": { | ||
"type": "rich_text", | ||
"rich_text": [ | ||
{"plain_text": "This is "}, | ||
{"plain_text": "a test"}, | ||
{"plain_text": " description"}, | ||
], | ||
}, | ||
}, | ||
} | ||
expected_doc = Document( | ||
page_content="Mocked block content", | ||
metadata={ | ||
"title": "Test Title", | ||
"description": "This is a test description", | ||
"id": "page_id", | ||
}, | ||
) | ||
|
||
# Exercise | ||
result = self.loader.load_page(page_summary) | ||
|
||
# Assert | ||
assert result == expected_doc | ||
|
||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request") | ||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks") | ||
def test_load_page_with_code_in_rich_text( | ||
self, mock_load_blocks: Mock, mock_request: Mock | ||
) -> None: | ||
# Setup | ||
mock_load_blocks.return_value = "Mocked block content" | ||
page_summary = { | ||
"id": "page_id", | ||
"properties": { | ||
"Answer": { | ||
"type": "rich_text", | ||
"rich_text": [ | ||
{"plain_text": "Use "}, | ||
{"plain_text": "print('Hello')"}, | ||
{"plain_text": " to display text"}, | ||
], | ||
} | ||
}, | ||
} | ||
expected_doc = Document( | ||
page_content="Mocked block content", | ||
metadata={"answer": "Use print('Hello') to display text", "id": "page_id"}, | ||
) | ||
|
||
# Exercise | ||
result = self.loader.load_page(page_summary) | ||
|
||
# Assert | ||
assert result == expected_doc | ||
|
||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request") | ||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks") | ||
def test_load(self, mock_load_blocks: Mock, mock_request: Mock) -> None: | ||
# Setup | ||
mock_load_blocks.return_value = "Mocked block content" | ||
mock_request.return_value = { | ||
"results": [ | ||
{ | ||
"id": "page_id_1", | ||
"properties": { | ||
"Title": { | ||
"type": "title", | ||
"title": [{"plain_text": "Test Title 1"}], | ||
} | ||
}, | ||
}, | ||
{ | ||
"id": "page_id_2", | ||
"properties": { | ||
"Title": { | ||
"type": "title", | ||
"title": [{"plain_text": "Test Title 2"}], | ||
} | ||
}, | ||
}, | ||
], | ||
"has_more": False, | ||
} | ||
expected_docs = [ | ||
Document( | ||
page_content="Mocked block content", | ||
metadata={"title": "Test Title 1", "id": "page_id_1"}, | ||
), | ||
Document( | ||
page_content="Mocked block content", | ||
metadata={"title": "Test Title 2", "id": "page_id_2"}, | ||
), | ||
] | ||
|
||
# Exercise | ||
result = self.loader.load() | ||
|
||
# Assert | ||
assert result == expected_docs |