Skip to content

Commit 2c98599

Browse files
authored
text-splitters: fix stale header metadata in ExperimentalMarkdownSyntaxTextSplitter (#31622)
**Description:** Previously, when transitioning from a deeper Markdown header (e.g., ###) to a shallower one (e.g., ##), the ExperimentalMarkdownSyntaxTextSplitter retained the deeper header in the metadata. This commit updates the `_resolve_header_stack` method to remove headers at the same or deeper levels before appending the current header. As a result, each chunk now reflects only the active header context. Fixes unexpected metadata leakage across sections in nested Markdown documents. Additionally, test cases have been updated to: - Validate correct header resolution and metadata assignment. - Cover edge cases with nested headers and horizontal rules. **Issue:** Fixes [#31596](#31596) **Dependencies:** None **Twitter handle:** -> [_RaghuKapur](https://twitter.com/_RaghuKapur) **LinkedIn:** -> [https://www.linkedin.com/in/raghukapur/](https://www.linkedin.com/in/raghukapur/)
1 parent 9d4d258 commit 2c98599

File tree

2 files changed

+62
-11
lines changed

2 files changed

+62
-11
lines changed

libs/text-splitters/langchain_text_splitters/markdown.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -376,10 +376,10 @@ def split_text(self, text: str) -> List[Document]:
376376

377377
def _resolve_header_stack(self, header_depth: int, header_text: str) -> None:
378378
for i, (depth, _) in enumerate(self.current_header_stack):
379-
if depth == header_depth:
380-
self.current_header_stack[i] = (header_depth, header_text)
381-
self.current_header_stack = self.current_header_stack[: i + 1]
382-
return
379+
if depth >= header_depth:
380+
# Truncate everything from this level onward
381+
self.current_header_stack = self.current_header_stack[:i]
382+
break
383383
self.current_header_stack.append((header_depth, header_text))
384384

385385
def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:

libs/text-splitters/tests/unit_tests/test_text_splitters.py

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1464,6 +1464,10 @@ def test_md_header_text_splitter_with_invisible_characters(characters: str) -> N
14641464
"Content for header 1\n"
14651465
"## Header 2\n"
14661466
"Content for header 2\n"
1467+
"### Header 3\n"
1468+
"Content for header 3\n"
1469+
"## Header 2 Again\n"
1470+
"This should be tagged with Header 1 and Header 2 Again\n"
14671471
"```python\n"
14681472
"def func_definition():\n"
14691473
" print('Keep the whitespace consistent')\n"
@@ -1491,6 +1495,18 @@ def test_experimental_markdown_syntax_text_splitter() -> None:
14911495
page_content="Content for header 2\n",
14921496
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
14931497
),
1498+
Document(
1499+
page_content="Content for header 3\n",
1500+
metadata={
1501+
"Header 1": "My Header 1",
1502+
"Header 2": "Header 2",
1503+
"Header 3": "Header 3",
1504+
},
1505+
),
1506+
Document(
1507+
page_content="This should be tagged with Header 1 and Header 2 Again\n",
1508+
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
1509+
),
14941510
Document(
14951511
page_content=(
14961512
"```python\ndef func_definition():\n "
@@ -1499,7 +1515,7 @@ def test_experimental_markdown_syntax_text_splitter() -> None:
14991515
metadata={
15001516
"Code": "python",
15011517
"Header 1": "My Header 1",
1502-
"Header 2": "Header 2",
1518+
"Header 2": "Header 2 Again",
15031519
},
15041520
),
15051521
Document(
@@ -1530,7 +1546,15 @@ def test_experimental_markdown_syntax_text_splitter_header_configuration() -> No
15301546

15311547
expected_output = [
15321548
Document(
1533-
page_content="Content for header 1\n## Header 2\nContent for header 2\n",
1549+
page_content=(
1550+
"Content for header 1\n"
1551+
"## Header 2\n"
1552+
"Content for header 2\n"
1553+
"### Header 3\n"
1554+
"Content for header 3\n"
1555+
"## Header 2 Again\n"
1556+
"This should be tagged with Header 1 and Header 2 Again\n"
1557+
),
15341558
metadata={"Encabezamiento 1": "My Header 1"},
15351559
),
15361560
Document(
@@ -1571,6 +1595,21 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
15711595
page_content="## Header 2\nContent for header 2\n",
15721596
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
15731597
),
1598+
Document(
1599+
page_content="### Header 3\nContent for header 3\n",
1600+
metadata={
1601+
"Header 1": "My Header 1",
1602+
"Header 2": "Header 2",
1603+
"Header 3": "Header 3",
1604+
},
1605+
),
1606+
Document(
1607+
page_content=(
1608+
"## Header 2 Again\n"
1609+
"This should be tagged with Header 1 and Header 2 Again\n"
1610+
),
1611+
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
1612+
),
15741613
Document(
15751614
page_content=(
15761615
"```python\ndef func_definition():\n "
@@ -1579,7 +1618,7 @@ def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
15791618
metadata={
15801619
"Code": "python",
15811620
"Header 1": "My Header 1",
1582-
"Header 2": "Header 2",
1621+
"Header 2": "Header 2 Again",
15831622
},
15841623
),
15851624
Document(
@@ -1614,36 +1653,48 @@ def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
16141653
page_content="Content for header 2",
16151654
metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
16161655
),
1656+
Document(
1657+
page_content="Content for header 3",
1658+
metadata={
1659+
"Header 1": "My Header 1",
1660+
"Header 2": "Header 2",
1661+
"Header 3": "Header 3",
1662+
},
1663+
),
1664+
Document(
1665+
page_content="This should be tagged with Header 1 and Header 2 Again",
1666+
metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},
1667+
),
16171668
Document(
16181669
page_content="```python",
16191670
metadata={
16201671
"Code": "python",
16211672
"Header 1": "My Header 1",
1622-
"Header 2": "Header 2",
1673+
"Header 2": "Header 2 Again",
16231674
},
16241675
),
16251676
Document(
16261677
page_content="def func_definition():",
16271678
metadata={
16281679
"Code": "python",
16291680
"Header 1": "My Header 1",
1630-
"Header 2": "Header 2",
1681+
"Header 2": "Header 2 Again",
16311682
},
16321683
),
16331684
Document(
16341685
page_content=" print('Keep the whitespace consistent')",
16351686
metadata={
16361687
"Code": "python",
16371688
"Header 1": "My Header 1",
1638-
"Header 2": "Header 2",
1689+
"Header 2": "Header 2 Again",
16391690
},
16401691
),
16411692
Document(
16421693
page_content="```",
16431694
metadata={
16441695
"Code": "python",
16451696
"Header 1": "My Header 1",
1646-
"Header 2": "Header 2",
1697+
"Header 2": "Header 2 Again",
16471698
},
16481699
),
16491700
Document(

0 commit comments

Comments
 (0)