Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

text-splitters: Add JSFrameworkTextSplitter for Handling JavaScript Framework Code #28972

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions libs/text-splitters/langchain_text_splitters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
HTMLSemanticPreservingSplitter,
)
from langchain_text_splitters.json import RecursiveJsonSplitter
from langchain_text_splitters.jsx import JSFrameworkTextSplitter
from langchain_text_splitters.konlpy import KonlpyTextSplitter
from langchain_text_splitters.latex import LatexTextSplitter
from langchain_text_splitters.markdown import (
Expand All @@ -60,6 +61,7 @@
"RecursiveCharacterTextSplitter",
"RecursiveJsonSplitter",
"LatexTextSplitter",
"JSFrameworkTextSplitter",
"PythonCodeTextSplitter",
"KonlpyTextSplitter",
"SpacyTextSplitter",
Expand Down
124 changes: 124 additions & 0 deletions libs/text-splitters/langchain_text_splitters/jsx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import re
from typing import Any, List, Optional

from langchain_text_splitters import RecursiveCharacterTextSplitter


class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
"""Text splitter that handles React (JSX), Vue, and Svelte code.

This splitter extends RecursiveCharacterTextSplitter to handle
React (JSX), Vue, and Svelte code by:
1. Detecting and extracting custom component tags from the text
2. Using those tags as additional separators along with standard JS syntax

The splitter combines:
- Custom component tags as separators (e.g. <Component, <div)
- JavaScript syntax elements (function, const, if, etc)
- Standard text splitting on newlines

This allows chunks to break at natural boundaries in
React, Vue, and Svelte component code.
"""

def __init__(
self,
separators: Optional[List[str]] = None,
chunk_size: int = 2000,
chunk_overlap: int = 0,
**kwargs: Any,
) -> None:
"""Initialize the JS Framework text splitter.

Args:
separators: Optional list of custom separator strings to use
chunk_size: Maximum size of chunks to return
chunk_overlap: Overlap in characters between chunks
**kwargs: Additional arguments to pass to parent class
"""
super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs)
self._separators = separators or []

def split_text(self, text: str) -> List[str]:
"""Split text into chunks.

This method splits the text into chunks by:
- Extracting unique opening component tags using regex
- Creating separators list with extracted tags and JS separators
- Splitting the text using the separators by calling the parent class method
- Handling chunk overlap if enabled

Args:
text: String containing code to split

Returns:
List of text chunks split on component and JS boundaries
"""
# Extract unique opening component tags using regex
component_tags = list(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CI is failing on test_vue_text_splitter, and I'm finding that this test is flaky when I run it locally (it fails around 50% of the time).

I think it is due to the use of list(set(...)) here and on line 73, since this is not deterministic. Using sorted(set(...)) will likely fix it if that makes sense to you.

set(
tag.split(" ")[0].strip("<>\n")
for tag in re.findall(r"<[^/\s][^>]*>", text) # Match opening tags
if tag.strip()
)
)
# Create separators list with extracted tags and default separators
component_separators = [f"<{tag}" for tag in component_tags]
component_separators = sorted(
component_separators,
key=lambda x: abs(
len(component_separators) // 2 - component_separators.index(x)
),
)
component_separators = list(set(component_separators))

js_separators = [
"\nexport ",
" export ",
"\nfunction ",
"\nasync function ",
" async function ",
"\nconst ",
"\nlet ",
"\nvar ",
"\nclass ",
" class ",
"\nif ",
" if ",
"\nfor ",
" for ",
"\nwhile ",
" while ",
"\nswitch ",
" switch ",
"\ncase ",
" case ",
"\ndefault ",
" default ",
]
separators = (
self._separators
+ js_separators
+ component_separators
+ ["<>", "\n\n", "&&\n", "||\n"]
)
self._separators = separators

# Split the text using the separators
chunks = super().split_text(text)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you comment why the logic around chunk overlaps below is necessary following the call to super()?


# Handle chunk overlap
if self._chunk_overlap > 0:
# Create a new list to hold the final chunks with overlap
final_chunks = []
for i in range(len(chunks)):
if i == 0:
final_chunks.append(chunks[i])
else:
# Add the overlap from the previous chunk
overlap_chunk = chunks[i - 1][-self._chunk_overlap :] + chunks[i]
final_chunks.append(overlap_chunk)

return final_chunks

return chunks
139 changes: 139 additions & 0 deletions libs/text-splitters/tests/unit_tests/test_text_splitters.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
HTMLSemanticPreservingSplitter,
)
from langchain_text_splitters.json import RecursiveJsonSplitter
from langchain_text_splitters.jsx import JSFrameworkTextSplitter
from langchain_text_splitters.markdown import (
ExperimentalMarkdownSyntaxTextSplitter,
MarkdownHeaderTextSplitter,
Expand Down Expand Up @@ -413,6 +414,144 @@ def test_python_text_splitter() -> None:
assert splits == expected_splits


FAKE_JSX_TEXT = """
import React from 'react';
import OtherComponent from './OtherComponent';

function MyComponent() {
const [count, setCount] = React.useState(0);

const handleClick = () => {
setCount(count + 1);
};

return (
<div>
<h1>Counter: {count}</h1>
<button onClick={handleClick}>
Increment
</button>
<OtherComponent />
</div>
);
}

export default MyComponent;
"""


def test_jsx_text_splitter() -> None:
splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)
splits = splitter.split_text(FAKE_JSX_TEXT)

expected_splits = [
"\nimport React from 'react';\n"
"import OtherComponent from './OtherComponent';\n",
"\nfunction MyComponent() {\n const [count, setCount] = React.useState(0);",
"\n\n const handleClick = () => {\n setCount(count + 1);\n };",
"return (",
"<div>",
"<h1>Counter: {count}</h1>\n ",
"<button onClick={handleClick}>\n Increment\n </button>\n ",
"<OtherComponent />\n </div>\n );\n}\n",
"export default MyComponent;",
]
assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]


FAKE_VUE_TEXT = """
<template>
<div>
<h1>{{ title }}</h1>
<button @click="increment">
Count is: {{ count }}
</button>
</div>
</template>

<script>
export default {
data() {
return {
title: 'Counter App',
count: 0
}
},
methods: {
increment() {
this.count++
}
}
}
</script>

<style>
button {
color: blue;
}
</style>
"""


def test_vue_text_splitter() -> None:
splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)
splits = splitter.split_text(FAKE_VUE_TEXT)

expected_splits = [
"<template>",
"<div>",
"<h1>{{ title }}</h1>",
'<button @click="increment">\n Count is: {{ count }}\n'
" </button>\n </div>\n</template>",
"<script>",
"export",
" default {\n data() {\n return {\n title: 'Counter App',\n "
"count: 0\n }\n },\n methods: {\n increment() {\n "
"this.count++\n }\n }\n}\n</script>",
"<style>\nbutton {\n color: blue;\n}\n</style>",
]
assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]


FAKE_SVELTE_TEXT = """
<script>
let count = 0

function increment() {
count += 1
}
</script>

<main>
<h1>Counter App</h1>
<button on:click={increment}>
Count is: {count}
</button>
</main>

<style>
button {
color: blue;
}
</style>
"""


def test_svelte_text_splitter() -> None:
splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)
splits = splitter.split_text(FAKE_SVELTE_TEXT)

expected_splits = [
"<script>\n let count = 0",
"\n\n function increment() {\n count += 1\n }\n</script>",
"<main>",
"<h1>Counter App</h1>",
"<button on:click={increment}>\n Count is: {count}\n </button>\n</main>",
"<style>\n button {\n color: blue;\n }\n</style>",
]
assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]


CHUNK_SIZE = 16


Expand Down
Loading