Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixes: add extraction limit to Extractors #1673

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 36 additions & 26 deletions src/ragas/testset/transforms/extractors/llm_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
from ragas.testset.transforms.base import LLMBasedExtractor


# define prompts
class TextWithExtractionLimit(BaseModel):
text: str
max_num: int = 10


class SummaryExtractorPrompt(PydanticPrompt[StringIO, StringIO]):
instruction: str = "Summarize the given text in less than 10 sentences."
input_model: t.Type[StringIO] = StringIO
Expand All @@ -29,14 +33,15 @@ class Keyphrases(BaseModel):
keyphrases: t.List[str]


class KeyphrasesExtractorPrompt(PydanticPrompt[StringIO, Keyphrases]):
instruction: str = "Extract top 5 keyphrases from the given text."
input_model: t.Type[StringIO] = StringIO
class KeyphrasesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Keyphrases]):
instruction: str = "Extract top max_num keyphrases from the given text."
input_model: t.Type[TextWithExtractionLimit] = TextWithExtractionLimit
output_model: t.Type[Keyphrases] = Keyphrases
examples: t.List[t.Tuple[StringIO, Keyphrases]] = [
examples: t.List[t.Tuple[TextWithExtractionLimit, Keyphrases]] = [
(
StringIO(
text="Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations."
TextWithExtractionLimit(
text="Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations.",
max_num=5,
),
Keyphrases(
keyphrases=[
Expand Down Expand Up @@ -66,17 +71,20 @@ class TitleExtractorPrompt(PydanticPrompt[StringIO, StringIO]):


class Headlines(BaseModel):
headlines: t.List[str]
headlines: t.List[str]


class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]):
instruction: str = "Extract only level 2 and level 3 headings from the given text."
class HeadlinesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Headlines]):
instruction: str = (
"Extract the most important max_num headlines from the given text that can be used to split the text into independent sections."
"Focus on Level 2 and Level 3 headings."
)

input_model: t.Type[StringIO] = StringIO
input_model: t.Type[TextWithExtractionLimit] = TextWithExtractionLimit
output_model: t.Type[Headlines] = Headlines
examples: t.List[t.Tuple[StringIO, Headlines]] = [
examples: t.List[t.Tuple[TextWithExtractionLimit, Headlines]] = [
(
StringIO(
TextWithExtractionLimit(
text="""\
Introduction
Overview of the topic...
Expand All @@ -98,30 +106,24 @@ class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]):

Conclusion
Final remarks and summary.
"""
""",
max_num=6,
),
Headlines(
headlines=[
"Introduction",
"Main Concepts",
"Detailed Analysis",
"Subsection: Specialized Techniques",
"Future Directions",
"Subsection: Next Steps in Research",
]
],)
),
),
]


class NEROutput(BaseModel):
entities: t.List[str]


class TextWithExtractionLimit(BaseModel):
text: str
max_num: int = 10


class NERPrompt(PydanticPrompt[TextWithExtractionLimit, NEROutput]):
instruction: str = (
"Extract the named entities from the given text, limiting the output to the top entities. "
Expand Down Expand Up @@ -190,12 +192,15 @@ class KeyphrasesExtractor(LLMBasedExtractor):

property_name: str = "keyphrases"
prompt: KeyphrasesExtractorPrompt = KeyphrasesExtractorPrompt()
max_num: int = 5

async def extract(self, node: Node) -> t.Tuple[str, t.Any]:
node_text = node.get_property("page_content")
if node_text is None:
return self.property_name, None
result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
result = await self.prompt.generate(
self.llm, data=TextWithExtractionLimit(text=node_text, max_num=self.max_num)
)
return self.property_name, result.keyphrases


Expand Down Expand Up @@ -238,12 +243,15 @@ class HeadlinesExtractor(LLMBasedExtractor):

property_name: str = "headlines"
prompt: HeadlinesExtractorPrompt = HeadlinesExtractorPrompt()
max_num: int = 5

async def extract(self, node: Node) -> t.Tuple[str, t.Any]:
node_text = node.get_property("page_content")
if node_text is None:
return self.property_name, None
result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
result = await self.prompt.generate(
self.llm, data=TextWithExtractionLimit(text=node_text, max_num=self.max_num)
)
if result is None:
return self.property_name, None
return self.property_name, result.headlines
Expand Down Expand Up @@ -282,7 +290,9 @@ class TopicDescription(BaseModel):


class TopicDescriptionPrompt(PydanticPrompt[StringIO, TopicDescription]):
instruction: str = "Provide a concise description of the main topic(s) discussed in the following text."
instruction: str = (
"Provide a concise description of the main topic(s) discussed in the following text."
)
input_model: t.Type[StringIO] = StringIO
output_model: t.Type[TopicDescription] = TopicDescription
examples: t.List[t.Tuple[StringIO, TopicDescription]] = [
Expand Down
Loading