Skip to content

Commit

Permalink
fixes: add extraction limit to Extractors (#1673)
Browse files Browse the repository at this point in the history
  • Loading branch information
shahules786 authored Nov 14, 2024
1 parent 22de85b commit 33fbdfc
Showing 1 changed file with 36 additions and 26 deletions.
62 changes: 36 additions & 26 deletions src/ragas/testset/transforms/extractors/llm_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
from ragas.testset.transforms.base import LLMBasedExtractor


# define prompts
class TextWithExtractionLimit(BaseModel):
text: str
max_num: int = 10


class SummaryExtractorPrompt(PydanticPrompt[StringIO, StringIO]):
instruction: str = "Summarize the given text in less than 10 sentences."
input_model: t.Type[StringIO] = StringIO
Expand All @@ -29,14 +33,15 @@ class Keyphrases(BaseModel):
keyphrases: t.List[str]


class KeyphrasesExtractorPrompt(PydanticPrompt[StringIO, Keyphrases]):
instruction: str = "Extract top 5 keyphrases from the given text."
input_model: t.Type[StringIO] = StringIO
class KeyphrasesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Keyphrases]):
instruction: str = "Extract top max_num keyphrases from the given text."
input_model: t.Type[TextWithExtractionLimit] = TextWithExtractionLimit
output_model: t.Type[Keyphrases] = Keyphrases
examples: t.List[t.Tuple[StringIO, Keyphrases]] = [
examples: t.List[t.Tuple[TextWithExtractionLimit, Keyphrases]] = [
(
StringIO(
text="Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations."
TextWithExtractionLimit(
text="Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations.",
max_num=5,
),
Keyphrases(
keyphrases=[
Expand Down Expand Up @@ -66,17 +71,20 @@ class TitleExtractorPrompt(PydanticPrompt[StringIO, StringIO]):


class Headlines(BaseModel):
headlines: t.List[str]
headlines: t.List[str]


class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]):
instruction: str = "Extract only level 2 and level 3 headings from the given text."
class HeadlinesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Headlines]):
instruction: str = (
"Extract the most important max_num headlines from the given text that can be used to split the text into independent sections."
"Focus on Level 2 and Level 3 headings."
)

input_model: t.Type[StringIO] = StringIO
input_model: t.Type[TextWithExtractionLimit] = TextWithExtractionLimit
output_model: t.Type[Headlines] = Headlines
examples: t.List[t.Tuple[StringIO, Headlines]] = [
examples: t.List[t.Tuple[TextWithExtractionLimit, Headlines]] = [
(
StringIO(
TextWithExtractionLimit(
text="""\
Introduction
Overview of the topic...
Expand All @@ -98,30 +106,24 @@ class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]):
Conclusion
Final remarks and summary.
"""
""",
max_num=6,
),
Headlines(
headlines=[
"Introduction",
"Main Concepts",
"Detailed Analysis",
"Subsection: Specialized Techniques",
"Future Directions",
"Subsection: Next Steps in Research",
]
],)
),
),
]


class NEROutput(BaseModel):
entities: t.List[str]


class TextWithExtractionLimit(BaseModel):
text: str
max_num: int = 10


class NERPrompt(PydanticPrompt[TextWithExtractionLimit, NEROutput]):
instruction: str = (
"Extract the named entities from the given text, limiting the output to the top entities. "
Expand Down Expand Up @@ -190,12 +192,15 @@ class KeyphrasesExtractor(LLMBasedExtractor):

property_name: str = "keyphrases"
prompt: KeyphrasesExtractorPrompt = KeyphrasesExtractorPrompt()
max_num: int = 5

async def extract(self, node: Node) -> t.Tuple[str, t.Any]:
node_text = node.get_property("page_content")
if node_text is None:
return self.property_name, None
result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
result = await self.prompt.generate(
self.llm, data=TextWithExtractionLimit(text=node_text, max_num=self.max_num)
)
return self.property_name, result.keyphrases


Expand Down Expand Up @@ -238,12 +243,15 @@ class HeadlinesExtractor(LLMBasedExtractor):

property_name: str = "headlines"
prompt: HeadlinesExtractorPrompt = HeadlinesExtractorPrompt()
max_num: int = 5

async def extract(self, node: Node) -> t.Tuple[str, t.Any]:
node_text = node.get_property("page_content")
if node_text is None:
return self.property_name, None
result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
result = await self.prompt.generate(
self.llm, data=TextWithExtractionLimit(text=node_text, max_num=self.max_num)
)
if result is None:
return self.property_name, None
return self.property_name, result.headlines
Expand Down Expand Up @@ -282,7 +290,9 @@ class TopicDescription(BaseModel):


class TopicDescriptionPrompt(PydanticPrompt[StringIO, TopicDescription]):
instruction: str = "Provide a concise description of the main topic(s) discussed in the following text."
instruction: str = (
"Provide a concise description of the main topic(s) discussed in the following text."
)
input_model: t.Type[StringIO] = StringIO
output_model: t.Type[TopicDescription] = TopicDescription
examples: t.List[t.Tuple[StringIO, TopicDescription]] = [
Expand Down

0 comments on commit 33fbdfc

Please sign in to comment.