diff --git a/src/ragas/testset/transforms/extractors/llm_based.py b/src/ragas/testset/transforms/extractors/llm_based.py index dca9c66f4..c7480e926 100644 --- a/src/ragas/testset/transforms/extractors/llm_based.py +++ b/src/ragas/testset/transforms/extractors/llm_based.py @@ -8,7 +8,11 @@ from ragas.testset.transforms.base import LLMBasedExtractor -# define prompts +class TextWithExtractionLimit(BaseModel): + text: str + max_num: int = 10 + + class SummaryExtractorPrompt(PydanticPrompt[StringIO, StringIO]): instruction: str = "Summarize the given text in less than 10 sentences." input_model: t.Type[StringIO] = StringIO @@ -29,14 +33,15 @@ class Keyphrases(BaseModel): keyphrases: t.List[str] -class KeyphrasesExtractorPrompt(PydanticPrompt[StringIO, Keyphrases]): - instruction: str = "Extract top 5 keyphrases from the given text." - input_model: t.Type[StringIO] = StringIO +class KeyphrasesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Keyphrases]): + instruction: str = "Extract top max_num keyphrases from the given text." + input_model: t.Type[TextWithExtractionLimit] = TextWithExtractionLimit output_model: t.Type[Keyphrases] = Keyphrases - examples: t.List[t.Tuple[StringIO, Keyphrases]] = [ + examples: t.List[t.Tuple[TextWithExtractionLimit, Keyphrases]] = [ ( - StringIO( - text="Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations." + TextWithExtractionLimit( + text="Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations.", + max_num=5, ), Keyphrases( keyphrases=[ @@ -66,17 +71,20 @@ class TitleExtractorPrompt(PydanticPrompt[StringIO, StringIO]): class Headlines(BaseModel): - headlines: t.List[str] + headlines: t.List[str] -class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]): - instruction: str = "Extract only level 2 and level 3 headings from the given text." +class HeadlinesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Headlines]): + instruction: str = ( + "Extract the most important max_num headlines from the given text that can be used to split the text into independent sections." + "Focus on Level 2 and Level 3 headings." + ) - input_model: t.Type[StringIO] = StringIO + input_model: t.Type[TextWithExtractionLimit] = TextWithExtractionLimit output_model: t.Type[Headlines] = Headlines - examples: t.List[t.Tuple[StringIO, Headlines]] = [ + examples: t.List[t.Tuple[TextWithExtractionLimit, Headlines]] = [ ( - StringIO( + TextWithExtractionLimit( text="""\ Introduction Overview of the topic... @@ -98,18 +106,17 @@ class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]): Conclusion Final remarks and summary. - """ + """, + max_num=6, ), Headlines( headlines=[ + "Introduction", "Main Concepts", "Detailed Analysis", - "Subsection: Specialized Techniques", "Future Directions", - "Subsection: Next Steps in Research", - ] + ],) ), - ), ] @@ -117,11 +124,6 @@ class NEROutput(BaseModel): entities: t.List[str] -class TextWithExtractionLimit(BaseModel): - text: str - max_num: int = 10 - - class NERPrompt(PydanticPrompt[TextWithExtractionLimit, NEROutput]): instruction: str = ( "Extract the named entities from the given text, limiting the output to the top entities. " @@ -190,12 +192,15 @@ class KeyphrasesExtractor(LLMBasedExtractor): property_name: str = "keyphrases" prompt: KeyphrasesExtractorPrompt = KeyphrasesExtractorPrompt() + max_num: int = 5 async def extract(self, node: Node) -> t.Tuple[str, t.Any]: node_text = node.get_property("page_content") if node_text is None: return self.property_name, None - result = await self.prompt.generate(self.llm, data=StringIO(text=node_text)) + result = await self.prompt.generate( + self.llm, data=TextWithExtractionLimit(text=node_text, max_num=self.max_num) + ) return self.property_name, result.keyphrases @@ -238,12 +243,15 @@ class HeadlinesExtractor(LLMBasedExtractor): property_name: str = "headlines" prompt: HeadlinesExtractorPrompt = HeadlinesExtractorPrompt() + max_num: int = 5 async def extract(self, node: Node) -> t.Tuple[str, t.Any]: node_text = node.get_property("page_content") if node_text is None: return self.property_name, None - result = await self.prompt.generate(self.llm, data=StringIO(text=node_text)) + result = await self.prompt.generate( + self.llm, data=TextWithExtractionLimit(text=node_text, max_num=self.max_num) + ) if result is None: return self.property_name, None return self.property_name, result.headlines @@ -282,7 +290,9 @@ class TopicDescription(BaseModel): class TopicDescriptionPrompt(PydanticPrompt[StringIO, TopicDescription]): - instruction: str = "Provide a concise description of the main topic(s) discussed in the following text." + instruction: str = ( + "Provide a concise description of the main topic(s) discussed in the following text." + ) input_model: t.Type[StringIO] = StringIO output_model: t.Type[TopicDescription] = TopicDescription examples: t.List[t.Tuple[StringIO, TopicDescription]] = [