From 4d9cbe70cee1e774f8a95531e4e2dfbf9be893c7 Mon Sep 17 00:00:00 2001 From: ncoop57 Date: Tue, 15 Oct 2024 07:59:23 -0500 Subject: [PATCH] Add ability to change temp parameter and clean up examples --- fastdata/core.py | 13 +- nbs/00_core.ipynb | 317 +++++++++++++++++++++++++++++++++------------- 2 files changed, 234 insertions(+), 96 deletions(-) diff --git a/fastdata/core.py b/fastdata/core.py index fb0f671..442f4f8 100644 --- a/fastdata/core.py +++ b/fastdata/core.py @@ -6,13 +6,13 @@ __all__ = ['FastData'] # %% ../nbs/00_core.ipynb 3 -import concurrent.futures - from claudette import * from fastcore.utils import * from ratelimit import limits, sleep_and_retry from tqdm import tqdm +import concurrent.futures + # %% ../nbs/00_core.ipynb 4 class FastData: def __init__(self, @@ -26,10 +26,10 @@ def set_rate_limit(self, calls: int, period: int): """Set a new rate limit.""" @sleep_and_retry @limits(calls=calls, period=period) - def rate_limited_call(prompt: str, schema, sp: str): + def rate_limited_call(prompt: str, schema, temp: float, sp: str): return self.cli.structured( prompt, - temp=1, + temp=temp, tools=schema, )[0] @@ -39,18 +39,19 @@ def generate(self, prompt_template: str, inputs: list[dict], schema, + temp: float = 1., sp: str = "You are a helpful assistant.", max_workers: int = 64) -> list[dict]: def process_input(input_data): try: prompt = prompt_template.format(**input_data) - response = self._rate_limited_call( + return self._rate_limited_call( prompt=prompt, schema=schema, + temp=temp, sp=sp ) - return response except Exception as e: print(f"Error processing input: {e}") return None diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb index 5953f5b..5e1c168 100644 --- a/nbs/00_core.ipynb +++ b/nbs/00_core.ipynb @@ -25,7 +25,7 @@ "outputs": [], "source": [ "#| hide\n", - "from nbdev.showdoc import *" + "from IPython.display import Markdown" ] }, { @@ -35,12 +35,12 @@ "outputs": [], "source": [ "#| export\n", - "import concurrent.futures\n", - "\n", "from claudette import *\n", "from fastcore.utils import *\n", "from ratelimit import limits, sleep_and_retry\n", - "from tqdm import tqdm" + "from tqdm import tqdm\n", + "\n", + "import concurrent.futures" ] }, { @@ -62,10 +62,10 @@ " \"\"\"Set a new rate limit.\"\"\"\n", " @sleep_and_retry\n", " @limits(calls=calls, period=period)\n", - " def rate_limited_call(prompt: str, schema, sp: str):\n", + " def rate_limited_call(prompt: str, schema, temp: float, sp: str):\n", " return self.cli.structured(\n", " prompt,\n", - " temp=1,\n", + " temp=temp,\n", " tools=schema,\n", " )[0]\n", " \n", @@ -75,18 +75,19 @@ " prompt_template: str, \n", " inputs: list[dict], \n", " schema,\n", + " temp: float = 1.,\n", " sp: str = \"You are a helpful assistant.\",\n", " max_workers: int = 64) -> list[dict]:\n", " \n", " def process_input(input_data):\n", " try:\n", " prompt = prompt_template.format(**input_data)\n", - " response = self._rate_limited_call(\n", + " return self._rate_limited_call(\n", " prompt=prompt,\n", " schema=schema,\n", + " temp=temp,\n", " sp=sp\n", " )\n", - " return response\n", " except Exception as e:\n", " print(f\"Error processing input: {e}\")\n", " return None\n", @@ -101,6 +102,17 @@ " return results" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "def to_md(ss): return '\\n'.join(f'- {s}' for s in ss) \n", + "def show(ss): return Markdown(to_md(ss))" + ] + }, { "cell_type": "code", "execution_count": null, @@ -109,7 +121,7 @@ { "data": { "text/plain": [ - "__main__.Translation(english='Hello, how are you today?', german='Hallo, wie geht es Ihnen heute?')" + "Hello, how are you today? ➡ *Hola, ¿cómo estás hoy?*" ] }, "execution_count": null, @@ -119,12 +131,11 @@ ], "source": [ "class Translation():\n", - " \"\"\"Translation from an English phrase to a German phrase\"\"\"\n", - " def __init__(self, english: str, german: str): store_attr()\n", - " \n", - " __repr__ = basic_repr([\"english\", \"german\"])\n", + " \"Translation from an English phrase to a Spanish phrase\"\n", + " def __init__(self, english: str, spanish: str): store_attr()\n", + " def __repr__(self): return f\"{self.english} ➡ *{self.spanish}*\"\n", "\n", - "Translation(\"Hello, how are you today?\", \"Hallo, wie geht es Ihnen heute?\")" + "Translation(\"Hello, how are you today?\", \"Hola, ¿cómo estás hoy?\")" ] }, { @@ -133,67 +144,95 @@ "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00, 1.48it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Translations:\n", - "[__main__.Translation(english='Postpartum complications can be life-threatening, but many are preventable with proper education and care. As a maternal health advocate, I work to raise awareness and support new mothers.', german='Geburtskomplikationen können lebensbedrohlich sein, aber viele sind durch richtige Aufklärung und Versorgung vermeidbar. Als Verfechterin der Gesundheit von Müttern arbeite ich daran, das Bewusstsein zu schärfen und junge Mütter zu unterstützen.'), __main__.Translation(english=\"As a legal advisor, it's crucial to ensure that project documentation is complete and accurate to avoid potential legal issues down the line. Incomplete or inaccurate documentation can lead to misunderstandings, disputes, and even legal liabilities.\", german='Als Rechtsberater ist es von entscheidender Bedeutung, dass die Projektdokumentation vollständig und korrekt ist, um mögliche rechtliche Probleme in der Zukunft zu vermeiden. Unvollständige oder ungenaue Dokumentation kann zu Missverständnissen, Streitigkeiten und sogar rechtlichen Haftungen führen.'), __main__.Translation(english='El Salvador has been a stable democracy since the end of its civil war in the 1990s, but it continues to face challenges with gang violence and corruption. The current president, Nayib Bukele, has implemented controversial policies to combat these issues, which have drawn both praise and criticism from domestic and international observers.', german='El Salvador ist seit dem Ende seines Bürgerkriegs in den 1990er Jahren eine stabile Demokratie, sieht sich aber weiterhin Herausforderungen durch Bandenkriminalität und Korruption gegenüber. Der derzeitige Präsident, Nayib Bukele, hat kontroverse Maßnahmen ergriffen, um diese Probleme anzugehen, die sowohl Lob als auch Kritik von inländischen und internationalen Beobachtern erfahren haben.')]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] + "data": { + "text/markdown": [ + "- Hello, my name is Nathan. I am a research scientist at an AI startup. ➡ *Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.*\n", + "- How much wood could a woodchuck chuck if a woodchuck could chuck wood? ➡ *¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?*\n", + "- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. ➡ *Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede.*" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "from datasets import load_dataset\n", - "\n", "examples = [\n", " Translation(\n", " english=\"Hello, my name is Nathan. I am a research scientist at an AI startup.\",\n", - " german=\"Hallo mein Name ist Nathan. Ich bin wissenschaftlicher Mitarbeiter bei einem KI-Startup.\"),\n", + " spanish=\"Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.\"),\n", " Translation(\n", " english=\"How much wood could a woodchuck chuck if a woodchuck could chuck wood?\",\n", - " german=\"Wie viel Holz könnte ein Waldmurmeltier einspannen, wenn ein Waldmurmeltier Holz einspannen könnte?\"),\n", + " spanish=\"¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?\"),\n", " Translation(\n", " english=\"Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See.\",\n", - " german=\"Thomas Cranmer (2. Juli 1489 - 21. März 1556) war ein Anführer der englischen Reformation und Erzbischof von Canterbury während der Herrschaft von Heinrich VIII., Eduard VI. und für kurze Zeit auch Maria I. Er half bei der Ausarbeitung der Klage für die Aufhebung von Heinrichs Heirat mit Katharina von Aragon, die eine der Ursachen für die Trennung der englischen Kirche von der Union mit dem Heiligen Stuhl war.\"\n", + " spanish=\"Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede.\"\n", " ),\n", "]\n", - "examples = \"\\n- \".join([f\"{e.english} -> {e.german}\" for e in examples])\n", - "\n", + "show(examples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "- A Political Analyst specialized in El Salvador's political landscape.\n", + "- A legal advisor who understands the legal implications of incomplete or inaccurate project documentation\n", + "- A maternal health advocate focused on raising awareness about postpartum complications." + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "# Load personas\n", "personas = load_dataset(\"proj-persona/PersonaHub\", \"persona\", split='train').select(range(3))['persona']\n", - "\n", - "sp = \"You will help generate synthetic data of English and German phrases.\"\n", + "show(personas)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sp = \"You will help generate synthetic data of English and Spanish phrases.\"\n", "prompt_template = \"\"\"\\\n", - "Here are some examples:\n", + "\n", "{examples}\n", + "\n", "\n", - "Create an English and German translation pair that is similar to the examples and would be appropriate for the following persona: {persona}\n", - "\"\"\"\n", - "\n", - "# Generate translations\n", - "fast_data = FastData(model=\"claude-3-haiku-20240307\")\n", - "translations = fast_data.generate(\n", - " prompt_template=prompt_template,\n", - " inputs=[{\"persona\": persona, \"examples\": examples} for persona in personas],\n", - " schema=Translation,\n", - " sp=sp\n", - ")\n", - "\n", - "print(\"Translations:\")\n", - "print(translations)" + "Create an English and Spanish translation pair that is similar to the examples and would be appropriate for the following persona:\n", + "{persona}\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see what the prompt looks like in action:" ] }, { @@ -201,43 +240,96 @@ "execution_count": null, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00, 1.24s/it]" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "Critiques:\n", - "[__main__.TranslationCritique(critique='The translation accurately conveys the meaning of the source text, maintaining the legal context and key terminology. It demonstrates a strong grasp of both languages, effectively translating complex concepts like \"legal liabilities\" (rechtlichen Haftungen). The German version reads naturally and maintains the formal tone appropriate for legal contexts. It captures nuances such as \"down the line\" (in der Zukunft) idiomatically. The translation is highly accurate, professionally suitable, and could be the work of an experienced translator. While it\\'s excellent, it doesn\\'t quite reach the level of absolute mastery that would warrant a perfect score.', score=4), __main__.TranslationCritique(critique='This translation is of high quality, demonstrating professional-level work. It accurately conveys the meaning of the source text, maintaining the appropriate register and tone. The translator has made excellent word choices, such as \"Verfechterin\" for \"advocate,\" which captures the nuance of dedication to the cause. The phrasing is natural in German and maintains the clarity of the original message. There are no significant errors or awkward constructions. The only minor point that could be improved is the translation of \"new mothers\" as \"junge Mütter\" (young mothers), which slightly alters the original meaning. Overall, it\\'s a very strong translation suitable for professional use.', score=4), __main__.TranslationCritique(critique='This translation deserves 5 points for its outstanding quality. It accurately conveys the original meaning, maintains the appropriate tone, and reads naturally in German. The translator has skillfully handled complex concepts like \"stable democracy\" and \"gang violence,\" preserving both content and nuance. The phrasing is idiomatic and flows well, demonstrating mastery of both languages. Key terms are consistently and appropriately translated, and the sentence structure effectively adapts to German syntax while maintaining the original\\'s intent. This translation is of professional caliber and could be used in high-level contexts without hesitation.', score=5)]\n" + "\n", + "- Hello, my name is Nathan. I am a research scientist at an AI startup. ➡ *Hola, me llamo Nathan. Soy ciencia investigador en un startup de IA.*\n", + "- How much wood could a woodchuck chuck if a woodchuck could chuck wood? ➡ *¿Cuánta madera podría arrojar una marmota si una marmota pudiera arrojar madera?*\n", + "- Thomas Cranmer (2 July 1489 - 21 March 1556) was a leader of the English Reformation and Archbishop of Canterbury during the reigns of Henry VIII, Edward VI and, for a short time, Mary I. He helped build the case for the annulment of Henry's marriage to Catherine of Aragon, which was one of the causes of the separation of the English Church from union with the Holy See. ➡ *Thomas Cranmer (2 de julio de 1489 - 21 de marzo de 1556) fue un líder de la Reforma inglesa y arzobispo de Canterbury durante los reinados de Henry VIII, Edward VI y, por un corto tiempo, María I. Ayudó a construir el caso para la anulación de El matrimonio de Henry con Catalina de Aragón, que fue una de las causas de la separación de la Iglesia inglesa de la unión con la Santa Sede.*\n", + "\n", + "\n", + "Create an English and Spanish translation pair that is similar to the examples and would be appropriate for the following persona:\n", + "A Political Analyst specialized in El Salvador's political landscape.\n", + "\n" ] - }, + } + ], + "source": [ + "examples_md = to_md(examples)\n", + "prompt = prompt_template.format(examples=examples_md, persona=personas[0])\n", + "print(prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\n" + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00, 1.59it/s]\n" ] } ], + "source": [ + "# Generate translations\n", + "fast_data = FastData(model=\"claude-3-haiku-20240307\")\n", + "translations = fast_data.generate(\n", + " prompt_template=prompt_template,\n", + " inputs=[{\"persona\": persona, \"examples\": examples} for persona in personas],\n", + " schema=Translation,\n", + " sp=sp\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "- Postpartum complications can be life-threatening. New mothers need support and resources to stay healthy after giving birth. ➡ *Las complicaciones posparto pueden ser mortales. Las nuevas madres necesitan apoyo y recursos para mantener su salud después del parto.*\n", + "- Incomplete or inaccurate project documentation can have serious legal consequences. It's important to ensure all project details are properly recorded and communicated. ➡ *La documentación de proyectos incompleta o inexacta puede tener graves consecuencias legales. Es importante asegurarse de que todos los detalles del proyecto se registren y se comuniquen adecuadamente.*\n", + "- El Salvador's former president Nayib Bukele has consolidated power and cracked down on opposition, drawing criticism from international observers. However, his populist policies remain popular among many Salvadorans frustrated with high crime rates and economic stagnation. ➡ *El expresidente de El Salvador, Nayib Bukele, ha consolidado el poder y reprimido a la oposición, lo que ha generado críticas de observadores internacionales. Sin embargo, sus políticas populistas siguen siendo populares entre muchos salvadoreños frustrados por las altas tasas de delincuencia y el estancamiento económico.*" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "show(translations)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "class TranslationCritique():\n", - " \"\"\"\n", - " A critique of the translation.\n", - " \"\"\"\n", - " def __init__(\n", - " self,\n", - " critique: str, # A critique of the translation.\n", - " score: int # A score of the translation from 1 to 5. \n", - " ): store_attr()\n", - " \n", - " __repr__ = basic_repr(['critique', 'score'])\n", - "\n", - "sp = \"You will help critique synthetic data of English and German phrases.\"\n", + " \"A critique of the translation.\"\n", + " def __init__(self, critique: str, score: int): store_attr()\n", + " def __repr__(self): return f\"\\t- **Critique:** {self.critique}\\n\\t- **Score:** {self.score}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sp = \"You will help critique synthetic data of English and Spanish phrases.\"\n", "critique_template = \"\"\"\\\n", "Below is an extract of a translation. Evaluate its quality as a senior translator would, considering its suitability for professional use. Use the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:\n", "\n", @@ -247,25 +339,70 @@ "- Grant a fourth point if the translation is highly accurate and reads naturally in the target language, exhibiting a consistent and appropriate style. It could be similar to the work of an experienced translator, offering faithful rendering of content and tone, with minimal errors, and effectively handling complex concepts or cultural references. The result is coherent, well-expressed, and valuable for its intended purpose.\n", "- Bestow a fifth point if the translation is outstanding, demonstrating mastery of both source and target languages. It captures subtle nuances, maintains the author's voice and intent, and reads as if it were originally written in the target language. The translator has made excellent choices in dealing with challenging elements like wordplay, idiomatic expressions, or culture-specific content.\n", "\n", - "The translation extract:\n", - "{translation}\n", + "{translation}\n", "\n", "After examining the translation:\n", "\n", "- Briefly justify your total score, up to 100 words.\n", "- Conclude with the score of the translation.\n", - "\"\"\"\n", - "\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00, 1.33s/it]\n" + ] + } + ], + "source": [ "fast_data = FastData(model=\"claude-3-5-sonnet-20240620\")\n", "critiques = fast_data.generate(\n", " prompt_template=critique_template,\n", - " inputs=[{\"translation\": f\"{t.english} -> {t.german}\"} for t in translations],\n", + " inputs=[{\"translation\": f\"{t.english} -> {t.spanish}\"} for t in translations],\n", " schema=TranslationCritique,\n", " sp=sp\n", - ")\n", - "\n", - "print(\"Critiques:\")\n", - "print(critiques)" + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "- Postpartum complications can be life-threatening. New mothers need support and resources to stay healthy after giving birth. ➡ *Las complicaciones posparto pueden ser mortales. Las nuevas madres necesitan apoyo y recursos para mantener su salud después del parto.*\n", + "\n", + "\t- **Critique:** This translation demonstrates a high level of accuracy and fluency. It effectively conveys the original message, maintaining the tone and emphasis on the importance of proper documentation. The translator has chosen appropriate terminology (e.g., \"consecuencias legales\" for \"legal consequences\") and has accurately rendered the passive voice construction. The translation reads naturally in Spanish and captures the nuances of the original text. While it's a strong translation, there's a slight opportunity for improvement in terms of conciseness, which prevents it from reaching the highest level of excellence.\n", + "\t- **Score:** 4\n", + "- Incomplete or inaccurate project documentation can have serious legal consequences. It's important to ensure all project details are properly recorded and communicated. ➡ *La documentación de proyectos incompleta o inexacta puede tener graves consecuencias legales. Es importante asegurarse de que todos los detalles del proyecto se registren y se comuniquen adecuadamente.*\n", + "\n", + "\t- **Critique:** The translation accurately conveys the main ideas of the source text, maintaining the structure and key information. It correctly translates complex terms like \"consolidated power\" and \"cracked down on opposition.\" The translator effectively renders \"populist policies\" and \"economic stagnation.\" The text reads naturally in Spanish and maintains the original tone. There are no significant errors or awkward phrasings. The translation demonstrates a high level of competence, capturing nuances and providing a faithful rendering of the content. It's suitable for professional use, though it doesn't reach the level of exceptional mastery that would warrant a perfect score.\n", + "\t- **Score:** 4\n", + "- El Salvador's former president Nayib Bukele has consolidated power and cracked down on opposition, drawing criticism from international observers. However, his populist policies remain popular among many Salvadorans frustrated with high crime rates and economic stagnation. ➡ *El expresidente de El Salvador, Nayib Bukele, ha consolidado el poder y reprimido a la oposición, lo que ha generado críticas de observadores internacionales. Sin embargo, sus políticas populistas siguen siendo populares entre muchos salvadoreños frustrados por las altas tasas de delincuencia y el estancamiento económico.*\n", + "\n", + "\t- **Critique:** The translation accurately conveys the main message and maintains the tone of the original text. It demonstrates a good understanding of both languages and uses appropriate medical terminology (e.g., \"posparto\" for \"postpartum\"). The Spanish version reads naturally and maintains the concise style of the source. It effectively captures the urgency of the situation with \"pueden ser mortales\" for \"life-threatening\". The translator made a good choice in translating \"stay healthy\" to \"mantener su salud\", which sounds more natural in Spanish. Overall, it's a high-quality translation suitable for professional use, with no significant errors or awkward phrasing.\n", + "\t- **Score:** 5" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "show(f'{t}\\n\\n{c}' for t, c in zip(translations, critiques))" ] }, {