diff --git a/00_core.ipynb b/00_core.ipynb deleted file mode 100644 index 6671b71..0000000 --- a/00_core.ipynb +++ /dev/null @@ -1,61 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# core\n", - "\n", - "> Fill in a module description here" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| default_exp core" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| hide\n", - "from nbdev.showdoc import *" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| export\n", - "def foo(): pass" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| hide\n", - "import nbdev; nbdev.nbdev_export()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "python3", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/00_xml.ipynb b/00_xml.ipynb new file mode 100644 index 0000000..ed1c7a8 --- /dev/null +++ b/00_xml.ipynb @@ -0,0 +1,781 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "efe78920", + "metadata": {}, + "outputs": [], + "source": [ + "#|default_exp xml" + ] + }, + { + "cell_type": "markdown", + "id": "3d773712-12fe-440e-891f-36f59666dfde", + "metadata": {}, + "source": [ + "# xml source" + ] + }, + { + "cell_type": "markdown", + "id": "ff6f6471-8061-4fdd-85a1-25fdc27c5cf3", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "033c76fd", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "import hashlib,xml.etree.ElementTree as ET\n", + "from collections import namedtuple\n", + "\n", + "from fastcore.utils import *\n", + "from fastcore.meta import delegates\n", + "from IPython import display" + ] + }, + { + "cell_type": "markdown", + "id": "65b52012", + "metadata": {}, + "source": [ + "## XML helpers" + ] + }, + { + "cell_type": "markdown", + "id": "aa15af54", + "metadata": {}, + "source": [ + "Many language models work (e.g. Claude) well with XML inputs, but XML can be a bit clunky to work with manually. Therefore, we create a couple of more streamlined approaches for XML generation. You don't need to use these if you don't find them useful -- you can always just use plain strings for XML directly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26f66da9", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def xt(tag:str, # XML tag name\n", + " c:Optional[list]=None, # Children\n", + " **kw):\n", + " \"Helper to create appropriate data structure for `to_xml`.\"\n", + " kw = {k.lstrip('_'):str(v) for k,v in kw.items()}\n", + " return tag,c,kw" + ] + }, + { + "cell_type": "markdown", + "id": "1f063c86", + "metadata": {}, + "source": [ + "An XML node contains a tag, optional children, and optional attributes. `xt` creates a tuple of these three things, which we will use to general XML shortly. Attributes are passed as kwargs; since these might conflict with reserved words in Python, you can optionally add a `_` prefix and it'll be stripped off." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "180f1934", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('x-custom', ['hi'], {'class': 'bar'})" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xt('x-custom', ['hi'], _class='bar')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59329ea3", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "g = globals()\n", + "tags = 'div img h1 h2 h3 h4 h5 p hr span html'.split()\n", + "for o in tags: g[o] = partial(xt, o)" + ] + }, + { + "cell_type": "markdown", + "id": "9a503937", + "metadata": {}, + "source": [ + "If you have to use a lot of tags of the same type, it's convenient to use `partial` to create specialised functions for them. Here, we're creating functions for some common HTML tags. Here's an example of using them:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6122acf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('html',\n", + " [('p', 'This is a paragraph', {}),\n", + " ('hr', None, {}),\n", + " ('img', None, {'src': 'http://example.prg'}),\n", + " ('div',\n", + " [('h1', 'This is a header', {}),\n", + " ('h2', 'This is a sub-header', {'style': 'k:v'})],\n", + " {'class': 'foo'})],\n", + " {})" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = html([\n", + " p('This is a paragraph'),\n", + " hr(),\n", + " img(src='http://example.prg'),\n", + " div([\n", + " h1('This is a header'),\n", + " h2('This is a sub-header', style='k:v'),\n", + " ], _class='foo')\n", + "])\n", + "a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15807ed7", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def hl_md(s, lang='xml'):\n", + " \"Syntax highlight `s` using `lang`.\"\n", + " if display: return display.Markdown(f'```{lang}\\n{s}\\n```')\n", + " print(s)" + ] + }, + { + "cell_type": "markdown", + "id": "79155289", + "metadata": {}, + "source": [ + "When we display XML in a notebook, it's nice to highlight it, so we create a function to simplify that:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb4907fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "```xml\n", + "a child\n", + "```" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hl_md('a child')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20467373", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def to_xml(node:tuple, # XML structure in `xt` format\n", + " hl=False # Syntax highlight response?\n", + " ):\n", + " \"Convert `node` to an XML string.\"\n", + " def mk_el(tag, cs, attrs):\n", + " el = ET.Element(tag, attrib=attrs)\n", + " if isinstance(cs, list): el.extend([mk_el(*o) for o in cs])\n", + " elif cs is not None: el.text = str(cs)\n", + " return el\n", + "\n", + " root = mk_el(*node)\n", + " ET.indent(root, space=' ' if hl else '')\n", + " res = ET.tostring(root, encoding='unicode')\n", + " return hl_md(res) if hl else res" + ] + }, + { + "cell_type": "markdown", + "id": "7a7fe4c6", + "metadata": {}, + "source": [ + "Now we can convert that HTML data structure we created into XML:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80a0cde7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "```xml\n", + "\n", + "

This is a paragraph

\n", + "
\n", + " \n", + "
\n", + "

This is a header

\n", + "

This is a sub-header

\n", + "
\n", + "\n", + "```" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "to_xml(a, hl=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2795f9fc", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def json_to_xml(d:dict, # JSON dictionary to convert\n", + " rnm:str # Root name\n", + " )->str:\n", + " \"Convert `d` to XML.\"\n", + " root = ET.Element(rnm)\n", + " def build_xml(data, parent):\n", + " if isinstance(data, dict):\n", + " for key, value in data.items(): build_xml(value, ET.SubElement(parent, key))\n", + " elif isinstance(data, list):\n", + " for item in data: build_xml(item, ET.SubElement(parent, 'item'))\n", + " else: parent.text = str(data)\n", + " build_xml(d, root)\n", + " ET.indent(root)\n", + " return ET.tostring(root, encoding='unicode')" + ] + }, + { + "cell_type": "markdown", + "id": "140a35a2", + "metadata": {}, + "source": [ + "JSON doesn't map as nicely to XML as the data structure used in the previous section, but for simple XML trees it can be convenient -- for example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "005a5be4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "```xml\n", + "\n", + " Howard\n", + " \n", + " Jeremy\n", + " Peter\n", + " \n", + "
\n", + " Queensland\n", + " Australia\n", + "
\n", + "
\n", + "```" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = dict(surname='Howard', firstnames=['Jeremy','Peter'],\n", + " address=dict(state='Queensland',country='Australia'))\n", + "hl_md(json_to_xml(a, 'person'))" + ] + }, + { + "cell_type": "markdown", + "id": "7788c48c", + "metadata": {}, + "source": [ + "## Including documents" + ] + }, + { + "cell_type": "markdown", + "id": "479be4c9", + "metadata": {}, + "source": [ + "According [to Anthropic](https://docs.anthropic.com/claude/docs/long-context-window-tips), \"*it's essential to structure your prompts in a way that clearly separates the input data from the instructions*\". They recommend using the following format:\n", + "\n", + "```xml\n", + "Here are some documents for you to reference for your task:\n", + " \n", + "\n", + "\n", + "\n", + "(URL, file name, hash, etc)\n", + "\n", + "\n", + "(the text content)\n", + "\n", + "\n", + "\n", + "```\n", + "\n", + "We will create some small helper functions to make it easier to generate context in this format. Although it's based on Anthropic's recommendation, it's likely to work well with other models too." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a01dc320", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "doctype = namedtuple('doctype', ['source', 'content'])" + ] + }, + { + "cell_type": "markdown", + "id": "6620a123", + "metadata": {}, + "source": [ + "We'll use `doctype` to store our pairs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce853491", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def _add_nls(s):\n", + " \"Add newlines to start and end of `s` if missing\"\n", + " if s[ 0]!='\\n': s = '\\n'+s\n", + " if s[-1]!='\\n': s = s+'\\n'\n", + " return s" + ] + }, + { + "cell_type": "markdown", + "id": "026d3b06", + "metadata": {}, + "source": [ + "Since Anthropic's example shows newlines before and after each tag, we'll do the same." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "932e8858", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def mk_doctype(content:str, # The document content\n", + " source:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided\n", + " ) -> namedtuple:\n", + " \"Create a `doctype` named tuple\"\n", + " if source is None: source = hashlib.md5(content.encode()).hexdigest()[:8]\n", + " return doctype(_add_nls(str(source).strip()), _add_nls(content.strip()))" + ] + }, + { + "cell_type": "markdown", + "id": "8800921b", + "metadata": {}, + "source": [ + "This is a convenience wrapper to ensure that a `doctype` has the needed information in the right format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14f9e185", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "doctype(source='\\nb8898fab\\n', content='\\nThis is a sample\\n')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc = 'This is a sample'\n", + "mk_doctype(doc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b8e6f87", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def mk_doc(index:int, # The document index\n", + " content:str, # The document content\n", + " source:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided\n", + " ) -> tuple:\n", + " \"Create an `xt` format tuple for a single doc in Anthropic's recommended format\"\n", + " dt = mk_doctype(content, source)\n", + " content = xt('document_content', dt.content)\n", + " source = xt('source', dt.source)\n", + " return xt('document', [source, content], index=index)" + ] + }, + { + "cell_type": "markdown", + "id": "a8b6ac26", + "metadata": {}, + "source": [ + "We can now generate XML for one document in the suggested format:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7ed5a9a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "b8898fab\n", + "\n", + "\n", + "This is a sample\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(to_xml(mk_doc(1, doc)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba5ebfab", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def docs_xml(docs:list[str], # The content of each document\n", + " sources:Optional[list]=None, # URLs, filenames, etc; each one defaults to `md5(content)` if not provided\n", + " prefix:bool=True # Include Anthropic's suggested prose intro?\n", + " )->str:\n", + " \"Create an XML string containing `docs` in Anthropic's recommended format\"\n", + " pre = 'Here are some documents for you to reference for your task:\\n\\n' if prefix else ''\n", + " if sources is None: sources = [None]*len(docs)\n", + " docs = [mk_doc(i+1, *o) for i,o in enumerate(zip(docs,sources))]\n", + " return pre + to_xml(xt('documents', docs))" + ] + }, + { + "cell_type": "markdown", + "id": "85004124", + "metadata": {}, + "source": [ + "Putting it all together, we have our final XML format:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1dac60f6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here are some documents for you to reference for your task:\n", + "\n", + "\n", + "\n", + "\n", + "b8898fab\n", + "\n", + "\n", + "This is a sample\n", + "\n", + "\n", + "\n", + "\n", + "doc.txt\n", + "\n", + "\n", + "And another one\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "docs = [doc, 'And another one']\n", + "sources = [None, 'doc.txt']\n", + "print(docs_xml(docs, sources))" + ] + }, + { + "cell_type": "markdown", + "id": "2a8a7a9a", + "metadata": {}, + "source": [ + "## Context creation" + ] + }, + { + "cell_type": "markdown", + "id": "cd06b2dc", + "metadata": {}, + "source": [ + "Now that we can generate Anthropic's XML format, let's make it easy for a few common cases." + ] + }, + { + "cell_type": "markdown", + "id": "65317fc6", + "metadata": {}, + "source": [ + "### File list to context" + ] + }, + { + "cell_type": "markdown", + "id": "3778e8ed", + "metadata": {}, + "source": [ + "For generating XML context from files, we'll just read them as text and use the file names as `source`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a168636", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "def files2ctx(\n", + " fnames:list[Union[str,Path]], # List of file names to add to context\n", + " prefix:bool=True # Include Anthropic's suggested prose intro?\n", + ")->str: # XML for LM context\n", + " fnames = [Path(o) for o in fnames]\n", + " contents = [o.read_text() for o in fnames]\n", + " return docs_xml(contents, fnames, prefix=prefix)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bf73d36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "```xml\n", + "Here are some documents for you to reference for your task:\n", + "\n", + "\n", + "\n", + "\n", + "samples/sample_core.py\n", + "\n", + "\n", + "import inspect\n", + "empty = inspect.Parameter.empty\n", + "models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'\n", + "\n", + "\n", + "\n", + "\n", + "samples/sample_styles.css\n", + "\n", + "\n", + ".cell { margin-bottom: 1rem; }\n", + ".cell > .sourceCode { margin-bottom: 0; }\n", + ".cell-output > pre { margin-bottom: 0; }\n", + "\n", + "\n", + "\n", + "```" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fnames = ['samples/sample_core.py', 'samples/sample_styles.css']\n", + "hl_md(files2ctx(fnames))" + ] + }, + { + "cell_type": "markdown", + "id": "191ddb2b", + "metadata": {}, + "source": [ + "### Folder to context" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0452a21", + "metadata": {}, + "outputs": [], + "source": [ + "#| exports\n", + "@delegates(globtastic)\n", + "def folder2ctx(\n", + " folder:Union[str,Path], # Folder name containing files to add to context\n", + " prefix:bool=True, # Include Anthropic's suggested prose intro?\n", + " **kwargs # Passed to `globtastic`\n", + ")->str: # XML for Claude context\n", + " fnames = globtastic(folder, **kwargs)\n", + " return files2ctx(fnames, prefix=prefix)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efd52392", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "samples/sample_core.py\n", + "\n", + "\n", + "import inspect\n", + "empty = inspect.Parameter.empty\n", + "models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(folder2ctx('samples', prefix=False, file_glob='*.py'))" + ] + }, + { + "cell_type": "markdown", + "id": "94ec4289", + "metadata": {}, + "source": [ + "## Export -" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e9ee5c1", + "metadata": {}, + "outputs": [], + "source": [ + "#|hide\n", + "#|eval: false\n", + "from nbdev.doclinks import nbdev_export\n", + "nbdev_export()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "207f9715", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/README.md b/README.md index 22b6b89..7d23009 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,7 @@ -This file will become your README and also the index of your -documentation. +This is a work in progress… ## Install @@ -14,10 +13,5 @@ pip install lmtools ## How to use -Fill me in please! Don’t forget code examples: - -``` python -1+1 -``` - - 2 +See the `xml source` section for a walkthru of XML and document context +generation functionality. diff --git a/index.ipynb b/index.ipynb index 5cd1219..444e365 100644 --- a/index.ipynb +++ b/index.ipynb @@ -7,7 +7,7 @@ "outputs": [], "source": [ "#| hide\n", - "from lmtools.core import *" + "from lmtools import *" ] }, { @@ -23,7 +23,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This file will become your README and also the index of your documentation." + "This is a work in progress..." ] }, { @@ -53,27 +53,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Fill me in please! Don't forget code examples:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "1+1" + "See the `xml source` section for a walkthru of XML and document context generation functionality." ] }, { @@ -86,7 +66,7 @@ ], "metadata": { "kernelspec": { - "display_name": "python3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" } diff --git a/lmtools/__init__.py b/lmtools/__init__.py index f102a9c..0fe2fae 100644 --- a/lmtools/__init__.py +++ b/lmtools/__init__.py @@ -1 +1,3 @@ __version__ = "0.0.1" +from .xml import * + diff --git a/lmtools/_modidx.py b/lmtools/_modidx.py index c64c2ee..f853f38 100644 --- a/lmtools/_modidx.py +++ b/lmtools/_modidx.py @@ -5,4 +5,13 @@ 'doc_host': 'https://jph00.github.io', 'git_url': 'https://github.com/jph00/lmtools', 'lib_path': 'lmtools'}, - 'syms': {'lmtools.core': {'lmtools.core.foo': ('core.html#foo', 'lmtools/core.py')}}} + 'syms': { 'lmtools.xml': { 'lmtools.xml._add_nls': ('xml.html#_add_nls', 'lmtools/xml.py'), + 'lmtools.xml.docs_xml': ('xml.html#docs_xml', 'lmtools/xml.py'), + 'lmtools.xml.files2ctx': ('xml.html#files2ctx', 'lmtools/xml.py'), + 'lmtools.xml.folder2ctx': ('xml.html#folder2ctx', 'lmtools/xml.py'), + 'lmtools.xml.hl_md': ('xml.html#hl_md', 'lmtools/xml.py'), + 'lmtools.xml.json_to_xml': ('xml.html#json_to_xml', 'lmtools/xml.py'), + 'lmtools.xml.mk_doc': ('xml.html#mk_doc', 'lmtools/xml.py'), + 'lmtools.xml.mk_doctype': ('xml.html#mk_doctype', 'lmtools/xml.py'), + 'lmtools.xml.to_xml': ('xml.html#to_xml', 'lmtools/xml.py'), + 'lmtools.xml.xt': ('xml.html#xt', 'lmtools/xml.py')}}} diff --git a/lmtools/core.py b/lmtools/core.py deleted file mode 100644 index a909a06..0000000 --- a/lmtools/core.py +++ /dev/null @@ -1,7 +0,0 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: ../00_core.ipynb. - -# %% auto 0 -__all__ = ['foo'] - -# %% ../00_core.ipynb 3 -def foo(): pass diff --git a/lmtools/xml.py b/lmtools/xml.py new file mode 100644 index 0000000..159d5cf --- /dev/null +++ b/lmtools/xml.py @@ -0,0 +1,123 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: ../00_xml.ipynb. + +# %% auto 0 +__all__ = ['g', 'tags', 'doctype', 'xt', 'hl_md', 'to_xml', 'json_to_xml', 'mk_doctype', 'mk_doc', 'docs_xml', 'files2ctx', + 'folder2ctx'] + +# %% ../00_xml.ipynb 3 +import hashlib,xml.etree.ElementTree as ET +from collections import namedtuple + +from fastcore.utils import * +from fastcore.meta import delegates +from IPython import display + +# %% ../00_xml.ipynb 6 +def xt(tag:str, # XML tag name + c:Optional[list]=None, # Children + **kw): + "Helper to create appropriate data structure for `to_xml`." + kw = {k.lstrip('_'):str(v) for k,v in kw.items()} + return tag,c,kw + +# %% ../00_xml.ipynb 9 +g = globals() +tags = 'div img h1 h2 h3 h4 h5 p hr span html'.split() +for o in tags: g[o] = partial(xt, o) + +# %% ../00_xml.ipynb 12 +def hl_md(s, lang='xml'): + "Syntax highlight `s` using `lang`." + if display: return display.Markdown(f'```{lang}\n{s}\n```') + print(s) + +# %% ../00_xml.ipynb 15 +def to_xml(node:tuple, # XML structure in `xt` format + hl=False # Syntax highlight response? + ): + "Convert `node` to an XML string." + def mk_el(tag, cs, attrs): + el = ET.Element(tag, attrib=attrs) + if isinstance(cs, list): el.extend([mk_el(*o) for o in cs]) + elif cs is not None: el.text = str(cs) + return el + + root = mk_el(*node) + ET.indent(root, space=' ' if hl else '') + res = ET.tostring(root, encoding='unicode') + return hl_md(res) if hl else res + +# %% ../00_xml.ipynb 18 +def json_to_xml(d:dict, # JSON dictionary to convert + rnm:str # Root name + )->str: + "Convert `d` to XML." + root = ET.Element(rnm) + def build_xml(data, parent): + if isinstance(data, dict): + for key, value in data.items(): build_xml(value, ET.SubElement(parent, key)) + elif isinstance(data, list): + for item in data: build_xml(item, ET.SubElement(parent, 'item')) + else: parent.text = str(data) + build_xml(d, root) + ET.indent(root) + return ET.tostring(root, encoding='unicode') + +# %% ../00_xml.ipynb 23 +doctype = namedtuple('doctype', ['source', 'content']) + +# %% ../00_xml.ipynb 25 +def _add_nls(s): + "Add newlines to start and end of `s` if missing" + if s[ 0]!='\n': s = '\n'+s + if s[-1]!='\n': s = s+'\n' + return s + +# %% ../00_xml.ipynb 27 +def mk_doctype(content:str, # The document content + source:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided + ) -> namedtuple: + "Create a `doctype` named tuple" + if source is None: source = hashlib.md5(content.encode()).hexdigest()[:8] + return doctype(_add_nls(str(source).strip()), _add_nls(content.strip())) + +# %% ../00_xml.ipynb 30 +def mk_doc(index:int, # The document index + content:str, # The document content + source:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided + ) -> tuple: + "Create an `xt` format tuple for a single doc in Anthropic's recommended format" + dt = mk_doctype(content, source) + content = xt('document_content', dt.content) + source = xt('source', dt.source) + return xt('document', [source, content], index=index) + +# %% ../00_xml.ipynb 33 +def docs_xml(docs:list[str], # The content of each document + sources:Optional[list]=None, # URLs, filenames, etc; each one defaults to `md5(content)` if not provided + prefix:bool=True # Include Anthropic's suggested prose intro? + )->str: + "Create an XML string containing `docs` in Anthropic's recommended format" + pre = 'Here are some documents for you to reference for your task:\n\n' if prefix else '' + if sources is None: sources = [None]*len(docs) + docs = [mk_doc(i+1, *o) for i,o in enumerate(zip(docs,sources))] + return pre + to_xml(xt('documents', docs)) + +# %% ../00_xml.ipynb 40 +def files2ctx( + fnames:list[Union[str,Path]], # List of file names to add to context + prefix:bool=True # Include Anthropic's suggested prose intro? +)->str: # XML for LM context + fnames = [Path(o) for o in fnames] + contents = [o.read_text() for o in fnames] + return docs_xml(contents, fnames, prefix=prefix) + +# %% ../00_xml.ipynb 43 +@delegates(globtastic) +def folder2ctx( + folder:Union[str,Path], # Folder name containing files to add to context + prefix:bool=True, # Include Anthropic's suggested prose intro? + **kwargs # Passed to `globtastic` +)->str: # XML for Claude context + fnames = globtastic(folder, **kwargs) + return files2ctx(fnames, prefix=prefix) diff --git a/samples/sample_core.py b/samples/sample_core.py new file mode 100644 index 0000000..7f54af1 --- /dev/null +++ b/samples/sample_core.py @@ -0,0 +1,3 @@ +import inspect +empty = inspect.Parameter.empty +models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307' diff --git a/samples/sample_styles.css b/samples/sample_styles.css new file mode 100644 index 0000000..5a494d0 --- /dev/null +++ b/samples/sample_styles.css @@ -0,0 +1,4 @@ +.cell { margin-bottom: 1rem; } +.cell > .sourceCode { margin-bottom: 0; } +.cell-output > pre { margin-bottom: 0; } +