From 210e3aeb476c31186a9dcecadf4a60aae501eae5 Mon Sep 17 00:00:00 2001 From: Isaac Flath Date: Wed, 25 Sep 2024 20:27:41 -0400 Subject: [PATCH 1/4] Update 03_download.ipynb --- 03_download.ipynb | 60 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/03_download.ipynb b/03_download.ipynb index 6692891..ddc7a29 100644 --- a/03_download.ipynb +++ b/03_download.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "92c3dff2", "metadata": {}, "outputs": [], @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "e58d8c43", "metadata": {}, "outputs": [], @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "30199708", "metadata": {}, "outputs": [], @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "95c4cab1", "metadata": {}, "outputs": [], @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "0f3d5c69", "metadata": {}, "outputs": [], @@ -80,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "478d5508", "metadata": {}, "outputs": [], @@ -92,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "d8d61937", "metadata": {}, "outputs": [], @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "5e897053", "metadata": {}, "outputs": [], @@ -124,20 +124,33 @@ " page = get(url).text\n", " if sel:\n", " soup = BeautifulSoup(page, 'html.parser')\n", - " page = str(soup.find(sel))\n", + " page = ''.join(str(el) for el in soup.select(sel))\n", " md = html2md(page)\n", " return clean_md(md, rm_comments, rm_details=rm_details)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, + "id": "7406a52d", + "metadata": {}, + "outputs": [], + "source": [ + "assert len(read_html('https://www.answer.ai/', sel='div')) > 100\n", + "assert len(read_html('https://www.answer.ai/', sel='.listing-description')) > 100\n", + "assert len(read_html('https://www.answer.ai/', sel='div.listing-description')) > 100" + ] + }, + { + "cell_type": "code", + "execution_count": 40, "id": "8f25e767", "metadata": {}, "outputs": [], "source": [ "htmlurl = 'https://hypermedia.systems/hypermedia-a-reintroduction/'\n", "hmd = read_html(htmlurl)\n", + "assert len(hmd) > 100\n", "# Markdown(hmd)" ] }, @@ -380,9 +393,34 @@ ], "metadata": { "kernelspec": { - "display_name": "python3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false } }, "nbformat": 4, From c4116c1fd0ca4fa0d8f00646950dbafb435203dd Mon Sep 17 00:00:00 2001 From: Isaac Flath Date: Wed, 25 Sep 2024 20:30:35 -0400 Subject: [PATCH 2/4] Clean and export --- 03_download.ipynb | 47 +++++++++++---------------------------------- toolslm/download.py | 10 +++++----- 2 files changed, 16 insertions(+), 41 deletions(-) diff --git a/03_download.ipynb b/03_download.ipynb index ddc7a29..d56b317 100644 --- a/03_download.ipynb +++ b/03_download.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "92c3dff2", "metadata": {}, "outputs": [], @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "e58d8c43", "metadata": {}, "outputs": [], @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "30199708", "metadata": {}, "outputs": [], @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "95c4cab1", "metadata": {}, "outputs": [], @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "0f3d5c69", "metadata": {}, "outputs": [], @@ -80,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "478d5508", "metadata": {}, "outputs": [], @@ -92,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "d8d61937", "metadata": {}, "outputs": [], @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "5e897053", "metadata": {}, "outputs": [], @@ -131,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "7406a52d", "metadata": {}, "outputs": [], @@ -143,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "8f25e767", "metadata": {}, "outputs": [], @@ -393,34 +393,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.4" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false } }, "nbformat": 4, diff --git a/toolslm/download.py b/toolslm/download.py index afdcef2..a34f345 100644 --- a/toolslm/download.py +++ b/toolslm/download.py @@ -13,20 +13,20 @@ from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin -# %% ../03_download.ipynb 5 +# %% ../03_download.ipynb 4 def clean_md(text, rm_comments=True, rm_details=True): "Remove comments and `
` sections from `text`" if rm_comments: text = re.sub(r'\n?\n?', '', text, flags=re.DOTALL) if rm_details: text = re.sub(r'\n?
.*?
\n?', '', text, flags=re.DOTALL) return text -# %% ../03_download.ipynb 6 +# %% ../03_download.ipynb 5 @delegates(get) def read_md(url, rm_comments=True, rm_details=True, **kwargs): "Read text from `url` and clean with `clean_docs`" return clean_md(get(url, **kwargs).text, rm_comments=rm_comments, rm_details=rm_details) -# %% ../03_download.ipynb 8 +# %% ../03_download.ipynb 7 def html2md(s:str): "Convert `s` from HTML to markdown" o = HTML2Text(bodywidth=5000) @@ -35,7 +35,7 @@ def html2md(s:str): o.ignore_images = True return o.handle(s) -# %% ../03_download.ipynb 9 +# %% ../03_download.ipynb 8 def read_html(url, # URL to read sel=None, # Read only outerHTML of CSS selector `sel` rm_comments=True, # Removes HTML comments @@ -45,7 +45,7 @@ def read_html(url, # URL to read page = get(url).text if sel: soup = BeautifulSoup(page, 'html.parser') - page = str(soup.find(sel)) + page = ''.join(str(el) for el in soup.select(sel)) md = html2md(page) return clean_md(md, rm_comments, rm_details=rm_details) From f31351b32702e2542afa42deb02db5a227354651 Mon Sep 17 00:00:00 2001 From: Isaac Flath Date: Thu, 26 Sep 2024 20:03:59 -0400 Subject: [PATCH 3/4] Add multi and wrap_tag options --- 03_download.ipynb | 147 +++++++++++++++++++++++++++++++++++++++++--- toolslm/download.py | 27 +++++--- 2 files changed, 157 insertions(+), 17 deletions(-) diff --git a/03_download.ipynb b/03_download.ipynb index d56b317..321fed4 100644 --- a/03_download.ipynb +++ b/03_download.ipynb @@ -107,6 +107,113 @@ " return o.handle(s)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb831e24", + "metadata": {}, + "outputs": [], + "source": [ + "from fasthtml.common import FT, to_xml\n", + "from fasthtml.components import Document" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf0549a1", + "metadata": {}, + "outputs": [], + "source": [ + "from fastcore.all import FT, to_xml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f39661ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ifnone(wrap_tag, noop)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ec582c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'dafads'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wrap_tag = Document\n", + "wrap_tag = None\n", + "to_xml(ifnone(wrap_tag, noop)('dafads'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a61c516", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'daf'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "f\"{to_xml(Document(\"daf\"))}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2e37783", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('abc',)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tuplify('abc')" + ] + }, { "cell_type": "code", "execution_count": null, @@ -118,15 +225,41 @@ "def read_html(url, # URL to read\n", " sel=None, # Read only outerHTML of CSS selector `sel`\n", " rm_comments=True, # Removes HTML comments\n", - " rm_details=True # Removes `
` tags\n", + " rm_details=True, # Removes `
` tags\n", + " multi=False, # Get all matches to `sel` or first one \n", + " wrap_tag=None, #If multi, each selection wrapped with content\n", " ): # Cleaned markdown\n", " \"Get `url`, optionally selecting CSS selector `sel`, and convert to clean markdown\"\n", " page = get(url).text\n", " if sel:\n", " soup = BeautifulSoup(page, 'html.parser')\n", - " page = ''.join(str(el) for el in soup.select(sel))\n", - " md = html2md(page)\n", - " return clean_md(md, rm_comments, rm_details=rm_details)" + " if multi:\n", + " page = [str(el) for el in soup.select(sel)]\n", + " if not wrap_tag: page = \"\\n\".join(page)\n", + " else: page = str(soup.select_one(sel))\n", + " mds = map(lambda x: clean_md(html2md(x), rm_comments, rm_details=rm_details), tuplify(page))\n", + " if wrap_tag: return '\\n'.join([f\"\\n<{wrap_tag}>\\n{o}\\n\" for o in mds])\n", + " else: return'\\n'.join(mds)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d07c687", + "metadata": {}, + "outputs": [], + "source": [ + "# test single class selector\n", + "listings = read_html('https://www.answer.ai/', sel='.listing-description')\n", + "assert len(listings) < 500\n", + "\n", + "# Test multi class selector\n", + "listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True)\n", + "assert len(listings) > 1000 # returns more than single so selecting multi\n", + "\n", + "# Test multi_wrap_tag\n", + "listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True, multi_wrap_tag='document')\n", + "assert '' in listings and '' in listings " ] }, { @@ -136,9 +269,9 @@ "metadata": {}, "outputs": [], "source": [ - "assert len(read_html('https://www.answer.ai/', sel='div')) > 100\n", - "assert len(read_html('https://www.answer.ai/', sel='.listing-description')) > 100\n", - "assert len(read_html('https://www.answer.ai/', sel='div.listing-description')) > 100" + "# test tag css selectors\n", + "assert len(read_html('https://www.answer.ai/', sel='div.listing-description', multi=True)) > 1000\n", + "assert len(read_html('https://www.answer.ai/', sel='div', multi=True)) > 1000" ] }, { diff --git a/toolslm/download.py b/toolslm/download.py index a34f345..227bcca 100644 --- a/toolslm/download.py +++ b/toolslm/download.py @@ -35,21 +35,28 @@ def html2md(s:str): o.ignore_images = True return o.handle(s) -# %% ../03_download.ipynb 8 +# %% ../03_download.ipynb 14 def read_html(url, # URL to read sel=None, # Read only outerHTML of CSS selector `sel` rm_comments=True, # Removes HTML comments - rm_details=True # Removes `
` tags + rm_details=True, # Removes `
` tags + multi=False, # Get all matches to `sel` or first one + wrap_tag=None, #If multi, each selection wrapped with content ): # Cleaned markdown "Get `url`, optionally selecting CSS selector `sel`, and convert to clean markdown" page = get(url).text if sel: soup = BeautifulSoup(page, 'html.parser') - page = ''.join(str(el) for el in soup.select(sel)) - md = html2md(page) - return clean_md(md, rm_comments, rm_details=rm_details) + if multi: + page = [str(el) for el in soup.select(sel)] + if not wrap_tag: page = "\n".join(page) + else: page = str(soup.select_one(sel)) + mds = map(lambda x: clean_md(html2md(x), rm_comments, rm_details=rm_details), tuplify(page)) + if wrap_tag: return '\n'.join([f"\n<{wrap_tag}>\n{o}\n" for o in mds]) + else: return'\n'.join(mds) -# %% ../03_download.ipynb 11 + +# %% ../03_download.ipynb 18 def get_llmstxt(url, optional=False, n_workers=None): "Get llms.txt file from and expand it with `llms_txt.create_ctx()`" if not url.endswith('llms.txt'): return None @@ -57,7 +64,7 @@ def get_llmstxt(url, optional=False, n_workers=None): if resp.status_code!=200: return None return create_ctx(resp.text, optional=optional, n_workers=n_workers) -# %% ../03_download.ipynb 13 +# %% ../03_download.ipynb 20 def split_url(url): "Split `url` into base, path, and file name, normalising name to '/' if empty" parsed = urlparse(url.strip('/')) @@ -67,13 +74,13 @@ def split_url(url): if not path and not fname: path='/' return base,path,fname -# %% ../03_download.ipynb 15 +# %% ../03_download.ipynb 22 def _tryget(url): "Return response from `url` if `status_code!=404`, otherwise `None`" res = get(url) return None if res.status_code==404 else url -# %% ../03_download.ipynb 16 +# %% ../03_download.ipynb 23 def find_docs(url): "If available, return LLM-friendly llms.txt context or markdown file location from `url`" base,path,fname = split_url(url) @@ -93,7 +100,7 @@ def find_docs(url): if parsed_url.path == '/' or not parsed_url.path: return None return find_docs(urljoin(url, '..')) -# %% ../03_download.ipynb 22 +# %% ../03_download.ipynb 29 def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True): "If available, return LLM-friendly llms.txt context or markdown file response for `url`" url = find_docs(url) From ba55084475867915f2d2e3f1735d226f8dbd25b9 Mon Sep 17 00:00:00 2001 From: Isaac Flath Date: Thu, 26 Sep 2024 20:14:25 -0400 Subject: [PATCH 4/4] Clean --- 03_download.ipynb | 109 +------------------------------------------- toolslm/download.py | 12 ++--- 2 files changed, 7 insertions(+), 114 deletions(-) diff --git a/03_download.ipynb b/03_download.ipynb index 321fed4..e56bfcc 100644 --- a/03_download.ipynb +++ b/03_download.ipynb @@ -107,113 +107,6 @@ " return o.handle(s)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb831e24", - "metadata": {}, - "outputs": [], - "source": [ - "from fasthtml.common import FT, to_xml\n", - "from fasthtml.components import Document" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf0549a1", - "metadata": {}, - "outputs": [], - "source": [ - "from fastcore.all import FT, to_xml" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f39661ce", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ifnone(wrap_tag, noop)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ec582c2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'dafads'" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wrap_tag = Document\n", - "wrap_tag = None\n", - "to_xml(ifnone(wrap_tag, noop)('dafads'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a61c516", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'daf'" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "f\"{to_xml(Document(\"daf\"))}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2e37783", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('abc',)" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tuplify('abc')" - ] - }, { "cell_type": "code", "execution_count": null, @@ -258,7 +151,7 @@ "assert len(listings) > 1000 # returns more than single so selecting multi\n", "\n", "# Test multi_wrap_tag\n", - "listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True, multi_wrap_tag='document')\n", + "listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True, wrap_tag='document')\n", "assert '' in listings and '' in listings " ] }, diff --git a/toolslm/download.py b/toolslm/download.py index 227bcca..e2bd690 100644 --- a/toolslm/download.py +++ b/toolslm/download.py @@ -35,7 +35,7 @@ def html2md(s:str): o.ignore_images = True return o.handle(s) -# %% ../03_download.ipynb 14 +# %% ../03_download.ipynb 8 def read_html(url, # URL to read sel=None, # Read only outerHTML of CSS selector `sel` rm_comments=True, # Removes HTML comments @@ -56,7 +56,7 @@ def read_html(url, # URL to read else: return'\n'.join(mds) -# %% ../03_download.ipynb 18 +# %% ../03_download.ipynb 12 def get_llmstxt(url, optional=False, n_workers=None): "Get llms.txt file from and expand it with `llms_txt.create_ctx()`" if not url.endswith('llms.txt'): return None @@ -64,7 +64,7 @@ def get_llmstxt(url, optional=False, n_workers=None): if resp.status_code!=200: return None return create_ctx(resp.text, optional=optional, n_workers=n_workers) -# %% ../03_download.ipynb 20 +# %% ../03_download.ipynb 14 def split_url(url): "Split `url` into base, path, and file name, normalising name to '/' if empty" parsed = urlparse(url.strip('/')) @@ -74,13 +74,13 @@ def split_url(url): if not path and not fname: path='/' return base,path,fname -# %% ../03_download.ipynb 22 +# %% ../03_download.ipynb 16 def _tryget(url): "Return response from `url` if `status_code!=404`, otherwise `None`" res = get(url) return None if res.status_code==404 else url -# %% ../03_download.ipynb 23 +# %% ../03_download.ipynb 17 def find_docs(url): "If available, return LLM-friendly llms.txt context or markdown file location from `url`" base,path,fname = split_url(url) @@ -100,7 +100,7 @@ def find_docs(url): if parsed_url.path == '/' or not parsed_url.path: return None return find_docs(urljoin(url, '..')) -# %% ../03_download.ipynb 29 +# %% ../03_download.ipynb 23 def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True): "If available, return LLM-friendly llms.txt context or markdown file response for `url`" url = find_docs(url)