diff --git a/03_download.ipynb b/03_download.ipynb index 6692891..e56bfcc 100644 --- a/03_download.ipynb +++ b/03_download.ipynb @@ -118,15 +118,53 @@ "def read_html(url, # URL to read\n", " sel=None, # Read only outerHTML of CSS selector `sel`\n", " rm_comments=True, # Removes HTML comments\n", - " rm_details=True # Removes `
` tags\n", + " rm_details=True, # Removes `
` tags\n", + " multi=False, # Get all matches to `sel` or first one \n", + " wrap_tag=None, #If multi, each selection wrapped with content\n", " ): # Cleaned markdown\n", " \"Get `url`, optionally selecting CSS selector `sel`, and convert to clean markdown\"\n", " page = get(url).text\n", " if sel:\n", " soup = BeautifulSoup(page, 'html.parser')\n", - " page = str(soup.find(sel))\n", - " md = html2md(page)\n", - " return clean_md(md, rm_comments, rm_details=rm_details)" + " if multi:\n", + " page = [str(el) for el in soup.select(sel)]\n", + " if not wrap_tag: page = \"\\n\".join(page)\n", + " else: page = str(soup.select_one(sel))\n", + " mds = map(lambda x: clean_md(html2md(x), rm_comments, rm_details=rm_details), tuplify(page))\n", + " if wrap_tag: return '\\n'.join([f\"\\n<{wrap_tag}>\\n{o}\\n\" for o in mds])\n", + " else: return'\\n'.join(mds)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d07c687", + "metadata": {}, + "outputs": [], + "source": [ + "# test single class selector\n", + "listings = read_html('https://www.answer.ai/', sel='.listing-description')\n", + "assert len(listings) < 500\n", + "\n", + "# Test multi class selector\n", + "listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True)\n", + "assert len(listings) > 1000 # returns more than single so selecting multi\n", + "\n", + "# Test multi_wrap_tag\n", + "listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True, wrap_tag='document')\n", + "assert '' in listings and '' in listings " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7406a52d", + "metadata": {}, + "outputs": [], + "source": [ + "# test tag css selectors\n", + "assert len(read_html('https://www.answer.ai/', sel='div.listing-description', multi=True)) > 1000\n", + "assert len(read_html('https://www.answer.ai/', sel='div', multi=True)) > 1000" ] }, { @@ -138,6 +176,7 @@ "source": [ "htmlurl = 'https://hypermedia.systems/hypermedia-a-reintroduction/'\n", "hmd = read_html(htmlurl)\n", + "assert len(hmd) > 100\n", "# Markdown(hmd)" ] }, diff --git a/toolslm/download.py b/toolslm/download.py index afdcef2..e2bd690 100644 --- a/toolslm/download.py +++ b/toolslm/download.py @@ -13,20 +13,20 @@ from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin -# %% ../03_download.ipynb 5 +# %% ../03_download.ipynb 4 def clean_md(text, rm_comments=True, rm_details=True): "Remove comments and `
` sections from `text`" if rm_comments: text = re.sub(r'\n?\n?', '', text, flags=re.DOTALL) if rm_details: text = re.sub(r'\n?
.*?
\n?', '', text, flags=re.DOTALL) return text -# %% ../03_download.ipynb 6 +# %% ../03_download.ipynb 5 @delegates(get) def read_md(url, rm_comments=True, rm_details=True, **kwargs): "Read text from `url` and clean with `clean_docs`" return clean_md(get(url, **kwargs).text, rm_comments=rm_comments, rm_details=rm_details) -# %% ../03_download.ipynb 8 +# %% ../03_download.ipynb 7 def html2md(s:str): "Convert `s` from HTML to markdown" o = HTML2Text(bodywidth=5000) @@ -35,21 +35,28 @@ def html2md(s:str): o.ignore_images = True return o.handle(s) -# %% ../03_download.ipynb 9 +# %% ../03_download.ipynb 8 def read_html(url, # URL to read sel=None, # Read only outerHTML of CSS selector `sel` rm_comments=True, # Removes HTML comments - rm_details=True # Removes `
` tags + rm_details=True, # Removes `
` tags + multi=False, # Get all matches to `sel` or first one + wrap_tag=None, #If multi, each selection wrapped with content ): # Cleaned markdown "Get `url`, optionally selecting CSS selector `sel`, and convert to clean markdown" page = get(url).text if sel: soup = BeautifulSoup(page, 'html.parser') - page = str(soup.find(sel)) - md = html2md(page) - return clean_md(md, rm_comments, rm_details=rm_details) + if multi: + page = [str(el) for el in soup.select(sel)] + if not wrap_tag: page = "\n".join(page) + else: page = str(soup.select_one(sel)) + mds = map(lambda x: clean_md(html2md(x), rm_comments, rm_details=rm_details), tuplify(page)) + if wrap_tag: return '\n'.join([f"\n<{wrap_tag}>\n{o}\n" for o in mds]) + else: return'\n'.join(mds) -# %% ../03_download.ipynb 11 + +# %% ../03_download.ipynb 12 def get_llmstxt(url, optional=False, n_workers=None): "Get llms.txt file from and expand it with `llms_txt.create_ctx()`" if not url.endswith('llms.txt'): return None @@ -57,7 +64,7 @@ def get_llmstxt(url, optional=False, n_workers=None): if resp.status_code!=200: return None return create_ctx(resp.text, optional=optional, n_workers=n_workers) -# %% ../03_download.ipynb 13 +# %% ../03_download.ipynb 14 def split_url(url): "Split `url` into base, path, and file name, normalising name to '/' if empty" parsed = urlparse(url.strip('/')) @@ -67,13 +74,13 @@ def split_url(url): if not path and not fname: path='/' return base,path,fname -# %% ../03_download.ipynb 15 +# %% ../03_download.ipynb 16 def _tryget(url): "Return response from `url` if `status_code!=404`, otherwise `None`" res = get(url) return None if res.status_code==404 else url -# %% ../03_download.ipynb 16 +# %% ../03_download.ipynb 17 def find_docs(url): "If available, return LLM-friendly llms.txt context or markdown file location from `url`" base,path,fname = split_url(url) @@ -93,7 +100,7 @@ def find_docs(url): if parsed_url.path == '/' or not parsed_url.path: return None return find_docs(urljoin(url, '..')) -# %% ../03_download.ipynb 22 +# %% ../03_download.ipynb 23 def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True): "If available, return LLM-friendly llms.txt context or markdown file response for `url`" url = find_docs(url)