apify · janbuchar · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024 · vdusek
diff --git a/src/crawlee/_cli.py b/src/crawlee/_cli.py
@@ -1,7 +1,7 @@
 # ruff: noqa: TRY301, FBT002, UP007
 from __future__ import annotations
 
-import os
+import json
 from pathlib import Path
 from typing import Annotated, Optional, cast
 
@@ -16,6 +16,11 @@
 
 cli = typer.Typer(no_args_is_help=True)
 
+cookiecutter_json = json.load((Path().parent.parent.parent / 'templates' / 'crawler' / 'cookiecutter.json').open())
+crawler_choices = cookiecutter_json['crawler_type']
+package_manager_choices = cookiecutter_json['package_manager']
+default_start_url = cookiecutter_json['start_url']
+
 
 @cli.callback(invoke_without_command=True)
 def callback(
@@ -64,25 +69,42 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str:
  return project_name
 
 
-def _prompt_for_template() -> str:
- """Prompt the user to select a template from a list."""
- # Fetch available templates
- response = httpx.get(
- TEMPLATE_LIST_URL,
- timeout=httpx.Timeout(10),
- headers=[('Authorization', f'Bearer {os.environ["GH_TOKEN"]}')] if 'GH_TOKEN' in os.environ else [],
+def _prompt_text(message: str, default: str) -> str:
+ return cast(
+ str,
+ ConsoleRender().render(
+ inquirer.Text(
+ name='text',
+ message=message,
+ default=default,
+ validate=lambda _, value: bool(value.strip()),
+ ),
+ ),
  )
- response.raise_for_status()
- template_choices = [item['name'] for item in response.json() if item['type'] == 'dir']
 
- # Prompt for template choice
+
+def _prompt_choice(message: str, choices: list[str]) -> str:
+ """Prompt the user to pick one from a list of choices."""
  return cast(
  str,
  ConsoleRender().render(
  inquirer.List(
- name='template',
- message='Please select the template for your new Crawlee project',
- choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices],
+ name='choice',
+ message=message,
+ choices=[(choice[0].upper() + choice[1:], choice) for choice in choices],
+ ),
+ ),
+ )
+
+
+def _prompt_bool(message: str, *, default: bool) -> bool:
+ return cast(
+ bool,
+ ConsoleRender().render(
+ inquirer.Confirm(
+ name='confirm',
+ message=message,
+ default=default,
  ),
  ),
  )
@@ -92,26 +114,63 @@ def _prompt_for_template() -> str:
 def create(
  project_name: Optional[str] = typer.Argument(
  default=None,
+ show_default=False,
  help='The name of the project and the directory that will be created to contain it. '
  'If none is given, you will be prompted.',
+ ),
+ crawler_type: Optional[str] = typer.Option(
+ None,
+ '--crawler-type',
+ '--template',
+ show_default=False,
+ help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',
+ ),
+ package_manager: Optional[str] = typer.Option(
+ default=None,
+ show_default=False,
+ help='Package manager to be used in the new project. If none is given, you will be prompted.',
+ ),
+ start_url: Optional[str] = typer.Option(
+ default=None,
  show_default=False,
+ help='The URL where crawling should start. If none is given, you will be prompted.',
  ),
- template: Optional[str] = typer.Option(
+ enable_apify_integration: Optional[bool] = typer.Option(
  default=None,
- help='The template to be used to create the project. If none is given, you will be prompted.',
  show_default=False,
+ help='Should Apify integration be set up for you? If not given, you will be prompted.',
  ),
 ) -> None:
  """Bootstrap a new Crawlee project."""
  try:
  # Prompt for project name if not provided.
  project_name = _prompt_for_project_name(project_name)
 
- # Prompt for template choice if not provided.
- if template is None:
- template = _prompt_for_template()
-
- if project_name and template:
+ # Prompt for crawler_type if not provided.
+ if crawler_type is None:
+ crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices)
+
+ # Prompt for package manager if not provided.
+ if package_manager is None:
+ package_manager = _prompt_choice('Please select the package manager', package_manager_choices)
+
+ # Prompt for start URL
+ if start_url is None:
+ start_url = _prompt_text('Please specify the start URL', default=default_start_url)
+
+ # Ask about Apify integration if not explicitly configured
+ if enable_apify_integration is None:
+ enable_apify_integration = _prompt_bool('Should Apify integration be set up for you?', default=False)
+
+ if all(
+ [
+ project_name,
+ crawler_type,
+ package_manager,
+ start_url,
+ enable_apify_integration is not None,
+ ]
+ ):
  # Start the bootstrap process.
  with Progress(
  SpinnerColumn(),
@@ -121,9 +180,14 @@ def create(
  progress.add_task(description='Bootstrapping...', total=None)
  cookiecutter(
  template='gh:apify/crawlee-python',
- directory=f'templates/{template}',
+ directory='templates/crawler',
  no_input=True,
- extra_context={'project_name': project_name},
+ extra_context={
+ 'project_name': project_name,
+ 'package_manager': package_manager,
+ 'crawler_type': crawler_type,
+ 'enable_apify_integration': enable_apify_integration,
+ },
  )
 
  typer.echo(f'Your project "{project_name}" was created.')

diff --git a/templates/crawler/cookiecutter.json b/templates/crawler/cookiecutter.json
@@ -0,0 +1,12 @@
+{
+ "project_name": "crawlee-python-beautifulsoup-project",
+ "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
+ "crawler_type": ["beautifulsoup", "parsel", "playwright"],
+ "package_manager": ["poetry", "pip"],
+ "enable_apify_integration": false,
+ "start_url": "https://crawlee.dev",
+ "_jinja2_env_vars": {
+ "line_statement_prefix": "# %"
+ },
+ "_extensions": ["jinja2.ext.do"]
+}
diff --git a/templates/crawler/templates/main.py b/templates/crawler/templates/main.py
@@ -0,0 +1,32 @@
+# % if cookiecutter.enable_apify_integration
+from apify import Actor
+# % endif
+# % block import required
+# % endblock
+
+from .routes import router
+
+
+async def main() -> None:
+ """The crawler entry point."""
+ # % filter truncate(0, end='')
+ # % block instantiation required
+ # % endblock
+ # % endfilter
+
+ # % if cookiecutter.enable_apify_integration
+ async with Actor:
+ # % filter indent(width=8, first=False)
+ {{ self.instantiation() }}
+ # % endfilter
+ # % else
+ # % filter indent(width=4, first=False)
+ {{ self.instantiation() }}
+ # % endfilter
+ # % endif
+
+ await crawler.run(
+ [
+ '{{ cookiecutter.start_url }}',
+ ]
+ )
diff --git a/templates/crawler/templates/main_beautifulsoup.py b/templates/crawler/templates/main_beautifulsoup.py
@@ -0,0 +1,12 @@
+# % extends 'main.py'
+
+# % block import
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
+# % endblock
+
+# % block instantiation
+crawler = BeautifulSoupCrawler(
+ request_handler=router,
+ max_requests_per_crawl=50,
+)
+# % endblock
diff --git a/templates/crawler/templates/main_parsel.py b/templates/crawler/templates/main_parsel.py
@@ -0,0 +1,12 @@
+# % extends 'main.py'
+
+# % block import
+from crawlee.parsel_crawler import ParselCrawler
+# % endblock
+
+# % block instantiation
+crawler = ParselCrawler(
+ request_handler=router,
+ max_requests_per_crawl=50,
+)
+# % endblock
diff --git a/templates/crawler/templates/main_playwright.py b/templates/crawler/templates/main_playwright.py
@@ -0,0 +1,13 @@
+# % extends 'main.py'
+
+# % block import
+from crawlee.playwright_crawler import PlaywrightCrawler
+# % endblock
+
+# % block instantiation
+crawler = PlaywrightCrawler(
+ request_handler=router,
+ headless=True,
+ max_requests_per_crawl=50,
+)
+# % endblock
diff --git a/templates/crawler/templates/routes_beautifulsoup.py b/templates/crawler/templates/routes_beautifulsoup.py
@@ -0,0 +1,19 @@
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
+from crawlee.router import Router
+
+router = Router[BeautifulSoupCrawlingContext]()
+
+
+@router.default_handler
+async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
+ """Default request handler."""
+ context.log.info(f'Processing {context.request.url} ...')
+ title = context.soup.find('title')
+ await context.push_data(
+ {
+ 'url': context.request.loaded_url,
+ 'title': title.text if title else None,
+ }
+ )
+
+ await context.enqueue_links()
diff --git a/templates/crawler/templates/routes_parsel.py b/templates/crawler/templates/routes_parsel.py
@@ -0,0 +1,19 @@
+from crawlee.parsel_crawler import ParselCrawlingContext
+from crawlee.router import Router
+
+router = Router[ParselCrawlingContext]()
+
+
+@router.default_handler
+async def default_handler(context: ParselCrawlingContext) -> None:
+ """Default request handler."""
+ context.log.info(f'Processing {context.request.url} ...')
+ title = context.selector.xpath('//title/text()').get()
+ await context.push_data(
+ {
+ 'url': context.request.loaded_url,
+ 'title': title,
+ }
+ )
+
+ await context.enqueue_links()
diff --git a/templates/crawler/templates/routes_playwright.py b/templates/crawler/templates/routes_playwright.py
@@ -0,0 +1,19 @@
+from crawlee.playwright_crawler import PlaywrightCrawlingContext
+from crawlee.router import Router
+
+router = Router[PlaywrightCrawlingContext]()
+
+
+@router.default_handler
+async def default_handler(context: PlaywrightCrawlingContext) -> None:
+ """Default request handler."""
+ context.log.info(f'Processing {context.request.url} ...')
+ title = await context.page.query_selector('title')
+ await context.push_data(
+ {
+ 'url': context.request.loaded_url,
+ 'title': await title.inner_text() if title else None,
+ }
+ )
+
+ await context.enqueue_links()
diff --git a/templates/crawler/{{cookiecutter.project_name}}/Dockerfile b/templates/crawler/{{cookiecutter.project_name}}/Dockerfile
@@ -0,0 +1,62 @@
+# First, specify the base Docker image.
+# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
+# You can also use any other image from Docker Hub.
+# % if cookiecutter.crawler_type == 'playwright'
+FROM apify/actor-python-playwright:3.12
+# % else
+FROM apify/actor-python:3.12
+# % endif
+
+RUN apt install -yq git && rm -rf /var/lib/apt/lists/*
+
+# % if cookiecutter.package_manager == 'poetry'
+RUN pip install -U pip setuptools \
+ && pip install poetry \
+ && poetry self add poetry-plugin-export
+
+# Second, copy just poetry.lock and pyproject.toml into the Actor image,
+# since those should be the only files that affects the dependency install in the next step,
+# in order to speed up the build
+COPY pyproject.toml ./
+COPY poetry.lock ./
+
+# Install the dependencies
+RUN echo "Python version:" \
+ && python --version \
+ && echo "Installing dependencies:" \
+ # Export packages from poetry.lock
+ && poetry export -f requirements.txt --without-hashes | \
+ # Replace playwright version so that it matches whatever is pre-installed in the image
+ sed "s/^playwright==.*/playwright==$(playwright --version | cut -d ' ' -f 2)/" | \
+ # Install everything using pip (ignore dependency checks - the lockfile is correct, period)
+ pip install -r /dev/stdin --no-dependencies \
+ && echo "All installed Python packages:" \
+ && pip freeze
+# % elif cookiecutter.package_manager == 'pip'
+RUN pip install -U pip setuptools
+
+# Second, copy just pyproject.toml into the Actor image,
+# since it should be the only file that affects the dependency install in the next step,
+# in order to speed up the build
+COPY pyproject.toml ./
+
+# Install the dependencies
+RUN echo "Python version:" \
+ && python --version \
+ && echo "Installing dependencies:" \
+ # Install everything using pip, set playwright version so that it matches whatever is pre-installed in the image
+ pip install . playwright==$(playwright --version | cut -d ' ' -f 2) \
+ && echo "All installed Python packages:" \
+ && pip freeze
+# % endif
+
+# Next, copy the remaining files and directories with the source code.
+# Since we do this after installing the dependencies, quick build will be really fast
+# for most source file changes.
+COPY . ./
+
+# Use compileall to ensure the runnability of the Actor Python code.
+RUN python -m compileall -q .
+
+# Specify how to launch the source code of your Actor.
+CMD ["python", "-m", "{{ cookiecutter.__package_name }}"]