diff --git a/Dockerfile b/Dockerfile index d51bb0843f5c..d62b85d4980a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,6 +77,11 @@ RUN --mount=type=cache,target=/root/.cache/pip/http \ -r /tmp/cvat/requirements/${DJANGO_CONFIGURATION}.txt \ -w /tmp/wheelhouse +FROM golang:1.20.5 AS build-smokescreen + +RUN git clone --depth=1 -b v0.0.4 https://github.com/stripe/smokescreen.git +RUN cd smokescreen && go build -o /tmp/smokescreen + FROM ${BASE_IMAGE} ARG http_proxy @@ -126,6 +131,9 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* && \ echo 'application/wasm wasm' >> /etc/mime.types +# Install smokescreen +COPY --from=build-smokescreen /tmp/smokescreen /usr/local/bin/smokescreen + # Add a non-root user ENV USER=${USER} ENV HOME /home/${USER} diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index 689619b93fdb..f3956108167c 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -15,10 +15,9 @@ from distutils.dir_util import copy_tree from urllib import parse as urlparse from urllib import request as urlrequest -import ipaddress -import dns.resolver import django_rq import pytz +import requests from django.conf import settings from django.db import transaction @@ -30,7 +29,7 @@ from cvat.apps.engine.media_extractors import (MEDIA_TYPES, ImageListReader, Mpeg4ChunkWriter, Mpeg4CompressedChunkWriter, ValidateDimension, ZipChunkWriter, ZipCompressedChunkWriter, get_mime, sort) from cvat.apps.engine.utils import av_scan_paths, get_rq_job_meta -from cvat.utils.http import make_requests_session +from cvat.utils.http import make_requests_session, PROXIES_FOR_UNTRUSTED_URLS from utils.dataset_manifest import ImageManifestManager, VideoManifestManager, is_manifest from utils.dataset_manifest.core import VideoManifestValidator, is_dataset_manifest from utils.dataset_manifest.utils import detect_related_images @@ -354,11 +353,7 @@ def _validate_manifest( return None -def _validate_url(url): - def _validate_ip_address(ip_address): - if not ip_address.is_global: - raise ValidationError('Non public IP address \'{}\' is provided!'.format(ip_address)) - +def _validate_scheme(url): ALLOWED_SCHEMES = ['http', 'https'] parsed_url = urlparse.urlparse(url) @@ -366,33 +361,6 @@ def _validate_ip_address(ip_address): if parsed_url.scheme not in ALLOWED_SCHEMES: raise ValueError('Unsupported URL sheme: {}. Only http and https are supported'.format(parsed_url.scheme)) - try: - ip_address = ipaddress.ip_address(parsed_url.hostname) - _validate_ip_address(ip_address) - except ValueError as _: - ip_v4_records = None - ip_v6_records = None - try: - ip_v4_records = dns.resolver.query(parsed_url.hostname, 'A') - for record in ip_v4_records: - _validate_ip_address(ipaddress.ip_address(record.to_text())) - except ValidationError: - raise - except Exception as e: - slogger.glob.info('Cannot get A record for domain \'{}\': {}'.format(parsed_url.hostname, e)) - - try: - ip_v6_records = dns.resolver.query(parsed_url.hostname, 'AAAA') - for record in ip_v6_records: - _validate_ip_address(ipaddress.ip_address(record.to_text())) - except ValidationError: - raise - except Exception as e: - slogger.glob.info('Cannot get AAAA record for domain \'{}\': {}'.format(parsed_url.hostname, e)) - - if not ip_v4_records and not ip_v6_records: - raise ValidationError('Cannot resolve IP address for domain \'{}\''.format(parsed_url.hostname)) - def _download_data(urls, upload_dir): job = rq.get_current_job() local_files = {} @@ -402,18 +370,27 @@ def _download_data(urls, upload_dir): name = os.path.basename(urlrequest.url2pathname(urlparse.urlparse(url).path)) if name in local_files: raise Exception("filename collision: {}".format(name)) - _validate_url(url) + _validate_scheme(url) slogger.glob.info("Downloading: {}".format(url)) job.meta['status'] = '{} is being downloaded..'.format(url) job.save_meta() - response = session.get(url, stream=True) + response = session.get(url, stream=True, proxies=PROXIES_FOR_UNTRUSTED_URLS) if response.status_code == 200: response.raw.decode_content = True with open(os.path.join(upload_dir, name), 'wb') as output_file: shutil.copyfileobj(response.raw, output_file) else: - raise Exception("Failed to download " + url) + error_message = f"Failed to download {response.url}" + if url != response.url: + error_message += f" (redirected from {url})" + + if response.status_code == 407: + error_message += "; likely attempt to access internal host" + elif response.status_code: + error_message += f"; HTTP error {response.status_code}" + + raise Exception(error_message) local_files[name] = True diff --git a/cvat/apps/webhooks/signals.py b/cvat/apps/webhooks/signals.py index cee8521c4d71..9e381dd22b89 100644 --- a/cvat/apps/webhooks/signals.py +++ b/cvat/apps/webhooks/signals.py @@ -22,7 +22,7 @@ get_instance_diff, organization_id, project_id) from cvat.apps.organizations.models import Invitation, Membership, Organization -from cvat.utils.http import make_requests_session +from cvat.utils.http import make_requests_session, PROXIES_FOR_UNTRUSTED_URLS from .event_type import EventTypeChoice, event_name from .models import Webhook, WebhookDelivery, WebhookTypeChoice @@ -55,6 +55,7 @@ def send_webhook(webhook, payload, redelivery=False): headers=headers, timeout=WEBHOOK_TIMEOUT, stream=True, + proxies=PROXIES_FOR_UNTRUSTED_URLS, ) status_code = response.status_code response_body = response.raw.read( diff --git a/cvat/settings/base.py b/cvat/settings/base.py index 25f6bfebe601..2944867036ac 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -683,3 +683,5 @@ class CVAT_QUEUES(Enum): IMPORT_CACHE_FAILED_TTL = timedelta(days=90) IMPORT_CACHE_SUCCESS_TTL = timedelta(hours=1) IMPORT_CACHE_CLEAN_DELAY = timedelta(hours=2) + +USE_SMOKESCREEN = True diff --git a/cvat/settings/development.py b/cvat/settings/development.py index dacdc87f5412..fc09a520db8d 100644 --- a/cvat/settings/development.py +++ b/cvat/settings/development.py @@ -65,3 +65,5 @@ DATABASES['default']['HOST'] = os.getenv('CVAT_POSTGRES_HOST', 'localhost') QUALITY_CHECK_JOB_DELAY = 5 + +USE_SMOKESCREEN = False diff --git a/cvat/utils/http.py b/cvat/utils/http.py index c713871e1192..0cf2e59bdfab 100644 --- a/cvat/utils/http.py +++ b/cvat/utils/http.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: MIT +from django.conf import settings + import requests import requests.utils @@ -9,6 +11,18 @@ _CVAT_USER_AGENT = f"CVAT/{__version__} {requests.utils.default_user_agent()}" +# Note that setting Session.proxies doesn't work correctly if an upstream proxy +# is specified via environment variables: . +# Therefore, for robust operation, these need to be passed with every request via the +# proxies= parameter. +PROXIES_FOR_UNTRUSTED_URLS = None + +if settings.USE_SMOKESCREEN: + PROXIES_FOR_UNTRUSTED_URLS = { + 'http': 'http://smokescreen:4750', + 'https': 'http://smokescreen:4750', + } + def make_requests_session() -> requests.Session: session = requests.Session() session.headers['User-Agent'] = _CVAT_USER_AGENT diff --git a/docker-compose.yml b/docker-compose.yml index 61d47119fba0..df9b3462f4f8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -78,13 +78,16 @@ services: DJANGO_LOG_SERVER_PORT: 80 no_proxy: clickhouse,grafana,vector,nuclio,opa,${no_proxy:-} NUMPROCS: 1 + SMOKESCREEN_OPTS: ${SMOKESCREEN_OPTS:-} command: -c supervisord/utils.conf volumes: - cvat_data:/home/django/data - cvat_keys:/home/django/keys - cvat_logs:/home/django/logs networks: - - cvat + cvat: + aliases: + - smokescreen cvat_worker_import: container_name: cvat_worker_import diff --git a/supervisord/utils.conf b/supervisord/utils.conf index be2f7f6f623f..94376a378e02 100644 --- a/supervisord/utils.conf +++ b/supervisord/utils.conf @@ -49,4 +49,7 @@ command=%(ENV_HOME)s/wait-for-it.sh %(ENV_CVAT_REDIS_HOST)s:6379 -t 0 -- bash -i " environment=SSH_AUTH_SOCK="/tmp/ssh-agent.sock",VECTOR_EVENT_HANDLER="SynchronousLogstashHandler" numprocs=%(ENV_NUMPROCS)s -process_name=rqworker_cleaning_%(process_num)s \ No newline at end of file +process_name=rqworker_cleaning_%(process_num)s + +[program:smokescreen] +command=smokescreen %(ENV_SMOKESCREEN_OPTS)s diff --git a/tests/docker-compose.test_servers.yml b/tests/docker-compose.test_servers.yml index 5d8fced7fb9e..7ef5999f73ec 100644 --- a/tests/docker-compose.test_servers.yml +++ b/tests/docker-compose.test_servers.yml @@ -1,4 +1,19 @@ services: + cvat_utils: + depends_on: + - webhook_receiver + entrypoint: + - /bin/bash + - -euc + # We want to exclude webhooks_receiver from SSRF protection, + # so that the server can send the webhooks there. + # --allow-address doesn't allow hostnames, so we have to resolve + # the IP address ourselves. + - | + webhooks_ip_addr="$$(getent hosts webhooks | head -1 | awk '{ print $$1 }')" + export SMOKESCREEN_OPTS=--allow-address="$$webhooks_ip_addr" + exec /usr/bin/supervisord -c supervisord/utils.conf + webhook_receiver: image: python:3.9-slim restart: always