Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/crawlee/_utils/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ async def _fetch_and_process_sitemap(
# Check if the first chunk is a valid gzip header
if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
first_chunk = False
first_chunk = False

chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
text_chunk = decoder.decode(chunk)
Expand Down
6 changes: 0 additions & 6 deletions tests/unit/_utils/test_sitemap.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import base64
import gzip
import os
from datetime import datetime

import pytest
from yarl import URL

from crawlee._utils.sitemap import Sitemap, SitemapUrl, parse_sitemap
Expand Down Expand Up @@ -106,10 +104,6 @@ async def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_client: H
assert sitemap.urls == []


@pytest.mark.skipif(
os.name == 'nt',
reason='This test is flaky on Windows, see https://github.com/apify/crawlee-python/issues/1460.',
)
async def test_gz_sitemap_with_non_gzipped(server_url: URL, http_client: HttpClient) -> None:
"""Test loading a sitemap with gzip type and .xml.gz url, but without gzipped data."""
sitemap_url = (server_url / 'sitemap.xml.gz').with_query(
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,9 @@ def redirect_server_url(redirect_http_server: TestServer) -> URL:

@pytest.fixture(
params=[
pytest.param('curl', id='curl'),
pytest.param('httpx', id='httpx'),
pytest.param('impit', id='impit'),
pytest.param('curl', id='curl'),
]
)
async def http_client(request: pytest.FixtureRequest) -> HttpClient:
Expand Down
16 changes: 8 additions & 8 deletions tests/unit/otel/test_crawler_instrumentor.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,20 @@
import io
import json
import os
import re
from unittest import mock

import pytest
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor
from opentelemetry.trace import set_tracer_provider
from yarl import URL

from crawlee import ConcurrencySettings
from crawlee.crawlers import ParselCrawler
from crawlee.otel.crawler_instrumentor import CrawlerInstrumentor
from crawlee.storages import Dataset


@pytest.mark.skipif(
os.name == 'nt',
reason='This test is flaky on Windows, see https://github.com/apify/crawlee-python/issues/1469.',
)
async def test_crawler_instrumentor_capability(server_url: URL) -> None:
"""Test OpenTelemetry instrumentation capability of the crawler.

Expand All @@ -40,15 +35,20 @@ async def test_crawler_instrumentor_capability(server_url: URL) -> None:
provider.add_span_processor(SimpleSpanProcessor(exporter))
set_tracer_provider(provider)
# Instrument the crawler with OpenTelemetry
CrawlerInstrumentor(instrument_classes=[Dataset]).instrument()
instrumentor = CrawlerInstrumentor(instrument_classes=[Dataset])
instrumentor.instrument()

# Generate first telemetry data from `Dataset` public methods.
# `Dataset` is in `instrument_classes` argument, and thus it's public methods are instrumented.
dataset = await Dataset.open(name='test-dataset')
await dataset.drop()

# Other traces will be from crawler run.
crawler = ParselCrawler(max_requests_per_crawl=1, request_handler=mock.AsyncMock())
crawler = ParselCrawler(
max_requests_per_crawl=1,
request_handler=mock.AsyncMock(),
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
)

# Run crawler and generate more telemetry data.
await crawler.run([str(server_url)])
Expand Down
Loading