diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index c6ee552b74..0d839cd1ed 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -335,7 +335,7 @@ async def _fetch_and_process_sitemap( # Check if the first chunk is a valid gzip header if first_chunk and raw_chunk.startswith(b'\x1f\x8b'): decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16) - first_chunk = False + first_chunk = False chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk text_chunk = decoder.decode(chunk) diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py index bd3e46c2b1..807090eaa4 100644 --- a/tests/unit/_utils/test_sitemap.py +++ b/tests/unit/_utils/test_sitemap.py @@ -1,9 +1,7 @@ import base64 import gzip -import os from datetime import datetime -import pytest from yarl import URL from crawlee._utils.sitemap import Sitemap, SitemapUrl, parse_sitemap @@ -106,10 +104,6 @@ async def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_client: H assert sitemap.urls == [] -@pytest.mark.skipif( - os.name == 'nt', - reason='This test is flaky on Windows, see https://github.com/apify/crawlee-python/issues/1460.', -) async def test_gz_sitemap_with_non_gzipped(server_url: URL, http_client: HttpClient) -> None: """Test loading a sitemap with gzip type and .xml.gz url, but without gzipped data.""" sitemap_url = (server_url / 'sitemap.xml.gz').with_query( diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index dec1994ad1..695b925256 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -195,9 +195,9 @@ def redirect_server_url(redirect_http_server: TestServer) -> URL: @pytest.fixture( params=[ - pytest.param('curl', id='curl'), pytest.param('httpx', id='httpx'), pytest.param('impit', id='impit'), + pytest.param('curl', id='curl'), ] ) async def http_client(request: pytest.FixtureRequest) -> HttpClient: diff --git a/tests/unit/otel/test_crawler_instrumentor.py b/tests/unit/otel/test_crawler_instrumentor.py index e11dde2eb4..8f46ab1ede 100644 --- a/tests/unit/otel/test_crawler_instrumentor.py +++ b/tests/unit/otel/test_crawler_instrumentor.py @@ -1,25 +1,20 @@ import io import json -import os import re from unittest import mock -import pytest from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor from opentelemetry.trace import set_tracer_provider from yarl import URL +from crawlee import ConcurrencySettings from crawlee.crawlers import ParselCrawler from crawlee.otel.crawler_instrumentor import CrawlerInstrumentor from crawlee.storages import Dataset -@pytest.mark.skipif( - os.name == 'nt', - reason='This test is flaky on Windows, see https://github.com/apify/crawlee-python/issues/1469.', -) async def test_crawler_instrumentor_capability(server_url: URL) -> None: """Test OpenTelemetry instrumentation capability of the crawler. @@ -40,7 +35,8 @@ async def test_crawler_instrumentor_capability(server_url: URL) -> None: provider.add_span_processor(SimpleSpanProcessor(exporter)) set_tracer_provider(provider) # Instrument the crawler with OpenTelemetry - CrawlerInstrumentor(instrument_classes=[Dataset]).instrument() + instrumentor = CrawlerInstrumentor(instrument_classes=[Dataset]) + instrumentor.instrument() # Generate first telemetry data from `Dataset` public methods. # `Dataset` is in `instrument_classes` argument, and thus it's public methods are instrumented. @@ -48,7 +44,11 @@ async def test_crawler_instrumentor_capability(server_url: URL) -> None: await dataset.drop() # Other traces will be from crawler run. - crawler = ParselCrawler(max_requests_per_crawl=1, request_handler=mock.AsyncMock()) + crawler = ParselCrawler( + max_requests_per_crawl=1, + request_handler=mock.AsyncMock(), + concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), + ) # Run crawler and generate more telemetry data. await crawler.run([str(server_url)])