diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index 3b5b1fb9195f..e3dbfe06151f 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -12,6 +12,7 @@ from datetime import datetime, timezone from importlib.metadata import entry_points from tempfile import NamedTemporaryFile, mkdtemp +from urllib.parse import urlparse import aiohttp import olxcleaner @@ -56,16 +57,16 @@ from cms.djangoapps.contentstore.utils import ( IMPORTABLE_FILE_TYPES, contains_previous_course_reference, - get_previous_run_course_key, + create_course_info_usage_key, create_or_update_xblock_upstream_link, delete_course, + get_previous_run_course_key, initialize_permissions, reverse_usage_url, translation_language ) from cms.djangoapps.contentstore.xblock_storage_handlers.view_handlers import get_block_info from cms.djangoapps.models.settings.course_metadata import CourseMetadata -from cms.djangoapps.contentstore.utils import create_course_info_usage_key from common.djangoapps.course_action_state.models import CourseRerunState from common.djangoapps.static_replace import replace_static_urls from common.djangoapps.student.auth import has_course_author_access @@ -116,6 +117,18 @@ "Connection": "keep-alive", } +# DOI-specific headers +DOI_HEADERS = { + "User-Agent": DEFAULT_HEADERS["User-Agent"], + "Accept": "application/vnd.citationstyles.csl+json", + "Connection": "keep-alive", +} + +# Domain-specific header mapping +DOMAIN_HEADERS = { + "doi.org": DOI_HEADERS, +} + class LinkState: """ @@ -1434,7 +1447,7 @@ async def _validate_urls_access_in_batches(url_list, course_key, batch_size=100) async def _validate_batch(batch, course_key): """Validate a batch of URLs""" - async with aiohttp.ClientSession(headers=DEFAULT_HEADERS) as session: + async with aiohttp.ClientSession() as session: tasks = [_validate_url_access(session, url_data, course_key) for url_data in batch] batch_results = await asyncio.gather(*tasks) return batch_results @@ -1462,8 +1475,17 @@ async def _validate_url_access(session, url_data, course_key): url = url.strip() # Trim leading/trailing whitespace result = {'block_id': block_id, 'url': url} standardized_url = _convert_to_standard_url(url, course_key) + + try: + parsed = urlparse(url) + domain = parsed.netloc.lower() + headers = DOMAIN_HEADERS.get(domain, DEFAULT_HEADERS) + except Exception as e: # lint-amnesty, pylint: disable=broad-except + LOGGER.debug(f'[Link Check] Error parsing URL {url}: {str(e)}') + headers = DEFAULT_HEADERS + try: - async with session.get(standardized_url, timeout=5) as response: + async with session.get(standardized_url, headers=headers, timeout=5) as response: result.update({'status': response.status}) except Exception as e: # lint-amnesty, pylint: disable=broad-except result.update({'status': None})