Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions cms/djangoapps/contentstore/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from datetime import datetime, timezone
from importlib.metadata import entry_points
from tempfile import NamedTemporaryFile, mkdtemp
from urllib.parse import urlparse

import aiohttp
import olxcleaner
Expand Down Expand Up @@ -56,16 +57,16 @@
from cms.djangoapps.contentstore.utils import (
IMPORTABLE_FILE_TYPES,
contains_previous_course_reference,
get_previous_run_course_key,
create_course_info_usage_key,
create_or_update_xblock_upstream_link,
delete_course,
get_previous_run_course_key,
initialize_permissions,
reverse_usage_url,
translation_language
)
from cms.djangoapps.contentstore.xblock_storage_handlers.view_handlers import get_block_info
from cms.djangoapps.models.settings.course_metadata import CourseMetadata
from cms.djangoapps.contentstore.utils import create_course_info_usage_key
from common.djangoapps.course_action_state.models import CourseRerunState
from common.djangoapps.static_replace import replace_static_urls
from common.djangoapps.student.auth import has_course_author_access
Expand Down Expand Up @@ -116,6 +117,18 @@
"Connection": "keep-alive",
}

# DOI-specific headers
DOI_HEADERS = {
"User-Agent": DEFAULT_HEADERS["User-Agent"],
"Accept": "application/vnd.citationstyles.csl+json",
"Connection": "keep-alive",
}

# Domain-specific header mapping
DOMAIN_HEADERS = {
"doi.org": DOI_HEADERS,
}


class LinkState:
"""
Expand Down Expand Up @@ -1434,7 +1447,7 @@ async def _validate_urls_access_in_batches(url_list, course_key, batch_size=100)

async def _validate_batch(batch, course_key):
"""Validate a batch of URLs"""
async with aiohttp.ClientSession(headers=DEFAULT_HEADERS) as session:
async with aiohttp.ClientSession() as session:
tasks = [_validate_url_access(session, url_data, course_key) for url_data in batch]
batch_results = await asyncio.gather(*tasks)
return batch_results
Expand Down Expand Up @@ -1462,8 +1475,17 @@ async def _validate_url_access(session, url_data, course_key):
url = url.strip() # Trim leading/trailing whitespace
result = {'block_id': block_id, 'url': url}
standardized_url = _convert_to_standard_url(url, course_key)

try:
parsed = urlparse(url)
domain = parsed.netloc.lower()
headers = DOMAIN_HEADERS.get(domain, DEFAULT_HEADERS)
except Exception as e: # lint-amnesty, pylint: disable=broad-except
LOGGER.debug(f'[Link Check] Error parsing URL {url}: {str(e)}')
headers = DEFAULT_HEADERS

try:
async with session.get(standardized_url, timeout=5) as response:
async with session.get(standardized_url, headers=headers, timeout=5) as response:
result.update({'status': response.status})
except Exception as e: # lint-amnesty, pylint: disable=broad-except
result.update({'status': None})
Expand Down
Loading