Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
2992d0b
bot detection logic
tkotthakota-adobe Dec 23, 2025
0212b84
test fix
tkotthakota-adobe Dec 23, 2025
b4a71a3
fix
tkotthakota-adobe Dec 23, 2025
e05927b
fix tests
tkotthakota-adobe Dec 23, 2025
d1a3b28
fix intermitent test failures
tkotthakota-adobe Dec 23, 2025
d27a645
merge main
tkotthakota-adobe Jan 5, 2026
b1e15ab
tests
tkotthakota-adobe Jan 5, 2026
f5736e6
Merge branch 'main' of github.com:adobe/spacecat-task-processor into …
tkotthakota-adobe Jan 6, 2026
161a5f0
merge main
tkotthakota-adobe Jan 6, 2026
7e25c7e
update shared lib
tkotthakota-adobe Jan 7, 2026
7c11cd9
read bot protection from scrape metadata
tkotthakota-adobe Jan 7, 2026
c05acb5
test
tkotthakota-adobe Jan 7, 2026
842343d
test
tkotthakota-adobe Jan 7, 2026
6615a8d
improve tests
tkotthakota-adobe Jan 7, 2026
8e04b1a
Merge branch 'main' of github.com:adobe/spacecat-task-processor into …
tkotthakota-adobe Jan 7, 2026
6621caa
merge main + update lib
tkotthakota-adobe Jan 7, 2026
0a46462
read content scraper logs when bot protection detected
tkotthakota-adobe Jan 8, 2026
3a80e36
read bot protection flag from scrape results
tkotthakota-adobe Jan 9, 2026
f378b77
test coverage
tkotthakota-adobe Jan 9, 2026
d7f7680
empty scrape.json to check bot protection
tkotthakota-adobe Jan 9, 2026
b3f3681
merge main
tkotthakota-adobe Jan 9, 2026
c5123d6
updated lib
tkotthakota-adobe Jan 9, 2026
c4f2d82
simplify logic
tkotthakota-adobe Jan 10, 2026
8040961
simplify logic to just read logs for bot protection
tkotthakota-adobe Jan 10, 2026
7054885
merge main
tkotthakota-adobe Jan 10, 2026
4d02b7f
adjust logs
tkotthakota-adobe Jan 11, 2026
262dfb1
refactor to simplify logic
tkotthakota-adobe Jan 12, 2026
8f80a78
increase tests
tkotthakota-adobe Jan 12, 2026
441b26f
debug logs
tkotthakota-adobe Jan 13, 2026
59330a0
use jobId to check content scraper logs
tkotthakota-adobe Jan 13, 2026
5e20ebf
add scraper as dependency in opportunity map
tkotthakota-adobe Jan 13, 2026
9a26142
log fix
tkotthakota-adobe Jan 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5,661 changes: 3,356 additions & 2,305 deletions package-lock.json

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@
"@adobe/spacecat-shared-rum-api-client": "2.40.4",
"@adobe/spacecat-shared-scrape-client": "2.3.6",
"@adobe/spacecat-shared-slack-client": "1.5.32",
"@adobe/spacecat-shared-utils": "1.87.0",
"@adobe/spacecat-shared-utils": "https://gist.github.com/tkotthakota-adobe/655342ca8fe806db4e508761465becb2/raw/72a22c19f8a596650493990d8443238466f0d224/adobe-spacecat-shared-utils-1.87.0.tgz",
"@aws-sdk/client-s3": "3.966.0",
"@aws-sdk/client-cloudwatch-logs": "3.966.0",
"@aws-sdk/client-lambda": "3.966.0",
"@aws-sdk/client-sqs": "3.966.0",
Expand Down
4 changes: 2 additions & 2 deletions src/tasks/demo-url-processor/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,13 @@ export async function runDemoUrlProcessor(message, context) {
}

const demoUrl = `${experienceUrl}?organizationId=${organizationId}#/@${imsTenantId}/sites-optimizer/sites/${siteId}/home`;
const slackMessage = `:white_check_mark: Onboarding setup completed successfully for the site ${siteUrl}!\nAccess your environment here: ${demoUrl}`;
const slackMessage = `:white_check_mark: Onboarding setup completed for the site ${siteUrl}!\nAccess your environment here: ${demoUrl}`;

if (slackContext) {
await say(env, log, slackContext, slackMessage);
}

log.info(`Onboarding setup completed successfully for the site ${siteUrl}! Access your environment here: ${demoUrl}`);
log.info(`Onboarding setup completed for the site ${siteUrl}! Access your environment here: ${demoUrl}`);

return ok({ message: 'Demo URL processor completed' });
}
Expand Down
164 changes: 65 additions & 99 deletions src/tasks/opportunity-status-processor/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,20 @@
*/

import { ok } from '@adobe/spacecat-shared-http-utils';
import { CloudWatchLogsClient, FilterLogEventsCommand } from '@aws-sdk/client-cloudwatch-logs';
import RUMAPIClient from '@adobe/spacecat-shared-rum-api-client';
import GoogleClient from '@adobe/spacecat-shared-google-client';
import { ScrapeClient } from '@adobe/spacecat-shared-scrape-client';
import { resolveCanonicalUrl } from '@adobe/spacecat-shared-utils';
import {
checkAndAlertBotProtection,
checkAuditExecution,
getAuditFailureReason,
} from '../../utils/cloudwatch-utils.js';
import { say } from '../../utils/slack-utils.js';
import { getOpportunitiesForAudit } from './audit-opportunity-map.js';
import { OPPORTUNITY_DEPENDENCY_MAP } from './opportunity-dependency-map.js';

const TASK_TYPE = 'opportunity-status-processor';
const AUDIT_WORKER_LOG_GROUP = '/aws/lambda/spacecat-services--audit-worker';

/**
* Checks if RUM is available for a domain by attempting to get a domainkey
Expand Down Expand Up @@ -127,9 +130,10 @@ function getOpportunityTitle(opportunityType) {
*
* @param {string} baseUrl - The base URL to check
* @param {object} context - The context object with env and log
* @param {number} [onboardStartTime] - Optional onboard start timestamp to filter jobs
* @returns {Promise<{available: boolean, results: Array}>} Scraping availability and URL results
*/
async function isScrapingAvailable(baseUrl, context) {
async function isScrapingAvailable(baseUrl, context, onboardStartTime) {
const { log } = context;

try {
Expand All @@ -144,15 +148,27 @@ async function isScrapingAvailable(baseUrl, context) {
// Create scrape client
const scrapeClient = ScrapeClient.createFrom(context);

// Get all scrape jobs for this baseUrl with 'default' processing type
const jobs = await scrapeClient.getScrapeJobsByBaseURL(baseUrl, 'default');
// Get all scrape jobs for this baseUrl (all processing types)
const jobs = await scrapeClient.getScrapeJobsByBaseURL(baseUrl);

if (!jobs || jobs.length === 0) {
return { available: false, results: [] };
}

// Sort jobs by date (latest first) - assuming jobs have a timestamp field
const sortedJobs = jobs.sort((a, b) => {
// Filter jobs created after onboardStartTime
const filteredJobs = jobs.filter((job) => {
const jobTimestamp = new Date(job.startedAt || job.createdAt || 0).getTime();
return jobTimestamp >= onboardStartTime;
});
log.info(`Filtered ${filteredJobs.length} jobs created after onboardStartTime from ${jobs.length} total jobs`);

if (filteredJobs.length === 0) {
log.info(`No scrape jobs found for ${baseUrl} after onboardStartTime ${new Date(onboardStartTime).toISOString()}`);
return { available: false, results: [] };
}

// Sort jobs by date (latest first)
const sortedJobs = filteredJobs.sort((a, b) => {
const dateA = new Date(b.startedAt || b.createdAt || 0);
const dateB = new Date(a.startedAt || a.createdAt || 0);
return dateA - dateB;
Expand All @@ -177,7 +193,6 @@ async function isScrapingAvailable(baseUrl, context) {
log.info(`Scraping check: No jobs with URL results found for ${baseUrl}`);
return { available: false, results: [] };
}

// Count successful and failed scrapes
const completedCount = urlResults.filter((result) => result.status === 'COMPLETE').length;
const failedCount = urlResults.filter((result) => result.status === 'FAILED').length;
Expand All @@ -203,94 +218,11 @@ async function isScrapingAvailable(baseUrl, context) {
}

/**
* Searches CloudWatch logs for audit execution
* @param {string} auditType - The audit type to search for
* @param {string} siteId - The site ID
* @param {number} onboardStartTime - The onboarding start timestamp
* @param {object} context - The context object
* @returns {Promise<boolean>} Whether the audit was executed
*/
async function checkAuditExecution(auditType, siteId, onboardStartTime, context) {
const { log, env } = context;
const logGroupName = AUDIT_WORKER_LOG_GROUP;

try {
const cloudWatchClient = new CloudWatchLogsClient({ region: env.AWS_REGION || 'us-east-1' });
const filterPattern = `"Received ${auditType} audit request for: ${siteId}"`;

// Add small buffer before onboardStartTime to account for clock skew and processing delays
// The audit log should be after onboardStartTime, but we add a small buffer for safety
const bufferMs = 60 * 1000; // 1 minute
const searchStartTime = onboardStartTime ? onboardStartTime - bufferMs : undefined;

const command = new FilterLogEventsCommand({
logGroupName,
filterPattern,
startTime: searchStartTime,
endTime: Date.now(),
});

const response = await cloudWatchClient.send(command);
const found = response.events && response.events.length > 0;

return found;
} catch (error) {
log.error(`Error checking audit execution for ${auditType}:`, error);
return false;
}
}

/**
* Searches CloudWatch logs for audit failure reason
* @param {string} auditType - The audit type to search for
* @param {string} siteId - The site ID
* @param {number} onboardStartTime - The onboarding start timestamp
* @param {object} context - The context object
* @returns {Promise<string|null>} The failure reason or null if not found
* Checks scrape results for bot protection blocking
* @param {Array} scrapeResults - Array of scrape URL results
* @param {object} context - The context object with log
* @returns {object|null} Bot protection details if detected, null otherwise
*/
async function getAuditFailureReason(auditType, siteId, onboardStartTime, context) {
const { log, env } = context;
const logGroupName = AUDIT_WORKER_LOG_GROUP;

try {
const cloudWatchClient = new CloudWatchLogsClient({ region: env.AWS_REGION || 'us-east-1' });
const filterPattern = `"${auditType} audit for ${siteId} failed"`;

// Add small buffer before onboardStartTime to account for clock skew and processing delays
const bufferMs = 30 * 1000; // 30 seconds
const searchStartTime = onboardStartTime ? onboardStartTime - bufferMs : undefined;

const command = new FilterLogEventsCommand({
logGroupName,
filterPattern,
startTime: searchStartTime,
endTime: Date.now(),
});

const response = await cloudWatchClient.send(command);

if (response.events && response.events.length > 0) {
// Extract reason from the message
const { message } = response.events[0];
const reasonMatch = message.match(/Reason:\s*([^]+?)(?:\s+at\s|$)/);
if (reasonMatch && reasonMatch[1]) {
return reasonMatch[1].trim();
}
// Fallback: return entire message if "Reason:" pattern not found
return message.trim();
}

return null;
/* c8 ignore start */
// Defensive error handling: Difficult to test as requires CloudWatch API to throw errors.
// Would need complex AWS SDK mocking infrastructure for marginal coverage gain.
} catch (error) {
log.error(`Error checking audit failure for ${auditType}:`, error);
return null;
}
/* c8 ignore stop */
}

/**
* Analyzes missing opportunities and determines the root cause
* @param {Array<string>} missingOpportunities - Array of missing opportunity types
Expand Down Expand Up @@ -448,10 +380,10 @@ export async function runOpportunityStatusProcessor(message, context) {
auditTypes.forEach((auditType) => {
const opportunitiesForAudit = getOpportunitiesForAudit(auditType);
if (opportunitiesForAudit.length === 0) {
// This audit type doesn't map to any known opportunities
hasUnknownAuditTypes = true;
} else {
expectedOpportunityTypes = [...expectedOpportunityTypes, ...opportunitiesForAudit];
}
expectedOpportunityTypes = [...expectedOpportunityTypes, ...opportunitiesForAudit];
});
// Remove duplicates
expectedOpportunityTypes = [...new Set(expectedOpportunityTypes)];
Expand Down Expand Up @@ -485,9 +417,43 @@ export async function runOpportunityStatusProcessor(message, context) {
}

if (needsScraping) {
const scrapingCheck = await isScrapingAvailable(siteUrl, context);
/* c8 ignore start */
log.info(`[BOT-CHECK-TP] Scraping dependency detected, checking scraping availability for ${siteUrl}`);
log.info(`[BOT-CHECK-TP] onboardStartTime: ${new Date(onboardStartTime).toISOString()} (${onboardStartTime})`);
/* c8 ignore stop */

// First, get scraping availability and jobId
const scrapingCheck = await isScrapingAvailable(siteUrl, context, onboardStartTime);
scrapingAvailable = scrapingCheck.available;

// Always check for bot protection, use jobId for precision if available
// If no jobId, fallback to searching all [BOT-BLOCKED] events in time window
/* c8 ignore start */
if (scrapingCheck.jobId) {
log.info(`[BOT-CHECK-TP] Found scrape job ${scrapingCheck.jobId}, checking for bot protection`);
} else {
log.info('[BOT-CHECK-TP] No scrape job found yet, checking for bot protection using time-based filter');
}
/* c8 ignore stop */

const botProtectionStats = await checkAndAlertBotProtection({
jobId: scrapingCheck.jobId || null,
siteUrl,
searchStartTime: onboardStartTime,
slackContext,
context,
});

// Abort processing if bot protection detected
if (botProtectionStats && botProtectionStats.totalCount > 0) {
log.warn(`[BOT-BLOCKED] Bot protection blocking scrapes for ${siteUrl}`);
return ok({
message: `Bot protection detected for ${siteUrl}`,
botProtectionDetected: true,
blockedUrlCount: botProtectionStats.totalCount,
});
}

// Send Slack notification with scraping statistics if available
if (scrapingCheck.stats && slackContext) {
const { completed, failed, total } = scrapingCheck.stats;
Expand Down Expand Up @@ -666,7 +632,7 @@ export async function runOpportunityStatusProcessor(message, context) {
if (failedOpportunities.length > 0) {
for (const failed of failedOpportunities) {
// Use info icon for successful audits with zero suggestions
const emoji = failed.reason.includes('opportunity found with zero suggestions') ? ' :information_source:' : ' :x:';
const emoji = failed.reason.includes('found no suggestions') ? ' :information_source:' : ' :x:';
auditErrors.push(`*${failed.title}*: ${failed.reason}${emoji}`);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ export const OPPORTUNITY_DEPENDENCY_MAP = {
cwv: ['RUM'],
'high-organic-low-ctr': ['RUM'],
'broken-internal-links': ['RUM', 'AHREFSImport'],
'meta-tags': ['AHREFSImport'],
'broken-backlinks': ['AHREFSImport'],
'alt-text': ['AHREFSImport'],
'form-accessibility': ['RUM'],
'forms-opportunities': ['RUM'],
'meta-tags': ['AHREFSImport', 'scraping'], // meta-tags audit uses scraping
'broken-backlinks': ['AHREFSImport', 'scraping'], // broken-backlinks audit uses scraping
'alt-text': ['AHREFSImport', 'scraping'], // alt-text audit uses scraping
'form-accessibility': ['RUM', 'scraping'], // forms audit uses scraping
'forms-opportunities': ['RUM', 'scraping'], // forms audit uses scraping
};

/**
Expand Down
Loading