adobe · tkotthakota-adobe · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -84,7 +84,8 @@
     "@adobe/spacecat-shared-rum-api-client": "2.40.4",
     "@adobe/spacecat-shared-scrape-client": "2.3.6",
     "@adobe/spacecat-shared-slack-client": "1.5.32",
-    "@adobe/spacecat-shared-utils": "1.87.0",
+    "@adobe/spacecat-shared-utils": "https://gist.github.com/tkotthakota-adobe/655342ca8fe806db4e508761465becb2/raw/72a22c19f8a596650493990d8443238466f0d224/adobe-spacecat-shared-utils-1.87.0.tgz",
+    "@aws-sdk/client-s3": "3.966.0",
     "@aws-sdk/client-cloudwatch-logs": "3.966.0",
     "@aws-sdk/client-lambda": "3.966.0",
     "@aws-sdk/client-sqs": "3.966.0",

diff --git a/src/tasks/demo-url-processor/handler.js b/src/tasks/demo-url-processor/handler.js
@@ -91,13 +91,13 @@ export async function runDemoUrlProcessor(message, context) {
   }
 
   const demoUrl = `${experienceUrl}?organizationId=${organizationId}#/@${imsTenantId}/sites-optimizer/sites/${siteId}/home`;
-  const slackMessage = `:white_check_mark: Onboarding setup completed successfully for the site ${siteUrl}!\nAccess your environment here: ${demoUrl}`;
+  const slackMessage = `:white_check_mark: Onboarding setup completed for the site ${siteUrl}!\nAccess your environment here: ${demoUrl}`;
 
   if (slackContext) {
     await say(env, log, slackContext, slackMessage);
   }
 
-  log.info(`Onboarding setup completed successfully for the site ${siteUrl}! Access your environment here: ${demoUrl}`);
+  log.info(`Onboarding setup completed for the site ${siteUrl}! Access your environment here: ${demoUrl}`);
 
   return ok({ message: 'Demo URL processor completed' });
 }

diff --git a/src/tasks/opportunity-status-processor/handler.js b/src/tasks/opportunity-status-processor/handler.js
@@ -11,17 +11,20 @@
  */
 
 import { ok } from '@adobe/spacecat-shared-http-utils';
-import { CloudWatchLogsClient, FilterLogEventsCommand } from '@aws-sdk/client-cloudwatch-logs';
 import RUMAPIClient from '@adobe/spacecat-shared-rum-api-client';
 import GoogleClient from '@adobe/spacecat-shared-google-client';
 import { ScrapeClient } from '@adobe/spacecat-shared-scrape-client';
 import { resolveCanonicalUrl } from '@adobe/spacecat-shared-utils';
+import {
+  checkAndAlertBotProtection,
+  checkAuditExecution,
+  getAuditFailureReason,
+} from '../../utils/cloudwatch-utils.js';
 import { say } from '../../utils/slack-utils.js';
 import { getOpportunitiesForAudit } from './audit-opportunity-map.js';
 import { OPPORTUNITY_DEPENDENCY_MAP } from './opportunity-dependency-map.js';
 
 const TASK_TYPE = 'opportunity-status-processor';
-const AUDIT_WORKER_LOG_GROUP = '/aws/lambda/spacecat-services--audit-worker';
 
 /**
  * Checks if RUM is available for a domain by attempting to get a domainkey
@@ -127,9 +130,10 @@ function getOpportunityTitle(opportunityType) {
  *
  * @param {string} baseUrl - The base URL to check
  * @param {object} context - The context object with env and log
+ * @param {number} [onboardStartTime] - Optional onboard start timestamp to filter jobs
  * @returns {Promise<{available: boolean, results: Array}>} Scraping availability and URL results
  */
-async function isScrapingAvailable(baseUrl, context) {
+async function isScrapingAvailable(baseUrl, context, onboardStartTime) {
   const { log } = context;
 
   try {
@@ -144,15 +148,27 @@ async function isScrapingAvailable(baseUrl, context) {
     // Create scrape client
     const scrapeClient = ScrapeClient.createFrom(context);
 
-    // Get all scrape jobs for this baseUrl with 'default' processing type
-    const jobs = await scrapeClient.getScrapeJobsByBaseURL(baseUrl, 'default');
+    // Get all scrape jobs for this baseUrl (all processing types)
+    const jobs = await scrapeClient.getScrapeJobsByBaseURL(baseUrl);
 
     if (!jobs || jobs.length === 0) {
       return { available: false, results: [] };
     }
 
-    // Sort jobs by date (latest first) - assuming jobs have a timestamp field
-    const sortedJobs = jobs.sort((a, b) => {
+    // Filter jobs created after onboardStartTime
+    const filteredJobs = jobs.filter((job) => {
+      const jobTimestamp = new Date(job.startedAt || job.createdAt || 0).getTime();
+      return jobTimestamp >= onboardStartTime;
+    });
+    log.info(`Filtered ${filteredJobs.length} jobs created after onboardStartTime from ${jobs.length} total jobs`);
+
+    if (filteredJobs.length === 0) {
+      log.info(`No scrape jobs found for ${baseUrl} after onboardStartTime ${new Date(onboardStartTime).toISOString()}`);
+      return { available: false, results: [] };
+    }
+
+    // Sort jobs by date (latest first)
+    const sortedJobs = filteredJobs.sort((a, b) => {
       const dateA = new Date(b.startedAt || b.createdAt || 0);
       const dateB = new Date(a.startedAt || a.createdAt || 0);
       return dateA - dateB;
@@ -177,7 +193,6 @@ async function isScrapingAvailable(baseUrl, context) {
       log.info(`Scraping check: No jobs with URL results found for ${baseUrl}`);
       return { available: false, results: [] };
     }
-
     // Count successful and failed scrapes
     const completedCount = urlResults.filter((result) => result.status === 'COMPLETE').length;
     const failedCount = urlResults.filter((result) => result.status === 'FAILED').length;
@@ -203,94 +218,11 @@ async function isScrapingAvailable(baseUrl, context) {
 }
 
 /**
- * Searches CloudWatch logs for audit execution
- * @param {string} auditType - The audit type to search for
- * @param {string} siteId - The site ID
- * @param {number} onboardStartTime - The onboarding start timestamp
- * @param {object} context - The context object
- * @returns {Promise<boolean>} Whether the audit was executed
- */
-async function checkAuditExecution(auditType, siteId, onboardStartTime, context) {
-  const { log, env } = context;
-  const logGroupName = AUDIT_WORKER_LOG_GROUP;
-
-  try {
-    const cloudWatchClient = new CloudWatchLogsClient({ region: env.AWS_REGION || 'us-east-1' });
-    const filterPattern = `"Received ${auditType} audit request for: ${siteId}"`;
-
-    // Add small buffer before onboardStartTime to account for clock skew and processing delays
-    // The audit log should be after onboardStartTime, but we add a small buffer for safety
-    const bufferMs = 60 * 1000; // 1 minute
-    const searchStartTime = onboardStartTime ? onboardStartTime - bufferMs : undefined;
-
-    const command = new FilterLogEventsCommand({
-      logGroupName,
-      filterPattern,
-      startTime: searchStartTime,
-      endTime: Date.now(),
-    });
-
-    const response = await cloudWatchClient.send(command);
-    const found = response.events && response.events.length > 0;
-
-    return found;
-  } catch (error) {
-    log.error(`Error checking audit execution for ${auditType}:`, error);
-    return false;
-  }
-}
-
-/**
- * Searches CloudWatch logs for audit failure reason
- * @param {string} auditType - The audit type to search for
- * @param {string} siteId - The site ID
- * @param {number} onboardStartTime - The onboarding start timestamp
- * @param {object} context - The context object
- * @returns {Promise<string|null>} The failure reason or null if not found
+ * Checks scrape results for bot protection blocking
+ * @param {Array} scrapeResults - Array of scrape URL results
+ * @param {object} context - The context object with log
+ * @returns {object|null} Bot protection details if detected, null otherwise
  */
-async function getAuditFailureReason(auditType, siteId, onboardStartTime, context) {
-  const { log, env } = context;
-  const logGroupName = AUDIT_WORKER_LOG_GROUP;
-
-  try {
-    const cloudWatchClient = new CloudWatchLogsClient({ region: env.AWS_REGION || 'us-east-1' });
-    const filterPattern = `"${auditType} audit for ${siteId} failed"`;
-
-    // Add small buffer before onboardStartTime to account for clock skew and processing delays
-    const bufferMs = 30 * 1000; // 30 seconds
-    const searchStartTime = onboardStartTime ? onboardStartTime - bufferMs : undefined;
-
-    const command = new FilterLogEventsCommand({
-      logGroupName,
-      filterPattern,
-      startTime: searchStartTime,
-      endTime: Date.now(),
-    });
-
-    const response = await cloudWatchClient.send(command);
-
-    if (response.events && response.events.length > 0) {
-      // Extract reason from the message
-      const { message } = response.events[0];
-      const reasonMatch = message.match(/Reason:\s*([^]+?)(?:\s+at\s|$)/);
-      if (reasonMatch && reasonMatch[1]) {
-        return reasonMatch[1].trim();
-      }
-      // Fallback: return entire message if "Reason:" pattern not found
-      return message.trim();
-    }
-
-    return null;
-  /* c8 ignore start */
-  // Defensive error handling: Difficult to test as requires CloudWatch API to throw errors.
-  // Would need complex AWS SDK mocking infrastructure for marginal coverage gain.
-  } catch (error) {
-    log.error(`Error checking audit failure for ${auditType}:`, error);
-    return null;
-  }
-  /* c8 ignore stop */
-}
-
 /**
  * Analyzes missing opportunities and determines the root cause
  * @param {Array<string>} missingOpportunities - Array of missing opportunity types
@@ -448,10 +380,10 @@ export async function runOpportunityStatusProcessor(message, context) {
       auditTypes.forEach((auditType) => {
         const opportunitiesForAudit = getOpportunitiesForAudit(auditType);
         if (opportunitiesForAudit.length === 0) {
-          // This audit type doesn't map to any known opportunities
           hasUnknownAuditTypes = true;
+        } else {
+          expectedOpportunityTypes = [...expectedOpportunityTypes, ...opportunitiesForAudit];
         }
-        expectedOpportunityTypes = [...expectedOpportunityTypes, ...opportunitiesForAudit];
       });
       // Remove duplicates
       expectedOpportunityTypes = [...new Set(expectedOpportunityTypes)];
@@ -485,9 +417,43 @@ export async function runOpportunityStatusProcessor(message, context) {
         }
 
         if (needsScraping) {
-          const scrapingCheck = await isScrapingAvailable(siteUrl, context);
+          /* c8 ignore start */
+          log.info(`[BOT-CHECK-TP] Scraping dependency detected, checking scraping availability for ${siteUrl}`);
+          log.info(`[BOT-CHECK-TP] onboardStartTime: ${new Date(onboardStartTime).toISOString()} (${onboardStartTime})`);
+          /* c8 ignore stop */
+
+          // First, get scraping availability and jobId
+          const scrapingCheck = await isScrapingAvailable(siteUrl, context, onboardStartTime);
           scrapingAvailable = scrapingCheck.available;
 
+          // Always check for bot protection, use jobId for precision if available
+          // If no jobId, fallback to searching all [BOT-BLOCKED] events in time window
+          /* c8 ignore start */
+          if (scrapingCheck.jobId) {
+            log.info(`[BOT-CHECK-TP] Found scrape job ${scrapingCheck.jobId}, checking for bot protection`);
+          } else {
+            log.info('[BOT-CHECK-TP] No scrape job found yet, checking for bot protection using time-based filter');
+          }
+          /* c8 ignore stop */
+
+          const botProtectionStats = await checkAndAlertBotProtection({
+            jobId: scrapingCheck.jobId || null,
+            siteUrl,
+            searchStartTime: onboardStartTime,
+            slackContext,
+            context,
+          });
+
+          // Abort processing if bot protection detected
+          if (botProtectionStats && botProtectionStats.totalCount > 0) {
+            log.warn(`[BOT-BLOCKED] Bot protection blocking scrapes for ${siteUrl}`);
+            return ok({
+              message: `Bot protection detected for ${siteUrl}`,
+              botProtectionDetected: true,
+              blockedUrlCount: botProtectionStats.totalCount,
+            });
+          }
+
           // Send Slack notification with scraping statistics if available
           if (scrapingCheck.stats && slackContext) {
             const { completed, failed, total } = scrapingCheck.stats;
@@ -666,7 +632,7 @@ export async function runOpportunityStatusProcessor(message, context) {
       if (failedOpportunities.length > 0) {
         for (const failed of failedOpportunities) {
           // Use info icon for successful audits with zero suggestions
-          const emoji = failed.reason.includes('opportunity found with zero suggestions') ? ' :information_source:' : ' :x:';
+          const emoji = failed.reason.includes('found no suggestions') ? ' :information_source:' : ' :x:';
           auditErrors.push(`*${failed.title}*: ${failed.reason}${emoji}`);
         }
       }

diff --git a/src/tasks/opportunity-status-processor/opportunity-dependency-map.js b/src/tasks/opportunity-status-processor/opportunity-dependency-map.js
@@ -21,11 +21,11 @@ export const OPPORTUNITY_DEPENDENCY_MAP = {
   cwv: ['RUM'],
   'high-organic-low-ctr': ['RUM'],
   'broken-internal-links': ['RUM', 'AHREFSImport'],
-  'meta-tags': ['AHREFSImport'],
-  'broken-backlinks': ['AHREFSImport'],
-  'alt-text': ['AHREFSImport'],
-  'form-accessibility': ['RUM'],
-  'forms-opportunities': ['RUM'],
+  'meta-tags': ['AHREFSImport', 'scraping'], // meta-tags audit uses scraping
+  'broken-backlinks': ['AHREFSImport', 'scraping'], // broken-backlinks audit uses scraping
+  'alt-text': ['AHREFSImport', 'scraping'], // alt-text audit uses scraping
+  'form-accessibility': ['RUM', 'scraping'], // forms audit uses scraping
+  'forms-opportunities': ['RUM', 'scraping'], // forms audit uses scraping
 };
 
 /**