diff --git a/src/commands/run.js b/src/commands/run.js index 0837c03f..e06714ad 100644 --- a/src/commands/run.js +++ b/src/commands/run.js @@ -16,7 +16,7 @@ const { replaceSecretsValue } = require('../lib/secrets'); const { getLocalUserInfo, purgeDefaultQueue, purgeDefaultKeyValueStore, purgeDefaultDataset, getLocalConfigOrThrow, getNpmCmd, checkIfStorageIsEmpty, - detectLocalActorLanguage, isPythonVersionSupported, getPythonCommand, isNodeVersionSupported, + detectLocalActorLanguage, isPythonVersionSupported, getPythonCommand, isNodeVersionSupported, getLocalStorageDir, } = require('../lib/utils'); class RunCommand extends ApifyCommand { @@ -29,9 +29,12 @@ class RunCommand extends ApifyCommand { const packageJsonPath = path.join(cwd, 'package.json'); const mainPyPath = path.join(cwd, 'src/__main__.py'); + const projectType = ProjectAnalyzer.getProjectType(cwd); + const actualStoragePath = getLocalStorageDir(); + const packageJsonExists = fs.existsSync(packageJsonPath); const mainPyExists = fs.existsSync(mainPyPath); - const isScrapyProject = ProjectAnalyzer.getProjectType(cwd) === PROJECT_TYPES.SCRAPY; + const isScrapyProject = projectType === PROJECT_TYPES.SCRAPY; if (!packageJsonExists && !mainPyExists && !isScrapyProject) { throw new Error( @@ -40,25 +43,43 @@ class RunCommand extends ApifyCommand { ); } - if (fs.existsSync(LEGACY_LOCAL_STORAGE_DIR) && !fs.existsSync(DEFAULT_LOCAL_STORAGE_DIR)) { - fs.renameSync(LEGACY_LOCAL_STORAGE_DIR, DEFAULT_LOCAL_STORAGE_DIR); - warning("The legacy 'apify_storage' directory was renamed to 'storage' to align it with Apify SDK v3." + if (fs.existsSync(LEGACY_LOCAL_STORAGE_DIR) && !fs.existsSync(actualStoragePath)) { + fs.renameSync(LEGACY_LOCAL_STORAGE_DIR, actualStoragePath); + warning(`The legacy 'apify_storage' directory was renamed to '${actualStoragePath}' to align it with Apify SDK v3.` + ' Contents were left intact.'); } + let CRAWLEE_PURGE_ON_START = '0'; + // Purge stores if (flags.purge) { - await Promise.all([purgeDefaultQueue(), purgeDefaultKeyValueStore(), purgeDefaultDataset()]); - info('All default local stores were purged.'); + switch (projectType) { + case PROJECT_TYPES.CRAWLEE: { + CRAWLEE_PURGE_ON_START = '1'; + break; + } + case PROJECT_TYPES.PRE_CRAWLEE_APIFY_SDK: { + await Promise.all([purgeDefaultQueue(), purgeDefaultKeyValueStore(), purgeDefaultDataset()]); + info('All default local stores were purged.'); + break; + } + default: { + // TODO: Python SDK too + } + } } + + // TODO: deprecate these flags if (flags.purgeQueue) { await purgeDefaultQueue(); info('Default local request queue was purged.'); } + if (flags.purgeDataset) { await purgeDefaultDataset(); info('Default local dataset was purged.'); } + if (flags.purgeKeyValueStore) { await purgeDefaultKeyValueStore(); info('Default local key-value store was purged.'); @@ -74,7 +95,9 @@ class RunCommand extends ApifyCommand { // Attach env vars from local config files const localEnvVars = { - [APIFY_ENV_VARS.LOCAL_STORAGE_DIR]: DEFAULT_LOCAL_STORAGE_DIR, + [APIFY_ENV_VARS.LOCAL_STORAGE_DIR]: actualStoragePath, + CRAWLEE_STORAGE_DIR: actualStoragePath, + CRAWLEE_PURGE_ON_START, }; if (proxy && proxy.password) localEnvVars[APIFY_ENV_VARS.PROXY_PASSWORD] = proxy.password; if (userId) localEnvVars[APIFY_ENV_VARS.USER_ID] = userId; diff --git a/src/lib/consts.js b/src/lib/consts.js index 9b8b8ea4..4facf9d7 100644 --- a/src/lib/consts.js +++ b/src/lib/consts.js @@ -25,6 +25,8 @@ exports.LANGUAGE = { exports.PROJECT_TYPES = { SCRAPY: 'scrapy', + CRAWLEE: 'crawlee', + PRE_CRAWLEE_APIFY_SDK: 'apify', UNKNOWN: 'unknown', }; diff --git a/src/lib/project_analyzer.js b/src/lib/project_analyzer.js index 0825bf98..828f77b7 100644 --- a/src/lib/project_analyzer.js +++ b/src/lib/project_analyzer.js @@ -1,4 +1,6 @@ const { PROJECT_TYPES } = require('./consts'); +const { CrawleeAnalyzer } = require('./projects/CrawleeAnalyzer'); +const { OldApifySDKAnalyzer } = require('./projects/OldApifySDKAnalyzer'); const { ScrapyProjectAnalyzer } = require('./scrapy-wrapper/ScrapyProjectAnalyzer'); const analyzers = [ @@ -6,6 +8,14 @@ const analyzers = [ type: PROJECT_TYPES.SCRAPY, analyzer: ScrapyProjectAnalyzer, }, + { + type: PROJECT_TYPES.CRAWLEE, + analyzer: CrawleeAnalyzer, + }, + { + type: PROJECT_TYPES.PRE_CRAWLEE_APIFY_SDK, + analyzer: OldApifySDKAnalyzer, + }, ]; class ProjectAnalyzer { @@ -17,6 +27,7 @@ class ProjectAnalyzer { return a.analyzer.isApplicable(pathname); }); + return analyzer?.type || PROJECT_TYPES.UNKNOWN; } } diff --git a/src/lib/projects/CrawleeAnalyzer.js b/src/lib/projects/CrawleeAnalyzer.js new file mode 100644 index 00000000..5f6e3ad9 --- /dev/null +++ b/src/lib/projects/CrawleeAnalyzer.js @@ -0,0 +1,37 @@ +const { existsSync, readFileSync } = require('fs'); +const { join } = require('path'); + +const CRAWLEE_PACKAGES = [ + 'crawlee', + '@crawlee/core', + '@crawlee/puppeteer', + '@crawlee/playwright', + '@crawlee/cheerio', + '@crawlee/jsdom', + '@crawlee/linkedom', + '@crawlee/http', + '@crawlee/browser', + '@crawlee/basic', +]; + +class CrawleeAnalyzer { + static isApplicable(pathname) { + const hasPackageJson = existsSync(join(pathname, 'package.json')); + + if (!hasPackageJson) { + return false; + } + + const packageJson = readFileSync(join(pathname, 'package.json'), 'utf8'); + + try { + const packageJsonParsed = JSON.parse(packageJson); + + return CRAWLEE_PACKAGES.some((pkg) => packageJsonParsed?.dependencies?.[pkg] !== undefined); + } catch (err) { + return false; + } + } +} + +exports.CrawleeAnalyzer = CrawleeAnalyzer; diff --git a/src/lib/projects/OldApifySDKAnalyzer.js b/src/lib/projects/OldApifySDKAnalyzer.js new file mode 100644 index 00000000..26f9815c --- /dev/null +++ b/src/lib/projects/OldApifySDKAnalyzer.js @@ -0,0 +1,61 @@ +const { existsSync, readFileSync } = require('fs'); +const { join } = require('path'); + +const { lt } = require('semver'); + +const VERSION_WHEN_APIFY_MOVED_TO_CRAWLEE = '3.0.0'; +const CRAWLEE_PACKAGES = [ + 'crawlee', + '@crawlee/core', + '@crawlee/puppeteer', + '@crawlee/playwright', + '@crawlee/cheerio', + '@crawlee/jsdom', + '@crawlee/linkedom', + '@crawlee/http', + '@crawlee/browser', + '@crawlee/basic', +]; + +class OldApifySDKAnalyzer { + static isApplicable(pathname) { + const hasPackageJson = existsSync(join(pathname, 'package.json')); + + if (!hasPackageJson) { + return false; + } + + const packageJson = readFileSync(join(pathname, 'package.json'), 'utf8'); + + try { + const packageJsonParsed = JSON.parse(packageJson); + + // If they have crawlee as a dependency, likely to use crawlee + if (CRAWLEE_PACKAGES.some((pkg) => packageJsonParsed?.dependencies?.[pkg] !== undefined)) { + return false; + } + + const apifyVersion = packageJsonParsed?.dependencies?.apify; + if (!apifyVersion) { + return false; + } + + // We cannot infer + if (apifyVersion === '*') { + return false; + } + + let actualVersion = apifyVersion; + + if (apifyVersion.startsWith('~') || apifyVersion.startsWith('^')) { + actualVersion = apifyVersion.slice(1); + } + + return lt(actualVersion, VERSION_WHEN_APIFY_MOVED_TO_CRAWLEE); + } catch (err) { + return false; + } + } +} + +exports.OldApifySDKAnalyzer = OldApifySDKAnalyzer; diff --git a/src/lib/utils.js b/src/lib/utils.js index 0832f5bb..bd5a3ee3 100644 --- a/src/lib/utils.js +++ b/src/lib/utils.js @@ -80,7 +80,7 @@ const MIGRATED_APIFY_JSON_PROPERTIES = ['name', 'version', 'buildTag']; const getLocalStorageDir = () => { const envVar = APIFY_ENV_VARS.LOCAL_STORAGE_DIR; - return process.env[envVar] || DEFAULT_LOCAL_STORAGE_DIR; + return process.env[envVar] || process.env.CRAWLEE_STORAGE_DIR || DEFAULT_LOCAL_STORAGE_DIR; }; const getLocalKeyValueStorePath = (storeId) => { const envVar = ACTOR_ENV_VARS.DEFAULT_KEY_VALUE_STORE_ID;