Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: support new crawlee purge logic #475

Merged
merged 5 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 31 additions & 8 deletions src/commands/run.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ const { replaceSecretsValue } = require('../lib/secrets');
const {
getLocalUserInfo, purgeDefaultQueue, purgeDefaultKeyValueStore,
purgeDefaultDataset, getLocalConfigOrThrow, getNpmCmd, checkIfStorageIsEmpty,
detectLocalActorLanguage, isPythonVersionSupported, getPythonCommand, isNodeVersionSupported,
detectLocalActorLanguage, isPythonVersionSupported, getPythonCommand, isNodeVersionSupported, getLocalStorageDir,
} = require('../lib/utils');

class RunCommand extends ApifyCommand {
Expand All @@ -29,9 +29,12 @@ class RunCommand extends ApifyCommand {
const packageJsonPath = path.join(cwd, 'package.json');
const mainPyPath = path.join(cwd, 'src/__main__.py');

const projectType = ProjectAnalyzer.getProjectType(cwd);
const actualStoragePath = getLocalStorageDir();

const packageJsonExists = fs.existsSync(packageJsonPath);
const mainPyExists = fs.existsSync(mainPyPath);
const isScrapyProject = ProjectAnalyzer.getProjectType(cwd) === PROJECT_TYPES.SCRAPY;
const isScrapyProject = projectType === PROJECT_TYPES.SCRAPY;

if (!packageJsonExists && !mainPyExists && !isScrapyProject) {
throw new Error(
janbuchar marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -40,25 +43,43 @@ class RunCommand extends ApifyCommand {
);
}

if (fs.existsSync(LEGACY_LOCAL_STORAGE_DIR) && !fs.existsSync(DEFAULT_LOCAL_STORAGE_DIR)) {
fs.renameSync(LEGACY_LOCAL_STORAGE_DIR, DEFAULT_LOCAL_STORAGE_DIR);
warning("The legacy 'apify_storage' directory was renamed to 'storage' to align it with Apify SDK v3."
if (fs.existsSync(LEGACY_LOCAL_STORAGE_DIR) && !fs.existsSync(actualStoragePath)) {
fs.renameSync(LEGACY_LOCAL_STORAGE_DIR, actualStoragePath);
warning(`The legacy 'apify_storage' directory was renamed to '${actualStoragePath}' to align it with Apify SDK v3.`
+ ' Contents were left intact.');
}

let CRAWLEE_PURGE_ON_START = '0';

// Purge stores
if (flags.purge) {
await Promise.all([purgeDefaultQueue(), purgeDefaultKeyValueStore(), purgeDefaultDataset()]);
info('All default local stores were purged.');
switch (projectType) {
case PROJECT_TYPES.CRAWLEE: {
CRAWLEE_PURGE_ON_START = '1';
break;
}
case PROJECT_TYPES.PRE_CRAWLEE_APIFY_SDK: {
await Promise.all([purgeDefaultQueue(), purgeDefaultKeyValueStore(), purgeDefaultDataset()]);
info('All default local stores were purged.');
break;
}
default: {
// TODO: Python SDK too
}
}
}

// TODO: deprecate these flags
if (flags.purgeQueue) {
await purgeDefaultQueue();
info('Default local request queue was purged.');
}

if (flags.purgeDataset) {
await purgeDefaultDataset();
info('Default local dataset was purged.');
}

if (flags.purgeKeyValueStore) {
await purgeDefaultKeyValueStore();
info('Default local key-value store was purged.');
Expand All @@ -74,7 +95,9 @@ class RunCommand extends ApifyCommand {

// Attach env vars from local config files
const localEnvVars = {
[APIFY_ENV_VARS.LOCAL_STORAGE_DIR]: DEFAULT_LOCAL_STORAGE_DIR,
[APIFY_ENV_VARS.LOCAL_STORAGE_DIR]: actualStoragePath,
CRAWLEE_STORAGE_DIR: actualStoragePath,
CRAWLEE_PURGE_ON_START,
};
if (proxy && proxy.password) localEnvVars[APIFY_ENV_VARS.PROXY_PASSWORD] = proxy.password;
if (userId) localEnvVars[APIFY_ENV_VARS.USER_ID] = userId;
Expand Down
2 changes: 2 additions & 0 deletions src/lib/consts.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ exports.LANGUAGE = {

exports.PROJECT_TYPES = {
SCRAPY: 'scrapy',
CRAWLEE: 'crawlee',
PRE_CRAWLEE_APIFY_SDK: 'apify',
UNKNOWN: 'unknown',
};

Expand Down
11 changes: 11 additions & 0 deletions src/lib/project_analyzer.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
const { PROJECT_TYPES } = require('./consts');
const { CrawleeAnalyzer } = require('./projects/CrawleeAnalyzer');
const { OldApifySDKAnalyzer } = require('./projects/OldApifySDKAnalyzer');
const { ScrapyProjectAnalyzer } = require('./scrapy-wrapper/ScrapyProjectAnalyzer');

const analyzers = [
{
type: PROJECT_TYPES.SCRAPY,
analyzer: ScrapyProjectAnalyzer,
},
{
type: PROJECT_TYPES.CRAWLEE,
analyzer: CrawleeAnalyzer,
},
{
type: PROJECT_TYPES.PRE_CRAWLEE_APIFY_SDK,
analyzer: OldApifySDKAnalyzer,
},
];

class ProjectAnalyzer {
Expand All @@ -17,6 +27,7 @@ class ProjectAnalyzer {

return a.analyzer.isApplicable(pathname);
});

return analyzer?.type || PROJECT_TYPES.UNKNOWN;
}
}
Expand Down
37 changes: 37 additions & 0 deletions src/lib/projects/CrawleeAnalyzer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
const { existsSync, readFileSync } = require('fs');
const { join } = require('path');

const CRAWLEE_PACKAGES = [
'crawlee',
'@crawlee/core',
'@crawlee/puppeteer',
'@crawlee/playwright',
'@crawlee/cheerio',
'@crawlee/jsdom',
'@crawlee/linkedom',
'@crawlee/http',
'@crawlee/browser',
'@crawlee/basic',
];

class CrawleeAnalyzer {
static isApplicable(pathname) {
const hasPackageJson = existsSync(join(pathname, 'package.json'));

if (!hasPackageJson) {
return false;
}

const packageJson = readFileSync(join(pathname, 'package.json'), 'utf8');

try {
const packageJsonParsed = JSON.parse(packageJson);

return CRAWLEE_PACKAGES.some((pkg) => packageJsonParsed?.dependencies?.[pkg] !== undefined);
} catch (err) {
return false;
}
}
}

exports.CrawleeAnalyzer = CrawleeAnalyzer;
61 changes: 61 additions & 0 deletions src/lib/projects/OldApifySDKAnalyzer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
const { existsSync, readFileSync } = require('fs');
const { join } = require('path');

const { lt } = require('semver');

const VERSION_WHEN_APIFY_MOVED_TO_CRAWLEE = '3.0.0';
const CRAWLEE_PACKAGES = [
'crawlee',
'@crawlee/core',
'@crawlee/puppeteer',
'@crawlee/playwright',
'@crawlee/cheerio',
'@crawlee/jsdom',
'@crawlee/linkedom',
'@crawlee/http',
'@crawlee/browser',
'@crawlee/basic',
];

class OldApifySDKAnalyzer {
static isApplicable(pathname) {
const hasPackageJson = existsSync(join(pathname, 'package.json'));

if (!hasPackageJson) {
return false;
}

const packageJson = readFileSync(join(pathname, 'package.json'), 'utf8');

try {
const packageJsonParsed = JSON.parse(packageJson);

// If they have crawlee as a dependency, likely to use crawlee
if (CRAWLEE_PACKAGES.some((pkg) => packageJsonParsed?.dependencies?.[pkg] !== undefined)) {
return false;
}

const apifyVersion = packageJsonParsed?.dependencies?.apify;
if (!apifyVersion) {
return false;
}

// We cannot infer
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this basically means "use latest version", so use crawlee v3

if (apifyVersion === '*') {
return false;
}

let actualVersion = apifyVersion;

if (apifyVersion.startsWith('~') || apifyVersion.startsWith('^')) {
actualVersion = apifyVersion.slice(1);
}

return lt(actualVersion, VERSION_WHEN_APIFY_MOVED_TO_CRAWLEE);
} catch (err) {
return false;
}
}
}

exports.OldApifySDKAnalyzer = OldApifySDKAnalyzer;
2 changes: 1 addition & 1 deletion src/lib/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ const MIGRATED_APIFY_JSON_PROPERTIES = ['name', 'version', 'buildTag'];
const getLocalStorageDir = () => {
const envVar = APIFY_ENV_VARS.LOCAL_STORAGE_DIR;

return process.env[envVar] || DEFAULT_LOCAL_STORAGE_DIR;
return process.env[envVar] || process.env.CRAWLEE_STORAGE_DIR || DEFAULT_LOCAL_STORAGE_DIR;
};
const getLocalKeyValueStorePath = (storeId) => {
const envVar = ACTOR_ENV_VARS.DEFAULT_KEY_VALUE_STORE_ID;
Expand Down
Loading