Skip to content

Commit

Permalink
Merge pull request #2685 from cloudflare/dominik/EW-8677
Browse files Browse the repository at this point in the history
Implements Python import statement parser to avoid ast module.
  • Loading branch information
dom96 authored Sep 18, 2024
2 parents 0b073a7 + caf6c4c commit 31b17c8
Show file tree
Hide file tree
Showing 9 changed files with 349 additions and 77 deletions.
57 changes: 0 additions & 57 deletions src/pyodide/internal/process_script_imports.py

This file was deleted.

2 changes: 0 additions & 2 deletions src/pyodide/internal/process_script_imports.py.d.ts

This file was deleted.

36 changes: 20 additions & 16 deletions src/pyodide/internal/snapshot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ import {
getSitePackagesPath,
} from 'pyodide-internal:setupPackages';
import { default as TarReader } from 'pyodide-internal:packages_tar_reader';
import processScriptImports from 'pyodide-internal:process_script_imports.py';
import {
SHOULD_SNAPSHOT_TO_DISK,
IS_CREATING_BASELINE_SNAPSHOT,
MEMORY_SNAPSHOT_READER,
} from 'pyodide-internal:metadata';
import { reportError, simpleRunPython } from 'pyodide-internal:util';
import { default as MetadataReader } from 'pyodide-internal:runtime-generated/metadata';

let LOADED_BASELINE_SNAPSHOT: number;

Expand Down Expand Up @@ -261,24 +261,28 @@ function memorySnapshotDoImports(Module: Module): Array<string> {
return [];
}

// Process the Python modules in the user worker looking for imports of packages which are not
// vendored. Vendored packages are skipped because they may contain sensitive information which
// we do not want to include in the package snapshot.
//
// See process_script_imports.py.
const processScriptImportsString = new TextDecoder().decode(
new Uint8Array(processScriptImports)
// The `importedModules` list will contain all modules that have been imported, including local
// modules, the usual `js` and other stdlib modules. We want to filter out local imports, so we
// grab them and put them into a set for fast filtering.
const localModulePaths: Set<string> = new Set<string>(
MetadataReader.getNames()
);
simpleRunPython(Module, processScriptImportsString);
const importedModules: Array<string> = ArtifactBundler.constructor
// @ts-ignore parsePythonScriptImports is a static method.
.parsePythonScriptImports(MetadataReader.getWorkerFiles('py'))
.filter((module: string) => {
const moduleFilename = module.replace('.', '/') + '.py';
return !localModulePaths.has(moduleFilename) && module != 'js';
});

const importedModules: Array<string> = JSON.parse(
simpleRunPython(
Module,
'import sys, json; print(json.dumps(CF_LOADED_MODULES), file=sys.stderr)'
)
);
const deduplicatedModules = [...new Set(importedModules)];

// Import the modules list so they are included in the snapshot.
if (deduplicatedModules.length > 0) {
simpleRunPython(Module, 'import ' + deduplicatedModules.join(','));
}

return importedModules;
return deduplicatedModules;
}

function checkLoadedSoFiles(dsoJSON: DylinkInfo): void {
Expand Down
1 change: 1 addition & 0 deletions src/pyodide/types/FS.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ interface FS {
mode: number
): FSNode<Info>;
isFile: (mode: number) => boolean;
readdir: (path: string) => Array<string>;
genericErrors: Error[];
}

Expand Down
1 change: 1 addition & 0 deletions src/pyodide/types/runtime-generated/metadata.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ declare namespace MetadataReader {
const getMainModule: () => string;
const hasMemorySnapshot: () => boolean;
const getNames: () => string[];
const getWorkerFiles: (ext: string) => string[];
const getSizes: () => number[];
const readMemorySnapshot: (
offset: number,
Expand Down
8 changes: 8 additions & 0 deletions src/workerd/api/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,14 @@ wd_cc_library(
],
)

kj_test(
src = "pyodide/pyodide-test.c++",
deps = [
":pyodide",
"//src/workerd/tests:test-fixture",
],
)

wd_cc_library(
name = "data-url",
srcs = ["data-url.c++"],
Expand Down
118 changes: 118 additions & 0 deletions src/workerd/api/pyodide/pyodide-test.c++
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// Copyright (c) 2017-2022 Cloudflare, Inc.
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
// https://opensource.org/licenses/Apache-2.0

#include "pyodide.h"
#include <kj/test.h>

namespace workerd::api {
namespace {

KJ_TEST("basic `import` tests") {
auto files = kj::heapArrayBuilder<kj::String>(2);
files.add(kj::str("import a\nimport z"));
files.add(kj::str("import b"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 3);
KJ_REQUIRE(result[0] == "a");
KJ_REQUIRE(result[1] == "z");
KJ_REQUIRE(result[2] == "b");
}

KJ_TEST("supports whitespace") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import a\nimport \n\tz"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "a");
KJ_REQUIRE(result[1] == "z");
}

KJ_TEST("supports windows newlines") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import a\r\nimport \r\n\tz"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "a");
KJ_REQUIRE(result[1] == "z");
}

KJ_TEST("basic `from` test") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("from x import a,b\nfrom z import y"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "x");
KJ_REQUIRE(result[1] == "z");
}

KJ_TEST("ignores indented blocks") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import a\nif True:\n import x\nimport y"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "a");
KJ_REQUIRE(result[1] == "y");
}

KJ_TEST("supports nested imports") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import a.b\nimport z.x.y.i"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "a.b");
KJ_REQUIRE(result[1] == "z.x.y.i");
}

KJ_TEST("nested `from` test") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("from x.y.z import a,b\nfrom z import y"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "x.y.z");
KJ_REQUIRE(result[1] == "z");
}

KJ_TEST("ignores trailing period") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import a.b.\nimport z.x.y.i."));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 0);
}

KJ_TEST("ignores relative import") {
// This is where we diverge from the old AST-based approach. It would have returned `y` in the
// input below.
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import .a.b\nimport ..z.x\nfrom .y import x"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 0);
}

KJ_TEST("supports commas") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import a,b"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "a");
KJ_REQUIRE(result[1] == "b");
}

KJ_TEST("supports backslash") {
auto files = kj::heapArrayBuilder<kj::String>(4);
files.add(kj::str("import a\\\n,b"));
files.add(kj::str("import\\\n q,w"));
files.add(kj::str("from \\\nx import y"));
files.add(kj::str("from \\\n c import y"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 6);
KJ_REQUIRE(result[0] == "a");
KJ_REQUIRE(result[1] == "b");
KJ_REQUIRE(result[2] == "q");
KJ_REQUIRE(result[3] == "w");
KJ_REQUIRE(result[4] == "x");
KJ_REQUIRE(result[5] == "c");
}

} // namespace
} // namespace workerd::api
Loading

0 comments on commit 31b17c8

Please sign in to comment.