From caf6c4c86a7efdc0c4fadd0e1a4a2252e10662b9 Mon Sep 17 00:00:00 2001 From: Dominik Picheta Date: Tue, 10 Sep 2024 18:01:07 +0000 Subject: [PATCH] Implements Python import statement parser to avoid ast module. --- .../internal/process_script_imports.py | 57 ------ .../internal/process_script_imports.py.d.ts | 2 - src/pyodide/internal/snapshot.ts | 36 ++-- src/pyodide/types/FS.d.ts | 1 + .../types/runtime-generated/metadata.d.ts | 1 + src/workerd/api/BUILD.bazel | 8 + src/workerd/api/pyodide/pyodide-test.c++ | 118 ++++++++++++ src/workerd/api/pyodide/pyodide.c++ | 173 ++++++++++++++++++ src/workerd/api/pyodide/pyodide.h | 30 ++- 9 files changed, 349 insertions(+), 77 deletions(-) delete mode 100644 src/pyodide/internal/process_script_imports.py delete mode 100644 src/pyodide/internal/process_script_imports.py.d.ts create mode 100644 src/workerd/api/pyodide/pyodide-test.c++ diff --git a/src/pyodide/internal/process_script_imports.py b/src/pyodide/internal/process_script_imports.py deleted file mode 100644 index b184ea97c89..00000000000 --- a/src/pyodide/internal/process_script_imports.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -This script is used to prepare a worker prior to a _package_ memory snapshot -being taken. All it does is walk through the imports in each of the worker's -modules and attempts to import them. Local imports are not possible because -the worker file path is explicitly removed from the module search path. -""" - -CF_LOADED_MODULES = [] - - -def _do_it(): - import ast - import sys - from pathlib import Path - - def find_imports(source: str) -> list[str]: - try: - mod = ast.parse(source) - except SyntaxError: - return [] - imports = set() - for node in ast.walk(mod): - if isinstance(node, ast.Import): - for name in node.names: - imports.add(name.name) - elif isinstance(node, ast.ImportFrom): - module_name = node.module - if module_name is None: - continue - imports.add(module_name) - return list(sorted(imports)) - - def process_script(script): - for mod in find_imports(script): - try: - __import__(mod) - CF_LOADED_MODULES.append(mod) - except ImportError: - pass - - def process_scripts(): - # Currently this script assumes that it is generating a _package_ - # snapshot- one that only includes non-vendored packages. Because of - # this we do not wish to import local modules, the easiest way to ensure - # they cannot be imported is to remove `/session/metadata` from the sys - # path. - worker_files_path = "/session/metadata" - sys.path.remove(worker_files_path) - for script in Path(worker_files_path).glob("**/*.py"): - process_script(script.read_text()) - sys.path.append(worker_files_path) - - process_scripts() - - -_do_it() -del _do_it diff --git a/src/pyodide/internal/process_script_imports.py.d.ts b/src/pyodide/internal/process_script_imports.py.d.ts deleted file mode 100644 index b47d10dc1c3..00000000000 --- a/src/pyodide/internal/process_script_imports.py.d.ts +++ /dev/null @@ -1,2 +0,0 @@ -declare const buf: ArrayBuffer; -export default buf; diff --git a/src/pyodide/internal/snapshot.ts b/src/pyodide/internal/snapshot.ts index 68b5b12381b..84d126c253f 100644 --- a/src/pyodide/internal/snapshot.ts +++ b/src/pyodide/internal/snapshot.ts @@ -6,13 +6,13 @@ import { getSitePackagesPath, } from 'pyodide-internal:setupPackages'; import { default as TarReader } from 'pyodide-internal:packages_tar_reader'; -import processScriptImports from 'pyodide-internal:process_script_imports.py'; import { SHOULD_SNAPSHOT_TO_DISK, IS_CREATING_BASELINE_SNAPSHOT, MEMORY_SNAPSHOT_READER, } from 'pyodide-internal:metadata'; import { reportError, simpleRunPython } from 'pyodide-internal:util'; +import { default as MetadataReader } from 'pyodide-internal:runtime-generated/metadata'; let LOADED_BASELINE_SNAPSHOT: number; @@ -261,24 +261,28 @@ function memorySnapshotDoImports(Module: Module): Array { return []; } - // Process the Python modules in the user worker looking for imports of packages which are not - // vendored. Vendored packages are skipped because they may contain sensitive information which - // we do not want to include in the package snapshot. - // - // See process_script_imports.py. - const processScriptImportsString = new TextDecoder().decode( - new Uint8Array(processScriptImports) + // The `importedModules` list will contain all modules that have been imported, including local + // modules, the usual `js` and other stdlib modules. We want to filter out local imports, so we + // grab them and put them into a set for fast filtering. + const localModulePaths: Set = new Set( + MetadataReader.getNames() ); - simpleRunPython(Module, processScriptImportsString); + const importedModules: Array = ArtifactBundler.constructor + // @ts-ignore parsePythonScriptImports is a static method. + .parsePythonScriptImports(MetadataReader.getWorkerFiles('py')) + .filter((module: string) => { + const moduleFilename = module.replace('.', '/') + '.py'; + return !localModulePaths.has(moduleFilename) && module != 'js'; + }); - const importedModules: Array = JSON.parse( - simpleRunPython( - Module, - 'import sys, json; print(json.dumps(CF_LOADED_MODULES), file=sys.stderr)' - ) - ); + const deduplicatedModules = [...new Set(importedModules)]; + + // Import the modules list so they are included in the snapshot. + if (deduplicatedModules.length > 0) { + simpleRunPython(Module, 'import ' + deduplicatedModules.join(',')); + } - return importedModules; + return deduplicatedModules; } function checkLoadedSoFiles(dsoJSON: DylinkInfo): void { diff --git a/src/pyodide/types/FS.d.ts b/src/pyodide/types/FS.d.ts index 937b65fd51e..30e6b005b3b 100644 --- a/src/pyodide/types/FS.d.ts +++ b/src/pyodide/types/FS.d.ts @@ -29,6 +29,7 @@ interface FS { mode: number ): FSNode; isFile: (mode: number) => boolean; + readdir: (path: string) => Array; genericErrors: Error[]; } diff --git a/src/pyodide/types/runtime-generated/metadata.d.ts b/src/pyodide/types/runtime-generated/metadata.d.ts index 1e87c006a0c..d7bb592780e 100644 --- a/src/pyodide/types/runtime-generated/metadata.d.ts +++ b/src/pyodide/types/runtime-generated/metadata.d.ts @@ -7,6 +7,7 @@ declare namespace MetadataReader { const getMainModule: () => string; const hasMemorySnapshot: () => boolean; const getNames: () => string[]; + const getWorkerFiles: (ext: string) => string[]; const getSizes: () => number[]; const readMemorySnapshot: ( offset: number, diff --git a/src/workerd/api/BUILD.bazel b/src/workerd/api/BUILD.bazel index c2afde71046..3c03394d4d4 100644 --- a/src/workerd/api/BUILD.bazel +++ b/src/workerd/api/BUILD.bazel @@ -112,6 +112,14 @@ wd_cc_library( ], ) +kj_test( + src = "pyodide/pyodide-test.c++", + deps = [ + ":pyodide", + "//src/workerd/tests:test-fixture", + ], +) + wd_cc_library( name = "data-url", srcs = ["data-url.c++"], diff --git a/src/workerd/api/pyodide/pyodide-test.c++ b/src/workerd/api/pyodide/pyodide-test.c++ new file mode 100644 index 00000000000..9f5ee37612b --- /dev/null +++ b/src/workerd/api/pyodide/pyodide-test.c++ @@ -0,0 +1,118 @@ +// Copyright (c) 2017-2022 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 + +#include "pyodide.h" +#include + +namespace workerd::api { +namespace { + +KJ_TEST("basic `import` tests") { + auto files = kj::heapArrayBuilder(2); + files.add(kj::str("import a\nimport z")); + files.add(kj::str("import b")); + auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish()); + KJ_REQUIRE(result.size() == 3); + KJ_REQUIRE(result[0] == "a"); + KJ_REQUIRE(result[1] == "z"); + KJ_REQUIRE(result[2] == "b"); +} + +KJ_TEST("supports whitespace") { + auto files = kj::heapArrayBuilder(1); + files.add(kj::str("import a\nimport \n\tz")); + auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish()); + KJ_REQUIRE(result.size() == 2); + KJ_REQUIRE(result[0] == "a"); + KJ_REQUIRE(result[1] == "z"); +} + +KJ_TEST("supports windows newlines") { + auto files = kj::heapArrayBuilder(1); + files.add(kj::str("import a\r\nimport \r\n\tz")); + auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish()); + KJ_REQUIRE(result.size() == 2); + KJ_REQUIRE(result[0] == "a"); + KJ_REQUIRE(result[1] == "z"); +} + +KJ_TEST("basic `from` test") { + auto files = kj::heapArrayBuilder(1); + files.add(kj::str("from x import a,b\nfrom z import y")); + auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish()); + KJ_REQUIRE(result.size() == 2); + KJ_REQUIRE(result[0] == "x"); + KJ_REQUIRE(result[1] == "z"); +} + +KJ_TEST("ignores indented blocks") { + auto files = kj::heapArrayBuilder(1); + files.add(kj::str("import a\nif True:\n import x\nimport y")); + auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish()); + KJ_REQUIRE(result.size() == 2); + KJ_REQUIRE(result[0] == "a"); + KJ_REQUIRE(result[1] == "y"); +} + +KJ_TEST("supports nested imports") { + auto files = kj::heapArrayBuilder(1); + files.add(kj::str("import a.b\nimport z.x.y.i")); + auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish()); + KJ_REQUIRE(result.size() == 2); + KJ_REQUIRE(result[0] == "a.b"); + KJ_REQUIRE(result[1] == "z.x.y.i"); +} + +KJ_TEST("nested `from` test") { + auto files = kj::heapArrayBuilder(1); + files.add(kj::str("from x.y.z import a,b\nfrom z import y")); + auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish()); + KJ_REQUIRE(result.size() == 2); + KJ_REQUIRE(result[0] == "x.y.z"); + KJ_REQUIRE(result[1] == "z"); +} + +KJ_TEST("ignores trailing period") { + auto files = kj::heapArrayBuilder(1); + files.add(kj::str("import a.b.\nimport z.x.y.i.")); + auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish()); + KJ_REQUIRE(result.size() == 0); +} + +KJ_TEST("ignores relative import") { + // This is where we diverge from the old AST-based approach. It would have returned `y` in the + // input below. + auto files = kj::heapArrayBuilder(1); + files.add(kj::str("import .a.b\nimport ..z.x\nfrom .y import x")); + auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish()); + KJ_REQUIRE(result.size() == 0); +} + +KJ_TEST("supports commas") { + auto files = kj::heapArrayBuilder(1); + files.add(kj::str("import a,b")); + auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish()); + KJ_REQUIRE(result.size() == 2); + KJ_REQUIRE(result[0] == "a"); + KJ_REQUIRE(result[1] == "b"); +} + +KJ_TEST("supports backslash") { + auto files = kj::heapArrayBuilder(4); + files.add(kj::str("import a\\\n,b")); + files.add(kj::str("import\\\n q,w")); + files.add(kj::str("from \\\nx import y")); + files.add(kj::str("from \\\n c import y")); + auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish()); + KJ_REQUIRE(result.size() == 6); + KJ_REQUIRE(result[0] == "a"); + KJ_REQUIRE(result[1] == "b"); + KJ_REQUIRE(result[2] == "q"); + KJ_REQUIRE(result[3] == "w"); + KJ_REQUIRE(result[4] == "x"); + KJ_REQUIRE(result[5] == "c"); +} + +} // namespace +} // namespace workerd::api diff --git a/src/workerd/api/pyodide/pyodide.c++ b/src/workerd/api/pyodide/pyodide.c++ index 764ffaed223..83da90fbc75 100644 --- a/src/workerd/api/pyodide/pyodide.c++ +++ b/src/workerd/api/pyodide/pyodide.c++ @@ -1,3 +1,6 @@ +// Copyright (c) 2017-2022 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 #include "pyodide.h" #include #include @@ -62,6 +65,17 @@ kj::Array> PyodideMetadataReader::getNames(jsg::Lock& return builder.finish(); } +kj::Array> PyodideMetadataReader::getWorkerFiles( + jsg::Lock& js, kj::String ext) { + auto builder = kj::heapArrayBuilder>(this->names.size()); + for (auto i: kj::zeroTo(builder.capacity())) { + if (this->names[i].endsWith(ext)) { + builder.add(js, js.str(this->contents[i])); + } + } + return builder.finish(); +} + kj::Array> PyodideMetadataReader::getRequirements(jsg::Lock& js) { auto builder = kj::heapArrayBuilder>(this->requirements.size()); for (auto i: kj::zeroTo(builder.capacity())) { @@ -100,6 +114,165 @@ int ArtifactBundler::readMemorySnapshot(int offset, kj::Array buf) { return readToTarget(KJ_REQUIRE_NONNULL(existingSnapshot), offset, buf); } +kj::Array ArtifactBundler::parsePythonScriptImports(kj::Array files) { + auto result = kj::Vector(); + + for (auto& file: files) { + // Returns the number of characters skipped. When `oneOf` is not found, skips to the end of + // the string. + auto skipUntil = [](kj::StringPtr str, std::initializer_list oneOf, int start) -> int { + int result = 0; + while (start + result < str.size()) { + char c = str[start + result]; + for (char expected: oneOf) { + if (c == expected) { + return result; + } + } + + result++; + } + + return result; + }; + + // Skips while current character is in `oneOf`. Returns the number of characters skipped. + auto skipWhile = [](kj::StringPtr str, std::initializer_list oneOf, int start) -> int { + int result = 0; + while (start + result < str.size()) { + char c = str[start + result]; + bool found = false; + for (char expected: oneOf) { + if (c == expected) { + result++; + found = true; + break; + } + } + + if (!found) { + break; + } + } + + return result; + }; + + // Skips one of the characters (specified in `oneOf`) at the current position. Otherwise + // throws. Returns the number of characters skipped. + auto skipChar = [](kj::StringPtr str, std::initializer_list oneOf, int start) -> int { + for (char expected: oneOf) { + if (str[start] == expected) { + return 1; + } + } + + KJ_FAIL_REQUIRE("Expected ", oneOf, "but received", str[start]); + }; + + auto parseKeyword = [](kj::StringPtr str, kj::StringPtr ident, int start) -> bool { + int i = 0; + for (; i < ident.size() && start + i < str.size(); i++) { + if (str[start + i] != ident[i]) { + return false; + } + } + + return i == ident.size(); + }; + + // Returns the size of the import identifier or 0 if no identifier exists at `start`. + auto parseIdent = [](kj::StringPtr str, int start) -> int { + // https://docs.python.org/3/reference/lexical_analysis.html#identifiers + // + // We also accept `.` because import idents can contain it. + // TODO: We don't currently support unicode, but if we see packages that utilise it we will + // implement that support. + if (std::isdigit(str[start])) { + return 0; + } + int i = 0; + for (; start + i < str.size(); i++) { + char c = str[start + i]; + bool validIdentChar = std::isalpha(c) || std::isdigit(c) || c == '_' || c == '.'; + if (!validIdentChar) { + return i; + } + } + + return i; + }; + + int i = 0; + while (i < file.size()) { + auto keywordToParse = file[i] == 'i' ? "import"_kj : "from"_kj; + switch (file[i]) { + case 'i': + case 'f': + if (!parseKeyword(file, keywordToParse, i)) { + i += skipUntil(file, {'\n', '\r'}, i); + continue; + } + i += keywordToParse.size(); // skip "import" or "from" + + while (i < file.size()) { + // Python expects a `\` to be paired with a newline, but we don't have to be as strict + // here because we rely on the fact that the script has gone through validation already. + i += skipWhile( + file, {'\r', '\n', ' ', '\t', '\\'}, i); // skip whitespace and backslash. + + if (file[i] == '.') { + // ignore relative imports + break; + } + + int identLen = parseIdent(file, i); + KJ_REQUIRE(identLen > 0); + + kj::String ident = kj::heapString(file.slice(i, i + identLen)); + if (ident[identLen - 1] != '.') { // trailing period means the import is invalid + result.add(kj::mv(ident)); + } + + i += identLen; + + // If "import" statement then look for comma. + if (keywordToParse == "import") { + i += skipWhile( + file, {'\r', '\n', ' ', '\t', '\\'}, i); // skip whitespace and backslash. + // Check if next char is a comma. + if (file[i] == ',') { + i += 1; // Skip comma. + // Allow while loop to continue + } else { + // No more idents, so break out of loop. + break; + } + } else { + // The "from" statement doesn't support commas. + break; + } + } + break; + default: + // Skip to the next line. + i += skipUntil(file, {'\n', '\r'}, i); + if (file[i] != '\0') { + i += skipChar(file, {'\n', '\r'}, i); // skip newline. + } + } + } + } + + // XXX: jsg doesn't support kj::Vector return types, so this seems to be the only way to do this. + auto builder = kj::heapArrayBuilder(result.size()); + for (auto i = 0; i < result.size(); i++) { + builder.add(kj::mv(result[i])); + } + + return builder.finish(); +} + jsg::Ref makePyodideMetadataReader( Worker::Reader conf, const PythonConfig& pythonConfig) { auto modules = conf.getModules(); diff --git a/src/workerd/api/pyodide/pyodide.h b/src/workerd/api/pyodide/pyodide.h index fe9abad3d32..a963f058030 100644 --- a/src/workerd/api/pyodide/pyodide.h +++ b/src/workerd/api/pyodide/pyodide.h @@ -1,3 +1,6 @@ +// Copyright (c) 2017-2022 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 #pragma once #include "workerd/util/wait-list.h" @@ -62,8 +65,11 @@ class PackagesTarReader: public jsg::Object { } }; -// A function to read a segment of the tar file into a buffer -// Set up this way to avoid copying files that aren't accessed. +// A class wrapping the information stored in a WorkerBundle, in particular the Python source files +// and metadata about the worker. +// +// This is done this way to avoid copying files as much as possible. We set up a Metadata File +// System which reads the contents as they are needed. class PyodideMetadataReader: public jsg::Object { private: kj::String mainModule; @@ -127,6 +133,10 @@ class PyodideMetadataReader: public jsg::Object { kj::Array> getNames(jsg::Lock& js); + // Returns files inside the WorkerBundle that end with the specified file extension. + // Usually called to get all the Python source files with a `py` extension. + kj::Array> getWorkerFiles(jsg::Lock& js, kj::String ext); + kj::Array> getRequirements(jsg::Lock& js); kj::Array getSizes(jsg::Lock& js); @@ -166,6 +176,7 @@ class PyodideMetadataReader: public jsg::Object { JSG_METHOD(getMainModule); JSG_METHOD(getRequirements); JSG_METHOD(getNames); + JSG_METHOD(getWorkerFiles); JSG_METHOD(getSizes); JSG_METHOD(read); JSG_METHOD(hasMemorySnapshot); @@ -279,6 +290,20 @@ class ArtifactBundler: public jsg::Object { return kj::none; } + // Takes in a list of Python files (their contents). Parses these files to find the import + // statements, then returns a list of modules imported via those statements. + // + // For example: + // import a, b, c + // from z import x + // import t.y.u + // from . import k + // + // -> ["a", "b", "c", "z", "t.y.u"] + // + // Package relative imports are ignored. + static kj::Array parsePythonScriptImports(kj::Array files); + JSG_RESOURCE_TYPE(ArtifactBundler) { JSG_METHOD(hasMemorySnapshot); JSG_METHOD(getMemorySnapshotSize); @@ -288,6 +313,7 @@ class ArtifactBundler: public jsg::Object { JSG_METHOD(storeMemorySnapshot); JSG_METHOD(isEnabled); JSG_METHOD(getPackage); + JSG_STATIC_METHOD(parsePythonScriptImports); } private: