From caf6c4c86a7efdc0c4fadd0e1a4a2252e10662b9 Mon Sep 17 00:00:00 2001
From: Dominik Picheta <dominik@cloudflare.com>
Date: Tue, 10 Sep 2024 18:01:07 +0000
Subject: [PATCH] Implements Python import statement parser to avoid ast
 module.

---
 .../internal/process_script_imports.py        |  57 ------
 .../internal/process_script_imports.py.d.ts   |   2 -
 src/pyodide/internal/snapshot.ts              |  36 ++--
 src/pyodide/types/FS.d.ts                     |   1 +
 .../types/runtime-generated/metadata.d.ts     |   1 +
 src/workerd/api/BUILD.bazel                   |   8 +
 src/workerd/api/pyodide/pyodide-test.c++      | 118 ++++++++++++
 src/workerd/api/pyodide/pyodide.c++           | 173 ++++++++++++++++++
 src/workerd/api/pyodide/pyodide.h             |  30 ++-
 9 files changed, 349 insertions(+), 77 deletions(-)
 delete mode 100644 src/pyodide/internal/process_script_imports.py
 delete mode 100644 src/pyodide/internal/process_script_imports.py.d.ts
 create mode 100644 src/workerd/api/pyodide/pyodide-test.c++

diff --git a/src/pyodide/internal/process_script_imports.py b/src/pyodide/internal/process_script_imports.py
deleted file mode 100644
index b184ea97c89..00000000000
--- a/src/pyodide/internal/process_script_imports.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""
-This script is used to prepare a worker prior to a _package_ memory snapshot
-being taken. All it does is walk through the imports in each of the worker's
-modules and attempts to import them. Local imports are not possible because
-the worker file path is explicitly removed from the module search path.
-"""
-
-CF_LOADED_MODULES = []
-
-
-def _do_it():
-    import ast
-    import sys
-    from pathlib import Path
-
-    def find_imports(source: str) -> list[str]:
-        try:
-            mod = ast.parse(source)
-        except SyntaxError:
-            return []
-        imports = set()
-        for node in ast.walk(mod):
-            if isinstance(node, ast.Import):
-                for name in node.names:
-                    imports.add(name.name)
-            elif isinstance(node, ast.ImportFrom):
-                module_name = node.module
-                if module_name is None:
-                    continue
-                imports.add(module_name)
-        return list(sorted(imports))
-
-    def process_script(script):
-        for mod in find_imports(script):
-            try:
-                __import__(mod)
-                CF_LOADED_MODULES.append(mod)
-            except ImportError:
-                pass
-
-    def process_scripts():
-        # Currently this script assumes that it is generating a _package_
-        # snapshot- one that only includes non-vendored packages. Because of
-        # this we do not wish to import local modules, the easiest way to ensure
-        # they cannot be imported is to remove `/session/metadata` from the sys
-        # path.
-        worker_files_path = "/session/metadata"
-        sys.path.remove(worker_files_path)
-        for script in Path(worker_files_path).glob("**/*.py"):
-            process_script(script.read_text())
-        sys.path.append(worker_files_path)
-
-    process_scripts()
-
-
-_do_it()
-del _do_it
diff --git a/src/pyodide/internal/process_script_imports.py.d.ts b/src/pyodide/internal/process_script_imports.py.d.ts
deleted file mode 100644
index b47d10dc1c3..00000000000
--- a/src/pyodide/internal/process_script_imports.py.d.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-declare const buf: ArrayBuffer;
-export default buf;
diff --git a/src/pyodide/internal/snapshot.ts b/src/pyodide/internal/snapshot.ts
index 68b5b12381b..84d126c253f 100644
--- a/src/pyodide/internal/snapshot.ts
+++ b/src/pyodide/internal/snapshot.ts
@@ -6,13 +6,13 @@ import {
   getSitePackagesPath,
 } from 'pyodide-internal:setupPackages';
 import { default as TarReader } from 'pyodide-internal:packages_tar_reader';
-import processScriptImports from 'pyodide-internal:process_script_imports.py';
 import {
   SHOULD_SNAPSHOT_TO_DISK,
   IS_CREATING_BASELINE_SNAPSHOT,
   MEMORY_SNAPSHOT_READER,
 } from 'pyodide-internal:metadata';
 import { reportError, simpleRunPython } from 'pyodide-internal:util';
+import { default as MetadataReader } from 'pyodide-internal:runtime-generated/metadata';
 
 let LOADED_BASELINE_SNAPSHOT: number;
 
@@ -261,24 +261,28 @@ function memorySnapshotDoImports(Module: Module): Array<string> {
     return [];
   }
 
-  // Process the Python modules in the user worker looking for imports of packages which are not
-  // vendored. Vendored packages are skipped because they may contain sensitive information which
-  // we do not want to include in the package snapshot.
-  //
-  // See process_script_imports.py.
-  const processScriptImportsString = new TextDecoder().decode(
-    new Uint8Array(processScriptImports)
+  // The `importedModules` list will contain all modules that have been imported, including local
+  // modules, the usual `js` and other stdlib modules. We want to filter out local imports, so we
+  // grab them and put them into a set for fast filtering.
+  const localModulePaths: Set<string> = new Set<string>(
+    MetadataReader.getNames()
   );
-  simpleRunPython(Module, processScriptImportsString);
+  const importedModules: Array<string> = ArtifactBundler.constructor
+    // @ts-ignore parsePythonScriptImports is a static method.
+    .parsePythonScriptImports(MetadataReader.getWorkerFiles('py'))
+    .filter((module: string) => {
+      const moduleFilename = module.replace('.', '/') + '.py';
+      return !localModulePaths.has(moduleFilename) && module != 'js';
+    });
 
-  const importedModules: Array<string> = JSON.parse(
-    simpleRunPython(
-      Module,
-      'import sys, json; print(json.dumps(CF_LOADED_MODULES), file=sys.stderr)'
-    )
-  );
+  const deduplicatedModules = [...new Set(importedModules)];
+
+  // Import the modules list so they are included in the snapshot.
+  if (deduplicatedModules.length > 0) {
+    simpleRunPython(Module, 'import ' + deduplicatedModules.join(','));
+  }
 
-  return importedModules;
+  return deduplicatedModules;
 }
 
 function checkLoadedSoFiles(dsoJSON: DylinkInfo): void {
diff --git a/src/pyodide/types/FS.d.ts b/src/pyodide/types/FS.d.ts
index 937b65fd51e..30e6b005b3b 100644
--- a/src/pyodide/types/FS.d.ts
+++ b/src/pyodide/types/FS.d.ts
@@ -29,6 +29,7 @@ interface FS {
     mode: number
   ): FSNode<Info>;
   isFile: (mode: number) => boolean;
+  readdir: (path: string) => Array<string>;
   genericErrors: Error[];
 }
 
diff --git a/src/pyodide/types/runtime-generated/metadata.d.ts b/src/pyodide/types/runtime-generated/metadata.d.ts
index 1e87c006a0c..d7bb592780e 100644
--- a/src/pyodide/types/runtime-generated/metadata.d.ts
+++ b/src/pyodide/types/runtime-generated/metadata.d.ts
@@ -7,6 +7,7 @@ declare namespace MetadataReader {
   const getMainModule: () => string;
   const hasMemorySnapshot: () => boolean;
   const getNames: () => string[];
+  const getWorkerFiles: (ext: string) => string[];
   const getSizes: () => number[];
   const readMemorySnapshot: (
     offset: number,
diff --git a/src/workerd/api/BUILD.bazel b/src/workerd/api/BUILD.bazel
index c2afde71046..3c03394d4d4 100644
--- a/src/workerd/api/BUILD.bazel
+++ b/src/workerd/api/BUILD.bazel
@@ -112,6 +112,14 @@ wd_cc_library(
     ],
 )
 
+kj_test(
+    src = "pyodide/pyodide-test.c++",
+    deps = [
+        ":pyodide",
+        "//src/workerd/tests:test-fixture",
+    ],
+)
+
 wd_cc_library(
     name = "data-url",
     srcs = ["data-url.c++"],
diff --git a/src/workerd/api/pyodide/pyodide-test.c++ b/src/workerd/api/pyodide/pyodide-test.c++
new file mode 100644
index 00000000000..9f5ee37612b
--- /dev/null
+++ b/src/workerd/api/pyodide/pyodide-test.c++
@@ -0,0 +1,118 @@
+// Copyright (c) 2017-2022 Cloudflare, Inc.
+// Licensed under the Apache 2.0 license found in the LICENSE file or at:
+//     https://opensource.org/licenses/Apache-2.0
+
+#include "pyodide.h"
+#include <kj/test.h>
+
+namespace workerd::api {
+namespace {
+
+KJ_TEST("basic `import` tests") {
+  auto files = kj::heapArrayBuilder<kj::String>(2);
+  files.add(kj::str("import a\nimport z"));
+  files.add(kj::str("import b"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 3);
+  KJ_REQUIRE(result[0] == "a");
+  KJ_REQUIRE(result[1] == "z");
+  KJ_REQUIRE(result[2] == "b");
+}
+
+KJ_TEST("supports whitespace") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import      a\nimport    \n\tz"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "a");
+  KJ_REQUIRE(result[1] == "z");
+}
+
+KJ_TEST("supports windows newlines") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import      a\r\nimport    \r\n\tz"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "a");
+  KJ_REQUIRE(result[1] == "z");
+}
+
+KJ_TEST("basic `from` test") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("from x import a,b\nfrom z import y"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "x");
+  KJ_REQUIRE(result[1] == "z");
+}
+
+KJ_TEST("ignores indented blocks") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import a\nif True:\n  import x\nimport y"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "a");
+  KJ_REQUIRE(result[1] == "y");
+}
+
+KJ_TEST("supports nested imports") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import a.b\nimport z.x.y.i"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "a.b");
+  KJ_REQUIRE(result[1] == "z.x.y.i");
+}
+
+KJ_TEST("nested `from` test") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("from x.y.z import a,b\nfrom z import y"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "x.y.z");
+  KJ_REQUIRE(result[1] == "z");
+}
+
+KJ_TEST("ignores trailing period") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import a.b.\nimport z.x.y.i."));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 0);
+}
+
+KJ_TEST("ignores relative import") {
+  // This is where we diverge from the old AST-based approach. It would have returned `y` in the
+  // input below.
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import .a.b\nimport ..z.x\nfrom .y import x"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 0);
+}
+
+KJ_TEST("supports commas") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import a,b"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "a");
+  KJ_REQUIRE(result[1] == "b");
+}
+
+KJ_TEST("supports backslash") {
+  auto files = kj::heapArrayBuilder<kj::String>(4);
+  files.add(kj::str("import a\\\n,b"));
+  files.add(kj::str("import\\\n q,w"));
+  files.add(kj::str("from \\\nx import y"));
+  files.add(kj::str("from \\\n   c import y"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 6);
+  KJ_REQUIRE(result[0] == "a");
+  KJ_REQUIRE(result[1] == "b");
+  KJ_REQUIRE(result[2] == "q");
+  KJ_REQUIRE(result[3] == "w");
+  KJ_REQUIRE(result[4] == "x");
+  KJ_REQUIRE(result[5] == "c");
+}
+
+}  // namespace
+}  // namespace workerd::api
diff --git a/src/workerd/api/pyodide/pyodide.c++ b/src/workerd/api/pyodide/pyodide.c++
index 764ffaed223..83da90fbc75 100644
--- a/src/workerd/api/pyodide/pyodide.c++
+++ b/src/workerd/api/pyodide/pyodide.c++
@@ -1,3 +1,6 @@
+// Copyright (c) 2017-2022 Cloudflare, Inc.
+// Licensed under the Apache 2.0 license found in the LICENSE file or at:
+//     https://opensource.org/licenses/Apache-2.0
 #include "pyodide.h"
 #include <kj/string.h>
 #include <workerd/util/string-buffer.h>
@@ -62,6 +65,17 @@ kj::Array<jsg::JsRef<jsg::JsString>> PyodideMetadataReader::getNames(jsg::Lock&
   return builder.finish();
 }
 
+kj::Array<jsg::JsRef<jsg::JsString>> PyodideMetadataReader::getWorkerFiles(
+    jsg::Lock& js, kj::String ext) {
+  auto builder = kj::heapArrayBuilder<jsg::JsRef<jsg::JsString>>(this->names.size());
+  for (auto i: kj::zeroTo(builder.capacity())) {
+    if (this->names[i].endsWith(ext)) {
+      builder.add(js, js.str(this->contents[i]));
+    }
+  }
+  return builder.finish();
+}
+
 kj::Array<jsg::JsRef<jsg::JsString>> PyodideMetadataReader::getRequirements(jsg::Lock& js) {
   auto builder = kj::heapArrayBuilder<jsg::JsRef<jsg::JsString>>(this->requirements.size());
   for (auto i: kj::zeroTo(builder.capacity())) {
@@ -100,6 +114,165 @@ int ArtifactBundler::readMemorySnapshot(int offset, kj::Array<kj::byte> buf) {
   return readToTarget(KJ_REQUIRE_NONNULL(existingSnapshot), offset, buf);
 }
 
+kj::Array<kj::String> ArtifactBundler::parsePythonScriptImports(kj::Array<kj::String> files) {
+  auto result = kj::Vector<kj::String>();
+
+  for (auto& file: files) {
+    // Returns the number of characters skipped. When `oneOf` is not found, skips to the end of
+    // the string.
+    auto skipUntil = [](kj::StringPtr str, std::initializer_list<char> oneOf, int start) -> int {
+      int result = 0;
+      while (start + result < str.size()) {
+        char c = str[start + result];
+        for (char expected: oneOf) {
+          if (c == expected) {
+            return result;
+          }
+        }
+
+        result++;
+      }
+
+      return result;
+    };
+
+    // Skips while current character is in `oneOf`. Returns the number of characters skipped.
+    auto skipWhile = [](kj::StringPtr str, std::initializer_list<char> oneOf, int start) -> int {
+      int result = 0;
+      while (start + result < str.size()) {
+        char c = str[start + result];
+        bool found = false;
+        for (char expected: oneOf) {
+          if (c == expected) {
+            result++;
+            found = true;
+            break;
+          }
+        }
+
+        if (!found) {
+          break;
+        }
+      }
+
+      return result;
+    };
+
+    // Skips one of the characters (specified in `oneOf`) at the current position. Otherwise
+    // throws. Returns the number of characters skipped.
+    auto skipChar = [](kj::StringPtr str, std::initializer_list<char> oneOf, int start) -> int {
+      for (char expected: oneOf) {
+        if (str[start] == expected) {
+          return 1;
+        }
+      }
+
+      KJ_FAIL_REQUIRE("Expected ", oneOf, "but received", str[start]);
+    };
+
+    auto parseKeyword = [](kj::StringPtr str, kj::StringPtr ident, int start) -> bool {
+      int i = 0;
+      for (; i < ident.size() && start + i < str.size(); i++) {
+        if (str[start + i] != ident[i]) {
+          return false;
+        }
+      }
+
+      return i == ident.size();
+    };
+
+    // Returns the size of the import identifier or 0 if no identifier exists at `start`.
+    auto parseIdent = [](kj::StringPtr str, int start) -> int {
+      // https://docs.python.org/3/reference/lexical_analysis.html#identifiers
+      //
+      // We also accept `.` because import idents can contain it.
+      // TODO: We don't currently support unicode, but if we see packages that utilise it we will
+      // implement that support.
+      if (std::isdigit(str[start])) {
+        return 0;
+      }
+      int i = 0;
+      for (; start + i < str.size(); i++) {
+        char c = str[start + i];
+        bool validIdentChar = std::isalpha(c) || std::isdigit(c) || c == '_' || c == '.';
+        if (!validIdentChar) {
+          return i;
+        }
+      }
+
+      return i;
+    };
+
+    int i = 0;
+    while (i < file.size()) {
+      auto keywordToParse = file[i] == 'i' ? "import"_kj : "from"_kj;
+      switch (file[i]) {
+        case 'i':
+        case 'f':
+          if (!parseKeyword(file, keywordToParse, i)) {
+            i += skipUntil(file, {'\n', '\r'}, i);
+            continue;
+          }
+          i += keywordToParse.size();  // skip "import" or "from"
+
+          while (i < file.size()) {
+            // Python expects a `\` to be paired with a newline, but we don't have to be as strict
+            // here because we rely on the fact that the script has gone through validation already.
+            i += skipWhile(
+                file, {'\r', '\n', ' ', '\t', '\\'}, i);  // skip whitespace and backslash.
+
+            if (file[i] == '.') {
+              // ignore relative imports
+              break;
+            }
+
+            int identLen = parseIdent(file, i);
+            KJ_REQUIRE(identLen > 0);
+
+            kj::String ident = kj::heapString(file.slice(i, i + identLen));
+            if (ident[identLen - 1] != '.') {  // trailing period means the import is invalid
+              result.add(kj::mv(ident));
+            }
+
+            i += identLen;
+
+            // If "import" statement then look for comma.
+            if (keywordToParse == "import") {
+              i += skipWhile(
+                  file, {'\r', '\n', ' ', '\t', '\\'}, i);  // skip whitespace and backslash.
+              // Check if next char is a comma.
+              if (file[i] == ',') {
+                i += 1;  // Skip comma.
+                // Allow while loop to continue
+              } else {
+                // No more idents, so break out of loop.
+                break;
+              }
+            } else {
+              // The "from" statement doesn't support commas.
+              break;
+            }
+          }
+          break;
+        default:
+          // Skip to the next line.
+          i += skipUntil(file, {'\n', '\r'}, i);
+          if (file[i] != '\0') {
+            i += skipChar(file, {'\n', '\r'}, i);  // skip newline.
+          }
+      }
+    }
+  }
+
+  // XXX: jsg doesn't support kj::Vector return types, so this seems to be the only way to do this.
+  auto builder = kj::heapArrayBuilder<kj::String>(result.size());
+  for (auto i = 0; i < result.size(); i++) {
+    builder.add(kj::mv(result[i]));
+  }
+
+  return builder.finish();
+}
+
 jsg::Ref<PyodideMetadataReader> makePyodideMetadataReader(
     Worker::Reader conf, const PythonConfig& pythonConfig) {
   auto modules = conf.getModules();
diff --git a/src/workerd/api/pyodide/pyodide.h b/src/workerd/api/pyodide/pyodide.h
index fe9abad3d32..a963f058030 100644
--- a/src/workerd/api/pyodide/pyodide.h
+++ b/src/workerd/api/pyodide/pyodide.h
@@ -1,3 +1,6 @@
+// Copyright (c) 2017-2022 Cloudflare, Inc.
+// Licensed under the Apache 2.0 license found in the LICENSE file or at:
+//     https://opensource.org/licenses/Apache-2.0
 #pragma once
 
 #include "workerd/util/wait-list.h"
@@ -62,8 +65,11 @@ class PackagesTarReader: public jsg::Object {
   }
 };
 
-// A function to read a segment of the tar file into a buffer
-// Set up this way to avoid copying files that aren't accessed.
+// A class wrapping the information stored in a WorkerBundle, in particular the Python source files
+// and metadata about the worker.
+//
+// This is done this way to avoid copying files as much as possible. We set up a Metadata File
+// System which reads the contents as they are needed.
 class PyodideMetadataReader: public jsg::Object {
 private:
   kj::String mainModule;
@@ -127,6 +133,10 @@ class PyodideMetadataReader: public jsg::Object {
 
   kj::Array<jsg::JsRef<jsg::JsString>> getNames(jsg::Lock& js);
 
+  // Returns files inside the WorkerBundle that end with the specified file extension.
+  // Usually called to get all the Python source files with a `py` extension.
+  kj::Array<jsg::JsRef<jsg::JsString>> getWorkerFiles(jsg::Lock& js, kj::String ext);
+
   kj::Array<jsg::JsRef<jsg::JsString>> getRequirements(jsg::Lock& js);
 
   kj::Array<int> getSizes(jsg::Lock& js);
@@ -166,6 +176,7 @@ class PyodideMetadataReader: public jsg::Object {
     JSG_METHOD(getMainModule);
     JSG_METHOD(getRequirements);
     JSG_METHOD(getNames);
+    JSG_METHOD(getWorkerFiles);
     JSG_METHOD(getSizes);
     JSG_METHOD(read);
     JSG_METHOD(hasMemorySnapshot);
@@ -279,6 +290,20 @@ class ArtifactBundler: public jsg::Object {
     return kj::none;
   }
 
+  // Takes in a list of Python files (their contents). Parses these files to find the import
+  // statements, then returns a list of modules imported via those statements.
+  //
+  // For example:
+  // import a, b, c
+  // from z import x
+  // import t.y.u
+  // from . import k
+  //
+  // -> ["a", "b", "c", "z", "t.y.u"]
+  //
+  // Package relative imports are ignored.
+  static kj::Array<kj::String> parsePythonScriptImports(kj::Array<kj::String> files);
+
   JSG_RESOURCE_TYPE(ArtifactBundler) {
     JSG_METHOD(hasMemorySnapshot);
     JSG_METHOD(getMemorySnapshotSize);
@@ -288,6 +313,7 @@ class ArtifactBundler: public jsg::Object {
     JSG_METHOD(storeMemorySnapshot);
     JSG_METHOD(isEnabled);
     JSG_METHOD(getPackage);
+    JSG_STATIC_METHOD(parsePythonScriptImports);
   }
 
 private: