scrape_role_files: add support for nested role directories

Changing the way how roles are scraped. Previously directories present in default roles directory were treated as roles (dir name is role name). Now scraper enters each directory looking for one of 8 specified subdirs. When at least one such subdir is found, its parent is treated as role, else subdir is added to search list (this is equivalent of traversing repository) Adjusting tests to utilize nested role directories
bmwcarit · Jan 10, 2024 · 8f86c5f · 8f86c5f
1 parent bb31afe
commit 8f86c5f
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 36 deletions.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -118,6 +118,14 @@ def repo_data():
             },
             "changelog_file": None,
         },
+        "foobar/baz": {
+            "last_changed": "2018-09-17 15:15:15",
+            "readme_file": {
+                "path": "roles/foobar/baz/README.rst",
+                "content": raw_file("repo_files/roles/foobar/baz/README.rst"),
+            },
+            "changelog_file": None,
+        },
     }
 
     return repo, tenants, job_files, role_files

diff --git a/tests/scraper/test_integration.py b/tests/scraper/test_integration.py
@@ -73,6 +73,7 @@ class MockContents:
     DIR = "dir"
 
     def __init__(self, path, type):
+        self.name = path.split("/")[-1]
         self.path = path
         self.type = type
 
@@ -101,9 +102,10 @@ class MockGitHubRepository(GitHubRepository):
             "zuul.d/jobs.yaml": MOCKED_JOB_CONTENT,
             "roles": {"docker-run": MockContents("roles/docker-run", MockContents.DIR)},
             "roles/docker-run": {
+                "tasks": MockContents("roles/docker-run/tasks", MockContents.DIR),
                 "README.rst": MockContents(
                     "roles/docker-run/README.rst", MockContents.FILE
-                )
+                ),
             },
             "roles/docker-run/README.rst": MOCKED_ROLE_DESCRIPTION,
             "roles/ignored": MockContents("not a valid role", MockContents.FILE),
@@ -117,25 +119,40 @@ class MockGitHubRepository(GitHubRepository):
                 "bar": MockContents("roles/bar", MockContents.DIR),
                 "foobar": MockContents("roles/foobar", MockContents.DIR),
                 "empty-dir": MockContents("roles/empty-dir", MockContents.DIR),
+                "foobaz": MockContents("roles/foobaz", MockContents.DIR),
             },
             "roles/foo": {
-                "README.md": MockContents("roles/foo/README.md", MockContents.FILE)
+                "defaults": MockContents("roles/foo/defaults", MockContents.DIR),
+                "README.md": MockContents("roles/foo/README.md", MockContents.FILE),
             },
             "roles/bar": {
-                "README.txt": MockContents("roles/bar/README.txt", MockContents.FILE)
+                "library": MockContents("roles/bar/library", MockContents.DIR),
+                "README.txt": MockContents("roles/bar/README.txt", MockContents.FILE),
             },
             "roles/foobar": {
-                "README": MockContents("roles/foobar/README", MockContents.FILE)
+                "handlers": MockContents("roles/foobar/handlers", MockContents.DIR),
+                "README": MockContents("roles/foobar/README", MockContents.FILE),
             },
             "roles/empty-dir": {
+                "meta": MockContents("roles/empty-dir/meta", MockContents.DIR),
                 "README.whatever": MockContents(
                     "roles/empty-dir/README.whatever", MockContents.FILE
-                )
+                ),
+            },
+            "roles/foobaz": {
+                "baz": MockContents("roles/foobaz/baz", MockContents.DIR),
+            },
+            "roles/foobaz/baz": {
+                "vars": MockContents("roles/foobaz/baz/vars", MockContents.DIR),
+                "README.rst": MockContents(
+                    "roles/foobaz/baz/README.rst", MockContents.FILE
+                ),
             },
             "roles/foo/README.md": "# Just some Markdown",
             "roles/bar/README.txt": "Just some simple text\nwith a line break",
             "roles/foobar/README": "Simple text in a file without extension",
-            "roles/empty-dir/REAMDE.whatever": "This file won't be checked out",
+            "roles/empty-dir/README.whatever": "This file won't be checked out",
+            "roles/foobaz/baz/README.rst": "Simple readme file from nested role",
         },
         "orga1/repo3": {
             REPO_ROOT: {
@@ -215,19 +232,24 @@ def test_scrape():
         "orga1/repo2": (
             {},
             {
-                "foo": {
+                "bar": {
                     "last_changed": "2018-09-17 15:15:15",
                     "readme_file": {
-                        "path": "roles/foo/README.md",
-                        "content": "# Just some Markdown",
+                        "path": "roles/bar/README.txt",
+                        "content": "Just some simple text\nwith a line break",
                     },
                     "changelog_file": None,
                 },
-                "bar": {
+                "empty-dir": {
+                    "last_changed": "2018-09-17 15:15:15",
+                    "readme_file": None,
+                    "changelog_file": None,
+                },
+                "foo": {
                     "last_changed": "2018-09-17 15:15:15",
                     "readme_file": {
-                        "path": "roles/bar/README.txt",
-                        "content": "Just some simple text\nwith a line break",
+                        "path": "roles/foo/README.md",
+                        "content": "# Just some Markdown",
                     },
                     "changelog_file": None,
                 },
@@ -239,9 +261,12 @@ def test_scrape():
                     },
                     "changelog_file": None,
                 },
-                "empty-dir": {
+                "foobaz/baz": {
                     "last_changed": "2018-09-17 15:15:15",
-                    "readme_file": None,
+                    "readme_file": {
+                        "path": "roles/foobaz/baz/README.rst",
+                        "content": "Simple readme file from nested role",
+                    },
                     "changelog_file": None,
                 },
             },

diff --git a/tests/scraper/test_repo_parser.py b/tests/scraper/test_repo_parser.py
@@ -47,6 +47,7 @@ def test_parse(repo_data):
     job_5 = jobs[4]
     role_1 = [r for r in roles if r["role_name"] == "foo"][0]
     role_2 = [r for r in roles if r["role_name"] == "bar"][0]
+    role_3 = [r for r in roles if r["role_name"] == "foobar/baz"][0]
 
     expected_job_1 = {
         "job_name": "my-cool-new-job",
@@ -215,6 +216,39 @@ def test_parse(repo_data):
         "last_updated": "2018-09-17 15:15:15",
     }
 
+    expected_role_3 = {
+        "role_name": "foobar/baz",
+        "repo": "my/project",
+        "tenants": ["foo", "bar"],
+        "description": (
+            "Yet another simple description\n\n"
+            "**Role variables**\n\n"
+            ".. zuul:rolevar:: my_mandatory_variable\n\n"
+            "   This variable is mandatory.\n"
+        ),
+        "description_html": (
+            "<p>Yet another simple description</p>\n"
+            "<p><strong>Role variables</strong></p>\n"
+            '<dl class="zuul rolevar">\n'
+            '<dt class="sig sig-object zuul" id="rolevar-my_mandatory_variable">\n'
+            '<span class="sig-name descname">'
+            '<span class="pre">my_mandatory_variable</span>'
+            "</span>"
+            '<a class="headerlink" href="#rolevar-my_mandatory_variable" '
+            'title="Permalink to this definition">'
+            "¶</a><br /></dt>\n"
+            "<dd><p>This variable is mandatory.</p>\n"
+            "</dd></dl>\n"
+            "\n"
+        ),
+        "url": "https://github/my/project/tree/master/roles/foobar/baz",
+        "private": False,
+        "platforms": [],
+        "reusable": False,
+        "scrape_time": scrape_time,
+        "last_updated": "2018-09-17 15:15:15",
+    }
+
     # NOTE (fschmidt): Without the skip_empty flag, empty (= None) keys will
     # be stripped from the resulting dict.
     assert job_1.to_dict(skip_empty=False) == expected_job_1
@@ -224,6 +258,7 @@ def test_parse(repo_data):
     assert job_5.to_dict(skip_empty=False) == expected_job_5
     assert role_1.to_dict(skip_empty=False) == expected_role_1
     assert role_2.to_dict(skip_empty=False) == expected_role_2
+    assert role_3.to_dict(skip_empty=False) == expected_role_3
 
 
 def test_parse_reusable_repo(repo_data):

diff --git a/tests/testdata/repo_files/roles/foobar/baz/README.rst b/tests/testdata/repo_files/roles/foobar/baz/README.rst
@@ -0,0 +1,7 @@
+Yet another simple description
+
+**Role variables**
+
+.. zuul:rolevar:: my_mandatory_variable
+
+   This variable is mandatory.
diff --git a/zubbi/scraper/scraper.py b/zubbi/scraper/scraper.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+import os
 
 from zubbi.scraper.exceptions import CheckoutError
 
@@ -29,6 +30,19 @@
 
 ROLES_DIRECTORY = "roles"
 
+# Role needs to contains at least one of those directories to be considered as role
+# https://docs.ansible.com/ansible/latest/playbook_guide/playbooks_reuse_roles.html#role-directory-structure
+ROLE_MANDATORY_DIRS = [
+    "tasks",
+    "handlers",
+    "library",
+    "defaults",
+    "vars",
+    "files",
+    "templates",
+    "meta",
+]
+
 REPO_ROOT = "/"
 
 
@@ -146,39 +160,76 @@ def get_file_info(self, path):
 
     def scrape_role_files(self):
         role_files = {}
+        # Roles might be grouped in some parent directory, so we need to
+        # recursively check nested directories till we find role directory.
         # We are only interested in the role name (=> directory name)
         # and the README and CHANGELOG files. Thus, we don't need to
         # iterate recursively over the roles directory as those files
         # should be on the top-level per role.
+
+        # Listing repo files is based on the example in pyGithub doc:
+        # https://pygithub.readthedocs.io/en/v2.1.1/examples/Repository.html#get-all-of-the-contents-of-the-repository-recursively
         try:
-            roles = self.repo.directory_contents(ROLES_DIRECTORY)
-            for role_name, remote_file in roles.items():
+            # Ansible requires the role to be defined in a certain directory structure.
+            # Thus, we are only interested in directories. Files can be omitted
+            dir_contents = [
+                item
+                for item in self.repo.directory_contents(ROLES_DIRECTORY).values()
+                if item.type == "dir"
+            ]
+            while dir_contents:
                 try:
-                    if remote_file.type != "dir":
-                        # Ansible requires the role to be defined in a
-                        # certain directory structure. Thus, we can
-                        # ignore it in case it's not a directory.
-                        continue
-                    last_changed = self.repo.last_changed(remote_file.path)
-                    existing_files = self.repo.directory_contents(remote_file.path)
-                    # Skip empty directories or files
-                    if not existing_files:
-                        continue
-                    readme_file = self.find_matching_file(README_FILES, existing_files)
-                    changelog_file = self.find_matching_file(
-                        CHANGELOG_FILES, existing_files
-                    )
-                    role_files[role_name] = {
-                        "last_changed": last_changed,
-                        "readme_file": readme_file,
-                        "changelog_file": changelog_file,
-                    }
+                    dir = dir_contents.pop(0)
+
+                    # When directory is one of role mandatory directories we assume that
+                    # its parent is the role directory (then information is extracted
+                    # and all child directories are removed from search list).
+                    # Otherwise, we append the subdirs of this dir to search list (this
+                    # implements the recursive search)
+                    if dir.name not in ROLE_MANDATORY_DIRS:
+                        dir_contents.extend(
+                            [
+                                item
+                                for item in self.repo.directory_contents(
+                                    dir.path
+                                ).values()
+                                if item.type == "dir"
+                            ]
+                        )
+                    else:
+                        # get parent path
+                        role_path = os.path.split(dir.path)[0]
+                        # remove all role subdirs from the search list
+                        dir_contents = [
+                            item
+                            for item in dir_contents
+                            if not item.path.startswith(f"{role_path}/")
+                        ]
+
+                        # we want to get the git change history
+                        # and search for README/CHANGELOG files
+                        last_changed = self.repo.last_changed(role_path)
+                        existing_files = self.repo.directory_contents(role_path)
+                        readme_file = self.find_matching_file(
+                            README_FILES, existing_files
+                        )
+                        changelog_file = self.find_matching_file(
+                            CHANGELOG_FILES, existing_files
+                        )
+                        # we treat role directory path (without ROLES_DIRECTORY dir name
+                        # and trailing slash prefix) as role name
+                        role_files[role_path.replace(f"{ROLES_DIRECTORY}/", "", 1)] = {
+                            "last_changed": last_changed,
+                            "readme_file": readme_file,
+                            "changelog_file": changelog_file,
+                        }
                 except CheckoutError as e:
                     LOGGER.exception(e)
         except CheckoutError as e:
             LOGGER.debug(e)
 
-        return role_files
+        # sort keys (role names) alphabetically
+        return {key: value for key, value in sorted(role_files.items())}
 
     def find_matching_file(self, file_filter, existing_files):
         for filename, file_content in existing_files.items():