diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
index 3a4fe27..22315ff 100644
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@@ -1,27 +1,83 @@
-name: Release library as a PyPI wheel and sdist on GH release creation
+name: Create library release archives, create a GH release and publish PyPI wheel and sdist on tag in main branch
+
+
+# This is executed automatically on a tag in the main branch
+
+# Summary of the steps:
+# - build wheels and sdist
+# - upload wheels and sdist to PyPI
+# - create gh-release and upload wheels and dists there
+# TODO: smoke test wheels and sdist
+# TODO: add changelog to release text body
+
+# WARNING: this is designed only for packages building as pure Python wheels
 
 on:
-  release:
-    types: [created]
+  workflow_dispatch:
+  push:
+    tags:
+      - "v*.*.*"
 
 jobs:
-  build-and-publish-to-pypi:
+  build-pypi-distribs:
     name: Build and publish library to PyPI
     runs-on: ubuntu-20.04
+
+    steps:
+      - uses: actions/checkout@master
+      - name: Set up Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.9
+
+      - name: Install pypa/build
+        run: python -m pip install build --user
+
+      - name: Build a binary wheel and a source tarball
+        run: python -m build --sdist --wheel --outdir dist/
+
+      - name: Upload built archives
+        uses: actions/upload-artifact@v3
+        with:
+          name: pypi_archives
+          path: dist/*
+
+
+  create-gh-release:
+    name: Create GH release
+    needs:
+      - build-pypi-distribs
+    runs-on: ubuntu-20.04
+
+    steps:
+      - name: Download built archives
+        uses: actions/download-artifact@v3
+        with:
+          name: pypi_archives
+          path: dist
+
+      - name: Create GH release
+        uses: softprops/action-gh-release@v1
+        with:
+          draft: true
+          files: dist/*
+
+
+  create-pypi-release:
+    name: Create PyPI release
+    needs:
+      - create-gh-release
+    runs-on: ubuntu-20.04
+
     steps:
-     - uses: actions/checkout@master
-     - name: Set up Python
-       uses: actions/setup-python@v1
-       with:
-         python-version: 3.9
-     - name: Install pypa/build
-       run: python -m pip install build --user
-     - name: Build a binary wheel and a source tarball
-       run: python -m build --sdist --wheel --outdir dist/
-        .
-     - name: Publish distribution  to PyPI
-       if: startsWith(github.ref, 'refs/tags')
-       uses: pypa/gh-action-pypi-publish@master
-       with:
-         password: ${{ secrets.PYPI_API_TOKEN }}
-        
+      - name: Download built archives
+        uses: actions/download-artifact@v3
+        with:
+          name: pypi_archives
+          path: dist
+
+      - name: Publish to PyPI
+        if: startsWith(github.ref, 'refs/tags')
+        uses: pypa/gh-action-pypi-publish@master
+        with:
+          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 972d786..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-# This is a skeleton Travis CI config file that provides a starting point for adding CI
-# to a Python project. Since we primarily develop in python3, this skeleton config file
-# will be specific to that language.
-#
-# See https://config.travis-ci.com/ for a full list of configuration options.
-
-os: linux
-
-dist: xenial
-
-language: python
-python:
-  - "3.6"
-  - "3.7"
-  - "3.8"
-  - "3.9"
-
-# Scripts to run at install stage
-install: ./configure --dev
-
-# Scripts to run at script stage
-script: venv/bin/pytest -vvs -n 2
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 7fd45fc..b65d657 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -4,7 +4,85 @@ Release notes
 Version (next) 
 ------------------------------
 
+TBD.
+
+Version 31.0.0 - (2022-05-16)
+------------------------------
+
+This is a major version with API-breaking changes in the resource module.
+
+- The Resource has no rid (resource id) and no pid (parent id). Instead
+  we now use internally a simpler mapping of {path: Resource} object.
+  As a result the iteration on a Codebase is faster but this requires more
+  memory.
+
+- The Codebase and VirtualCodebase accepts a new "paths" argument that is list
+  of paths. When provided, the Codebase will only contain Resources with these
+  paths and no other resources. This handy to create a Codebase with only a
+  subset of paths of interest. When we create a Codebase or VirtualCodebase
+  with paths, we also always create any intermediate directories. So if you
+  ask for a path of "root/dir/file", we create three resources: "root",
+  "root/dir" and "root/dir/file". We accumulate codebase errors if the paths
+  does not exists in the Codebase or VirtualCodebase. The paths must start with
+  the root path segment and must be POSIX paths.
+
+- When you create a VirtualCodebase with multiple scans, we now prefix each
+  scan path with a codebase-1/, codebase-2/, etc. directory in addition to the
+  "virtual_root" shared root directory. Otherwise files data was overwritten
+  and inconsistent when each location "files" were sharing leading path
+  segments. So if you provide to JSON inputs with that each contain the path
+  "root/dir/file", the VirtualCodebase will contain these paths:
+
+    - "virtual_root/codebase-1/root/dir/file"
+    - "virtual_root/codebase-2/root/dir/file"
+
+  It is otherwise practically impossible to correctly merge file data from
+  multiple codebases reliably, so adding this prefix ensures that we are doing
+  the right thing
+
+- The Resource.path now never contains leading or trailing slash. We also
+  normalize the path everywhere. In particular this behaviour is visible when
+  you create a Codebase with a "full_root" argument. Previously, the paths of a
+  "full_root" Codebase were prefixed with a slash "/".
+
+- When you create a VirtualCodebase with more than one Resource, we now recreate
+  the directory tree for any intermediary directory used in a path that is
+  otherwise missing from files path list.
+  In particular this behaviour changed when you create a VirtualCodebase from
+  a previous Codebase created with a "full_root" argument. Previously, the
+  missing paths of a "full_root" Codebase were kept unchanged. 
+  Note that the VirtualCodebase has always ignored the "full_root" argument.
+
+- The Codebase and VirtualCodebase are now iterable. Iterating on a codebase
+  is the same as a top-down walk.
+
+- The "Codebase.original_location" attributed has been removed.
+  No known users of commoncode used this.
+
+- The Codebase and VirtualCodebase no longer have a "full_root" and
+  "strip_root" constructor arguments and attributes. These can still be
+  passed but they will be ignored.
+
+  - Resource.path is now always the plain path where the first segment
+    is the last segment of the root location, e.g. the root fiename.
+
+  - The Resource now has new "full_root_path" and "strip_root_path"
+    properties that return the corresponding paths.
+
+  - The Resource.to_dict and the new Codebase.to_list both have a new
+    "full_root" and "strip_root" arguments
+
+  - The Resource.get_path() method accepts "full_root" and "strip_root" arguments.
+
+- The Resource.create_child() method has been removed.
+
+Other changes:
+
 - Remove Python upper version limit.
+- Merge latest skeleton
+- fileutils.parent_directory() now accepts a "with_trail" argument. 
+  The returned directory has a trailing path separator unless with_trail is False.
+  The default is True and the default behaviour is unchanged.
 
 
 Version 30.2.0 - (2022-05-02)
@@ -39,7 +117,7 @@ Version 30.1.0 (2022-04-05)
 Version 30.0.0 (2021-09-24)
 ------------------------------
 
-- Switch back from clamver to semver.
+- Switch back from calver to semver.
 - Adopt latest skeleton. The default virtualenv directory is now venv and no
   longer tmp
 - Fix issue with Click progressbar API #23 that prohibited to use all supported
diff --git a/CODE_OF_CONDUCT.rst b/CODE_OF_CONDUCT.rst
new file mode 100644
index 0000000..590ba19
--- /dev/null
+++ b/CODE_OF_CONDUCT.rst
@@ -0,0 +1,86 @@
+Contributor Covenant Code of Conduct
+====================================
+
+Our Pledge
+----------
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our
+project and our community a harassment-free experience for everyone,
+regardless of age, body size, disability, ethnicity, gender identity and
+expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+
+Our Standards
+-------------
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+-  Using welcoming and inclusive language
+-  Being respectful of differing viewpoints and experiences
+-  Gracefully accepting constructive criticism
+-  Focusing on what is best for the community
+-  Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+-  The use of sexualized language or imagery and unwelcome sexual
+   attention or advances
+-  Trolling, insulting/derogatory comments, and personal or political
+   attacks
+-  Public or private harassment
+-  Publishing others’ private information, such as a physical or
+   electronic address, without explicit permission
+-  Other conduct which could reasonably be considered inappropriate in a
+   professional setting
+
+Our Responsibilities
+--------------------
+
+Project maintainers are responsible for clarifying the standards of
+acceptable behavior and are expected to take appropriate and fair
+corrective action in response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit,
+or reject comments, commits, code, wiki edits, issues, and other
+contributions that are not aligned to this Code of Conduct, or to ban
+temporarily or permanently any contributor for other behaviors that they
+deem inappropriate, threatening, offensive, or harmful.
+
+Scope
+-----
+
+This Code of Conduct applies both within project spaces and in public
+spaces when an individual is representing the project or its community.
+Examples of representing a project or community include using an
+official project e-mail address, posting via an official social media
+account, or acting as an appointed representative at an online or
+offline event. Representation of a project may be further defined and
+clarified by project maintainers.
+
+Enforcement
+-----------
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may
+be reported by contacting the project team at pombredanne@gmail.com
+or on the Gitter chat channel at https://gitter.im/aboutcode-org/discuss .
+All complaints will be reviewed and investigated and will result in a
+response that is deemed necessary and appropriate to the circumstances.
+The project team is obligated to maintain confidentiality with regard to
+the reporter of an incident. Further details of specific enforcement
+policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in
+good faith may face temporary or permanent repercussions as determined
+by other members of the project’s leadership.
+
+Attribution
+-----------
+
+This Code of Conduct is adapted from the `Contributor Covenant`_ ,
+version 1.4, available at
+https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+.. _Contributor Covenant: https://www.contributor-covenant.org
diff --git a/MANIFEST.in b/MANIFEST.in
index ef3721e..8424cbe 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -9,7 +9,7 @@ include *.rst
 include setup.*
 include configure*
 include requirements*
-include .git*
+include .giti*
 
 global-exclude *.py[co] __pycache__ *.*~
 
diff --git a/configure b/configure
index d1b4fda..a52f539 100755
--- a/configure
+++ b/configure
@@ -54,11 +54,10 @@ CFG_BIN_DIR=$CFG_ROOT_DIR/$VIRTUALENV_DIR/bin
 
 ################################
 # Thirdparty package locations and index handling
-# Find packages from the local thirdparty directory or from thirdparty.aboutcode.org
+# Find packages from the local thirdparty directory
 if [ -d "$CFG_ROOT_DIR/thirdparty" ]; then
     PIP_EXTRA_ARGS="--find-links $CFG_ROOT_DIR/thirdparty"
 fi
-PIP_EXTRA_ARGS="$PIP_EXTRA_ARGS --find-links https://thirdparty.aboutcode.org/pypi/simple/links.html"
 
 
 ################################
diff --git a/configure.bat b/configure.bat
index 487e78a..41547cc 100644
--- a/configure.bat
+++ b/configure.bat
@@ -52,11 +52,10 @@ set "CFG_BIN_DIR=%CFG_ROOT_DIR%\%VIRTUALENV_DIR%\Scripts"
 
 @rem ################################
 @rem # Thirdparty package locations and index handling
-@rem # Find packages from the local thirdparty directory or from thirdparty.aboutcode.org
+@rem # Find packages from the local thirdparty directory
 if exist "%CFG_ROOT_DIR%\thirdparty" (
     set PIP_EXTRA_ARGS=--find-links "%CFG_ROOT_DIR%\thirdparty"
 )
-set "PIP_EXTRA_ARGS=%PIP_EXTRA_ARGS% --find-links https://thirdparty.aboutcode.org/pypi/simple/links.html"
 
 
 @rem ################################
@@ -69,7 +68,6 @@ if not defined CFG_QUIET (
 @rem ################################
 @rem # Main command line entry point
 set "CFG_REQUIREMENTS=%REQUIREMENTS%"
-set "NO_INDEX=--no-index"
 
 :again
 if not "%1" == "" (
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 62bca04..d5435e7 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -28,7 +28,7 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-'sphinx.ext.intersphinx',
+    "sphinx.ext.intersphinx",
 ]
 
 # This points to aboutcode.readthedocs.io
@@ -36,8 +36,8 @@
 # Link was created at commit - https://github.com/nexB/aboutcode/commit/faea9fcf3248f8f198844fe34d43833224ac4a83
 
 intersphinx_mapping = {
-    'aboutcode': ('https://aboutcode.readthedocs.io/en/latest/', None),
-    'scancode-workbench': ('https://scancode-workbench.readthedocs.io/en/develop/', None),
+    "aboutcode": ("https://aboutcode.readthedocs.io/en/latest/", None),
+    "scancode-workbench": ("https://scancode-workbench.readthedocs.io/en/develop/", None),
 }
 
 
@@ -62,7 +62,7 @@
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ["_static"]
 
-master_doc = 'index'
+master_doc = "index"
 
 html_context = {
     "display_github": True,
@@ -72,9 +72,7 @@
     "conf_py_path": "/docs/source/",  # path in the checkout to the docs root
 }
 
-html_css_files = [
-    '_static/theme_overrides.css'
-    ]
+html_css_files = ["_static/theme_overrides.css"]
 
 
 # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
diff --git a/etc/scripts/check_thirdparty.py b/etc/scripts/check_thirdparty.py
index 0f04b34..b052f25 100644
--- a/etc/scripts/check_thirdparty.py
+++ b/etc/scripts/check_thirdparty.py
@@ -16,7 +16,7 @@
 @click.command()
 @click.option(
     "-d",
-    "--dest_dir",
+    "--dest",
     type=click.Path(exists=True, readable=True, path_type=str, file_okay=False),
     required=True,
     help="Path to the thirdparty directory to check.",
@@ -35,7 +35,7 @@
 )
 @click.help_option("-h", "--help")
 def check_thirdparty_dir(
-    dest_dir,
+    dest,
     wheels,
     sdists,
 ):
@@ -45,7 +45,7 @@ def check_thirdparty_dir(
     # check for problems
     print(f"==> CHECK FOR PROBLEMS")
     utils_thirdparty.find_problems(
-        dest_dir=dest_dir,
+        dest_dir=dest,
         report_missing_sources=sdists,
         report_missing_wheels=wheels,
     )
diff --git a/etc/scripts/fetch_thirdparty.py b/etc/scripts/fetch_thirdparty.py
index 22147b2..89d17de 100644
--- a/etc/scripts/fetch_thirdparty.py
+++ b/etc/scripts/fetch_thirdparty.py
@@ -18,7 +18,8 @@
 import utils_thirdparty
 import utils_requirements
 
-TRACE = True
+TRACE = False
+TRACE_DEEP = False
 
 
 @click.command()
@@ -99,11 +100,16 @@
     "index_urls",
     type=str,
     metavar="INDEX",
-    default=utils_thirdparty.PYPI_INDEXES,
+    default=utils_thirdparty.PYPI_INDEX_URLS,
     show_default=True,
     multiple=True,
     help="PyPI index URL(s) to use for wheels and sources, in order of preferences.",
 )
+@click.option(
+    "--use-cached-index",
+    is_flag=True,
+    help="Use on disk cached PyPI indexes list of packages and versions and do not refetch if present.",
+)
 @click.help_option("-h", "--help")
 def fetch_thirdparty(
     requirements_files,
@@ -115,9 +121,10 @@ def fetch_thirdparty(
     wheels,
     sdists,
     index_urls,
+    use_cached_index,
 ):
     """
-    Download to --dest-dir THIRDPARTY_DIR the PyPI wheels, source distributions,
+    Download to --dest THIRDPARTY_DIR the PyPI wheels, source distributions,
     and their ABOUT metadata, license and notices files.
 
     Download the PyPI packages listed in the combination of:
@@ -125,16 +132,23 @@ def fetch_thirdparty(
     - the pip name==version --specifier SPECIFIER(s)
     - any pre-existing wheels or sdsists found in --dest-dir THIRDPARTY_DIR.
 
-    Download wheels with the --wheels option for the ``--python-version`` PYVER(s)
-    and ``--operating_system`` OS(s) combinations defaulting to all supported combinations.
+    Download wheels with the --wheels option for the ``--python-version``
+    PYVER(s) and ``--operating_system`` OS(s) combinations defaulting to all
+    supported combinations.
 
     Download sdists tarballs with the --sdists option.
 
-    Generate or Download .ABOUT, .LICENSE and .NOTICE files for all the wheels and sources fetched.
+    Generate or Download .ABOUT, .LICENSE and .NOTICE files for all the wheels
+    and sources fetched.
 
-    Download wheels and sdists the provided PyPI simple --index-url INDEX(s) URLs.
+    Download from the provided PyPI simple --index-url INDEX(s) URLs.
     """
+    if not (wheels or sdists):
+        print("Error: one or both of --wheels  and --sdists is required.")
+        sys.exit(1)
+
     print(f"COLLECTING REQUIRED NAMES & VERSIONS FROM {dest_dir}")
+
     existing_packages_by_nv = {
         (package.name, package.version): package
         for package in utils_thirdparty.get_local_packages(directory=dest_dir)
@@ -150,134 +164,88 @@ def fetch_thirdparty(
         required_name_versions.update(nvs)
 
     for specifier in specifiers:
-        nv = utils_requirements.get_name_version(
+        nv = utils_requirements.get_required_name_version(
             requirement=specifier,
             with_unpinned=latest_version,
         )
         required_name_versions.add(nv)
 
+    if latest_version:
+        names = set(name for name, _version in sorted(required_name_versions))
+        required_name_versions = {(n, None) for n in names}
+
     if not required_name_versions:
         print("Error: no requirements requested.")
         sys.exit(1)
 
-    if not os.listdir(dest_dir) and not (wheels or sdists):
-        print("Error: one or both of --wheels  and --sdists is required.")
-        sys.exit(1)
-
-    if latest_version:
-        latest_name_versions = set()
-        names = set(name for name, _version in sorted(required_name_versions))
-        for name in sorted(names):
-            latests = utils_thirdparty.PypiPackage.sorted(
-                utils_thirdparty.get_package_versions(
-                    name=name, version=None, index_urls=index_urls
-                )
-            )
-            if not latests:
-                print(f"No distribution found for: {name}")
-            continue
-            latest = latests[-1]
-            latest_name_versions.add((latest.name, latest.version))
-        required_name_versions = latest_name_versions
-
-    if TRACE:
-        print("required_name_versions:", required_name_versions)
+    if TRACE_DEEP:
+        print("required_name_versions:")
+        for n, v in required_name_versions:
+            print(f"    {n} @ {v}")
 
+    # create the environments matrix we need for wheels
+    environments = None
     if wheels:
-        # create the environments matrix we need for wheels
         evts = itertools.product(python_versions, operating_systems)
         environments = [utils_thirdparty.Environment.from_pyver_and_os(pyv, os) for pyv, os in evts]
 
-    wheels_not_found = {}
-    sdists_not_found = {}
-    # iterate over requirements, one at a time
+    # Collect PyPI repos
+    repos = []
+    for index_url in index_urls:
+        index_url = index_url.strip("/")
+        existing = utils_thirdparty.DEFAULT_PYPI_REPOS_BY_URL.get(index_url)
+        if existing:
+            existing.use_cached_index = use_cached_index
+            repos.append(existing)
+        else:
+            repo = utils_thirdparty.PypiSimpleRepository(
+                index_url=index_url,
+                use_cached_index=use_cached_index,
+            )
+            repos.append(repo)
+
+    wheels_fetched = []
+    wheels_not_found = []
+
+    sdists_fetched = []
+    sdists_not_found = []
+
     for name, version in sorted(required_name_versions):
         nv = name, version
-        existing_package = existing_packages_by_nv.get(nv)
+        print(f"Processing: {name} @ {version}")
         if wheels:
             for environment in environments:
-                if existing_package:
-                    existing_wheels = list(
-                        existing_package.get_supported_wheels(environment=environment)
-                    )
-                else:
-                    existing_wheels = None
-
-                if existing_wheels:
-                    if TRACE:
-                        print(
-                            f"====> Wheels already available: {name}=={version} on: {environment}: {existing_package.wheels!r}"
-                        )
-                    if all(w.is_pure() for w in existing_wheels):
-                        break
-                    else:
-                        continue
-
                 if TRACE:
-                    print(f"Fetching wheel for: {name}=={version} on: {environment}")
-
-                try:
-                    (
-                        fetched_wheel_filenames,
-                        existing_wheel_filenames,
-                    ) = utils_thirdparty.download_wheel(
-                        name=name,
-                        version=version,
-                        environment=environment,
-                        dest_dir=dest_dir,
-                        index_urls=index_urls,
-                    )
-                    if TRACE:
-                        if existing_wheel_filenames:
-                            print(
-                                f"    ====> Wheels already available: {name}=={version} on: {environment}"
-                            )
-                            for whl in existing_wheel_filenames:
-                                print(f"        {whl}")
-                        if fetched_wheel_filenames:
-                            print(f"    ====> Wheels fetched: {name}=={version} on: {environment}")
-                            for whl in fetched_wheel_filenames:
-                                print(f"        {whl}")
-
-                    fwfns = fetched_wheel_filenames + existing_wheel_filenames
-
-                    if all(utils_thirdparty.Wheel.from_filename(f).is_pure() for f in fwfns):
-                        break
-
-                except utils_thirdparty.DistributionNotFound as e:
-                    wheels_not_found[f"{name}=={version}"] = str(e)
-
-        if sdists:
-            if existing_package and existing_package.sdist:
-                if TRACE:
-                    print(
-                        f"  ====> Sdist already available: {name}=={version}: {existing_package.sdist!r}"
-                    )
-                continue
-
-            if TRACE:
-                print(f"  Fetching sdist for: {name}=={version}")
-
-            try:
-                fetched = utils_thirdparty.download_sdist(
+                    print(f"  ==> Fetching wheel for envt: {environment}")
+                fwfns = utils_thirdparty.download_wheel(
                     name=name,
                     version=version,
+                    environment=environment,
                     dest_dir=dest_dir,
-                    index_urls=index_urls,
+                    repos=repos,
                 )
+                if fwfns:
+                    wheels_fetched.extend(fwfns)
+                else:
+                    wheels_not_found.append(f"{name}=={version} for: {environment}")
+                    if TRACE:
+                        print(f"      NOT FOUND")
 
+        if sdists:
+            if TRACE:
+                print(f"  ==> Fetching sdist: {name}=={version}")
+            fetched = utils_thirdparty.download_sdist(
+                name=name,
+                version=version,
+                dest_dir=dest_dir,
+                repos=repos,
+            )
+            if fetched:
+                sdists_fetched.append(fetched)
+            else:
+                sdists_not_found.append(f"{name}=={version}")
                 if TRACE:
-                    if not fetched:
-                        print(
-                            f"    ====> Sdist already available: {name}=={version} on: {environment}"
-                        )
-                    else:
-                        print(
-                            f"    ====> Sdist fetched: {fetched} for {name}=={version} on: {environment}"
-                        )
-
-            except utils_thirdparty.DistributionNotFound as e:
-                sdists_not_found[f"{name}=={version}"] = str(e)
+                    print(f"      NOT FOUND")
 
     if wheels and wheels_not_found:
         print(f"==> MISSING WHEELS")
@@ -290,7 +258,7 @@ def fetch_thirdparty(
             print(f"  {sd}")
 
     print(f"==> FETCHING OR CREATING ABOUT AND LICENSE FILES")
-    utils_thirdparty.fetch_abouts_and_licenses(dest_dir=dest_dir)
+    utils_thirdparty.fetch_abouts_and_licenses(dest_dir=dest_dir, use_cached_index=use_cached_index)
     utils_thirdparty.clean_about_files(dest_dir=dest_dir)
 
     # check for problems
diff --git a/etc/scripts/gen_pypi_simple.py b/etc/scripts/gen_pypi_simple.py
index 8de2b96..03312ab 100644
--- a/etc/scripts/gen_pypi_simple.py
+++ b/etc/scripts/gen_pypi_simple.py
@@ -25,26 +25,26 @@ class InvalidDistributionFilename(Exception):
 
 def get_package_name_from_filename(filename):
     """
-    Return the package name extracted from a package ``filename``.
-    Optionally ``normalize`` the name according to distribution name rules.
+    Return the normalized package name extracted from a package ``filename``.
+    Normalization is done according to distribution name rules.
     Raise an ``InvalidDistributionFilename`` if the ``filename`` is invalid::
 
     >>> get_package_name_from_filename("foo-1.2.3_rc1.tar.gz")
     'foo'
-    >>> get_package_name_from_filename("foo-bar-1.2-py27-none-any.whl")
+    >>> get_package_name_from_filename("foo_bar-1.2-py27-none-any.whl")
     'foo-bar'
     >>> get_package_name_from_filename("Cython-0.17.2-cp26-none-linux_x86_64.whl")
     'cython'
     >>> get_package_name_from_filename("python_ldap-2.4.19-cp27-none-macosx_10_10_x86_64.whl")
     'python-ldap'
-    >>> get_package_name_from_filename("foo.whl")
-    Traceback (most recent call last):
-        ...
-    InvalidDistributionFilename: ...
-    >>> get_package_name_from_filename("foo.png")
-    Traceback (most recent call last):
-        ...
-    InvalidFilePackageName: ...
+    >>> try:
+    ...     get_package_name_from_filename("foo.whl")
+    ... except InvalidDistributionFilename:
+    ...     pass
+    >>> try:
+    ...     get_package_name_from_filename("foo.png")
+    ... except InvalidDistributionFilename:
+    ...     pass
     """
     if not filename or not filename.endswith(dist_exts):
         raise InvalidDistributionFilename(filename)
diff --git a/etc/scripts/requirements.txt b/etc/scripts/requirements.txt
index 6591e49..ebb404b 100644
--- a/etc/scripts/requirements.txt
+++ b/etc/scripts/requirements.txt
@@ -1,12 +1,11 @@
 aboutcode_toolkit
-github-release-retry2
 attrs
 commoncode
 click
 requests
 saneyaml
-romp
 pip
 setuptools
 twine
-wheel
\ No newline at end of file
+wheel
+build
\ No newline at end of file
diff --git a/etc/scripts/utils_requirements.py b/etc/scripts/utils_requirements.py
index fbc456d..7c99a33 100644
--- a/etc/scripts/utils_requirements.py
+++ b/etc/scripts/utils_requirements.py
@@ -41,23 +41,23 @@ def get_required_name_versions(requirement_lines, with_unpinned=False):
         if req_line.startswith("-") or (not with_unpinned and not "==" in req_line):
             print(f"Requirement line is not supported: ignored: {req_line}")
             continue
-        yield get_name_version(requirement=req_line, with_unpinned=with_unpinned)
+        yield get_required_name_version(requirement=req_line, with_unpinned=with_unpinned)
 
 
-def get_name_version(requirement, with_unpinned=False):
+def get_required_name_version(requirement, with_unpinned=False):
     """
     Return a (name, version) tuple given a`requirement` specifier string.
     Requirement version must be pinned. If ``with_unpinned`` is True, unpinned
     requirements are accepted and only the name portion is returned.
 
     For example:
-    >>> assert get_name_version("foo==1.2.3") == ("foo", "1.2.3")
-    >>> assert get_name_version("fooA==1.2.3.DEV1") == ("fooa", "1.2.3.dev1")
-    >>> assert get_name_version("foo==1.2.3", with_unpinned=False) == ("foo", "1.2.3")
-    >>> assert get_name_version("foo", with_unpinned=True) == ("foo", "")
-    >>> assert get_name_version("foo>=1.2", with_unpinned=True) == ("foo", ""), get_name_version("foo>=1.2")
+    >>> assert get_required_name_version("foo==1.2.3") == ("foo", "1.2.3")
+    >>> assert get_required_name_version("fooA==1.2.3.DEV1") == ("fooa", "1.2.3.dev1")
+    >>> assert get_required_name_version("foo==1.2.3", with_unpinned=False) == ("foo", "1.2.3")
+    >>> assert get_required_name_version("foo", with_unpinned=True) == ("foo", "")
+    >>> assert get_required_name_version("foo>=1.2", with_unpinned=True) == ("foo", ""), get_required_name_version("foo>=1.2")
     >>> try:
-    ...   assert not get_name_version("foo", with_unpinned=False)
+    ...   assert not get_required_name_version("foo", with_unpinned=False)
     ... except Exception as e:
     ...   assert "Requirement version must be pinned" in str(e)
     """
@@ -110,6 +110,8 @@ def get_installed_reqs(site_packages_dir):
     Return the installed pip requirements as text found in `site_packages_dir`
     as a text.
     """
+    if not os.path.exists(site_packages_dir):
+        raise Exception(f"site_packages directory: {site_packages_dir!r} does not exists")
     # Also include these packages in the output with --all: wheel, distribute,
     # setuptools, pip
     args = ["pip", "freeze", "--exclude-editable", "--all", "--path", site_packages_dir]
diff --git a/etc/scripts/utils_thirdparty.py b/etc/scripts/utils_thirdparty.py
index 829cf8c..2d6f3e4 100644
--- a/etc/scripts/utils_thirdparty.py
+++ b/etc/scripts/utils_thirdparty.py
@@ -8,7 +8,6 @@
 # See https://github.com/nexB/skeleton for support or download.
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
-from collections import defaultdict
 import email
 import itertools
 import os
@@ -18,6 +17,8 @@
 import tempfile
 import time
 import urllib
+from collections import defaultdict
+from urllib.parse import quote_plus
 
 import attr
 import license_expression
@@ -29,10 +30,8 @@
 from commoncode.text import python_safe_name
 from packaging import tags as packaging_tags
 from packaging import version as packaging_version
-from urllib.parse import quote_plus
 
 import utils_pip_compatibility_tags
-from utils_requirements import load_requirements
 
 """
 Utilities to manage Python thirparty libraries source, binaries and metadata in
@@ -111,7 +110,7 @@
 
 """
 
-TRACE = True
+TRACE = False
 TRACE_DEEP = False
 TRACE_ULTRA_DEEP = False
 
@@ -168,6 +167,16 @@ def get_python_dot_version(version):
         "macosx_10_15_x86_64",
         "macosx_11_0_x86_64",
         "macosx_11_intel",
+        "macosx_11_0_x86_64",
+        "macosx_11_intel",
+        "macosx_10_9_universal2",
+        "macosx_10_10_universal2",
+        "macosx_10_11_universal2",
+        "macosx_10_12_universal2",
+        "macosx_10_13_universal2",
+        "macosx_10_14_universal2",
+        "macosx_10_15_universal2",
+        "macosx_11_0_universal2",
         # 'macosx_11_0_arm64',
     ],
     "windows": [
@@ -178,18 +187,19 @@ def get_python_dot_version(version):
 THIRDPARTY_DIR = "thirdparty"
 CACHE_THIRDPARTY_DIR = ".cache/thirdparty"
 
-ABOUT_BASE_URL = "https://thirdparty.aboutcode.org/pypi"
+################################################################################
 
+ABOUT_BASE_URL = "https://thirdparty.aboutcode.org/pypi"
 ABOUT_PYPI_SIMPLE_URL = f"{ABOUT_BASE_URL}/simple"
 ABOUT_LINKS_URL = f"{ABOUT_PYPI_SIMPLE_URL}/links.html"
-
 PYPI_SIMPLE_URL = "https://pypi.org/simple"
-PYPI_INDEXES = (PYPI_SIMPLE_URL, ABOUT_PYPI_SIMPLE_URL)
+PYPI_INDEX_URLS = (PYPI_SIMPLE_URL, ABOUT_PYPI_SIMPLE_URL)
+
+################################################################################
 
 EXTENSIONS_APP = (".pyz",)
 EXTENSIONS_SDIST = (
     ".tar.gz",
-    ".tar.bz2",
     ".zip",
     ".tar.xz",
 )
@@ -216,119 +226,90 @@ class DistributionNotFound(Exception):
     pass
 
 
-def download_wheel(
-    name,
-    version,
-    environment,
-    dest_dir=THIRDPARTY_DIR,
-    index_urls=PYPI_INDEXES,
-):
+def download_wheel(name, version, environment, dest_dir=THIRDPARTY_DIR, repos=tuple()):
     """
     Download the wheels binary distribution(s) of package ``name`` and
-    ``version`` matching the ``environment`` Environment constraints from the
-    PyPI simple repository ``index_urls`` list of URLs into the ``dest_dir``
-    directory.
+    ``version`` matching the ``environment`` Environment constraints into the
+    ``dest_dir`` directory. Return a list of fetched_wheel_filenames, possibly
+    empty.
 
-    Raise a DistributionNotFound if no wheel is not found. Otherwise, return a
-    tuple of lists of (fetched_wheel_filenames, existing_wheel_filenames)
+    Use the first PyPI simple repository from a list of ``repos`` that contains this wheel.
     """
     if TRACE_DEEP:
-        print(f"  download_wheel: {name}=={version}: {environment}")
+        print(f"  download_wheel: {name}=={version} for envt: {environment}")
 
-    fetched_wheel_filenames = []
-    existing_wheel_filenames = []
-    try:
-        for pypi_package in get_package_versions(
-            name=name,
-            version=version,
-            index_urls=index_urls,
-        ):
-            if not pypi_package.wheels:
-                continue
-
-            supported_wheels = list(pypi_package.get_supported_wheels(environment=environment))
-            if not supported_wheels:
-                continue
+    if not repos:
+        repos = DEFAULT_PYPI_REPOS
 
-            for wheel in supported_wheels:
-                if os.path.exists(os.path.join(dest_dir, wheel.filename)):
-                    # do not refetch
-                    existing_wheel_filenames.append(wheel.filename)
-                    continue
+    fetched_wheel_filenames = []
 
-                if TRACE:
-                    print(f"  Fetching wheel from index: {wheel.download_url}")
-                fetched_wheel_filename = wheel.download(dest_dir=dest_dir)
-                fetched_wheel_filenames.add(fetched_wheel_filename)
+    for repo in repos:
+        package = repo.get_package_version(name=name, version=version)
+        if not package:
+            if TRACE_DEEP:
+                print(f"    download_wheel: No package in {repo.index_url} for {name}=={version}")
+            continue
+        supported_wheels = list(package.get_supported_wheels(environment=environment))
+        if not supported_wheels:
+            if TRACE_DEEP:
+                print(
+                    f"    download_wheel: No supported wheel for {name}=={version}: {environment} "
+                )
+            continue
 
-    except Exception as e:
-        raise DistributionNotFound(f"Failed to fetch wheel: {name}=={version}: {e}") from e
+        for wheel in supported_wheels:
+            if TRACE_DEEP:
+                print(
+                    f"    download_wheel: Getting wheel from index (or cache): {wheel.download_url}"
+                )
+            fetched_wheel_filename = wheel.download(dest_dir=dest_dir)
+            fetched_wheel_filenames.append(fetched_wheel_filename)
 
-    if not fetched_wheel_filenames and not existing_wheel_filenames:
-        raise DistributionNotFound(f"Failed to fetch wheel: {name}=={version}: No wheel found")
+        if fetched_wheel_filenames:
+            # do not futher fetch from other repos if we find in first, typically PyPI
+            break
 
-    return fetched_wheel_filenames, existing_wheel_filenames
+    return fetched_wheel_filenames
 
 
-def download_sdist(
-    name,
-    version,
-    dest_dir=THIRDPARTY_DIR,
-    index_urls=PYPI_INDEXES,
-):
+def download_sdist(name, version, dest_dir=THIRDPARTY_DIR, repos=tuple()):
     """
     Download the sdist source distribution of package ``name`` and ``version``
-    from the PyPI simple repository ``index_urls`` list of URLs into the
-    ``dest_dir`` directory.
+    into the ``dest_dir`` directory. Return a fetched filename or None.
 
-    Raise a DistributionNotFound if this was not found. Return the filename if
-    downloaded and False if not downloaded because it already exists.
+    Use the first PyPI simple repository from a list of ``repos`` that contains
+    this sdist.
     """
-    if TRACE_DEEP:
-        print(f"download_sdist: {name}=={version}: ")
+    if TRACE:
+        print(f"  download_sdist: {name}=={version}")
 
-    try:
-        for pypi_package in get_package_versions(
-            name=name,
-            version=version,
-            index_urls=index_urls,
-        ):
-            if not pypi_package.sdist:
-                continue
+    if not repos:
+        repos = DEFAULT_PYPI_REPOS
 
-            if os.path.exists(os.path.join(dest_dir, pypi_package.sdist.filename)):
-                # do not refetch
-                return False
-            if TRACE:
-                print(f"  Fetching sources from index: {pypi_package.sdist.download_url}")
-            fetched = pypi_package.sdist.download(dest_dir=dest_dir)
-            if fetched:
-                return pypi_package.sdist.filename
+    fetched_sdist_filename = None
 
-    except Exception as e:
-        raise DistributionNotFound(f"Failed to fetch sdist: {name}=={version}: {e}") from e
+    for repo in repos:
+        package = repo.get_package_version(name=name, version=version)
+
+        if not package:
+            if TRACE_DEEP:
+                print(f"    download_sdist: No package in {repo.index_url} for {name}=={version}")
+            continue
+        sdist = package.sdist
+        if not sdist:
+            if TRACE_DEEP:
+                print(f"    download_sdist: No sdist for {name}=={version}")
+            continue
 
-    raise DistributionNotFound(f"Failed to fetch sdist: {name}=={version}: No sources found")
+        if TRACE_DEEP:
+            print(f"    download_sdist: Getting sdist from index (or cache): {sdist.download_url}")
+        fetched_sdist_filename = package.sdist.download(dest_dir=dest_dir)
 
+        if fetched_sdist_filename:
+            # do not futher fetch from other repos if we find in first, typically PyPI
+            break
 
-def get_package_versions(
-    name,
-    version=None,
-    index_urls=PYPI_INDEXES,
-):
-    """
-    Yield PypiPackages with ``name`` and ``version`` from the PyPI simple
-    repository ``index_urls`` list of URLs.
-    If ``version`` is not provided, return the latest available versions.
-    """
-    for index_url in index_urls:
-        try:
-            repo = get_pypi_repo(index_url)
-            package = repo.get_package(name, version)
-            if package:
-                yield package
-        except RemoteNotFetchedException as e:
-            print(f"Failed to fetch PyPI package {name} @ {version} info from {index_url}: {e}")
+    return fetched_sdist_filename
 
 
 ################################################################################
@@ -362,17 +343,6 @@ def normalize_name(name):
         """
         return name and re.sub(r"[-_.]+", "-", name).lower() or name
 
-    @staticmethod
-    def standardize_name(name):
-        """
-        Return a standardized package name, e.g. lowercased and using - not _
-        """
-        return name and re.sub(r"[-_]+", "-", name).lower() or name
-
-    @property
-    def name_ver(self):
-        return f"{self.name}-{self.version}"
-
     def sortable_name_version(self):
         """
         Return a tuple of values to sort by name, then version.
@@ -388,7 +358,7 @@ def sorted(cls, namevers):
 @attr.attributes
 class Distribution(NameVer):
 
-    # field names that can be updated from another dist of mapping
+    # field names that can be updated from another Distribution or mapping
     updatable_fields = [
         "license_expression",
         "copyright",
@@ -406,6 +376,13 @@ class Distribution(NameVer):
         metadata=dict(help="File name."),
     )
 
+    path_or_url = attr.ib(
+        repr=False,
+        type=str,
+        default="",
+        metadata=dict(help="Path or URL"),
+    )
+
     sha256 = attr.ib(
         repr=False,
         type=str,
@@ -530,36 +507,50 @@ def package_url(self):
         """
         Return a Package URL string of self.
         """
-        return str(packageurl.PackageURL(**self.purl_identifiers()))
+        return str(
+            packageurl.PackageURL(
+                type=self.type,
+                namespace=self.namespace,
+                name=self.name,
+                version=self.version,
+                subpath=self.subpath,
+                qualifiers=self.qualifiers,
+            )
+        )
 
     @property
     def download_url(self):
         return self.get_best_download_url()
 
-    def get_best_download_url(
-        self,
-        index_urls=tuple([PYPI_SIMPLE_URL, ABOUT_PYPI_SIMPLE_URL]),
-    ):
+    def get_best_download_url(self, repos=tuple()):
         """
-        Return the best download URL for this distribution where best means that
-        PyPI is better and our selfhosted repo URLs are second.
-        If none is found, return a synthetic remote URL.
+        Return the best download URL for this distribution where best means this
+        is the first URL found for this distribution found in the list of
+        ``repos``.
+
+        If none is found, return a synthetic PyPI remote URL.
         """
-        for index_url in index_urls:
-            pypi_package = get_pypi_package(
-                name=self.normalized_name,
-                version=self.version,
-                index_url=index_url,
-            )
-            if pypi_package:
-                if isinstance(pypi_package, tuple):
-                    raise Exception("############", repr(pypi_package))
-                try:
-                    pypi_url = pypi_package.get_url_for_filename(self.filename)
-                except Exception as e:
-                    raise Exception(repr(pypi_package)) from e
-                if pypi_url:
-                    return pypi_url
+
+        if not repos:
+            repos = DEFAULT_PYPI_REPOS
+
+        for repo in repos:
+            package = repo.get_package_version(name=self.name, version=self.version)
+            if not package:
+                if TRACE:
+                    print(
+                        f"     get_best_download_url: {self.name}=={self.version} "
+                        f"not found in {repo.index_url}"
+                    )
+                continue
+            pypi_url = package.get_url_for_filename(self.filename)
+            if pypi_url:
+                return pypi_url
+            else:
+                if TRACE:
+                    print(
+                        f"     get_best_download_url: {self.filename} not found in {repo.index_url}"
+                    )
 
     def download(self, dest_dir=THIRDPARTY_DIR):
         """
@@ -567,16 +558,17 @@ def download(self, dest_dir=THIRDPARTY_DIR):
         Return the fetched filename.
         """
         assert self.filename
-        if TRACE:
+        if TRACE_DEEP:
             print(
                 f"Fetching distribution of {self.name}=={self.version}:",
                 self.filename,
             )
 
-        fetch_and_save_path_or_url(
-            filename=self.filename,
-            dest_dir=dest_dir,
+        # FIXME:
+        fetch_and_save(
             path_or_url=self.path_or_url,
+            dest_dir=dest_dir,
+            filename=self.filename,
             as_text=False,
         )
         return self.filename
@@ -601,7 +593,7 @@ def notice_download_url(self):
     def from_path_or_url(cls, path_or_url):
         """
         Return a distribution built from the data found in the filename of a
-        `path_or_url` string. Raise an exception if this is not a valid
+        ``path_or_url`` string. Raise an exception if this is not a valid
         filename.
         """
         filename = os.path.basename(path_or_url.strip("/"))
@@ -632,47 +624,6 @@ def from_filename(cls, filename):
         clazz = cls.get_dist_class(filename)
         return clazz.from_filename(filename)
 
-    def purl_identifiers(self, skinny=False):
-        """
-        Return a mapping of non-empty identifier name/values for the purl
-        fields. If skinny is True, only inlucde type, namespace and name.
-        """
-        identifiers = dict(
-            type=self.type,
-            namespace=self.namespace,
-            name=self.name,
-        )
-
-        if not skinny:
-            identifiers.update(
-                version=self.version,
-                subpath=self.subpath,
-                qualifiers=self.qualifiers,
-            )
-
-        return {k: v for k, v in sorted(identifiers.items()) if v}
-
-    def identifiers(self, purl_as_fields=True):
-        """
-        Return a mapping of non-empty identifier name/values.
-        Return each purl fields separately if purl_as_fields is True.
-        Otherwise return a package_url string for the purl.
-        """
-        if purl_as_fields:
-            identifiers = self.purl_identifiers()
-        else:
-            identifiers = dict(package_url=self.package_url)
-
-        identifiers.update(
-            download_url=self.download_url,
-            filename=self.filename,
-            md5=self.md5,
-            sha1=self.sha1,
-            package_url=self.package_url,
-        )
-
-        return {k: v for k, v in sorted(identifiers.items()) if v}
-
     def has_key_metadata(self):
         """
         Return True if this distribution has key metadata required for basic attribution.
@@ -802,7 +753,7 @@ def load_remote_about_data(self):
         NOTICE file if any. Return True if the data was updated.
         """
         try:
-            about_text = fetch_content_from_path_or_url_through_cache(
+            about_text = CACHE.get(
                 path_or_url=self.about_download_url,
                 as_text=True,
             )
@@ -816,7 +767,7 @@ def load_remote_about_data(self):
         notice_file = about_data.pop("notice_file", None)
         if notice_file:
             try:
-                notice_text = fetch_content_from_path_or_url_through_cache(
+                notice_text = CACHE.get(
                     path_or_url=self.notice_download_url,
                     as_text=True,
                 )
@@ -867,12 +818,12 @@ def get_license_keys(self):
             return ["unknown"]
         return keys
 
-    def fetch_license_files(self, dest_dir=THIRDPARTY_DIR):
+    def fetch_license_files(self, dest_dir=THIRDPARTY_DIR, use_cached_index=False):
         """
         Fetch license files if missing in `dest_dir`.
         Return True if license files were fetched.
         """
-        urls = LinksRepository.from_url().links
+        urls = LinksRepository.from_url(use_cached_index=use_cached_index).links
         errors = []
         extra_lic_names = [l.get("file") for l in self.extra_data.get("licenses", {})]
         extra_lic_names += [self.extra_data.get("license_file")]
@@ -887,10 +838,10 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR):
                 # try remotely first
                 lic_url = get_license_link_for_filename(filename=filename, urls=urls)
 
-                fetch_and_save_path_or_url(
-                    filename=filename,
-                    dest_dir=dest_dir,
+                fetch_and_save(
                     path_or_url=lic_url,
+                    dest_dir=dest_dir,
+                    filename=filename,
                     as_text=True,
                 )
                 if TRACE:
@@ -900,10 +851,10 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR):
                 try:
                     # try licensedb second
                     lic_url = f"{LICENSEDB_API_URL}/{filename}"
-                    fetch_and_save_path_or_url(
-                        filename=filename,
-                        dest_dir=dest_dir,
+                    fetch_and_save(
                         path_or_url=lic_url,
+                        dest_dir=dest_dir,
+                        filename=filename,
                         as_text=True,
                     )
                     if TRACE:
@@ -1062,6 +1013,84 @@ class InvalidDistributionFilename(Exception):
     pass
 
 
+def get_sdist_name_ver_ext(filename):
+    """
+    Return a (name, version, extension) if filename is a valid sdist name. Some legacy
+    binary builds have weird names. Return False otherwise.
+
+    In particular they do not use PEP440 compliant versions and/or mix tags, os
+    and arch names in tarball names and versions:
+
+    >>> assert get_sdist_name_ver_ext("intbitset-1.3.tar.gz")
+    >>> assert not get_sdist_name_ver_ext("intbitset-1.3.linux-x86_64.tar.gz")
+    >>> assert get_sdist_name_ver_ext("intbitset-1.4a.tar.gz")
+    >>> assert get_sdist_name_ver_ext("intbitset-1.4a.zip")
+    >>> assert not get_sdist_name_ver_ext("intbitset-2.0.linux-x86_64.tar.gz")
+    >>> assert get_sdist_name_ver_ext("intbitset-2.0.tar.gz")
+    >>> assert not get_sdist_name_ver_ext("intbitset-2.1-1.src.rpm")
+    >>> assert not get_sdist_name_ver_ext("intbitset-2.1-1.x86_64.rpm")
+    >>> assert not get_sdist_name_ver_ext("intbitset-2.1.linux-x86_64.tar.gz")
+    >>> assert not get_sdist_name_ver_ext("cffi-1.2.0-1.tar.gz")
+    >>> assert not get_sdist_name_ver_ext("html5lib-1.0-reupload.tar.gz")
+    >>> assert not get_sdist_name_ver_ext("selenium-2.0-dev-9429.tar.gz")
+    >>> assert not get_sdist_name_ver_ext("testfixtures-1.8.0dev-r4464.tar.gz")
+    """
+    name_ver = None
+    extension = None
+
+    for ext in EXTENSIONS_SDIST:
+        if filename.endswith(ext):
+            name_ver, extension, _ = filename.rpartition(ext)
+            break
+
+    if not extension or not name_ver:
+        return False
+
+    name, _, version = name_ver.rpartition("-")
+
+    if not name or not version:
+        return False
+
+    # weird version
+    if any(
+        w in version
+        for w in (
+            "x86_64",
+            "i386",
+        )
+    ):
+        return False
+
+    # all char versions
+    if version.isalpha():
+        return False
+
+    # non-pep 440 version
+    if "-" in version:
+        return False
+
+    # single version
+    if version.isdigit() and len(version) == 1:
+        return False
+
+    # r1 version
+    if len(version) == 2 and version[0] == "r" and version[1].isdigit():
+        return False
+
+    # dotless version (but calver is OK)
+    if "." not in version and len(version) < 3:
+        return False
+
+    # version with dashes selenium-2.0-dev-9429.tar.gz
+    if name.endswith(("dev",)) and "." not in version:
+        return False
+    # version pre or post, old legacy
+    if version.startswith(("beta", "rc", "pre", "post", "final")):
+        return False
+
+    return name, version, extension
+
+
 @attr.attributes
 class Sdist(Distribution):
 
@@ -1078,21 +1107,11 @@ def from_filename(cls, filename):
         Return a Sdist object built from a filename.
         Raise an exception if this is not a valid sdist filename
         """
-        name_ver = None
-        extension = None
-
-        for ext in EXTENSIONS_SDIST:
-            if filename.endswith(ext):
-                name_ver, extension, _ = filename.rpartition(ext)
-                break
-
-        if not extension or not name_ver:
+        name_ver_ext = get_sdist_name_ver_ext(filename)
+        if not name_ver_ext:
             raise InvalidDistributionFilename(filename)
 
-        name, _, version = name_ver.rpartition("-")
-
-        if not name or not version:
-            raise InvalidDistributionFilename(filename)
+        name, version, extension = name_ver_ext
 
         return cls(
             type="pypi",
@@ -1280,8 +1299,8 @@ def is_pure_wheel(filename):
 @attr.attributes
 class PypiPackage(NameVer):
     """
-    A Python package with its "distributions", e.g. wheels and source
-    distribution , ABOUT files and licenses or notices.
+    A Python package contains one or more wheels and one source distribution
+    from a repository.
     """
 
     sdist = attr.ib(
@@ -1298,16 +1317,6 @@ class PypiPackage(NameVer):
         metadata=dict(help="List of Wheel for this package"),
     )
 
-    @property
-    def specifier(self):
-        """
-        A requirement specifier for this package
-        """
-        if self.version:
-            return f"{self.name}=={self.version}"
-        else:
-            return self.name
-
     def get_supported_wheels(self, environment, verbose=TRACE_ULTRA_DEEP):
         """
         Yield all the Wheel of this package supported and compatible with the
@@ -1389,17 +1398,20 @@ def packages_from_dir(cls, directory):
         Yield PypiPackages built from files found in at directory path.
         """
         base = os.path.abspath(directory)
+
         paths = [os.path.join(base, f) for f in os.listdir(base) if f.endswith(EXTENSIONS)]
+
         if TRACE_ULTRA_DEEP:
             print("packages_from_dir: paths:", paths)
-        return cls.packages_from_many_paths_or_urls(paths)
+        return PypiPackage.packages_from_many_paths_or_urls(paths)
 
     @classmethod
     def packages_from_many_paths_or_urls(cls, paths_or_urls):
         """
         Yield PypiPackages built from a list of paths or URLs.
+        These are sorted by name and then by version from oldest to newest.
         """
-        dists = cls.dists_from_paths_or_urls(paths_or_urls)
+        dists = PypiPackage.dists_from_paths_or_urls(paths_or_urls)
         if TRACE_ULTRA_DEEP:
             print("packages_from_many_paths_or_urls: dists:", dists)
 
@@ -1414,54 +1426,11 @@ def packages_from_many_paths_or_urls(cls, paths_or_urls):
                 print("packages_from_many_paths_or_urls", package)
             yield package
 
-    @classmethod
-    def get_versions(cls, name, packages):
-        """
-        Return a subset list of package versions from a list of `packages` that
-        match PypiPackage `name`.
-        The list is sorted by version from oldest to most recent.
-        """
-        norm_name = NameVer.normalize_name(name)
-        versions = [p for p in packages if p.normalized_name == norm_name]
-        return cls.sorted(versions)
-
-    @classmethod
-    def get_latest_version(cls, name, packages):
-        """
-        Return the latest version of PypiPackage `name` from a list of `packages`.
-        """
-        versions = cls.get_versions(name, packages)
-        if not versions:
-            return
-        return versions[-1]
-
-    @classmethod
-    def get_name_version(cls, name, version, packages):
-        """
-        Return the PypiPackage with `name` and `version` from a list of `packages`
-        or None if it is not found.
-        If `version` is None, return the latest version found.
-        """
-        if TRACE_ULTRA_DEEP:
-            print("get_name_version:", name, version, packages)
-        if not version:
-            return cls.get_latest_version(name, packages)
-
-        nvs = [p for p in cls.get_versions(name, packages) if p.version == version]
-
-        if not nvs:
-            return name, version
-
-        if len(nvs) == 1:
-            return nvs[0]
-
-        raise Exception(f"More than one PypiPackage with {name}=={version}")
-
     @classmethod
     def dists_from_paths_or_urls(cls, paths_or_urls):
         """
         Return a list of Distribution given a list of
-        `paths_or_urls` to wheels or source distributions.
+        ``paths_or_urls`` to wheels or source distributions.
 
         Each Distribution receives two extra attributes:
             - the path_or_url it was created from
@@ -1473,29 +1442,24 @@ def dists_from_paths_or_urls(cls, paths_or_urls):
         ...     bitarray-0.8.1-cp36-cp36m-macosx_10_9_x86_64.macosx_10_10_x86_64.whl
         ...     bitarray-0.8.1-cp36-cp36m-win_amd64.whl
         ...     https://example.com/bar/bitarray-0.8.1.tar.gz
-        ...     bitarray-0.8.1.tar.gz.ABOUT bit.LICENSE'''.split()
-        >>> result = list(PypiPackage.dists_from_paths_or_urls(paths_or_urls))
+        ...     bitarray-0.8.1.tar.gz.ABOUT
+        ...     bit.LICENSE'''.split()
+        >>> results = list(PypiPackage.dists_from_paths_or_urls(paths_or_urls))
         >>> for r in results:
-        ...    r.filename = ''
-        ...    r.path_or_url = ''
-        >>> expected = [
-        ...     Wheel(name='bitarray', version='0.8.1', build='',
-        ...         python_versions=['cp36'], abis=['cp36m'],
-        ...         platforms=['linux_x86_64']),
-        ...     Wheel(name='bitarray', version='0.8.1', build='',
-        ...         python_versions=['cp36'], abis=['cp36m'],
-        ...         platforms=['macosx_10_9_x86_64', 'macosx_10_10_x86_64']),
-        ...     Wheel(name='bitarray', version='0.8.1', build='',
-        ...         python_versions=['cp36'], abis=['cp36m'],
-        ...         platforms=['win_amd64']),
-        ...     Sdist(name='bitarray', version='0.8.1'),
-        ...     Sdist(name='bitarray', version='0.8.1')
-        ... ]
-        >>> assert expected == result
+        ...    print(r.__class__.__name__, r.name, r.version)
+        ...    if isinstance(r, Wheel):
+        ...       print(" ", ", ".join(r.python_versions), ", ".join(r.platforms))
+        Wheel bitarray 0.8.1
+          cp36 linux_x86_64
+        Wheel bitarray 0.8.1
+          cp36 macosx_10_9_x86_64, macosx_10_10_x86_64
+        Wheel bitarray 0.8.1
+          cp36 win_amd64
+        Sdist bitarray 0.8.1
         """
         dists = []
-        if TRACE_DEEP:
-            print("   ###paths_or_urls:", paths_or_urls)
+        if TRACE_ULTRA_DEEP:
+            print("     ###paths_or_urls:", paths_or_urls)
         installable = [f for f in paths_or_urls if f.endswith(EXTENSIONS_INSTALLABLE)]
         for path_or_url in installable:
             try:
@@ -1503,7 +1467,14 @@ def dists_from_paths_or_urls(cls, paths_or_urls):
                 dists.append(dist)
                 if TRACE_DEEP:
                     print(
-                        "     ===> dists_from_paths_or_urls:", dist, "with URL:", dist.download_url
+                        "     ===> dists_from_paths_or_urls:",
+                        dist,
+                        "\n     ",
+                        "with URL:",
+                        dist.download_url,
+                        "\n     ",
+                        "from URL:",
+                        path_or_url,
                     )
             except InvalidDistributionFilename:
                 if TRACE_DEEP:
@@ -1639,98 +1610,107 @@ class PypiSimpleRepository:
         metadata=dict(help="Base PyPI simple URL for this index."),
     )
 
-    packages_by_normalized_name = attr.ib(
+    # we keep a nested mapping of PypiPackage that has this shape:
+    # {name: {version: PypiPackage, version: PypiPackage, etc}
+    # the inner versions mapping is sorted by version from oldest to newest
+
+    packages = attr.ib(
         type=dict,
-        default=attr.Factory(lambda: defaultdict(list)),
-        metadata=dict(help="Mapping of {package name: [package objects]} available in this repo"),
+        default=attr.Factory(lambda: defaultdict(dict)),
+        metadata=dict(
+            help="Mapping of {name: {version: PypiPackage, version: PypiPackage, etc} available in this repo"
+        ),
     )
 
-    packages_by_normalized_name_version = attr.ib(
-        type=dict,
-        default=attr.Factory(dict),
-        metadata=dict(help="Mapping of {(name, version): package object} available in this repo"),
+    fetched_package_normalized_names = attr.ib(
+        type=set,
+        default=attr.Factory(set),
+        metadata=dict(help="A set of already fetched package normalized names."),
     )
 
-    def get_versions(self, name):
+    use_cached_index = attr.ib(
+        type=bool,
+        default=False,
+        metadata=dict(
+            help="If True, use any existing on-disk cached PyPI index files. Otherwise, fetch and cache."
+        ),
+    )
+
+    def _get_package_versions_map(self, name):
         """
-        Return a list of all available PypiPackage version for this package name.
-        The list may be empty.
+        Return a mapping of all available PypiPackage version for this package name.
+        The mapping may be empty. It is ordered by version from oldest to newest
         """
-        name = name and NameVer.normalize_name(name)
-        self._populate_links_and_packages(name)
-        return self.packages_by_normalized_name.get(name, [])
+        assert name
+        normalized_name = NameVer.normalize_name(name)
+        versions = self.packages[normalized_name]
+        if not versions and normalized_name not in self.fetched_package_normalized_names:
+            self.fetched_package_normalized_names.add(normalized_name)
+            try:
+                links = self.fetch_links(normalized_name=normalized_name)
+                # note that thsi is sorted so the mapping is also sorted
+                versions = {
+                    package.version: package
+                    for package in PypiPackage.packages_from_many_paths_or_urls(paths_or_urls=links)
+                }
+                self.packages[normalized_name] = versions
+            except RemoteNotFetchedException as e:
+                if TRACE:
+                    print(f"failed to fetch package name: {name} from: {self.index_url}:\n{e}")
 
-    def get_latest_version(self, name):
+        if not versions and TRACE:
+            print(f"WARNING: package {name} not found in repo: {self.index_url}")
+
+        return versions
+
+    def get_package_versions(self, name):
         """
-        Return the latest PypiPackage version for this package name or None.
+        Return a mapping of all available PypiPackage version as{version:
+        package} for this package name. The mapping may be empty but not None.
+        It is sorted by version from oldest to newest.
         """
-        versions = self.get_versions(name)
-        return PypiPackage.get_latest_version(name, versions)
+        return dict(self._get_package_versions_map(name))
 
-    def get_package(self, name, version):
+    def get_package_version(self, name, version=None):
         """
         Return the PypiPackage with name and version or None.
+        Return the latest PypiPackage version if version is None.
         """
-        versions = self.get_versions(name)
-        if TRACE_DEEP:
-            print("PypiPackage.get_package:versions:", versions)
-        return PypiPackage.get_name_version(name, version, versions)
+        if not version:
+            versions = list(self._get_package_versions_map(name).values())
+            return versions and versions[-1]
+        else:
+            return self._get_package_versions_map(name).get(version)
 
-    def _fetch_links(self, name, _LINKS={}):
+    def fetch_links(self, normalized_name):
         """
         Return a list of download link URLs found in a PyPI simple index for package
         name using the `index_url` of this repository.
         """
-        name = name and NameVer.normalize_name(name)
-        index_url = self.index_url
-
-        name = name and NameVer.normalize_name(name)
-        index_url = index_url.strip("/")
-        index_url = f"{index_url}/{name}"
-
-        if TRACE_DEEP:
-            print(
-                f"    Finding links for {name!r} from PyPI index: {index_url} : cached?:",
-                index_url in _LINKS,
-            )
-
-        if index_url not in _LINKS:
-            text = fetch_content_from_path_or_url_through_cache(path_or_url=index_url, as_text=True)
-            links = collect_urls(text)
-            # TODO: keep sha256
-            links = [l.partition("#sha256=") for l in links]
-            links = [url for url, _, _sha256 in links]
-            _LINKS[index_url] = [l for l in links if l.endswith(EXTENSIONS)]
-
-        links = _LINKS[index_url]
-        if TRACE_DEEP:
-            print(f"          Found links {links!r}")
+        package_url = f"{self.index_url}/{normalized_name}"
+        text = CACHE.get(
+            path_or_url=package_url,
+            as_text=True,
+            force=not self.use_cached_index,
+        )
+        links = collect_urls(text)
+        # TODO: keep sha256
+        links = [l.partition("#sha256=") for l in links]
+        links = [url for url, _, _sha256 in links]
         return links
 
-    def _populate_links_and_packages(self, name):
-        name = name and NameVer.normalize_name(name)
-
-        if TRACE_DEEP:
-            print("PypiPackage._populate_links_and_packages:name:", name)
-
-        links = self._fetch_links(name)
-        packages = list(PypiPackage.packages_from_many_paths_or_urls(paths_or_urls=links))
 
-        if TRACE_DEEP:
-            print("PypiPackage._populate_links_and_packages:packages:", packages)
-
-        self.packages_by_normalized_name[name] = packages
-
-        for p in packages:
-            name = name and NameVer.normalize_name(p.name)
-            self.packages_by_normalized_name_version[(name, p.version)] = p
+PYPI_PUBLIC_REPO = PypiSimpleRepository(index_url=PYPI_SIMPLE_URL)
+PYPI_SELFHOSTED_REPO = PypiSimpleRepository(index_url=ABOUT_PYPI_SIMPLE_URL)
+DEFAULT_PYPI_REPOS = PYPI_PUBLIC_REPO, PYPI_SELFHOSTED_REPO
+DEFAULT_PYPI_REPOS_BY_URL = {r.index_url: r for r in DEFAULT_PYPI_REPOS}
 
 
 @attr.attributes
 class LinksRepository:
     """
-    Represents a simple links repository such an HTTP directory listing or a
-    page with links.
+    Represents a simple links repository such an HTTP directory listing or an
+    HTML page with links.
     """
 
     url = attr.ib(
@@ -1745,14 +1725,25 @@ class LinksRepository:
         metadata=dict(help="List of links available in this repo"),
     )
 
+    use_cached_index = attr.ib(
+        type=bool,
+        default=False,
+        metadata=dict(
+            help="If True, use any existing on-disk cached index files. Otherwise, fetch and cache."
+        ),
+    )
+
     def __attrs_post_init__(self):
         if not self.links:
             self.links = self.find_links()
 
-    def find_links(self):
+    def find_links(self, _CACHE=[]):
         """
         Return a list of link URLs found in the HTML page at `self.url`
         """
+        if _CACHE:
+            return _CACHE
+
         links_url = self.url
         if TRACE_DEEP:
             print(f"Finding links from: {links_url}")
@@ -1764,9 +1755,10 @@ def find_links(self):
         if TRACE_DEEP:
             print(f"Base URL {base_url}")
 
-        text = fetch_content_from_path_or_url_through_cache(
+        text = CACHE.get(
             path_or_url=links_url,
             as_text=True,
+            force=not self.use_cached_index,
         )
 
         links = []
@@ -1795,12 +1787,13 @@ def find_links(self):
 
         if TRACE:
             print(f"Found {len(links)} links at {links_url}")
+        _CACHE.extend(links)
         return links
 
     @classmethod
-    def from_url(cls, url=ABOUT_BASE_URL, _LINKS_REPO={}):
+    def from_url(cls, url=ABOUT_BASE_URL, _LINKS_REPO={}, use_cached_index=False):
         if url not in _LINKS_REPO:
-            _LINKS_REPO[url] = cls(url=url)
+            _LINKS_REPO[url] = cls(url=url, use_cached_index=use_cached_index)
         return _LINKS_REPO[url]
 
 
@@ -1818,26 +1811,6 @@ def get_local_packages(directory=THIRDPARTY_DIR):
     return list(PypiPackage.packages_from_dir(directory=directory))
 
 
-def get_pypi_repo(index_url, _PYPI_REPO={}):
-    if index_url not in _PYPI_REPO:
-        _PYPI_REPO[index_url] = PypiSimpleRepository(index_url=index_url)
-    return _PYPI_REPO[index_url]
-
-
-def get_pypi_package(name, version, index_url, verbose=TRACE_DEEP):
-    """
-    Return a PypiPackage or None.
-    """
-    try:
-        package = get_pypi_repo(index_url).get_package(name, version)
-        if verbose:
-            print(f"    get_pypi_package: {name} @ {version} info from {index_url}: {package}")
-        return package
-
-    except RemoteNotFetchedException as e:
-        print(f"Failed to fetch PyPI package {name} @ {version} info from {index_url}: {e}")
-
-
 ################################################################################
 #
 # Basic file and URL-based operations using a persistent file-based Cache
@@ -1857,34 +1830,40 @@ class Cache:
     def __attrs_post_init__(self):
         os.makedirs(self.directory, exist_ok=True)
 
-    def clear(self):
-        shutil.rmtree(self.directory)
-
-    def get(self, path_or_url, as_text=True):
+    def get(self, path_or_url, as_text=True, force=False):
         """
-        Get a file from a `path_or_url` through the cache.
-        `path_or_url` can be a path or a URL to a file.
+        Return the content fetched from a ``path_or_url`` through the cache.
+        Raise an Exception on errors. Treats the content as text if as_text is
+        True otherwise as treat as binary. `path_or_url` can be a path or a URL
+        to a file.
         """
         cache_key = quote_plus(path_or_url.strip("/"))
         cached = os.path.join(self.directory, cache_key)
 
-        if not os.path.exists(cached):
+        if force or not os.path.exists(cached):
+            if TRACE_DEEP:
+                print(f"        FILE CACHE MISS: {path_or_url}")
             content = get_file_content(path_or_url=path_or_url, as_text=as_text)
             wmode = "w" if as_text else "wb"
             with open(cached, wmode) as fo:
                 fo.write(content)
             return content
         else:
+            if TRACE_DEEP:
+                print(f"        FILE CACHE HIT: {path_or_url}")
             return get_local_file_content(path=cached, as_text=as_text)
 
 
+CACHE = Cache()
+
+
 def get_file_content(path_or_url, as_text=True):
     """
     Fetch and return the content at `path_or_url` from either a local path or a
     remote URL. Return the content as bytes is `as_text` is False.
     """
     if path_or_url.startswith("https://"):
-        if TRACE:
+        if TRACE_DEEP:
             print(f"Fetching: {path_or_url}")
         _headers, content = get_remote_file_content(url=path_or_url, as_text=as_text)
         return content
@@ -1936,7 +1915,7 @@ def get_remote_file_content(
     # using a GET with stream=True ensure we get the the final header from
     # several redirects and that we can ignore content there. A HEAD request may
     # not get us this last header
-    print(f"    DOWNLOADING {url}")
+    print(f"    DOWNLOADING: {url}")
     with requests.get(url, allow_redirects=True, stream=True, headers=headers) as response:
         status = response.status_code
         if status != requests.codes.ok:  # NOQA
@@ -1960,35 +1939,19 @@ def get_remote_file_content(
         return response.headers, response.text if as_text else response.content
 
 
-def fetch_content_from_path_or_url_through_cache(
+def fetch_and_save(
     path_or_url,
-    as_text=True,
-    cache=Cache(),
-):
-    """
-    Return the content from fetching at path or URL. Raise an Exception on
-    errors. Treats the content as text if as_text is True otherwise as treat as
-    binary. Use the provided file cache. This is the main entry for using the
-    cache.
-
-    Note: the `cache` argument is a global, though it does not really matter
-    since it does not hold any state which is only kept on disk.
-    """
-    return cache.get(path_or_url=path_or_url, as_text=as_text)
-
-
-def fetch_and_save_path_or_url(
-    filename,
     dest_dir,
-    path_or_url,
+    filename,
     as_text=True,
 ):
     """
-    Return the content from fetching the `filename` file name at URL or path
-    and save to `dest_dir`. Raise an Exception on errors. Treats the content as
-    text if as_text is True otherwise as treat as binary.
+    Fetch content at ``path_or_url`` URL or path and save this to
+    ``dest_dir/filername``. Return the fetched content. Raise an Exception on
+    errors. Treats the content as text if as_text is True otherwise as treat as
+    binary.
     """
-    content = fetch_content_from_path_or_url_through_cache(
+    content = CACHE.get(
         path_or_url=path_or_url,
         as_text=as_text,
     )
@@ -2000,45 +1963,9 @@ def fetch_and_save_path_or_url(
 
 
 ################################################################################
-# Requirements processing
-################################################################################
-
-
-def get_required_remote_packages(
-    requirements_file="requirements.txt",
-    index_url=PYPI_SIMPLE_URL,
-):
-    """
-    Yield tuple of (name, version, PypiPackage) for packages listed in the
-    `requirements_file` requirements file and found in the PyPI index
-    ``index_url`` URL.
-    """
-    required_name_versions = load_requirements(requirements_file=requirements_file)
-    return get_required_packages(required_name_versions=required_name_versions, index_url=index_url)
-
-
-def get_required_packages(
-    required_name_versions,
-    index_url=PYPI_SIMPLE_URL,
-):
-    """
-    Yield tuple of (name, version) or a PypiPackage for package name/version
-    listed in the ``required_name_versions`` list and found in the PyPI index
-    ``index_url`` URL.
-    """
-    if TRACE:
-        print("get_required_packages", index_url)
-
-    repo = get_pypi_repo(index_url=index_url)
-
-    for name, version in required_name_versions:
-        if TRACE:
-            print("  get_required_packages: name:", name, "version:", version)
-        yield repo.get_package(name, version)
-
-
-################################################################################
+#
 # Functions to update or fetch ABOUT and license files
+#
 ################################################################################
 
 
@@ -2059,7 +1986,7 @@ def clean_about_files(
                 local_dist.save_about_and_notice_files(dest_dir)
 
 
-def fetch_abouts_and_licenses(dest_dir=THIRDPARTY_DIR):
+def fetch_abouts_and_licenses(dest_dir=THIRDPARTY_DIR, use_cached_index=False):
     """
     Given a thirdparty dir, add missing ABOUT. LICENSE and NOTICE files using
     best efforts:
@@ -2069,6 +1996,8 @@ def fetch_abouts_and_licenses(dest_dir=THIRDPARTY_DIR):
     - derive from existing distribution with same name and latest version that
       would have such ABOUT file
     - extract ABOUT file data from distributions PKGINFO or METADATA files
+
+    Use available existing on-disk cached index if use_cached_index is True.
     """
 
     def get_other_dists(_package, _dist):
@@ -2078,7 +2007,6 @@ def get_other_dists(_package, _dist):
         """
         return [d for d in _package.get_distributions() if d != _dist]
 
-    selfhosted_repo = get_pypi_repo(index_url=ABOUT_PYPI_SIMPLE_URL)
     local_packages = get_local_packages(directory=dest_dir)
     packages_by_name = defaultdict(list)
     for local_package in local_packages:
@@ -2094,7 +2022,7 @@ def get_other_dists(_package, _dist):
             # if has key data we may look to improve later, but we can move on
             if local_dist.has_key_metadata():
                 local_dist.save_about_and_notice_files(dest_dir=dest_dir)
-                local_dist.fetch_license_files(dest_dir=dest_dir)
+                local_dist.fetch_license_files(dest_dir=dest_dir, use_cached_index=use_cached_index)
                 continue
 
             # lets try to get from another dist of the same local package
@@ -2106,7 +2034,7 @@ def get_other_dists(_package, _dist):
             # if has key data we may look to improve later, but we can move on
             if local_dist.has_key_metadata():
                 local_dist.save_about_and_notice_files(dest_dir=dest_dir)
-                local_dist.fetch_license_files(dest_dir=dest_dir)
+                local_dist.fetch_license_files(dest_dir=dest_dir, use_cached_index=use_cached_index)
                 continue
 
             # try to get another version of the same package that is not our version
@@ -2115,7 +2043,6 @@ def get_other_dists(_package, _dist):
                 for p in packages_by_name[local_package.name]
                 if p.version != local_package.version
             ]
-
             other_local_version = other_local_packages and other_local_packages[-1]
             if other_local_version:
                 latest_local_dists = list(other_local_version.get_distributions())
@@ -2133,7 +2060,9 @@ def get_other_dists(_package, _dist):
                 # if has key data we may look to improve later, but we can move on
                 if local_dist.has_key_metadata():
                     local_dist.save_about_and_notice_files(dest_dir=dest_dir)
-                    local_dist.fetch_license_files(dest_dir=dest_dir)
+                    local_dist.fetch_license_files(
+                        dest_dir=dest_dir, use_cached_index=use_cached_index
+                    )
                     continue
 
             # lets try to fetch remotely
@@ -2142,14 +2071,16 @@ def get_other_dists(_package, _dist):
             # if has key data we may look to improve later, but we can move on
             if local_dist.has_key_metadata():
                 local_dist.save_about_and_notice_files(dest_dir=dest_dir)
-                local_dist.fetch_license_files(dest_dir=dest_dir)
+                local_dist.fetch_license_files(dest_dir=dest_dir, use_cached_index=use_cached_index)
                 continue
 
             # try to get a latest version of the same package that is not our version
+            # and that is in our self hosted repo
+            lpv = local_package.version
+            lpn = local_package.name
+
             other_remote_packages = [
-                p
-                for p in selfhosted_repo.get_versions(local_package.name)
-                if p.version != local_package.version
+                p for v, p in PYPI_SELFHOSTED_REPO.get_package_versions(lpn).items() if v != lpv
             ]
 
             latest_version = other_remote_packages and other_remote_packages[-1]
@@ -2169,7 +2100,9 @@ def get_other_dists(_package, _dist):
                 # if has key data we may look to improve later, but we can move on
                 if local_dist.has_key_metadata():
                     local_dist.save_about_and_notice_files(dest_dir=dest_dir)
-                    local_dist.fetch_license_files(dest_dir=dest_dir)
+                    local_dist.fetch_license_files(
+                        dest_dir=dest_dir, use_cached_index=use_cached_index
+                    )
                     continue
 
             # try to get data from pkginfo (no license though)
@@ -2179,7 +2112,7 @@ def get_other_dists(_package, _dist):
             # if local_dist.has_key_metadata() or not local_dist.has_key_metadata():
             local_dist.save_about_and_notice_files(dest_dir)
 
-            lic_errs = local_dist.fetch_license_files(dest_dir)
+            lic_errs = local_dist.fetch_license_files(dest_dir, use_cached_index=use_cached_index)
 
             if not local_dist.has_key_metadata():
                 print(f"Unable to add essential ABOUT data for: {local_dist}")
@@ -2292,65 +2225,16 @@ def download_wheels_with_pip(
     return sorted(downloaded), error
 
 
-def build_wheels_locally_if_pure_python(
-    requirements_specifier,
-    with_deps=False,
-    verbose=False,
-    dest_dir=THIRDPARTY_DIR,
-):
-    """
-    Given pip `requirements_specifier` string (such as package names or as
-    name==version), build the corresponding binary wheel(s) locally.
-
-    If all these are "pure" Python wheels that run on all Python 3 versions and
-    operating systems, copy them back in `dest_dir` if they do not exists there
-
-    Return a tuple of (True if all wheels are "pure", list of built wheel file names)
-    """
-    deps = [] if with_deps else ["--no-deps"]
-    verbose = ["--verbose"] if verbose else []
-
-    wheel_dir = tempfile.mkdtemp(prefix="scancode-release-wheels-local-")
-    cli_args = (
-        [
-            "pip",
-            "wheel",
-            "--wheel-dir",
-            wheel_dir,
-        ]
-        + deps
-        + verbose
-        + [requirements_specifier]
-    )
-
-    print(f"Building local wheels for: {requirements_specifier}")
-    print(f"Using command:", " ".join(cli_args))
-    call(cli_args)
-
-    built = os.listdir(wheel_dir)
-    if not built:
-        return []
-
-    all_pure = all(is_pure_wheel(bwfn) for bwfn in built)
-
-    if not all_pure:
-        print(f"  Some wheels are not pure")
-
-    print(f"  Copying local wheels")
-    pure_built = []
-    for bwfn in built:
-        owfn = os.path.join(dest_dir, bwfn)
-        if not os.path.exists(owfn):
-            nwfn = os.path.join(wheel_dir, bwfn)
-            fileutils.copyfile(nwfn, owfn)
-        pure_built.append(bwfn)
-        print(f"    Built local wheel: {bwfn}")
-    return all_pure, pure_built
+################################################################################
+#
+# Functions to check for problems
+#
+################################################################################
 
 
 def check_about(dest_dir=THIRDPARTY_DIR):
     try:
-        subprocess.check_output(f"about check {dest_dir}".split())
+        subprocess.check_output(f"venv/bin/about check {dest_dir}".split())
     except subprocess.CalledProcessError as cpe:
         print()
         print("Invalid ABOUT files:")
diff --git a/requirements-dev.txt b/requirements-dev.txt
index e69de29..5a2b695 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -0,0 +1,24 @@
+aboutcode-toolkit==7.0.2
+bleach==4.1.0
+build==0.7.0
+commonmark==0.9.1
+docutils==0.18.1
+et-xmlfile==1.1.0
+execnet==1.9.0
+iniconfig==1.1.1
+jeepney==0.7.1
+keyring==23.4.1
+openpyxl==3.0.9
+pep517==0.12.0
+pkginfo==1.8.2
+py==1.11.0
+pytest==7.0.1
+pytest-forked==1.4.0
+pytest-xdist==2.5.0
+readme-renderer==34.0
+requests-toolbelt==0.9.1
+rfc3986==1.5.0
+rich==12.3.0
+secretstorage==3.3.2
+tomli==1.2.3
+twine==3.8.0
diff --git a/requirements.txt b/requirements.txt
index 3011e68..1d23a05 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,79 @@
-attrs==21.2.0
-beautifulsoup4==4.9.3
-certifi==2021.5.30
+attrs==21.4.0
+banal==1.0.6
+beautifulsoup4==4.11.1
+binaryornot==0.4.4
+boolean.py==4.0
+certifi==2021.10.8
+cffi==1.15.0
 chardet==4.0.0
-click==8.0.1
-idna==2.10
-intbitset==2.4.1
-PyYAML==5.4.1
-requests==2.25.1
+charset-normalizer==2.0.12
+click==8.0.4
+colorama==0.4.4
+construct==2.10.68
+container-inspector==31.0.0
+cryptography==36.0.2
+debian-inspector==30.0.0
+dockerfile-parse==1.2.0
+dparse2==0.6.1
+extractcode==31.0.0
+extractcode-7z==16.5.210531
+extractcode-libarchive==3.5.1.210531
+fasteners==0.17.3
+fingerprints==1.0.3
+ftfy==6.0.3
+future==0.18.2
+gemfileparser==0.8.0
+html5lib==1.1
+idna==3.3
+importlib-metadata==4.8.3
+inflection==0.5.1
+intbitset==3.0.1
+isodate==0.6.1
+jaraco.functools==3.4.0
+javaproperties==0.8.1
+Jinja2==3.0.3
+jsonstreams==0.6.0
+license-expression==30.0.0
+lxml==4.8.0
+MarkupSafe==2.0.1
+more-itertools==8.13.0
+normality==2.3.3
+packagedcode-msitools==0.101.210706
+packageurl-python==0.9.9
+packaging==21.3
+parameter-expansion-patched==0.3.1
+patch==1.16
+pdfminer-six==20220506
+pefile==2021.9.3
+pip-requirements-parser==31.2.0
+pkginfo2==30.0.0
+pluggy==1.0.0
+plugincode==30.0.0
+ply==3.11
+publicsuffix2==2.20191221
+pyahocorasick==2.0.0b1
+pycparser==2.21
+pygmars==0.7.0
+Pygments==2.12.0
+pymaven-patch==0.3.0
+pyparsing==3.0.8
+pytz==2022.1
+PyYAML==6.0
+rdflib==5.0.0
+regipy==2.3.1
+requests==2.27.1
+rpm-inspector-rpm==4.16.1.3.210404
 saneyaml==0.5.2
 six==1.16.0
-soupsieve==2.2.1
+soupsieve==2.3.1
+spdx-tools==0.7.0a3
 text-unidecode==1.3
-typing==3.6.6
-urllib3==1.26.5
-
+toml==0.10.2
+typecode==30.0.0
+typecode-libmagic==5.39.210531
+urllib3==1.26.9
+urlpy==0.5
+wcwidth==0.2.5
+webencodings==0.5.1
+xmltodict==0.12.0
+zipp==3.6.0
diff --git a/setup.cfg b/setup.cfg
index eeb7c79..02788ee 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -14,6 +14,7 @@ license = Apache-2.0
 # description must be on ONE line https://github.com/pypa/setuptools/issues/1390
 description = Set of common utilities, originally split from ScanCode
 long_description = file:README.rst
+long_description_content_type = text/x-rst
 url = https://github.com/nexB/commoncode
 classifiers =
     Development Status :: 5 - Production/Stable
@@ -46,8 +47,6 @@ install_requires =
     text_unidecode >= 1.0
     typing >=3.6, < 3.7; python_version < "3.7"
 
-setup_requires = setuptools_scm[toml] >= 4
-
 
 [options.packages.find]
 where = src
diff --git a/src/commoncode/fileutils.py b/src/commoncode/fileutils.py
index 525e915..584b7cc 100644
--- a/src/commoncode/fileutils.py
+++ b/src/commoncode/fileutils.py
@@ -206,14 +206,15 @@ def file_name(path, force_posix=False):
     return resource_name(path, force_posix)
 
 
-def parent_directory(path, force_posix=False):
+def parent_directory(path, force_posix=False, with_trail=True):
     """
     Return the parent directory path of a file or directory `path`.
+    The returned directory has a trailing path separator unless with_trail is False.
     """
     left, _right = split_parent_resource(path, force_posix)
     use_posix = force_posix or is_posixpath(path)
     sep = '/' if use_posix else '\\'
-    trail = sep if left != sep else ''
+    trail = sep if with_trail and left != sep else ''
     return left + trail
 
 
diff --git a/src/commoncode/resource.py b/src/commoncode/resource.py
index d7ef484..8fecdc1 100644
--- a/src/commoncode/resource.py
+++ b/src/commoncode/resource.py
@@ -6,51 +6,48 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
-import io
 import json
 import os
-import posixpath
-import traceback
 import sys
-from pathlib import Path
-
+import traceback
 from collections import deque
 from functools import partial
+from hashlib import md5
+from operator import itemgetter
 from os import walk as os_walk
 from os.path import abspath
 from os.path import exists
 from os.path import expanduser
+from os.path import isfile
 from os.path import join
 from os.path import normpath
+from posixpath import join as posixpath_join
+from posixpath import normpath as posixpath_normpath
+from posixpath import dirname as posixpath_parent
 
 import attr
-from intbitset import intbitset
 
 try:
     from scancode_config import scancode_temp_dir as temp_dir
 except ImportError:
     # alway have something there.
     import tempfile
+
     temp_dir = tempfile.mkdtemp(prefix='scancode-resource-cache')
 
+from commoncode import ignore
 from commoncode.datautils import List
 from commoncode.datautils import Mapping
 from commoncode.datautils import String
-
 from commoncode.filetype import is_file as filetype_is_file
 from commoncode.filetype import is_special
-
 from commoncode.fileutils import as_posixpath
 from commoncode.fileutils import create_dir
 from commoncode.fileutils import delete
-from commoncode.fileutils import file_base_name
 from commoncode.fileutils import file_name
 from commoncode.fileutils import parent_directory
 from commoncode.fileutils import splitext_name
 
-from commoncode import ignore
-from commoncode import paths
-
 """
 This module provides Codebase and Resource objects as an abstraction for files
 and directories used throughout ScanCode. ScanCode deals with a lot of these as
@@ -76,13 +73,11 @@ def logger_debug(*args):
     import logging
 
     logger = logging.getLogger(__name__)
-    # logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
     logging.basicConfig(stream=sys.stdout)
     logger.setLevel(logging.DEBUG)
 
     def logger_debug(*args):
-        return logger.debug(
-            ' '.join(isinstance(a, str) and a or repr(a) for a in args))
+        return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
 
 
 class ResourceNotInCache(Exception):
@@ -93,32 +88,45 @@ class UnknownResource(Exception):
     pass
 
 
-def skip_ignored(_loc):
-    """Always ignore VCS and some special filetypes."""
+def skip_ignored(location):
+    """
+    Return True if ``location`` should be skipped.
+    Always ignore VCS and some special filetypes.
+    """
     ignored = partial(ignore.is_ignored, ignores=ignore.ignores_VCS)
 
     if TRACE_DEEP:
         logger_debug()
-        logger_debug('Codebase.populate: walk: ignored loc:', _loc,
-                     'ignored:', ignored(_loc),
-                     'is_special:', is_special(_loc))
+        logger_debug(
+            'Codebase.populate: walk: ignored loc:',
+            location,
+            'ignored:',
+            ignored(location),
+            'is_special:',
+            is_special(location),
+        )
 
-    return is_special(_loc) or ignored(_loc)
+    return is_special(location) or ignored(location)
 
 
-def depth_walk(root_location, max_depth, error_handler=lambda:None):
+def depth_walk(
+    root_location,
+    max_depth,
+    skip_ignored=skip_ignored,
+    error_handler=lambda: None,
+):
     """
-    Yield a (top, dirs, files) tuple at each step of walking the `root_location` directory
-    recursively up to `max_depth` path segments extending from the `root_location`.
-    The behaviour is similar of `os.walk`.
+    Yield a (top, dirs, files) tuple at each step of walking the ``root_location``
+    directory recursively up to ``max_depth`` path segments extending from the
+    ``root_location``. The behaviour is similar of ``os.walk``.
 
     Arguments:
-       - root_location: Absolute, normalized path for the directory to be walked
-       - max_depth: positive integer for fixed depth limit. 0 for no limit.
-       - skip_ignored: Callback function that takes `top` as argument and returns a boolean
-                       indicating whether to ignore files in that location. No ignoring
-                       by default.
-       - error_handler: Error handler callback. No action taken by default.
+
+    - root_location: Absolute, normalized path for the directory to be walked
+    - max_depth: positive integer for fixed depth limit. 0 for no limit.
+    - skip_ignored: Callback function that takes a location as argument and
+      returns a boolean indicating whether to ignore files in that location.
+    - error_handler: Error handler callback. No action taken by default.
     """
 
     if max_depth < 0:
@@ -139,22 +147,23 @@ def depth_walk(root_location, max_depth, error_handler=lambda:None):
             dirs[:] = []
             files[:] = []
             continue
-        yield (top, dirs, files)
+        yield top, dirs, files
 
 
 @attr.s(slots=True)
 class Header(object):
     """
-    Represent a codebase header. Each tool that transforms the codebase
-    should create a Header and append it to the codebase log_entries list.
+    Represent a Codebase header. Each tool that transforms the codebase
+    should create a Header and append it to the Codebase.headers list.
     """
+
     tool_name = String(help='Name of the tool used such as scancode-toolkit.')
     tool_version = String(default='', help='Tool version used such as v1.2.3.')
     options = Mapping(help='Mapping of key/values describing the options used with this tool.')
     notice = String(default='', help='Notice text for this tool.')
     start_timestamp = String(help='Start timestamp for this header.')
     end_timestamp = String(help='End timestamp for this header.')
-    output_format_version = String(help='Version for the scancode output data format, such as v1.1 .')
+    output_format_version = String(help='Version for the output data format, such as v1.1 .')
     duration = String(help='Scan duration in seconds.')
     message = String(help='Message text.')
     errors = List(help='List of error messages.')
@@ -170,26 +179,8 @@ def from_dict(cls, **kwargs):
         Return a Header object deserialized from a `kwargs` mapping of
         key/values. Unknown attributes are ignored.
         """
-        known_attributes = set([
-            'tool_name',
-            'tool_version',
-            'options',
-            'notice',
-            'start_timestamp',
-            'end_timestamp',
-            'output_format_version',
-            'duration',
-            'message',
-            'errors',
-            'warnings',
-            'extra_data',
-        ])
-
-        # pop unknowns
-        for kwarg in list(kwargs.keys()):
-            if kwarg not in known_attributes:
-                kwargs.pop(kwarg)
-
+        known_attributes = set(attr.fields_dict(Header))
+        kwargs = {k: v for k, v in kwargs.items() if k in known_attributes}
         return cls(**kwargs)
 
 
@@ -203,115 +194,120 @@ def ignore_nothing(resource, codebase):
     return False
 
 
-class Codebase(object):
+class Codebase:
     """
-    Represent a codebase being scanned. A Codebase is a tree of Resources.
+    Represent a codebase being scanned. A Codebase is a list of Resources.
     """
 
     # we do not really need slots but this is a way to ensure we have tight
     # control on object attributes
     __slots__ = (
-        'original_location',
-        'full_root',
-        'strip_root',
         'max_depth',
         'location',
         'has_single_resource',
         'resource_attributes',
         'resource_class',
-        'resource_ids',
         'root',
         'is_file',
-
         'temp_dir',
-
-        'resources',
+        'resources_by_path',
+        'resources_count',
+        'paths',
         'max_in_memory',
         'all_in_memory',
         'all_on_disk',
         'cache_dir',
-
         'headers',
         'current_header',
-
         'codebase_attributes',
         'attributes',
-
         'counters',
         'timings',
         'errors',
     )
 
-    def __init__(self, location,
-                 resource_attributes=None,
-                 codebase_attributes=None,
-                 full_root=False, strip_root=False,
-                 temp_dir=temp_dir,
-                 max_in_memory=10000, max_depth=0):
-        """
-        Initialize a new codebase rooted at the `location` existing file or
+    # the value returned if the resource is cached
+    CACHED_RESOURCE = 1
+
+    def __init__(
+        self,
+        location,
+        resource_attributes=None,
+        codebase_attributes=None,
+        temp_dir=temp_dir,
+        max_in_memory=10000,
+        max_depth=0,
+        paths=tuple(),
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize a new codebase rooted at the ``location`` existing file or
         directory.
 
-        `resource_attributes` is an ordered mapping of attr Resource attributes
+        Use an optional list of ``paths`` strings that are paths relative to the
+        root ``location`` such that joining the root ``location`` and such a
+        path is the ``location`` of this path. If these ``paths`` are provided,
+        the codebase will only contain these paths and no other path.
+
+        ``resource_attributes`` is an ordered mapping of attr Resource attributes
         such as plugin-provided attributes: these will be added to a Resource
         sub-class crafted for this codebase.
 
-        `codebase_attributes` is an ordered mapping of attr Codebase attributes
+        ``codebase_attributes`` is an ordered mapping of attr Codebase attributes
         such as plugin-provided attributes: these will be added to a
         CodebaseAttributes sub-class crafted for this codebase.
 
-        `strip_root` and `full_root`: boolean flags: these control the values
-        of the path attribute of the codebase Resources. These are mutually
-        exclusive.
-        If `strip_root` is True, strip the first `path` segment of a Resource
-        unless the codebase contains a single root Resource.
-        If `full_root` is True the path is an an absolute path.
-
-        `temp_dir` is the base temporary directory to use to cache resources on
+        ``temp_dir`` is the base temporary directory to use to cache resources on
         disk and other temporary files.
 
-        `max_in_memory` is the maximum number of Resource instances to keep in
+        ``max_in_memory`` is the maximum number of Resource instances to keep in
         memory. Beyond this number, Resource are saved on disk instead. -1 means
         no memory is used and 0 means unlimited memory is used.
 
-        `max_depth` is the maximum depth of subdirectories to descend below and
+        ``max_depth`` is the maximum depth of subdirectories to descend below and
         including `location`.
         """
-        self.original_location = location
-        self.full_root = full_root
-        self.strip_root = strip_root
         self.max_depth = max_depth
 
         # Resource sub-class to use: Configured with attributes in _populate
         self.resource_class = Resource
 
-        self.resource_attributes = resource_attributes or dict()
-        self.codebase_attributes = codebase_attributes or dict()
+        self.resource_attributes = resource_attributes or {}
+        self.codebase_attributes = codebase_attributes or {}
 
         # setup location
         ########################################################################
         location = os.fsdecode(location)
-
         location = abspath(normpath(expanduser(location)))
         location = location.rstrip('/\\')
-
-        # TODO: we should also accept to create "virtual" codebase without a
-        # backing filesystem location
+        # TODO: what if is_special(location)???
         assert exists(location)
-        # FIXME: what if is_special(location)???
         self.location = location
+
         self.is_file = filetype_is_file(location)
 
         # True if this codebase root is a file or an empty directory.
         self.has_single_resource = bool(self.is_file or not os.listdir(location))
 
+        ########################################################################
         # Set up caching, summary, timing, and error info
         self._setup_essentials(temp_dir, max_in_memory)
 
-        # finally walk the location and populate
-        ########################################################################
+        # finally populate
+        self.paths = self._prepare_clean_paths(paths)
         self._populate()
 
+    def _prepare_clean_paths(self, paths=tuple()):
+        """
+        Return a new set of cleaned ``paths`` possibly empty.
+        We convert to POSIX and ensure we have no slash at both ends.
+        """
+        paths = (clean_path(p) for p in (paths or []) if p)
+        # we sort by path segments (e.g. essentially a topo sort)
+        _sorter = lambda p: p.split('/')
+        return sorted(paths, key=_sorter)
+
     def _setup_essentials(self, temp_dir=temp_dir, max_in_memory=10000):
         """
         Set the remaining Codebase attributes
@@ -329,27 +325,26 @@ def _setup_essentials(self, temp_dir=temp_dir, max_in_memory=10000):
         # root resource, never cached on disk
         self.root = None
 
-        # set index of existing resource ids ints, initially allocated with
-        # 10000 positions (this will grow as needed)
-        self.resource_ids = intbitset(10000)
+        # mapping of {path: Resource}. This the key data structure of a Codebase.
+        # All resources MUST exist there. When cached to disk the value is CACHED_RESOURCE
+        self.resources_by_path = {}
+        self.resources_count = 0
 
         # setup caching
         ########################################################################
         # dir used for caching and other temp files
         self.temp_dir = temp_dir
 
-        # maximmum number of Resource objects kept in memory cached in this
+        # maximum number of Resource objects kept in memory cached in this
         # Codebase. When the number of in-memory Resources exceed this number,
         # the next Resource instances are saved to disk instead and re-loaded
         # from disk when used/needed.
         self.max_in_memory = max_in_memory
-
-        # map of {rid: resource} for resources that are kept in memory
-        self.resources = {}
         # use only memory
         self.all_in_memory = max_in_memory == 0
         # use only disk
         self.all_on_disk = max_in_memory == -1
+
         # dir where the on-disk cache is stored
         self.cache_dir = None
         if not self.all_in_memory:
@@ -375,89 +370,213 @@ def _setup_essentials(self, temp_dir=temp_dir, max_in_memory=10000):
         # unreadable file, etc).
         self.errors = []
 
-    def _get_next_rid(self):
-        """
-        Return the next available resource id.
-        """
-        return len(self.resource_ids)
-
-    def _get_resource_cache_location(self, rid, create=False):
+    def _get_resource_cache_location(self, path, create_dirs=False):
         """
         Return the location where to get/put a Resource in the cache given a
-        Resource `rid`. Create the directories if requested.
+        Resource `path`. Create the directories if requested.
         """
         if not self.cache_dir:
             return
-        # Note this is hex
-        resid = '%08x' % rid
+
+        if isinstance(path, Resource):
+            path = path.path
+
+        path = clean_path(path)
+
+        # for the cached file name, we use an md5 of the path to avoid things being too long
+        resid = str(md5(path.encode('utf-8')).hexdigest())
         cache_sub_dir, cache_file_name = resid[-2:], resid
+
         parent = join(self.cache_dir, cache_sub_dir)
-        if create and not exists(parent):
+        if create_dirs and not exists(parent):
             create_dir(parent)
+
         return join(parent, cache_file_name)
 
+    def _collect_codebase_attributes(self, *args, **kwargs):
+        """
+        Return a mapping of CodebaseAttributes fields to use with this Codebase
+        """
+        return self.codebase_attributes
+
+    def _build_resource_class(self, *args, **kwargs):
+        """
+        Return a Resource class to use with this Codebase
+        """
+        # Resource sub-class to use. Configured with plugin attributes if present
+        return attr.make_class(
+            name='ScannedResource',
+            attrs=self.resource_attributes or {},
+            slots=True,
+            bases=(Resource,),
+        )
+
     # TODO: add populate progress manager!!!
     def _populate(self):
         """
         Populate this codebase with Resource objects.
 
+        The actual subclass of Resource objects used in this codebase will be
+        created as a side effect.
+
         Population is done by walking its `location` topdown, breadth-first,
         first creating first file then directory Resources both sorted in case-
         insensitive name order.
 
         Special files, links and VCS files are ignored.
         """
+        # Collect headers
+        ##########################################################
+        self.headers = []
 
-        # Codebase attributes to use. Configured with plugin attributes if present.
-        cbac = get_codebase_attributes_class(self.codebase_attributes)
+        # Collect codebase-level attributes and build a class, then load
+        ##########################################################
+        # Codebase attributes to use. Configured with plugin attributes if
+        # present.
+        self.codebase_attributes = self._collect_codebase_attributes()
+        cbac = _CodebaseAttributes.from_attributes(attributes=self.codebase_attributes)
         self.attributes = cbac()
 
         # Resource sub-class to use. Configured with plugin attributes if present
-        self.resource_class = attr.make_class(
-            name='ScannedResource',
-            attrs=self.resource_attributes or {},
-            slots=True,
-            # frozen=True,
-            bases=(Resource,)
-        )
+        ##########################################################
+        self.resource_class = self._build_resource_class()
 
-        def err(_error):
-            """os.walk error handler"""
-            self.errors.append(
-                'ERROR: cannot populate codebase: {}\n'.format(_error)
-                +traceback.format_exc())
-
-        def create_resources(_seq, _top, _parent, _is_file):
-            """Create Resources of parent from a seq of files or directories."""
-            _seq.sort(key=lambda p: (p.lower(), p))
-            for name in _seq:
-                location = join(_top, name)
-                if skip_ignored(location):
-                    continue
-                res = self._create_resource(name, parent=_parent, is_file=_is_file)
-                if not _is_file:
-                    # on the plain, bare FS, files cannot be parents
-                    parent_by_loc[location] = res
-                if TRACE: logger_debug('Codebase.populate:', res)
+        ##########################################################
+        # walk and create resources proper
 
+        # Create root first
+        ##########################################################
         root = self._create_root_resource()
-        if TRACE: logger_debug('Codebase.populate: root:', root)
+        if TRACE:
+            logger_debug('Codebase.populate: root:', root)
 
         if self.has_single_resource:
             # there is nothing else to do for a single file or a single
             # childless directory
             return
 
+        if self.paths:
+            return self._create_resources_from_paths(root=root, paths=self.paths)
+        else:
+            return self._create_resources_from_root(root=root)
+
+    def _create_resources_from_paths(self, root, paths):
+        # without paths we iterate the provided paths. We report an error
+        # if a path is missing on disk.
+
+        # !!!NOTE: WE DO NOT skip_ignored in this case!!!!!
+
+        base_location = parent_directory(root.location)
+
+        # track resources parents by path during construction to avoid
+        # recreating all ancestor directories
+        parents_by_path = {root.path: root}
+
+        for path in paths:
+            res_loc = join(base_location, path)
+            if not exists(res_loc):
+                msg = f'ERROR: cannot populate codebase: path: {path!r} not found in {res_loc!r}'
+                self.errors.append(msg)
+                raise Exception(path, join(base_location, path))
+                continue
+
+            # create all parents. The last parent is the one we want to use
+            parent = root
+            if TRACE:
+                logger_debug('Codebase._create_resources_from_paths: parent', parent)
+            for parent_path in get_ancestor_paths(path, include_self=False):
+                if TRACE:
+                    logger_debug(
+                        f'  Codebase._create_resources_from_paths: parent_path: {parent_path!r}'
+                    )
+                if not parent_path:
+                    continue
+                newpar = parents_by_path.get(parent_path)
+                if TRACE:
+                    logger_debug('  Codebase._create_resources_from_paths: newpar', repr(newpar))
+
+                if not newpar:
+                    newpar = self._get_or_create_resource(
+                        name=file_name(parent_path),
+                        parent=parent,
+                        path=parent_path,
+                        is_file=False,
+                    )
+                    if not newpar:
+                        raise Exception(
+                            f'ERROR: Codebase._create_resources_from_paths: cannot create parent for: {parent_path!r}'
+                        )
+                    parent = newpar
+
+                    parents_by_path[parent_path] = parent
+
+                    if TRACE:
+                        logger_debug(
+                            f'  Codebase._create_resources_from_paths:',
+                            f'created newpar: {newpar!r}',
+                        )
+
+            res = self._get_or_create_resource(
+                name=file_name(path),
+                parent=parent,
+                path=path,
+                is_file=isfile(res_loc),
+            )
+            if TRACE:
+                logger_debug('Codebase._create_resources_from_paths: resource', res)
+
+    def _create_resources_from_root(self, root):
+        # without paths we walks the root location top-down
+
         # track resources parents by location during construction.
         # NOTE: this cannot exhaust memory on a large codebase, because we do
         # not keep parents already walked and we walk topdown.
-        parent_by_loc = {root.location: root}
+        parents_by_loc = {root.location: root}
+
+        def err(_error):
+            """os.walk error handler"""
+            self.errors.append(
+                f'ERROR: cannot populate codebase: {_error}\n{traceback.format_exc()}'
+            )
 
         # Walk over the directory and build the resource tree
-        for (top, dirs, files) in depth_walk(root.location, self.max_depth, err):
-                parent = parent_by_loc.pop(top)
-                create_resources(files, top, parent, _is_file=True)
-                create_resources(dirs, top, parent, _is_file=False)
+        for (top, dirs, files) in depth_walk(
+            root_location=root.location,
+            max_depth=self.max_depth,
+            error_handler=err,
+        ):
+            parent = parents_by_loc.pop(top)
+            for created in self._create_resources(
+                parent=parent,
+                top=top,
+                dirs=dirs,
+                files=files,
+            ):
+                # on the plain, bare FS, files cannot be parents
+                if not created.is_file:
+                    parents_by_loc[created.location] = created
+
+    def _create_resources(self, parent, top, dirs, files, skip_ignored=skip_ignored):
+        """
+        Create and yield ``files`` and ``dirs`` children Resources of a
+        ``parent`` Resource. These are sorted as: directories then files and by
+        lowercase name, then name.
+        """
+        for names, is_file in [(dirs, False), (files, True)]:
+            names.sort(key=lambda p: (p.lower(), p))
+
+            for name in names:
+                location = join(top, name)
+                if skip_ignored(location):
+                    continue
+                res = self._get_or_create_resource(
+                    name=name,
+                    parent=parent,
+                    is_file=is_file,
+                )
+                if TRACE:
+                    logger_debug('Codebase.create_resources:', res)
+                yield res
 
     def _create_root_resource(self):
         """
@@ -471,87 +590,81 @@ def _create_root_resource(self):
         name = file_name(location)
 
         # do not strip root for codebase with a single Resource.
-        if self.strip_root:
-            if self.has_single_resource:
-                path = name
-            else:
-                # NOTE: this may seem weird but the root path will be an empty
-                # string for a codebase root with strip_root=True if not
-                # single_resource
-                path = ''
-        else:
-            path = get_path(
-                root_location=location,
-                location=location,
-                full_root=self.full_root,
-                strip_root=self.strip_root,
-            )
+        path = Resource.build_path(root_location=location, location=location)
+
         if TRACE:
-            logger_debug('  Codebase._create_root_resource:', path)
+            logger_debug(f'  Codebase._create_root_resource: {path} is_file: {self.is_file}')
             logger_debug()
 
         root = self.resource_class(
             name=name,
             location=location,
+            # never cached
+            cache_location=None,
             path=path,
-            rid=0,
-            pid=None,
+            is_root=True,
             is_file=self.is_file,
         )
 
-        self.resource_ids.add(0)
-        self.resources[0] = root
+        self.resources_by_path[path] = root
+        self.resources_count += 1
         self.root = root
         return root
 
-    def _create_resource(self, name, parent, is_file=False, path=None, resource_data=None):
+    def _get_or_create_resource(
+        self,
+        name,
+        parent,
+        is_file=False,
+        path=None,
+    ):
         """
-        Create and return a new Resource in this codebase with `name` as a child
-        of the `parent` Resource.
-        `name` is always in native OS-preferred encoding (e.g. byte on Linux,
-        unicode elsewhere).
+        Create and return a new codebase Resource with ``path`` and ``location``.
         """
-        if parent is None:
-            raise TypeError('Cannot create resource without parent.')
+        if not parent:
+            raise TypeError(
+                f'Cannot create resource without parent: name: {name!r}, path: {path!r}'
+            )
 
-        rid = self._get_next_rid()
+        # If the codebase is virtual, we provide the path
+        if not path:
+            path = posixpath_join(parent.path, name)
+        path = clean_path(path)
 
-        if self._use_disk_cache_for_resource(rid):
-            cache_location = self._get_resource_cache_location(rid, create=True)
+        existing = self.get_resource(path)
+        if existing:
+            if TRACE:
+                logger_debug('  Codebase._get_or_create_resource: path  already exists:', path)
+            return existing
+
+        if self._use_disk_cache_for_resource():
+            cache_location = self._get_resource_cache_location(path=path, create_dirs=True)
         else:
             cache_location = None
 
-        # If the codebase is virtual, then there is no location
+        # NOTE: If the codebase is virtual, then there is no location
         parent_location = parent.location
         if parent_location:
             location = join(parent_location, name)
         else:
             location = None
 
-        # If the codebase is virtual, we provide the path
-        if not path:
-            path = posixpath.join(parent.path, name)
-
         if TRACE:
-            logger_debug('  Codebase._create_resource: parent.path:', parent.path, 'path:', path)
+            logger_debug(
+                f'  Codebase._get_or_create_resource: with path: {path}\n'
+                f'  name={name}, is_file={is_file}'
+            )
 
-        resource_data = resource_data or {}
-        if resource_data:
-            resource_data = remove_properties_and_basics(resource_data)
         child = self.resource_class(
             name=name,
             location=location,
             path=path,
             cache_location=cache_location,
-            rid=rid,
-            pid=parent.rid,
             is_file=is_file,
-            **resource_data
         )
+        self.resources_count += 1
 
-        self.resource_ids.add(rid)
-        parent.children_rids.append(rid)
-        # TODO: fixme, this is not great to save also the parent :|
+        parent.children_names.append(name)
         self.save_resource(parent)
         self.save_resource(child)
         return child
@@ -591,86 +704,88 @@ def get_headers(self):
 
     def exists(self, resource):
         """
-        Return True if the Resource with `rid` exists in the codebase.
+        Return True if the Resource path exists in the codebase.
         """
-        return resource.rid in self.resource_ids
+        return resource and resource.path in self.resources_by_path
 
-    def _use_disk_cache_for_resource(self, rid):
+    def _use_disk_cache_for_resource(self):
         """
-        Return True if Resource `rid` should be cached on-disk or False if it
-        should be cached in-memory.
+        Return True if Resource ``res`` should be cached on-disk or False if it
+        should be kept in-memory.
         """
-        if TRACE:
-            msg = ['    Codebase._use_disk_cache_for_resource:, rid:', rid, 'mode:']
-            if rid == 0:
-                msg.append('root')
-            elif rid is  None:
-                msg.append('from memory')
-            elif self.all_on_disk:
-                msg.append('all_on_disk')
-            elif self.all_in_memory:
-                msg.append('all_in_memory')
-            else:
-                msg.extend(['mixed:', 'self.max_in_memory:', self.max_in_memory])
-                if rid and rid < self.max_in_memory:
-                    msg.append('from memory')
-                else:
-                    msg.append('from disk')
-            logger_debug(*msg)
 
-        if rid == 0:
-            return False
-        elif rid is  None:
-            return False
-        elif self.all_on_disk:
-            return True
+        use_disk_cache = False
+        if self.all_on_disk:
+            use_disk_cache = True
         elif self.all_in_memory:
-            return False
-        # mixed case where some are in memory and some on disk
-        elif  rid < self.max_in_memory:
-            return False
+            use_disk_cache = False
         else:
-            return True
+            # mixed case where some are in memory and some on disk
+            if self.resources_count < self.max_in_memory:
+                use_disk_cache = False
+            else:
+                use_disk_cache = True
 
-    def _exists_in_memory(self, rid):
+        if TRACE:
+            logger_debug(
+                f'    Codebase._use_disk_cache_for_resource mode: {use_disk_cache} '
+                f'on_disk: {self.all_on_disk} '
+                f'in_mem: {self.all_in_memory} '
+                f'max_in_mem: {self.max_in_memory}'
+            )
+        return use_disk_cache
+
+    def _exists_in_memory(self, path):
         """
-        Return True if Resource `rid` exists in the codebase memory cache.
+        Return True if Resource `path` exists in the codebase memory cache.
         """
-        return rid in self.resources
+        path = clean_path(path)
+        return isinstance(self.resources_by_path.get(path), Resource)
 
-    def _exists_on_disk(self, rid):
+    def _exists_on_disk(self, path):
         """
-        Return True if Resource `rid` exists in the codebase disk cache.
+        Return True if Resource `path` exists in the codebase disk cache.
         """
-        cache_location = self._get_resource_cache_location(rid)
-        if cache_location:
-            return exists(cache_location)
+        path = clean_path(path)
+        if not self._exists_in_memory(path):
+            cache_location = self._get_resource_cache_location(path, create_dirs=False)
+            if cache_location:
+                return exists(cache_location)
 
-    def get_resource(self, rid):
+    ########### FIXME: the PATH SHOULD NOT INCLUDE THE ROOT NAME
+    def get_resource(self, path):
         """
-        Return the Resource with `rid` or None if it does not exists.
+        Return the Resource with `path` or None if it does not exists.
+        The ``path`` must be relative to the root (and including the root
+        name as its first segment).
         """
+        assert isinstance(path, str), f'Invalid path: {path!r} is not a string.'
+        path = clean_path(path)
         if TRACE:
-            msg = ['  Codebase.get_resource:', 'rid:', rid]
-            if rid == 0:
-                msg.append('root')
-            elif not rid or rid not in self.resource_ids:
+            msg = ['  Codebase.get_resource:', 'path:', path]
+            if not path or path not in self.resources_by_path:
                 msg.append('not in resources!')
-            elif self._use_disk_cache_for_resource(rid):
-                msg.extend(['from disk', 'exists:', self._exists_on_disk(rid)])
             else:
-                msg.extend(['from memory', 'exists:', self._exists_in_memory(rid)])
+                msg.extend(['exists on disk:', self._exists_on_disk(path)])
+                msg.extend(['exists in memo:', self._exists_in_memory(path)])
             logger_debug(*msg)
 
-        if rid == 0:
-            res = attr.evolve(self.root)
-        elif self._use_disk_cache_for_resource(rid):
-            res = self._load_resource(rid)
-        elif not rid or rid not in self.resource_ids:
-            res = None
-        else:
-            res = self.resources.get(rid)
+        # we use Codebase.CACHED_RESOURCE as a semaphore for existing but only
+        # on-disk, non-in-memory resource that we need to load from the disk
+        # cache to differentiate from None which means missing
+        res = self.resources_by_path.get(path)
+        if res is Codebase.CACHED_RESOURCE:
+            res = self._load_resource(path)
+
+        elif isinstance(res, Resource):
             res = attr.evolve(res)
+
+        elif res is None:
+            pass
+        else:
+            # this should never happen
+            raise Exception(f'get_resource: Internal error when getting {path!r}')
+
         if TRACE:
             logger_debug('    Resource:', res)
         return res
@@ -679,34 +794,24 @@ def save_resource(self, resource):
         """
         Save the `resource` Resource to cache (in memory or disk).
         """
-        if TRACE:
-            msg = ['  Codebase.save_resource:', resource]
-            rid = resource.rid
-            if resource.is_root:
-                msg.append('root')
-            elif rid not in self.resource_ids:
-                msg.append('missing resource')
-            elif self._use_disk_cache_for_resource(rid):
-                msg.extend(['to disk:', 'exists:', self._exists_on_disk(rid)])
-            else:
-                msg.extend(['to memory:', 'exists:', self._exists_in_memory(rid)])
-            logger_debug(*msg)
-
         if not resource:
             return
 
-        rid = resource.rid
-        if rid not in self.resource_ids:
-            raise UnknownResource('Not part of codebase: %(resource)r' % locals())
+        path = clean_path(resource.path)
+
+        if TRACE:
+            logger_debug('  Codebase.save_resource:', resource)
 
         if resource.is_root:
-            # this can possibly damage things badly
             self.root = resource
+            self.resources_by_path[path] = resource
 
-        if self._use_disk_cache_for_resource(rid):
+        elif resource.cache_location:
             self._dump_resource(resource)
+            self.resources_by_path[path] = Codebase.CACHED_RESOURCE
+
         else:
-            self.resources[rid] = resource
+            self.resources_by_path[path] = resource
 
     def _dump_resource(self, resource):
         """
@@ -716,29 +821,31 @@ def _dump_resource(self, resource):
 
         if not cache_location:
             raise TypeError(
-                'Resource cannot be dumped to disk and is used only'
-                f'in memory: {resource}'
+                'Resource cannot be dumped to disk and is used only' f'in memory: {resource}'
             )
 
         # TODO: consider messagepack or protobuf for compact/faster processing?
-        with open(cache_location , 'w') as cached:
+        with open(cache_location, 'w') as cached:
             cached.write(json.dumps(resource.serialize(), check_circular=False))
 
     # TODO: consider adding a small LRU cache in front of this for perf?
-    def _load_resource(self, rid):
+    def _load_resource(self, path):
         """
-        Return a Resource with `rid` loaded from the disk cache.
+        Return a Resource with ``path`` loaded from the disk cache.
         """
-        cache_location = self._get_resource_cache_location(rid, create=False)
+        path = clean_path(path)
+        cache_location = self._get_resource_cache_location(path, create_dirs=False)
 
         if TRACE:
             logger_debug(
-                '    Codebase._load_resource: exists:', exists(cache_location),
-                'cache_location:', cache_location)
+                '    Codebase._load_resource: exists:',
+                exists(cache_location),
+                'cache_location:',
+                cache_location,
+            )
 
         if not exists(cache_location):
-            raise ResourceNotInCache(
-                'Failed to load Resource: %(rid)d from %(cache_location)r' % locals())
+            raise ResourceNotInCache(f'Failed to load Resource: {path} from {cache_location!r}')
 
         # TODO: consider messagepack or protobuf for compact/faster processing
         try:
@@ -747,91 +854,87 @@ def _load_resource(self, rid):
                 # TODO: Consider using simplejson
                 data = json.load(cached)
                 return self.resource_class(**data)
-        except Exception:
+        except Exception as e:
             with open(cache_location, 'rb') as cached:
                 cached_data = cached.read()
             msg = (
                 f'ERROR: failed to load resource from cached location: {cache_location} '
-                'with content:\n\n'
-                +repr(cached_data)
-                +'\n\n'
-                +traceback.format_exc())
-            raise Exception(msg)
+                'with content:\n\n' + repr(cached_data) + '\n\n' + traceback.format_exc()
+            )
+            raise Exception(msg) from e
 
     def _remove_resource(self, resource):
         """
-        Remove the `resource` Resource object from the resource tree.
+        Remove the ``resource`` Resource object from this codebase.
         Does not remove children.
         """
         if resource.is_root:
-            raise TypeError(
-                'Cannot remove the root resource from '
-                'codebase: ' + repr(resource))
-        rid = resource.rid
-        # remove from index.
-        self.resource_ids.discard(rid)
+            raise TypeError(f'Cannot remove the root resource from codebase: {resource!r}')
+
         # remove from in-memory cache. The disk cache is cleared on exit.
-        self.resources.pop(rid, None)
+        self.resources_by_path.pop(resource.path, None)
         if TRACE:
             logger_debug('Codebase._remove_resource:', resource)
 
     def remove_resource(self, resource):
         """
         Remove the `resource` Resource object and all its children from the
-        resource tree. Return a set of removed Resource ids.
+        codebase. Return a set of removed Resource paths.
         """
         if TRACE:
             logger_debug('Codebase.remove_resource')
             logger_debug('  resource', resource)
 
         if resource.is_root:
-            raise TypeError(
-                'Cannot remove the root resource from codebase:' + repr(resource))
+            raise TypeError(f'Cannot remove the root resource from codebase: {resource!r}')
 
-        removed_rids = set()
+        removed_paths = set()
 
         # remove all descendants bottom up to avoid out-of-order access to
         # removed resources
         for descendant in resource.walk(self, topdown=False):
             self._remove_resource(descendant)
-            removed_rids.add(descendant.rid)
+            removed_paths.add(descendant.location)
 
         # remove resource from parent
         parent = resource.parent(self)
-        if TRACE: logger_debug('    parent', parent)
-        parent.children_rids.remove(resource.rid)
+        if TRACE:
+            logger_debug('    parent', parent)
+        parent.children_names.remove(resource.name)
         parent.save(self)
 
         # remove resource proper
         self._remove_resource(resource)
-        removed_rids.add(resource.rid)
+        removed_paths.add(resource.location)
 
-        return removed_rids
+        return removed_paths
 
     def walk(self, topdown=True, skip_root=False, ignored=ignore_nothing):
         """
-        Yield all resources for this Codebase walking its resource tree.
-        Walk the tree top-down, depth-first if `topdown` is True, otherwise walk
+        Yield all resources for this Codebase walking its resource tree. Walk
+        the tree top-down, depth-first if ``topdown`` is True, otherwise walk
         bottom-up.
 
-        Each level is sorted by children sort order (e.g. without-children, then
-        with-children and each group by case-insensitive name)
+        Each level is sorted by children woth this sort order: resource without-
+        children first, then resource with-children and each group sorted by
+        case-insensitive name.
 
-        If `skip_root` is True, the root resource is not returned unless this is
-        a codebase with a single resource.
+        If ``skip_root`` is True, the root resource is not returned unless this
+        is a codebase with a single resource.
 
-        `ignored` is a callable that accepts two arguments, `resource` and `codebase`,
-        and returns True if `resource` should be ignored.
+        ``ignored`` is a callable that accepts two arguments, ``resource`` and
+        ``codebase``, and returns True if ``resource`` should be ignored.
         """
         root = self.root
 
-        if ignored(root, self):
+        if ignored(resource=root, codebase=self):
             return
 
+        # make a copy
         root = attr.evolve(root)
 
         # include root if no children (e.g. codebase with a single resource)
-        if skip_root and not root.has_children():
+        if self.has_single_resource or (skip_root and not root.has_children()):
             skip_root = False
 
         root = attr.evolve(root)
@@ -844,39 +947,27 @@ def walk(self, topdown=True, skip_root=False, ignored=ignore_nothing):
         if not topdown and not skip_root:
             yield root
 
-    def get_resource_from_path(self, path, absolute=False):
-        """
-        Return a Resource that matches the path or or None. If `absolute` is
-        True, treat the path as an absolute location. Otherwise as relative to
-        the root (and including it).
-        """
-        for res in self.walk():
-            if absolute:
-                if path == res.location:
-                    return res
-            else:
-                if path == res.path:
-                    return res
+    def __iter__(self):
+        yield from self.walk()
 
     def walk_filtered(self, topdown=True, skip_root=False):
         """
         Walk this Codebase as with walk() but does not return Resources with
         `is_filtered` flag set to True.
         """
-        for resource in self.walk(topdown, skip_root):
-            if resource.is_filtered:
-                continue
-            yield resource
+        for resource in self.walk(topdown=topdown, skip_root=skip_root):
+            if not resource.is_filtered:
+                yield resource
 
     def compute_counts(self, skip_root=False, skip_filtered=False):
         """
-        Compute and update the counts of every resource.
-        Return a tuple of top level counters (files_count, dirs_count,
-        size_count) for this codebase.
+        Compute, update and save the counts of every resource.
+        Return a tuple of top level counters for this codebase as:
+          (files_count, dirs_count, size_count).
 
-        The counts are computed differently based on these falsg:
-        - If `skip_root` is True, the root resource is not included in counts.
-        - If `skip_filtered` is True, resources with `is_filtered` set to True
+        The counts are computed differently based on these flags:
+        - If ``skip_root`` is True, the root resource is not included in counts.
+        - If ``skip_filtered`` is True, resources with ``is_filtered`` set to True
           are not included in counts.
         """
         self.update_counts(skip_filtered=skip_filtered)
@@ -900,9 +991,9 @@ def compute_counts(self, skip_root=False, skip_filtered=False):
     def update_counts(self, skip_filtered=False):
         """
         Update files_count, dirs_count and size_count attributes of each
-        Resource in this codebase based on the current state.
+        Resource in this codebase based on the current Resource data.
 
-        If `skip_filtered` is True, resources with `is_filtered` set to True are
+        If ``skip_filtered`` is True, resources with ``is_filtered`` set to True are
         not included in counts.
         """
         # note: we walk bottom up to update things in the proper order
@@ -910,11 +1001,9 @@ def update_counts(self, skip_filtered=False):
         for resource in self.walk(topdown=False):
             try:
                 resource._compute_children_counts(self, skip_filtered)
-            except Exception:
-                path = resource.path
-                msg = ('ERROR: cannot compute children counts for: {path}:\n'.format(**locals())
-                +traceback.format_exc())
-                raise Exception(msg)
+            except Exception as e:
+                msg = f'ERROR: cannot compute children counts for: {resource.path}'
+                raise Exception(msg) from e
 
     def clear(self):
         """
@@ -924,39 +1013,75 @@ def clear(self):
 
     def lowest_common_parent(self):
         """
-        Return a Resource that is the lowest common parent of all the files of
-        this codebase, skipping empty root directory segments.
-        Return None is this codebase contains a single resource.
+        Return a Resource that is the lowest common parent (aka. lowest common
+        ancestor) of all the files of this codebase, skipping root directory
+        segments that are "empty" e.g. with a single child. Return None is this
+        codebase contains a single resource.
         """
         if self.has_single_resource:
             return self.root
+
         for res in self.walk(topdown=True):
             if not res.is_file:
                 kids = res.children(self)
                 if len(kids) == 1 and not kids[0].is_file:
-                    # this is an empty dir with a single dir child
+                    # this is an empty dir with a single dir child, therefore
                     # we shall continue the descent walk
                     continue
                 else:
-                    # the dir starts to branch: we have our root
+                    # the dir starts to branch: we have our lowest common parent
+                    # root
                     break
             else:
                 # we are in a case that should never happen
                 return self.root
         return res
 
+    def to_list(
+        self,
+        with_timing=False,
+        with_info=False,
+        skinny=False,
+        full_root=False,
+        strip_root=False,
+    ):
+        """
+        Return a list of all Resources of this Codebase as mappings.
+        """
+        if self.has_single_resource:
+            return [
+                self.root.to_dict(
+                    with_timing=with_timing,
+                    with_info=with_info,
+                    skinny=skinny,
+                    # we never strip root for single res codebase
+                    full_root=full_root,
+                    strip_root=False,
+                )
+            ]
+
+        td = partial(
+            Resource.to_dict,
+            with_timing=with_timing,
+            with_info=with_info,
+            skinny=skinny,
+            full_root=full_root,
+            strip_root=strip_root,
+        )
+        return [td(r) for r in self.walk(skip_root=strip_root)]
+
 
 def to_decoded_posix_path(path):
     """
     Return `path` as a Unicode POSIX path given a unicode or bytes path string.
     """
-    return os.fsdecode(as_posixpath(path))
+    return clean_path(os.fsdecode(as_posixpath(path)))
 
 
 @attr.attributes(slots=True)
 class Resource(object):
     """
-    A resource represent a file or directory with essential "file information"
+    A resource represents a file or directory with essential "file information"
     and the scanned data details.
 
     A Resource is a tree that models the fileystem tree structure.
@@ -964,11 +1089,13 @@ class Resource(object):
     In order to support lightweight and smaller objects that can be serialized
     and deserialized (such as pickled in multiprocessing) without pulling in a
     whole object tree, a Resource does not store its related objects directly:
-    the Codebase it belongs to, its parent Resource and its Resource children
-    objects are stored only as integer ids. Querying the Resource relationships
-    and walking the Resources tree requires to lookup the corresponding object
-    by id in the codebase object.
+    - the Codebase it belongs to is never stored.
+    - its parent Resource and its Resource children objects are queryable by path.
+
+    Querying the Resource relationships and walking the Resources tree typically
+    requires to lookup the corresponding object by path in the Codebase object.
     """
+
     # the file or directory name in the OS preferred representation (either
     # bytes on Linux and Unicode elsewhere)
     name = attr.attrib(repr=False)
@@ -979,23 +1106,10 @@ class Resource(object):
     location = attr.attrib(repr=False)
 
     # the file or directory POSIX path decoded as unicode using the filesystem
-    # encoding. This is the path that will be reported in output and can be
-    # either one of these:
-    # - if the codebase was created with strip_root==True, this is a path
-    #   relative to the root, stripped from its root segment unless the codebase
-    #   contains a single file.
-    # - if the codebase was created with full_root==True, this is an absolute
-    #   path
+    # encoding. This is the path that will be reported in output and is always
+    # relative to and starting with the root directory.
     path = attr.attrib(converter=to_decoded_posix_path)
 
-    # resource id as an integer
-    # the root of a Resource tree has a pid==0 by convention
-    rid = attr.ib()
-
-    # parent resource id of this resource as an integer
-    # the root of a Resource tree has a pid==None by convention
-    pid = attr.ib()
-
     # location of the file where this resource can be chached on disk in the OS
     # preferred representation (either bytes on Linux and Unicode elsewhere)
     cache_location = attr.attrib(default=None, repr=False)
@@ -1007,8 +1121,8 @@ class Resource(object):
     # returned list of resources
     is_filtered = attr.ib(default=False)
 
-    # a list of rids
-    children_rids = attr.ib(default=attr.Factory(list), repr=TRACE)
+    # a list of names
+    children_names = attr.ib(default=attr.Factory(list), repr=TRACE)
 
     # external data to serialize
     size = attr.ib(default=0, type=int, repr=TRACE)
@@ -1028,18 +1142,15 @@ class Resource(object):
     # mapping of timings for each scan as {scan_key: duration in seconds as a float}
     scan_timings = attr.ib(default=attr.Factory(dict), repr=False)
 
-    # stores a mapping of extra data for this Resource this data is
-    # never returned in a to_dict() and not meant to be saved in the
-    # final scan results. Instead it can be used to store extra data
-    # attributes that may be useful during a scan processing but are not
-    # usefuol afterwards. Be careful when using this not to override
-    # keys/valoues that may have been created by some other plugin or
-    # process
+    # stores a mapping of extra data for this Resource this data is never
+    # returned in a to_dict() and not meant to be saved in the final scan
+    # results. Instead it can be used to store extra data attributes that may be
+    # useful during a scan processing but are not usefuol afterwards. Be careful
+    # not to override keys/values that may have been created by some other
+    # plugin or process
     extra_data = attr.ib(default=attr.Factory(dict), repr=False)
 
-    @property
-    def is_root(self):
-        return self.rid == 0
+    is_root = attr.ib(default=False, type=bool, repr=False)
 
     @property
     def type(self):
@@ -1052,12 +1163,72 @@ def type(self, value):
         else:
             self.is_file = False
 
-    def get_path(self, strip_root=False):
-        if strip_root:
-            return strip_first_path_segment(self.path)
+    @classmethod
+    def build_path(cls, root_location, location):
+        """
+        Return a POSIX path string (using "/"  separators) of ``location`` relative
+        to ``root_location`. Both locations are absolute native locations.
+        The returned path has no leading and trailing slashes.  The first segment
+        of this path is always the last segment of the ``root_location``.
+        For example:
+        >>> result = Resource.build_path(r'D:\\foo\\bar', r'D:\\foo\\bar\\baz')
+        >>> assert result == 'bar/baz', repr(result)
+        >>> result = Resource.build_path('/foo/bar/', '/foo/bar/baz')
+        >>> assert result == 'bar/baz', result
+        >>> result = Resource.build_path('/foo/bar/', '/foo/bar')
+        >>> assert result  == 'bar', result
+        """
+        root_loc = clean_path(root_location)
+        loc = clean_path(location)
+        assert loc.startswith(root_loc)
+
+        # keep the root directory name by default
+        root_loc = posixpath_parent(root_loc).strip('/')
+        path = loc.replace(root_loc, '', 1).strip('/')
+        if TRACE:
+            logger_debug('build_path:', root_loc, loc, path)
+        return path
+
+    def get_path(self, full_root=False, strip_root=False):
+        """
+        Return a POSIX path string (using "/"  separators) for this resource.
+        The returned path has no leading and trailing slashes.
+
+        - If ``full_root`` is True, return an absolute path.
+
+        - If ``strip_root`` is True, return a relative path without the first
+          root segment. Ignored if ``full_root`` is True.
+
+        - Otherwise return a relative path where the first segment is the
+          ``location`` last path segment.
+        """
+        if full_root:
+            return self.full_root_path
+        elif strip_root:
+            return self.strip_root_path
         else:
             return self.path
 
+    @property
+    def full_root_path(self):
+        """
+        Return a fully rooted POSIX path stripped from leading and trailing slash
+        """
+        location = self.location
+        if location:
+            return clean_path(as_posixpath(self.location))
+        else:
+            return self.path
+
+    @property
+    def strip_root_path(self):
+        """
+        Return a path relative to the root, stripped from its root segment
+        unless the codebase contains a single file or there is only one segment
+        in the path.
+        """
+        return strip_first_path_segment(self.path)
+
     @property
     def is_dir(self):
         # note: we only store is_file
@@ -1084,27 +1255,28 @@ def extension(self, value):
         pass
 
     def extracted_to(self, codebase):
-        extract_path = '{}{}'.format(self.path, '-extract')
-        for s in self.siblings(codebase):
-            if not s.path == extract_path:
-                continue
-            return s
+        """
+        Return the path this Resource archive was extracted to or None.
+        """
+        extract_path = f'{self.path}-extract'
+        return codebase.get_resource(extract_path)
 
     def extracted_from(self, codebase):
-        archive_path, _, _ = self.path.rpartition('-extract')
-        for a in self.ancestors(codebase):
-            for c in a.children(codebase):
-                if not c.path == archive_path:
-                    continue
-                return c
+        """
+        Return the path to an archive this Resource was extracted from or None.
+        """
+        path = self.path
+        if '-extract' in path:
+            archive_path, _, _ = self.path.rpartition('-extract')
+            return codebase.get_resource(archive_path)
 
     @classmethod
-    def get(cls, codebase, rid):
+    def get(cls, codebase, path):
         """
-        Return the Resource with `rid` in `codebase` or None if it does not
+        Return the Resource with `path` in `codebase` or None if it does not
         exists.
         """
-        return codebase.get_resource(rid)
+        return codebase.get_resource(path)
 
     def save(self, codebase):
         """
@@ -1115,18 +1287,10 @@ def save(self, codebase):
     def remove(self, codebase):
         """
         Remove this resource and all its children from the codebase.
-        Return a set of removed Resource ids.
+        Return a set of removed Resource paths.
         """
         return codebase.remove_resource(self)
 
-    def create_child(self, codebase, name, is_file=False):
-        """
-        Create and return a new child Resource of this resource in `codebase`
-        with `name`. `name` is always in native OS-preferred encoding (e.g. byte
-        on Linux, unicode elsewhere).
-        """
-        return codebase._create_resource(name, self, is_file)
-
     def _compute_children_counts(self, codebase, skip_filtered=False):
         """
         Compute counts and update self with these counts from direct children.
@@ -1181,9 +1345,15 @@ def walk(self, codebase, topdown=True, ignored=ignore_nothing):
                 child = attr.evolve(child)
                 if topdown:
                     yield child
-                for subchild in child.walk(codebase, topdown=topdown, ignored=ignored):
+
+                for subchild in child.walk(
+                    codebase=codebase,
+                    topdown=topdown,
+                    ignored=ignored,
+                ):
                     if not ignored(subchild, codebase):
                         yield subchild
+
                 if not topdown:
                     yield child
 
@@ -1191,30 +1361,51 @@ def has_children(self):
         """
         Return True is this Resource has children.
         """
-        return bool(self.children_rids)
+        return bool(self.children_names)
 
-    def children(self, codebase):
+    def children(self, codebase, names=()):
         """
-        Return a sorted sequence of direct children Resource objects for this Resource
-        or an empty sequence.
+        Return a sorted sequence of direct children Resource objects for this
+        Resource or an empty sequence.
+
         Sorting is by resources without children, then resource with children
         (e.g. directories or files with children), then case-insentive name.
         """
+        children_names = self.children_names or []
+        if not children_names:
+            return []
+
+        if names:
+            kids = set(children_names)
+            children_names = [n for n in names if n in kids]
+            if not children_names:
+                return []
+
+        child_path = partial(posixpath_join, self.path)
+        get_child = codebase.get_resource
+        children = [get_child(path=child_path(name)) for name in children_names]
+
         _sorter = lambda r: (r.has_children(), r.name.lower(), r.name)
-        get_resource = codebase.get_resource
-        return sorted((get_resource(rid) for rid in self.children_rids), key=_sorter)
+        return sorted((c for c in children if c), key=_sorter)
 
     def has_parent(self):
         """
-        Return True is this Resource has children.
+        Return True is this Resource has a parent.
         """
         return not self.is_root
 
+    def parent_path(self):
+        """
+        Return the parent Resource object for this Resource or None.
+        """
+        return self.has_parent() and parent_directory(self.path, with_trail=False)
+
     def parent(self, codebase):
         """
         Return the parent Resource object for this Resource or None.
         """
-        return codebase.get_resource(self.pid)
+        parent_path = self.parent_path()
+        return parent_path and codebase.get_resource(parent_path)
 
     def has_siblings(self, codebase):
         """
@@ -1233,7 +1424,7 @@ def siblings(self, codebase):
 
     def ancestors(self, codebase):
         """
-        Return a sequence of ancestor Resource objects from self to root
+        Return a sequence of ancestor Resource objects from root to self
         (includes self).
         """
         if self.is_root:
@@ -1241,14 +1432,17 @@ def ancestors(self, codebase):
 
         ancestors = deque()
         ancestors_appendleft = ancestors.appendleft
-        codebase_get_resource = codebase.get_resource
         current = self
+
         # walk up the parent tree up to the root
-        while not current.is_root:
+        while current and not current.is_root:
             ancestors_appendleft(current)
-            current = codebase_get_resource(current.pid)
+            current = current.parent(codebase)
+
         # append root too
-        ancestors_appendleft(current)
+        if current:
+            ancestors_appendleft(current)
+
         return list(ancestors)
 
     def descendants(self, codebase):
@@ -1256,7 +1450,7 @@ def descendants(self, codebase):
         Return a sequence of descendant Resource objects
         (does NOT include self).
         """
-        return list(self.walk(codebase, topdown=True))
+        return list(self.walk(codebase=codebase, topdown=True))
 
     def distance(self, codebase):
         """
@@ -1270,13 +1464,28 @@ def distance(self, codebase):
             return 0
         return len(self.ancestors(codebase)) - 1
 
-    def to_dict(self, with_timing=False, with_info=False, skinny=False):
+    def to_dict(
+        self,
+        with_timing=False,
+        with_info=False,
+        skinny=False,
+        full_root=False,
+        strip_root=False,
+    ):
         """
-        Return a mapping of representing this Resource and its scans.
+        Return a mapping of representing this Resource and its data.
+
+        The path is always a POSIX path stripped from leading and trailing
+        slashes and can be either one of these exclusive flags:
+
+        - If ``full_root`` is True, this is a full path when available.
+
+        - If ``strip_root`` is True, this is a path relative to the root,
+          stripped from its root segment unless the codebase contains a single
+          file with a single root segment. Ignored if ``full_root`` is True.
         """
-        res = dict()
-        res['path'] = self.path
-        res['type'] = self.type
+        path = self.get_path(full_root=full_root, strip_root=strip_root)
+        res = dict(path=path, type=self.type)
         if skinny:
             return res
 
@@ -1291,8 +1500,7 @@ def to_dict(self, with_timing=False, with_info=False, skinny=False):
 
         # this will catch every attribute that has been added dynamically, such
         # as scan-provided resource_attributes
-        other_data = attr.asdict(
-            self, filter=self_fields_filter, dict_factory=dict)
+        other_data = attr.asdict(self, filter=self_fields_filter, dict_factory=dict)
 
         # FIXME: make a deep copy of the data first!!!!
         # see https://github.com/nexB/scancode-toolkit/issues/1199
@@ -1314,71 +1522,56 @@ def to_dict(self, with_timing=False, with_info=False, skinny=False):
 
     def serialize(self):
         """
-        Return a mapping of representing this Resource and its scans in a form
-        that is fully serializable and can be used to reconstruct a Resource.
-        All path-derived OS-native strings are decoded to Unicode for ulterior
-        JSON serialization.
+        Return a mapping of representing this Resource and its data in a form
+        that is fully serializable (to JSON, YAML, pickle, etc.) and can be used
+        to reconstruct a Resource.
         """
         # we save all fields, not just the one in .to_dict()
-        saveable = attr.asdict(self, dict_factory=dict)
-        saveable['name'] = self.name
+        serializable = attr.asdict(self)
+        serializable['name'] = self.name
         if self.location:
-            saveable['location'] = self.location
+            serializable['location'] = self.location
         if self.cache_location:
-            saveable['cache_location'] = self.cache_location
-        return saveable
+            serializable['cache_location'] = self.cache_location
+        return serializable
 
 
-def get_path(root_location, location, full_root=False, strip_root=False):
+def clean_path(path):
     """
-    Return a unicode srting POSIX path (using "/"  separators) derived from
-    `root_location` of the codebase and the `location` of a resource. Both
-    locations are absolute native locations.
-
-    - If `full_root` is True, return an absolute path. Otherwise return a
-      relative path where the first segment is the `root_location` last path
-      segment name.
-
-    - If `strip_root` is True, return a relative path without the first root
-      segment. Ignored if `full_root` is True.
+    Return a cleaned and normalized POSIX ``path``.
     """
-
-    posix_loc = as_posixpath(location)
-    if full_root:
-        return posix_loc
-
-    if not strip_root:
-        # keep the root directory name by default
-        root_loc = parent_directory(root_location)
-    else:
-        root_loc = root_location
-
-    posix_root_loc = as_posixpath(root_loc).rstrip('/') + '/'
-
-    return posix_loc.replace(posix_root_loc, '', 1)
+    path = path or ''
+    # convert to posix and ensure we have no slash at both ends
+    path = posixpath_normpath(path.replace('\\', '/').strip('/'))
+    if path == '.':
+        path = ''
+    return path
 
 
 def strip_first_path_segment(path):
     """
-    Return a POSIX path stripped from its first path segment.
+    Return a POSIX ``path`` stripped from its first path segment unless there is
+    only one segment in which case we return this segment. The returned path has
+    no leading and trailing slashes.
 
     For example::
         >>> strip_first_path_segment('')
         ''
         >>> strip_first_path_segment('foo')
-        'foo'
+        ''
         >>> strip_first_path_segment('foo/bar/baz')
         'bar/baz'
         >>> strip_first_path_segment('/foo/bar/baz/')
         'bar/baz'
         >>> strip_first_path_segment('foo/')
-        'foo/'
+        ''
     """
-    segments = paths.split(path)
-    if not segments or len(segments) == 1:
+    path = clean_path(path)
+    if '/' in path:
+        _root, _, path = path.partition('/')
         return path
-    stripped = segments[1:]
-    return '/'.join(stripped)
+    else:
+        return ''
 
 
 def get_codebase_cache_dir(temp_dir):
@@ -1400,14 +1593,18 @@ class _CodebaseAttributes(object):
     def to_dict(self):
         return attr.asdict(self, dict_factory=dict)
 
-
-def get_codebase_attributes_class(attributes):
-    return attr.make_class(
-        name='CodebaseAttributes',
-        attrs=attributes or {},
-        slots=True,
-        bases=(_CodebaseAttributes,)
-    )
+    @classmethod
+    def from_attributes(cls, attributes):
+        """
+        Return a new sub class of _CodebaseAttributes built with the
+        ``attributes`` mapping of "attr" attributes.
+        """
+        return attr.make_class(
+            name='CodebaseAttributes',
+            attrs=attributes or {},
+            slots=True,
+            bases=(_CodebaseAttributes,),
+        )
 
 
 def build_attributes_defs(mapping, ignored_keys=()):
@@ -1415,7 +1612,7 @@ def build_attributes_defs(mapping, ignored_keys=()):
     Given a mapping, return an ordered mapping of attributes built from the
     mapping keys and values.
     """
-    attributes = dict()
+    attributes = {}
 
     # We add the attributes that are not in standard_res_attributes already
     # FIXME: we should not have to infer the schema may be?
@@ -1426,6 +1623,10 @@ def build_attributes_defs(mapping, ignored_keys=()):
             attributes[key] = attr.ib(default=attr.Factory(list), repr=False)
         elif isinstance(value, dict):
             attributes[key] = attr.ib(default=attr.Factory(dict), repr=False)
+        elif isinstance(value, bool):
+            attributes[key] = attr.ib(default=False, type=bool, repr=False)
+        elif isinstance(value, int):
+            attributes[key] = attr.ib(default=0, type=bool, repr=False)
         else:
             attributes[key] = attr.ib(default=None, repr=False)
 
@@ -1440,13 +1641,17 @@ class VirtualCodebase(Codebase):
         'has_single_resource',
     )
 
-    def __init__(self, location,
-                 resource_attributes=None,
-                 codebase_attributes=None,
-                 full_root=False, strip_root=False,
-                 temp_dir=temp_dir,
-                 max_in_memory=10000,
-                 *args, **kwargs):
+    def __init__(
+        self,
+        location,
+        resource_attributes=None,
+        codebase_attributes=None,
+        temp_dir=temp_dir,
+        max_in_memory=10000,
+        paths=tuple(),
+        *args,
+        **kwargs,
+    ):
         """
         Initialize a new virtual codebase from JSON scan file at `location`.
         See the Codebase parent class for other arguments.
@@ -1454,15 +1659,18 @@ def __init__(self, location,
         `max_depth`, if passed, will be ignored as VirtualCodebase will
         be using the depth of the original scan.
         """
+        logger_debug(f'VirtualCodebase: new from: {location!r}')
+
         self._setup_essentials(temp_dir, max_in_memory)
 
-        self.codebase_attributes = codebase_attributes or dict()
-        self.resource_attributes = resource_attributes or dict()
+        self.codebase_attributes = codebase_attributes or {}
+        self.resource_attributes = resource_attributes or {}
         self.resource_class = None
         self.has_single_resource = False
         self.location = location
 
         scan_data = self._get_scan_data(location)
+        self.paths = self._prepare_clean_paths(paths)
         self._populate(scan_data)
 
     def _get_scan_data_helper(self, location):
@@ -1472,10 +1680,9 @@ def _get_scan_data_helper(self, location):
         try:
             return json.loads(location)
         except:
-            # Load scan data at once TODO: since we load it all does it make sense
-            # to have support for caching at all?
+
             location = abspath(normpath(expanduser(location)))
-            with io.open(location, 'rb') as f:
+            with open(location) as f:
                 scan_data = json.load(f)
             return scan_data
 
@@ -1485,94 +1692,131 @@ def _get_scan_data(self, location):
         - a path string
         - a JSON string
         - a Python mapping
-
-        or `location` is a List or a Tuple that contains multiple paths to scans that are to be joined together.
+        - a List or Tuple of paths to JSON scans to combine together. In this
+          case all paths are prefixed with codebase-1/, codebase-2., etc.
+          incremented for each location.
+        Loading also cleans the paths as POSIX.
         """
         if isinstance(location, dict):
             return location
-        if isinstance(location, (list, tuple,)):
+
+        if isinstance(
+            location,
+            (
+                list,
+                tuple,
+            ),
+        ):
             combined_scan_data = dict(headers=[], files=[])
-            for loc in location:
+            for idx, loc in enumerate(location, 1):
                 scan_data = self._get_scan_data_helper(loc)
                 headers = scan_data.get('headers')
                 if headers:
                     combined_scan_data['headers'].extend(headers)
                 files = scan_data.get('files')
                 if files:
+                    for f in files:
+                        f['path'] = posixpath_join(f'codebase-{idx}', clean_path(f['path']))
                     combined_scan_data['files'].extend(files)
                 else:
-                    raise Exception('Input file does not have Resources to import: {}'.format(loc))
-            combined_scan_data['headers'] = sorted(combined_scan_data['headers'], key=lambda x: x['start_timestamp'])
+                    raise Exception(
+                        f'Input file is missing a "files" (aka. resources) section to load: {loc}'
+                    )
+
+            combined_scan_data['headers'] = sorted(
+                combined_scan_data['headers'],
+                key=lambda x: x['start_timestamp'],
+            )
             return combined_scan_data
+
         return self._get_scan_data_helper(location)
 
     def _create_empty_resource_data(self):
         """
         Return a dictionary of Resource fields and their default values.
 
-        The fields returned are that which are not part of the standard set of Resource attributes.
+        The fields returned are that which are not part of the standard set of
+        Resource attributes.
         """
         # Get fields from the base Resource class and the ScannedResource class
         base_fields = attr.fields(Resource)
         resource_fields = attr.fields(self.resource_class)
-        # Create dict of {field: field_default_value} for the dynamically created fields
-        resource_data = dict()
+        # A dict of {field: field_default_value} for the dynamically created fields
+        resource_data = {}
         for field in resource_fields:
             if field in base_fields:
                 # We only want the fields that are not part of the base set of fields
                 continue
             value = field.default
             if isinstance(value, attr.Factory):
-                # For fields that have Factories as values, we set their values to be an
-                # instance of whatever type the factory makes
+                # For fields that have Factories as values, we set their values
+                # to be an instance of whatever type the factory makes
                 value = value.factory()
             resource_data[field.name] = value
         return resource_data
 
-    def _get_or_create_parent(self, path, parent_by_path):
-        """
-        Return a parent resource for a given `path` from `parent_by_path`.
-
-        If a parent resource for a `path` does not exist in `parent_by_path`, it
-        is created recursively.
-
-        Note: the root path and root Resource must already be in
-        `parent_by_path` or else this function does not work.
-        """
-        parent_path = parent_directory(path).rstrip('/').rstrip('\\').lstrip('/')
-        existing_parent = parent_by_path.get(parent_path)
-        if existing_parent:
-            return existing_parent
-        parent_parent = self._get_or_create_parent(parent_path, parent_by_path)
-        parent_name = file_base_name(parent_path)
-        parent_is_file = False
-        parent_resource_data = self._create_empty_resource_data()
-        parent_resource = self._create_resource(
-            parent_name,
-            parent_parent,
-            parent_is_file,
-            parent_path,
-            parent_resource_data,
+    def _collect_codebase_attributes(self, scan_data, *args, **kwargs):
+        """
+        Return a mapping of CodebaseAttributes fields to use with this Codebase
+        """
+        # collect attributes from scan data
+        all_attributes = (
+            build_attributes_defs(
+                mapping=scan_data,
+                ignored_keys=('headers', 'files'),
+            )
+            or {}
         )
-        parent_by_path[parent_path] = parent_resource
-        return parent_resource
 
-    def _set_new_root_directory(self, resources_data, new_root_directory_path):
-        for resource_data in resources_data:
-            resource_path = Path(resource_data['path'])
-            new_resource_path = Path(new_root_directory_path)
-            new_resource_path = new_resource_path.joinpath(resource_path)
-            resource_data['path'] = str(new_resource_path)
+        # We add in the attributes that we collected from the plugins. They come
+        # last for now.
+        for name, plugin_attribute in self.codebase_attributes.items():
+            if name not in all_attributes:
+                all_attributes[name] = plugin_attribute
+
+        return all_attributes
+
+    def _build_resource_class(self, sample_resource_data, *args, **kwargs):
+        """
+        Return a Resource class to use with this Codebase
+        """
+        # Collect the existing attributes of the standard Resource class
+        standard_res_attributes = set(f.name for f in attr.fields(Resource))
+
+        # add these properties since they are fields but are serialized
+        properties = set(['type', 'base_name', 'extension'])
+        standard_res_attributes.update(properties)
+
+        # We collect attributes that are not in standard_res_attributes already
+        # FIXME: we should not have to infer the schema may be?
+        all_res_attributes = build_attributes_defs(
+            mapping=sample_resource_data,
+            ignored_keys=standard_res_attributes,
+        )
+
+        # We add the attributes that we collected from the plugins. They come
+        # last for now.
+        for name, plugin_attribute in self.resource_attributes.items():
+            if name not in all_res_attributes:
+                all_res_attributes[name] = plugin_attribute
+
+        # Create the Resource class with the desired attributes
+        return attr.make_class(
+            name='ScannedResource',
+            attrs=all_res_attributes or dict(),
+            slots=True,
+            bases=(Resource,),
+        )
 
     def _populate(self, scan_data):
         """
         Populate this codebase with Resource objects.
-        The actual class of Resource objects will be created as a side effect.
 
-        Population is done by loading JSON scan results and creating new
-        Resources for each result.
+        The actual subclass of Resource objects used in this codebase will be
+        created as a side effect.
 
-        This assumes that the input JSON scan results are in top-down order.
+        Population is done by loading JSON scan results and creating new
+        Resources for each files mappings.
         """
         # Collect headers
         ##########################################################
@@ -1582,180 +1826,278 @@ def _populate(self, scan_data):
 
         # Collect codebase-level attributes and build a class, then load
         ##########################################################
-        standard_cb_attrs = set(['headers', 'files', ])
-        all_cb_attributes = build_attributes_defs(scan_data, standard_cb_attrs)
-        # We add in the attributes that we collected from the plugins. They come
-        # last for now.
-        for name, plugin_attribute in self.codebase_attributes.items():
-            if name not in all_cb_attributes:
-                all_cb_attributes[name] = plugin_attribute
-
-        cbac = get_codebase_attributes_class(all_cb_attributes or dict())
+        # Codebase attributes to use. Configured with scan_data and plugin
+        # attributes if present.
+        self.codebase_attributes = self._collect_codebase_attributes(scan_data)
+        cbac = _CodebaseAttributes.from_attributes(attributes=self.codebase_attributes)
         self.attributes = cbac()
 
         # now populate top level codebase attributes
-        for attr_name in all_cb_attributes:
+        ##########################################################
+        for attr_name in self.codebase_attributes:
             value = scan_data.get(attr_name)
-            if value:
-                setattr(self.attributes, attr_name, value)
+            setattr(self.attributes, attr_name, value)
 
-        # Build attributes attach to Resource
         ##########################################################
-        resources_data = scan_data['files']
-        if len(resources_data) == 1 :
+        files_data = scan_data.get('files')
+        if not files_data:
+            raise Exception('Input has no "files" top-level scan results.')
+
+        if len(files_data) == 1:
+            # we will shortcut to populate the codebase with a single root resource
             self.has_single_resource = True
-        if not resources_data:
-            raise Exception('Input has no file-level scan results.')
+            root_is_file = files_data[0].get('type') == 'file'
+        else:
+            root_is_file = False
 
-        # We iterate through all the Resource(s) so that we can build attributes each resource contains
+        # Create a virtual root if we are merging multiple input scans together
+        location = self.location
+        multiple_inputs = (
+            isinstance(
+                location,
+                (
+                    list,
+                    tuple,
+                ),
+            )
+            and len(location) > 1
+        )
 
-        sample_resource_data = dict()
+        # Iterate through all Resources to collect any attribute in any resource
+        # as sample data. The paths were cleaned on loading
+        # NOTE: We also:
+        # - add a new "segments" attributes with path split in segments
+        # - populate a set of unique root names to to check if all scanned
+        #   Resources share a common root or need a new virtual root
 
-        for resource in resources_data:
-            sample_resource_data.update(resource)
+        root_names = set()
+        root_names_add = root_names.add
 
-        # Collect the existing attributes of the standard Resource class
-        standard_res_attributes = set(f.name for f in attr.fields(Resource))
-        # add these properties since they are fields but are serialized
-        properties = set(['type', 'base_name', 'extension'])
-        standard_res_attributes.update(properties)
+        sample_resource_data = {}
+        sample_resource_data_update = sample_resource_data.update
 
-        # We collect attributes that are not in standard_res_attributes already
-        # FIXME: we should not have to infer the schema may be?
-        all_res_attributes = build_attributes_defs(sample_resource_data, standard_res_attributes)
-        # We add the attributes that we collected from the plugins. They come
-        # last for now.
-        for name, plugin_attribute in self.resource_attributes.items():
-            if name not in all_res_attributes:
-                all_res_attributes[name] = plugin_attribute
+        for fdata in files_data:
+            sample_resource_data_update(fdata)
+            segments = fdata['path'].split('/')
+            root_names_add(segments[0])
+            fdata['path_segments'] = segments
 
-        # Create the Resource class with the desired attributes
-        self.resource_class = attr.make_class(
-            name='ScannedResource',
-            attrs=all_res_attributes or dict(),
-            slots=True,
-            # frozen=True,
-            bases=(Resource,))
+        # Resource sub-class to use. Configured with all known scanned file
+        # attributes and plugin attributes if present
+        ##########################################################
+        self.resource_class = self._build_resource_class(sample_resource_data)
 
         # do we have file information attributes in this codebase data?
-        self.with_info = any(a in sample_resource_data for a in (
-            'name',
-            'base_name',
-            'extension',
-            'size',
-            'files_count',
-            'dirs_count',
-            'size_count',)
+        self.with_info = any(
+            a in sample_resource_data
+            for a in (
+                'name',
+                'base_name',
+                'extension',
+                'size',
+                'files_count',
+                'dirs_count',
+                'size_count',
+            )
         )
 
-        # Create Resources from scan info
+        # walk and create resources proper
+        # Create root resource first
         ##########################################################
-        # Create root resource without setting root data just yet. If we run into the root data
-        # while we iterate through `resources_data`, we fill in the data then.
-
-        # Create a virtual root if we are merging multiple scans together
-        multiple_input = isinstance(self.location, (list, tuple,)) and len(self.location) > 1
-        if multiple_input:
+        if not root_names:
+            raise Exception('Unable to find root for codebase.')
+
+        len_root_names = len(root_names)
+        if len_root_names == 1:
+            root_path = root_names.pop()
+            needs_new_virtual_root = False
+        elif len_root_names > 1 or multiple_inputs:
             root_path = 'virtual_root'
-        else:
-            sample_resource_path = sample_resource_data['path']
-            sample_resource_path = sample_resource_path.strip('/')
-            root_path = sample_resource_path.split('/')[0]
-
-            # Check to see if the Resources from the scan we received has a common root directory.
-            for resource_data in resources_data:
-                resource_path = resource_data.get('path')
-                resource_path = resource_path.strip('/')
-                resource_root_path = resource_path.split('/')[0]
-                # If not, set a common root directory for all Resources.
-                if resource_root_path != root_path:
-                    self._set_new_root_directory(
-                        resources_data=resources_data,
-                        new_root_directory_path='virtual_root'
-                    )
-                    root_path = 'virtual_root'
-                    break
+            needs_new_virtual_root = True
+
+        if needs_new_virtual_root:
+            for fdata in files_data:
+                rpath = fdata['path']
+                fdata['path'] = posixpath_join(root_path, rpath)
+                fdata['path_segments'].insert(0, root_path)
 
-        root_name = root_path
-        root_is_file = False
         root_data = self._create_empty_resource_data()
-        root_resource = self._create_root_resource(
-            name=root_name,
+
+        if self.has_single_resource:
+            # single resource with one or more segments
+            rdata = files_data[0]
+            root_path = rdata['path']
+            rdata = remove_properties_and_basics(rdata)
+            root_data.update(rdata)
+
+        # Create root resource
+        root = self._create_root_resource(
+            name=file_name(root_path),
             path=root_path,
             is_file=root_is_file,
-            root_data=root_data,
         )
 
-        # To help recreate the resource tree we keep a mapping by path of any
-        # parent resource
-        parent_by_path = {root_path: root_resource}
+        for name, value in root_data.items():
+            # skip known properties
+            if name not in KNOW_PROPS:
+                setattr(root, name, value)
+
+        if TRACE:
+            logger_debug('VirtualCodebase.populate: root:', root)
+
+        # TODO: report error if filtering the root with a paths?
+        self.save_resource(root)
+
+        if self.has_single_resource:
+            if TRACE:
+                logger_debug('VirtualCodebase.populate: with single resource.')
+            return
+
+        all_paths = None
+        if self.paths:
+            # build a set of all all paths and all their ancestors
+            all_paths = set()
+            for path in self.paths:
+                all_paths.update(get_ancestor_paths(path, include_self=True))
 
-        for resource_data in resources_data:
-            path = resource_data.get('path')
-            # Append virtual_root path to imported Resource path if we are merging multiple scans
-            if multiple_input:
-                path = posixpath.join(root_path, path)
+        # Create other Resources from scan info
 
-            name = resource_data.get('name', None)
+        # Note that we do not know the ordering there.
+        # Therefore we sort in place by path segments
+        files_data.sort(key=itemgetter('path_segments'))
+
+        # We create directories that exist in the scan or create these that
+        # exist only in paths
+        duplicated_paths = set()
+        last_path = None
+        for fdata in files_data:
+            path = fdata.get('path')
+
+            # skip the ones we did not request
+            if all_paths and path not in all_paths:
+                continue
+
+            # these are no longer needed
+            path_segments = fdata.pop('path_segments')
+
+            if not last_path:
+                last_path = path
+            elif last_path == path:
+                duplicated_paths.add(path)
+            else:
+                last_path = path
+
+            name = fdata.get('name', None) or None
             if not name:
                 name = file_name(path)
 
-            is_file = resource_data.get('type', 'file') == 'file'
+            is_file = fdata.get('type', 'file') == 'file'
 
-            existing_parent = parent_by_path.get(path)
-            if existing_parent:
-                # We update the empty parent Resouorce we in
-                # _get_or_create_parent() with the data from the scan
-                for k, v in resource_data.items():
-                    setattr(existing_parent, k, v)
-                self.save_resource(existing_parent)
-            else:
-                # Note: `root_path`: `root_resource` must be in `parent_by_path`
-                # in order for `_get_or_create_parent` to work
-                parent = self._get_or_create_parent(path, parent_by_path)
-                resource = self._create_resource(
-                    name=name,
-                    parent=parent,
-                    is_file=is_file,
-                    path=path,
-                    resource_data=resource_data,
-                )
+            parent = self._get_parent_directory(path_segments=path_segments)
+            resource = self._get_or_create_resource(
+                name=name,
+                path=path,
+                parent=parent,
+                is_file=is_file,
+            )
+            # set data
+            for name, value in fdata.items():
+                # skip known properties
+                if name not in KNOW_PROPS:
+                    setattr(resource, name, value)
+
+            self.save_resource(resource)
+
+        if duplicated_paths:
+            raise Exception(
+                'Illegal combination of VirtualCode multiple inputs: '
+                f'duplicated paths: {list(duplicated_paths)}',
+            )
 
-                # Files are not parents (for now), so we do not need to add this
-                # to the parent_by_path mapping
-                if not is_file:
-                    parent_by_path[path] = resource
-                self.save_resource(resource)
+    def _get_parent_directory(self, path_segments):
+        """
+        Ensure that all directories in a sequence of path_segments exist
+        and return the last one.
+        """
+        # TODO: handle single resource codebases
+        resources_by_path = self.resources_by_path
 
-    def _create_root_resource(self, name, path, is_file, root_data):
+        # remove the first which is the root, already created
+        # and the last which is the current "child" segment
+        path_segments = path_segments[1:-1]
+
+        current = self.root
+        for segment in path_segments:
+            existing = resources_by_path.get(segment)
+            if not existing:
+                existing = self._get_or_create_resource(
+                    name=segment,
+                    # build the path based on parent
+                    path=posixpath_join(current.path, segment),
+                    parent=current,
+                    is_file=False,
+                )
+            current = existing
+        return current
+
+    def _create_root_resource(self, name, path, is_file):
         """
         Create and return the root Resource of this codebase.
         """
         # we cannot recreate a root if it exists!!
         if self.root:
             raise TypeError('Root resource already exists and cannot be recreated')
-        if root_data:
-            root_data = remove_properties_and_basics(root_data)
+
+        path = clean_path(path)
+
+        if TRACE:
+            logger_debug(f'  VirtualCodebase._create_root_resource: {path!r} is_file: {is_file}')
+
         root = self.resource_class(
             name=name,
             location=None,
+            # never cached
+            cache_location=None,
             path=path,
-            rid=0,
-            pid=None,
+            is_root=True,
             is_file=is_file,
-            **root_data,
         )
 
-        self.resource_ids.add(0)
-        self.resources[0] = root
+        self.resources_by_path[path] = root
+        self.resources_count += 1
         self.root = root
         return root
 
 
+KNOW_PROPS = set(['type', 'base_name', 'extension', 'path', 'name', 'path_segments'])
+
+
 def remove_properties_and_basics(resource_data):
     """
     Given a mapping of resource_data attributes to use as "kwargs", return a new
     mapping with the known properties removed.
     """
-    return dict([(k, v) for k, v in resource_data.items()
-            if k not in ('type', 'base_name', 'extension', 'path', 'name')])
+    return {k: v for k, v in resource_data.items() if k not in KNOW_PROPS}
+
+
+def get_ancestor_paths(path, include_self=False):
+    """
+    Yield all subpaths from a POSIX path.
+
+    For example::
+    >>> path = 'foo/bar/baz'
+    >>> results = list(get_ancestor_paths(path))
+    >>> assert results == ['foo', 'foo/bar'], results
+    >>> results = list(get_ancestor_paths(path, include_self=True))
+    >>> assert results == ['foo', 'foo/bar', 'foo/bar/baz'], results
+    >>> results = list(get_ancestor_paths('foo', include_self=False))
+    >>> assert results == [], results
+    """
+    assert path
+    segments = path.split('/')
+    if not include_self:
+        segments = segments[:-1]
+    subpath = []
+    for segment in segments:
+        subpath.append(segment)
+        yield '/'.join(subpath)
diff --git a/src/commoncode/testcase.py b/src/commoncode/testcase.py
index 3debcfd..2fd9fd3 100644
--- a/src/commoncode/testcase.py
+++ b/src/commoncode/testcase.py
@@ -7,26 +7,28 @@
 #
 
 import filecmp
+import json
 import os
 import shutil
 import stat
 import sys
-
 from os import path
 from collections import defaultdict
 from itertools import chain
 from unittest import TestCase as TestCaseClass
 
+import saneyaml
+
 from commoncode import fileutils
 from commoncode import filetype
-from commoncode.system import on_posix
-from commoncode.system import on_windows
 from commoncode.archive import extract_tar
 from commoncode.archive import extract_tar_raw
 from commoncode.archive import extract_tar_uni
 from commoncode.archive import extract_zip
 from commoncode.archive import extract_zip_raw
 from commoncode.archive import tar_can_extract  # NOQA
+from commoncode.system import on_posix
+from commoncode.system import on_windows
 
 # a base test dir specific to a given test run
 # to ensure that multiple tests run can be launched in parallel
@@ -376,3 +378,27 @@ def get_test_file_pairs(test_dir):
 
     for test_file in test_files:
         yield test_file + '.yml', test_file
+
+
+def check_against_expected_json_file(results, expected_file, regen=False):
+    """
+    Check that the ``results`` data are the same as the data in the
+    ``expected_file`` expected JSON data file.
+
+    If `regen` is True the expected_file will overwritten with the ``results``.
+    This is convenient for updating tests expectations. But use with caution.
+    """
+    if regen:
+        with open(expected_file, 'w') as reg:
+            json.dump(results, reg, indent=2, separators=(',', ': '))
+        expected = results
+    else:
+        with open(expected_file) as exp:
+            expected = json.load(exp)
+
+    # NOTE we redump the JSON as a YAML string for easier display of
+    # the failures comparison/diff
+    if results != expected:
+        expected = saneyaml.dump(expected)
+        results = saneyaml.dump(results)
+        assert results == expected
diff --git a/tests/data/resource/virtual_codebase/cache2.json b/tests/data/resource/virtual_codebase/cache2.json
deleted file mode 100644
index e334ad3..0000000
--- a/tests/data/resource/virtual_codebase/cache2.json
+++ /dev/null
@@ -1 +0,0 @@
-{"files":[{"path":"cache2","type":"directory","name":"cache2","base_name":"cache2","extension":"","size":0,"date":null,"sha1":null,"md5":null,"mime_type":null,"file_type":null,"programming_language":null,"is_binary":false,"is_text":false,"is_archive":false,"is_media":false,"is_source":false,"is_script":false,"files_count":5,"dirs_count":2,"size_count":2228,"scan_errors":[]},{"path":"cache2/abc","type":"file","name":"abc","base_name":"abc","extension":"","size":0,"date":"2018-02-08","sha1":null,"md5":null,"mime_type":"inode/x-empty","file_type":"empty","programming_language":null,"is_binary":false,"is_text":true,"is_archive":false,"is_media":false,"is_source":false,"is_script":false,"files_count":0,"dirs_count":0,"size_count":0,"scan_errors":[]},{"path":"cache2/et131x.h","type":"file","name":"et131x.h","base_name":"et131x","extension":".h","size":2228,"date":"2018-02-08","sha1":"3903b654c47ea95203567230d72093ad1c5c4b90","md5":"5746ea8172cbab86d986a1c659bb4ce8","mime_type":"text/plain","file_type":"UTF-8 Unicode text","programming_language":"C","is_binary":false,"is_text":true,"is_archive":false,"is_media":false,"is_source":true,"is_script":false,"files_count":0,"dirs_count":0,"size_count":0,"scan_errors":[]},{"path":"cache2/dir","type":"directory","name":"dir","base_name":"dir","extension":"","size":0,"date":null,"sha1":null,"md5":null,"mime_type":null,"file_type":null,"programming_language":null,"is_binary":false,"is_text":false,"is_archive":false,"is_media":false,"is_source":false,"is_script":false,"files_count":2,"dirs_count":0,"size_count":0,"scan_errors":[]},{"path":"cache2/dir/that","type":"file","name":"that","base_name":"that","extension":"","size":0,"date":"2018-02-08","sha1":null,"md5":null,"mime_type":"inode/x-empty","file_type":"empty","programming_language":null,"is_binary":false,"is_text":true,"is_archive":false,"is_media":false,"is_source":false,"is_script":false,"files_count":0,"dirs_count":0,"size_count":0,"scan_errors":[]},{"path":"cache2/dir/this","type":"file","name":"this","base_name":"this","extension":"","size":0,"date":"2018-02-08","sha1":null,"md5":null,"mime_type":"inode/x-empty","file_type":"empty","programming_language":null,"is_binary":false,"is_text":true,"is_archive":false,"is_media":false,"is_source":false,"is_script":false,"files_count":0,"dirs_count":0,"size_count":0,"scan_errors":[]},{"path":"cache2/other dir","type":"directory","name":"other dir","base_name":"other dir","extension":"","size":0,"date":null,"sha1":null,"md5":null,"mime_type":null,"file_type":null,"programming_language":null,"is_binary":false,"is_text":false,"is_archive":false,"is_media":false,"is_source":false,"is_script":false,"files_count":1,"dirs_count":0,"size_count":0,"scan_errors":[]},{"path":"cache2/other dir/file","type":"file","name":"file","base_name":"file","extension":"","size":0,"date":"2018-02-08","sha1":null,"md5":null,"mime_type":"inode/x-empty","file_type":"empty","programming_language":null,"is_binary":false,"is_text":true,"is_archive":false,"is_media":false,"is_source":false,"is_script":false,"files_count":0,"dirs_count":0,"size_count":0,"scan_errors":[]}]}
diff --git a/tests/data/resource/virtual_codebase/codebase-for-cache-tests.json b/tests/data/resource/virtual_codebase/codebase-for-cache-tests.json
new file mode 100644
index 0000000..495d3ef
--- /dev/null
+++ b/tests/data/resource/virtual_codebase/codebase-for-cache-tests.json
@@ -0,0 +1,196 @@
+{
+  "files": [
+    {
+      "path": "cache2",
+      "type": "directory",
+      "name": "cache2",
+      "base_name": "cache2",
+      "extension": "",
+      "size": 0,
+      "date": null,
+      "sha1": null,
+      "md5": null,
+      "mime_type": null,
+      "file_type": null,
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": false,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 5,
+      "dirs_count": 2,
+      "size_count": 2228,
+      "scan_errors": []
+    },
+    {
+      "path": "cache2/abc",
+      "type": "file",
+      "name": "abc",
+      "base_name": "abc",
+      "extension": "",
+      "size": 0,
+      "date": "2018-02-08",
+      "sha1": null,
+      "md5": null,
+      "mime_type": "inode/x-empty",
+      "file_type": "empty",
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "cache2/et131x.h",
+      "type": "file",
+      "name": "et131x.h",
+      "base_name": "et131x",
+      "extension": ".h",
+      "size": 2228,
+      "date": "2018-02-08",
+      "sha1": "3903b654c47ea95203567230d72093ad1c5c4b90",
+      "md5": "5746ea8172cbab86d986a1c659bb4ce8",
+      "mime_type": "text/plain",
+      "file_type": "UTF-8 Unicode text",
+      "programming_language": "C",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "cache2/dir",
+      "type": "directory",
+      "name": "dir",
+      "base_name": "dir",
+      "extension": "",
+      "size": 0,
+      "date": null,
+      "sha1": null,
+      "md5": null,
+      "mime_type": null,
+      "file_type": null,
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": false,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 2,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "cache2/dir/that",
+      "type": "file",
+      "name": "that",
+      "base_name": "that",
+      "extension": "",
+      "size": 0,
+      "date": "2018-02-08",
+      "sha1": null,
+      "md5": null,
+      "mime_type": "inode/x-empty",
+      "file_type": "empty",
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "cache2/dir/this",
+      "type": "file",
+      "name": "this",
+      "base_name": "this",
+      "extension": "",
+      "size": 0,
+      "date": "2018-02-08",
+      "sha1": null,
+      "md5": null,
+      "mime_type": "inode/x-empty",
+      "file_type": "empty",
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "cache2/other dir",
+      "type": "directory",
+      "name": "other dir",
+      "base_name": "other dir",
+      "extension": "",
+      "size": 0,
+      "date": null,
+      "sha1": null,
+      "md5": null,
+      "mime_type": null,
+      "file_type": null,
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": false,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 1,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "cache2/other dir/file",
+      "type": "file",
+      "name": "file",
+      "base_name": "file",
+      "extension": "",
+      "size": 0,
+      "date": "2018-02-08",
+      "sha1": null,
+      "md5": null,
+      "mime_type": "inode/x-empty",
+      "file_type": "empty",
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    }
+  ]
+}
diff --git a/tests/data/resource/virtual_codebase/combine-expected.json b/tests/data/resource/virtual_codebase/combine-expected.json
new file mode 100644
index 0000000..d3b79dc
--- /dev/null
+++ b/tests/data/resource/virtual_codebase/combine-expected.json
@@ -0,0 +1,44 @@
+[
+  {
+    "path": "virtual_root",
+    "type": "directory",
+    "summary": [],
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-1",
+    "type": "directory",
+    "summary": [],
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-1/samples",
+    "type": "directory",
+    "summary": [],
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-1/samples/NOTICE",
+    "type": "file",
+    "summary": [],
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-2",
+    "type": "directory",
+    "summary": [],
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-2/thirdparty",
+    "type": "directory",
+    "summary": [],
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-2/thirdparty/example.zip",
+    "type": "file",
+    "summary": [],
+    "scan_errors": []
+  }
+]
\ No newline at end of file
diff --git a/tests/data/resource/virtual_codebase/combine-shared-directory-name-1.json b/tests/data/resource/virtual_codebase/combine-shared-directory-name-1.json
new file mode 100644
index 0000000..88bc3f4
--- /dev/null
+++ b/tests/data/resource/virtual_codebase/combine-shared-directory-name-1.json
@@ -0,0 +1,45 @@
+{
+  "headers": [
+    {
+      "tool_name": "scancode-toolkit",
+      "tool_version": "31.0.0b4",
+      "options": {
+        "input": [
+          "/home/jono/Desktop/test1/codebase"
+        ],
+        "--json-pp": "/home/jono/Desktop/test1-i.json"
+      },
+      "notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.",
+      "start_timestamp": "2022-05-14T012902.519537",
+      "end_timestamp": "2022-05-14T012902.525235",
+      "output_format_version": "2.0.0",
+      "duration": 0.005705356597900391,
+      "message": null,
+      "errors": [],
+      "warnings": [],
+      "extra_data": {
+        "system_environment": {
+          "operating_system": "linux",
+          "cpu_architecture": "64",
+          "platform": "Linux-5.4.0-109-generic-x86_64-with-glibc2.27",
+          "platform_version": "#123~18.04.1-Ubuntu SMP Fri Apr 8 09:48:52 UTC 2022",
+          "python_version": "3.9.12 (main, Apr 16 2022, 19:31:36) \n[GCC 7.5.0]"
+        },
+        "spdx_license_list_version": "3.16",
+        "files_count": 1
+      }
+    }
+  ],
+  "files": [
+    {
+      "path": "codebase",
+      "type": "directory",
+      "scan_errors": []
+    },
+    {
+      "path": "codebase/test1.c",
+      "type": "file",
+      "scan_errors": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/resource/virtual_codebase/combine-shared-directory-name-2.json b/tests/data/resource/virtual_codebase/combine-shared-directory-name-2.json
new file mode 100644
index 0000000..b6ea0b7
--- /dev/null
+++ b/tests/data/resource/virtual_codebase/combine-shared-directory-name-2.json
@@ -0,0 +1,45 @@
+{
+  "headers": [
+    {
+      "tool_name": "scancode-toolkit",
+      "tool_version": "31.0.0b4",
+      "options": {
+        "input": [
+          "/home/jono/Desktop/test2/codebase"
+        ],
+        "--json-pp": "/home/jono/Desktop/test2-i.json"
+      },
+      "notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.",
+      "start_timestamp": "2022-05-14T012909.811079",
+      "end_timestamp": "2022-05-14T012909.816782",
+      "output_format_version": "2.0.0",
+      "duration": 0.0057103633880615234,
+      "message": null,
+      "errors": [],
+      "warnings": [],
+      "extra_data": {
+        "system_environment": {
+          "operating_system": "linux",
+          "cpu_architecture": "64",
+          "platform": "Linux-5.4.0-109-generic-x86_64-with-glibc2.27",
+          "platform_version": "#123~18.04.1-Ubuntu SMP Fri Apr 8 09:48:52 UTC 2022",
+          "python_version": "3.9.12 (main, Apr 16 2022, 19:31:36) \n[GCC 7.5.0]"
+        },
+        "spdx_license_list_version": "3.16",
+        "files_count": 1
+      }
+    }
+  ],
+  "files": [
+    {
+      "path": "codebase",
+      "type": "directory",
+      "scan_errors": []
+    },
+    {
+      "path": "codebase/test2.py",
+      "type": "file",
+      "scan_errors": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/resource/virtual_codebase/combine-shared-directory-name-expected.json b/tests/data/resource/virtual_codebase/combine-shared-directory-name-expected.json
new file mode 100644
index 0000000..dfefe04
--- /dev/null
+++ b/tests/data/resource/virtual_codebase/combine-shared-directory-name-expected.json
@@ -0,0 +1,37 @@
+[
+  {
+    "path": "virtual_root",
+    "type": "directory",
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-1",
+    "type": "directory",
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-1/codebase",
+    "type": "directory",
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-1/codebase/test1.c",
+    "type": "file",
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-2",
+    "type": "directory",
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-2/codebase",
+    "type": "directory",
+    "scan_errors": []
+  },
+  {
+    "path": "virtual_root/codebase-2/codebase/test2.py",
+    "type": "file",
+    "scan_errors": []
+  }
+]
\ No newline at end of file
diff --git a/tests/data/resource/virtual_codebase/et131x.h.json b/tests/data/resource/virtual_codebase/et131x.h.json
index c687df4..091186a 100644
--- a/tests/data/resource/virtual_codebase/et131x.h.json
+++ b/tests/data/resource/virtual_codebase/et131x.h.json
@@ -1 +1,28 @@
-{"files":[{"path":"et131x.h","type":"file","name":"et131x.h","base_name":"et131x","extension":".h","size":2228,"date":"2018-02-08","sha1":"3903b654c47ea95203567230d72093ad1c5c4b90","md5":"5746ea8172cbab86d986a1c659bb4ce8","mime_type":"text/plain","file_type":"UTF-8 Unicode text","programming_language":"C","is_binary":false,"is_text":true,"is_archive":false,"is_media":false,"is_source":true,"is_script":false,"files_count":0,"dirs_count":0,"size_count":0,"scan_errors":[]}]}
+{
+  "files": [
+    {
+      "path": "et131x.h",
+      "type": "file",
+      "name": "et131x.h",
+      "base_name": "et131x",
+      "extension": ".h",
+      "size": 2228,
+      "date": "2018-02-08",
+      "sha1": "3903b654c47ea95203567230d72093ad1c5c4b90",
+      "md5": "5746ea8172cbab86d986a1c659bb4ce8",
+      "mime_type": "text/plain",
+      "file_type": "UTF-8 Unicode text",
+      "programming_language": "C",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/resource/virtual_codebase/full-root-info-many-expected.json b/tests/data/resource/virtual_codebase/full-root-info-many-expected.json
new file mode 100644
index 0000000..e486aa0
--- /dev/null
+++ b/tests/data/resource/virtual_codebase/full-root-info-many-expected.json
@@ -0,0 +1,652 @@
+[
+  {
+    "path": "home",
+    "type": "directory",
+    "name": "home",
+    "base_name": "home",
+    "extension": "",
+    "size": 0,
+    "date": null,
+    "sha1": null,
+    "md5": null,
+    "sha256": null,
+    "mime_type": null,
+    "file_type": null,
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": false,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar",
+    "type": "directory",
+    "name": "foobar",
+    "base_name": "foobar",
+    "extension": "",
+    "size": 0,
+    "date": null,
+    "sha1": null,
+    "md5": null,
+    "sha256": null,
+    "mime_type": null,
+    "file_type": null,
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": false,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit",
+    "type": "directory",
+    "name": "scancode-toolkit",
+    "base_name": "scancode-toolkit",
+    "extension": "",
+    "size": 0,
+    "date": null,
+    "sha1": null,
+    "md5": null,
+    "sha256": null,
+    "mime_type": null,
+    "file_type": null,
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": false,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples",
+    "type": "directory",
+    "name": "samples",
+    "base_name": "samples",
+    "extension": "",
+    "size": 0,
+    "date": null,
+    "sha1": null,
+    "md5": null,
+    "sha256": null,
+    "mime_type": null,
+    "file_type": null,
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": false,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib",
+    "type": "directory",
+    "name": "zlib",
+    "base_name": "zlib",
+    "extension": "",
+    "size": 0,
+    "date": null,
+    "sha1": null,
+    "md5": null,
+    "sha256": null,
+    "mime_type": null,
+    "file_type": null,
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": false,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 16,
+    "dirs_count": 5,
+    "size_count": 268762,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/adler32.c",
+    "type": "file",
+    "name": "adler32.c",
+    "base_name": "adler32",
+    "extension": ".c",
+    "size": 4968,
+    "date": "2022-03-11",
+    "sha1": "0cff4808476ce0b5f6f0ebbc69ee2ab2a0eebe43",
+    "md5": "ae3bbb54820e1d49fb90cbba222e973f",
+    "sha256": "341d49ae2703037d2d10c8486f1a1ca3b65e0f10cc9e5fead6bfbbc0b34564ba",
+    "mime_type": "text/x-c",
+    "file_type": "C source, ASCII text",
+    "programming_language": "C",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/deflate.c",
+    "type": "file",
+    "name": "deflate.c",
+    "base_name": "deflate",
+    "extension": ".c",
+    "size": 71476,
+    "date": "2022-03-11",
+    "sha1": "7b4ace6d698c5dbbfb9a8f047f63228ca54d2e77",
+    "md5": "cd7826278ce9d9d9ed5abdefef50c3e2",
+    "sha256": "565e68ddfff5af8efd55f71e122b860ad11527a7d9de40a76af2b16afef24cc0",
+    "mime_type": "text/x-c",
+    "file_type": "C source, ASCII text",
+    "programming_language": "C",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/deflate.h",
+    "type": "file",
+    "name": "deflate.h",
+    "base_name": "deflate",
+    "extension": ".h",
+    "size": 12774,
+    "date": "2022-03-11",
+    "sha1": "29ed3b8ca3927576e5889dea5880ca0052942c7d",
+    "md5": "7ceae74a13201f14c91623116af169c3",
+    "sha256": "80570c8052491bdc7583600da28a8f1cb32c27ab1cec107ec12c83255d426cf7",
+    "mime_type": "text/x-c",
+    "file_type": "C source, ASCII text",
+    "programming_language": "C",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/zlib.h",
+    "type": "file",
+    "name": "zlib.h",
+    "base_name": "zlib",
+    "extension": ".h",
+    "size": 87883,
+    "date": "2022-03-11",
+    "sha1": "400d35465f179a4acacb5fe749e6ce20a0bbdb84",
+    "md5": "64d8a5180bd54ff5452886e4cbb21e14",
+    "sha256": "726b0569915917b967f87f3f08a1eec039101bf9dcc29d61c0b2b0b8f271b58d",
+    "mime_type": "text/x-c",
+    "file_type": "C source, ASCII text",
+    "programming_language": "C",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/zutil.c",
+    "type": "file",
+    "name": "zutil.c",
+    "base_name": "zutil",
+    "extension": ".c",
+    "size": 7414,
+    "date": "2022-03-11",
+    "sha1": "e1af709bff21ae0d4331119a7fc4c19f82932043",
+    "md5": "fff257bc1656eb60fc585a7dc35f963d",
+    "sha256": "c5e9927d5a1a1dec514ccdcedfa1e0f01664c58bb33166b4997b50b8001f1d6c",
+    "mime_type": "text/x-c",
+    "file_type": "C source, ASCII text",
+    "programming_language": "C",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/zutil.h",
+    "type": "file",
+    "name": "zutil.h",
+    "base_name": "zutil",
+    "extension": ".h",
+    "size": 6766,
+    "date": "2022-03-11",
+    "sha1": "b909d27ef9ce51639f76b7ea6b62721e7d1b6bf7",
+    "md5": "04fcfbb961591c9452c4d0fd1525ffdf",
+    "sha256": "91cce8e78e83bcdb8c6acb98d4f0686dbdc81ca97d4a36a60c0b48f7ef78f1af",
+    "mime_type": "text/x-c",
+    "file_type": "C source, ASCII text",
+    "programming_language": "C",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/ada",
+    "type": "directory",
+    "name": "ada",
+    "base_name": "ada",
+    "extension": "",
+    "size": 0,
+    "date": null,
+    "sha1": null,
+    "md5": null,
+    "sha256": null,
+    "mime_type": null,
+    "file_type": null,
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": false,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 1,
+    "dirs_count": 0,
+    "size_count": 13594,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/ada/zlib.ads",
+    "type": "file",
+    "name": "zlib.ads",
+    "base_name": "zlib",
+    "extension": ".ads",
+    "size": 13594,
+    "date": "2022-03-11",
+    "sha1": "0245a91806d804bf9f0907a3a001a141e9adb61b",
+    "md5": "71de2670f2e588b51c62e7f6a9046399",
+    "sha256": "02634bec0d5e4c69d8d2859124380074a57de8d8bd928398379bfacc514236d2",
+    "mime_type": "text/plain",
+    "file_type": "ASCII text",
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/dotzlib",
+    "type": "directory",
+    "name": "dotzlib",
+    "base_name": "dotzlib",
+    "extension": "",
+    "size": 0,
+    "date": null,
+    "sha1": null,
+    "md5": null,
+    "sha256": null,
+    "mime_type": null,
+    "file_type": null,
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": false,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 4,
+    "dirs_count": 0,
+    "size_count": 14257,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/dotzlib/AssemblyInfo.cs",
+    "type": "file",
+    "name": "AssemblyInfo.cs",
+    "base_name": "AssemblyInfo",
+    "extension": ".cs",
+    "size": 2500,
+    "date": "2022-03-11",
+    "sha1": "9f1db1177b2e9a014f72bb3cd80be17133e06d16",
+    "md5": "23d0d7c18846fc31655b6aa89b7c8038",
+    "sha256": "314afcfb339ea95f5431047b7ab24631b11c3532c7ce5dc2094ed0cf80a7c16d",
+    "mime_type": "text/plain",
+    "file_type": "ASCII text, with CRLF line terminators",
+    "programming_language": "C#",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/dotzlib/ChecksumImpl.cs",
+    "type": "file",
+    "name": "ChecksumImpl.cs",
+    "base_name": "ChecksumImpl",
+    "extension": ".cs",
+    "size": 8040,
+    "date": "2022-03-11",
+    "sha1": "3807a0e24a57b92ea301559cab7307b8eab52c51",
+    "md5": "d01b3cb2e75da9b15f05b92b42f6bd33",
+    "sha256": "e7c047a2c3bcf88d3d002ee3d2d05af414acf53cb4451efacc0f2e95a474ea0f",
+    "mime_type": "text/x-c++",
+    "file_type": "C++ source, ISO-8859 text, with CRLF line terminators",
+    "programming_language": "C#",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/dotzlib/LICENSE_1_0.txt",
+    "type": "file",
+    "name": "LICENSE_1_0.txt",
+    "base_name": "LICENSE_1_0",
+    "extension": ".txt",
+    "size": 1359,
+    "date": "2022-03-11",
+    "sha1": "892b34f7865d90a6f949f50d95e49625a10bc7f0",
+    "md5": "81543b22c36f10d20ac9712f8d80ef8d",
+    "sha256": "36266a8fd073568394cb81cdb2b124f7fdae2c64c1a7ed09db34b4d22efa2951",
+    "mime_type": "text/plain",
+    "file_type": "ASCII text, with CRLF line terminators",
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/dotzlib/readme.txt",
+    "type": "file",
+    "name": "readme.txt",
+    "base_name": "readme",
+    "extension": ".txt",
+    "size": 2358,
+    "date": "2022-03-11",
+    "sha1": "b1229b826f0096808628474538cea8fec2922a9b",
+    "md5": "1f20f3168ee63d90de033edac2ce383c",
+    "sha256": "d04972a91b1563fb4b7acab4b9ff2b84e57368953cc0596d5f5ea17d97315fd0",
+    "mime_type": "text/plain",
+    "file_type": "ASCII text, with CRLF line terminators",
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/gcc_gvmat64",
+    "type": "directory",
+    "name": "gcc_gvmat64",
+    "base_name": "gcc_gvmat64",
+    "extension": "",
+    "size": 0,
+    "date": null,
+    "sha1": null,
+    "md5": null,
+    "sha256": null,
+    "mime_type": null,
+    "file_type": null,
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": false,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 1,
+    "dirs_count": 0,
+    "size_count": 16413,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/gcc_gvmat64/gvmat64.S",
+    "type": "file",
+    "name": "gvmat64.S",
+    "base_name": "gvmat64",
+    "extension": ".S",
+    "size": 16413,
+    "date": "2022-03-11",
+    "sha1": "742603cba1af98a1432cc02efb019b1a5760adf2",
+    "md5": "5e772d7302475e5473d0c4c57b9861e8",
+    "sha256": "22ff411b8b1d1b04aeaa8418b68245400267dc43c6f44104f6ccd37f0daee89f",
+    "mime_type": "text/x-c",
+    "file_type": "C source, ASCII text, with CRLF line terminators",
+    "programming_language": "GAS",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/infback9",
+    "type": "directory",
+    "name": "infback9",
+    "base_name": "infback9",
+    "extension": "",
+    "size": 0,
+    "date": null,
+    "sha1": null,
+    "md5": null,
+    "sha256": null,
+    "mime_type": null,
+    "file_type": null,
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": false,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 2,
+    "dirs_count": 0,
+    "size_count": 23223,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/infback9/infback9.c",
+    "type": "file",
+    "name": "infback9.c",
+    "base_name": "infback9",
+    "extension": ".c",
+    "size": 21629,
+    "date": "2022-03-11",
+    "sha1": "17fb362c03755b12f2dda5b12a68cf38162674bd",
+    "md5": "23ff5edec0817da303cb1294c1e4205c",
+    "sha256": "0a715c85a1ce3bb8b5a18d60941ffabc0186a886bcc66ba2ee0c4115a8e274e9",
+    "mime_type": "text/x-c",
+    "file_type": "C source, ASCII text",
+    "programming_language": "C",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/infback9/infback9.h",
+    "type": "file",
+    "name": "infback9.h",
+    "base_name": "infback9",
+    "extension": ".h",
+    "size": 1594,
+    "date": "2022-03-11",
+    "sha1": "d0486a32b558dcaceded5f0746fad62e680a4734",
+    "md5": "52b1ed99960d3ed7ed60cd20295e64a8",
+    "sha256": "dda2302f28157fe43a6143f84802af1740393572c2766559593996fd7a5a3245",
+    "mime_type": "text/x-c",
+    "file_type": "C source, ASCII text",
+    "programming_language": "C",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/iostream2",
+    "type": "directory",
+    "name": "iostream2",
+    "base_name": "iostream2",
+    "extension": "",
+    "size": 0,
+    "date": null,
+    "sha1": null,
+    "md5": null,
+    "sha256": null,
+    "mime_type": null,
+    "file_type": null,
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": false,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 2,
+    "dirs_count": 0,
+    "size_count": 9994,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/iostream2/zstream.h",
+    "type": "file",
+    "name": "zstream.h",
+    "base_name": "zstream",
+    "extension": ".h",
+    "size": 9283,
+    "date": "2022-03-11",
+    "sha1": "fca4540d490fff36bb90fd801cf9cd8fc695bb17",
+    "md5": "a980b61c1e8be68d5cdb1236ba6b43e7",
+    "sha256": "d0343e0c57ff58008b6f29643d289c72713aa2d653fe3dcd2e939fc77e7e20b6",
+    "mime_type": "text/x-c++",
+    "file_type": "C++ source, ASCII text",
+    "programming_language": "C",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  },
+  {
+    "path": "home/foobar/scancode-toolkit/samples/zlib/iostream2/zstream_test.cpp",
+    "type": "file",
+    "name": "zstream_test.cpp",
+    "base_name": "zstream_test",
+    "extension": ".cpp",
+    "size": 711,
+    "date": "2022-03-11",
+    "sha1": "e18a6d55cbbd8b832f8d795530553467e5c74fcf",
+    "md5": "d32476bde4e6d5f889092fdff6f8cdb0",
+    "sha256": "f789df183cc58b78751985466380c656308490a9036eb48a7ef79704c3d3f229",
+    "mime_type": "text/x-c",
+    "file_type": "C source, ASCII text",
+    "programming_language": "C++",
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": true,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  }
+]
\ No newline at end of file
diff --git a/tests/data/resource/virtual_codebase/full-root-info-many.json b/tests/data/resource/virtual_codebase/full-root-info-many.json
new file mode 100644
index 0000000..bb86b72
--- /dev/null
+++ b/tests/data/resource/virtual_codebase/full-root-info-many.json
@@ -0,0 +1,570 @@
+{
+  "headers": [
+    {
+      "tool_name": "scancode-toolkit",
+      "tool_version": "31.0.0b4",
+      "options": {
+        "input": [
+          "samples/zlib/"
+        ],
+        "--full-root": true,
+        "--info": true,
+        "--json-pp": "-"
+      },
+      "errors": [],
+      "warnings": []
+    }
+  ],
+  "files": [
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib",
+      "type": "directory",
+      "name": "zlib",
+      "base_name": "zlib",
+      "extension": "",
+      "size": 0,
+      "date": null,
+      "sha1": null,
+      "md5": null,
+      "sha256": null,
+      "mime_type": null,
+      "file_type": null,
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": false,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 16,
+      "dirs_count": 5,
+      "size_count": 268762,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/adler32.c",
+      "type": "file",
+      "name": "adler32.c",
+      "base_name": "adler32",
+      "extension": ".c",
+      "size": 4968,
+      "date": "2022-03-11",
+      "sha1": "0cff4808476ce0b5f6f0ebbc69ee2ab2a0eebe43",
+      "md5": "ae3bbb54820e1d49fb90cbba222e973f",
+      "sha256": "341d49ae2703037d2d10c8486f1a1ca3b65e0f10cc9e5fead6bfbbc0b34564ba",
+      "mime_type": "text/x-c",
+      "file_type": "C source, ASCII text",
+      "programming_language": "C",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/deflate.c",
+      "type": "file",
+      "name": "deflate.c",
+      "base_name": "deflate",
+      "extension": ".c",
+      "size": 71476,
+      "date": "2022-03-11",
+      "sha1": "7b4ace6d698c5dbbfb9a8f047f63228ca54d2e77",
+      "md5": "cd7826278ce9d9d9ed5abdefef50c3e2",
+      "sha256": "565e68ddfff5af8efd55f71e122b860ad11527a7d9de40a76af2b16afef24cc0",
+      "mime_type": "text/x-c",
+      "file_type": "C source, ASCII text",
+      "programming_language": "C",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/deflate.h",
+      "type": "file",
+      "name": "deflate.h",
+      "base_name": "deflate",
+      "extension": ".h",
+      "size": 12774,
+      "date": "2022-03-11",
+      "sha1": "29ed3b8ca3927576e5889dea5880ca0052942c7d",
+      "md5": "7ceae74a13201f14c91623116af169c3",
+      "sha256": "80570c8052491bdc7583600da28a8f1cb32c27ab1cec107ec12c83255d426cf7",
+      "mime_type": "text/x-c",
+      "file_type": "C source, ASCII text",
+      "programming_language": "C",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/zlib.h",
+      "type": "file",
+      "name": "zlib.h",
+      "base_name": "zlib",
+      "extension": ".h",
+      "size": 87883,
+      "date": "2022-03-11",
+      "sha1": "400d35465f179a4acacb5fe749e6ce20a0bbdb84",
+      "md5": "64d8a5180bd54ff5452886e4cbb21e14",
+      "sha256": "726b0569915917b967f87f3f08a1eec039101bf9dcc29d61c0b2b0b8f271b58d",
+      "mime_type": "text/x-c",
+      "file_type": "C source, ASCII text",
+      "programming_language": "C",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/zutil.c",
+      "type": "file",
+      "name": "zutil.c",
+      "base_name": "zutil",
+      "extension": ".c",
+      "size": 7414,
+      "date": "2022-03-11",
+      "sha1": "e1af709bff21ae0d4331119a7fc4c19f82932043",
+      "md5": "fff257bc1656eb60fc585a7dc35f963d",
+      "sha256": "c5e9927d5a1a1dec514ccdcedfa1e0f01664c58bb33166b4997b50b8001f1d6c",
+      "mime_type": "text/x-c",
+      "file_type": "C source, ASCII text",
+      "programming_language": "C",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/zutil.h",
+      "type": "file",
+      "name": "zutil.h",
+      "base_name": "zutil",
+      "extension": ".h",
+      "size": 6766,
+      "date": "2022-03-11",
+      "sha1": "b909d27ef9ce51639f76b7ea6b62721e7d1b6bf7",
+      "md5": "04fcfbb961591c9452c4d0fd1525ffdf",
+      "sha256": "91cce8e78e83bcdb8c6acb98d4f0686dbdc81ca97d4a36a60c0b48f7ef78f1af",
+      "mime_type": "text/x-c",
+      "file_type": "C source, ASCII text",
+      "programming_language": "C",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/ada",
+      "type": "directory",
+      "name": "ada",
+      "base_name": "ada",
+      "extension": "",
+      "size": 0,
+      "date": null,
+      "sha1": null,
+      "md5": null,
+      "sha256": null,
+      "mime_type": null,
+      "file_type": null,
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": false,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 1,
+      "dirs_count": 0,
+      "size_count": 13594,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/ada/zlib.ads",
+      "type": "file",
+      "name": "zlib.ads",
+      "base_name": "zlib",
+      "extension": ".ads",
+      "size": 13594,
+      "date": "2022-03-11",
+      "sha1": "0245a91806d804bf9f0907a3a001a141e9adb61b",
+      "md5": "71de2670f2e588b51c62e7f6a9046399",
+      "sha256": "02634bec0d5e4c69d8d2859124380074a57de8d8bd928398379bfacc514236d2",
+      "mime_type": "text/plain",
+      "file_type": "ASCII text",
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/dotzlib",
+      "type": "directory",
+      "name": "dotzlib",
+      "base_name": "dotzlib",
+      "extension": "",
+      "size": 0,
+      "date": null,
+      "sha1": null,
+      "md5": null,
+      "sha256": null,
+      "mime_type": null,
+      "file_type": null,
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": false,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 4,
+      "dirs_count": 0,
+      "size_count": 14257,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/dotzlib/AssemblyInfo.cs",
+      "type": "file",
+      "name": "AssemblyInfo.cs",
+      "base_name": "AssemblyInfo",
+      "extension": ".cs",
+      "size": 2500,
+      "date": "2022-03-11",
+      "sha1": "9f1db1177b2e9a014f72bb3cd80be17133e06d16",
+      "md5": "23d0d7c18846fc31655b6aa89b7c8038",
+      "sha256": "314afcfb339ea95f5431047b7ab24631b11c3532c7ce5dc2094ed0cf80a7c16d",
+      "mime_type": "text/plain",
+      "file_type": "ASCII text, with CRLF line terminators",
+      "programming_language": "C#",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/dotzlib/ChecksumImpl.cs",
+      "type": "file",
+      "name": "ChecksumImpl.cs",
+      "base_name": "ChecksumImpl",
+      "extension": ".cs",
+      "size": 8040,
+      "date": "2022-03-11",
+      "sha1": "3807a0e24a57b92ea301559cab7307b8eab52c51",
+      "md5": "d01b3cb2e75da9b15f05b92b42f6bd33",
+      "sha256": "e7c047a2c3bcf88d3d002ee3d2d05af414acf53cb4451efacc0f2e95a474ea0f",
+      "mime_type": "text/x-c++",
+      "file_type": "C++ source, ISO-8859 text, with CRLF line terminators",
+      "programming_language": "C#",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/dotzlib/LICENSE_1_0.txt",
+      "type": "file",
+      "name": "LICENSE_1_0.txt",
+      "base_name": "LICENSE_1_0",
+      "extension": ".txt",
+      "size": 1359,
+      "date": "2022-03-11",
+      "sha1": "892b34f7865d90a6f949f50d95e49625a10bc7f0",
+      "md5": "81543b22c36f10d20ac9712f8d80ef8d",
+      "sha256": "36266a8fd073568394cb81cdb2b124f7fdae2c64c1a7ed09db34b4d22efa2951",
+      "mime_type": "text/plain",
+      "file_type": "ASCII text, with CRLF line terminators",
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/dotzlib/readme.txt",
+      "type": "file",
+      "name": "readme.txt",
+      "base_name": "readme",
+      "extension": ".txt",
+      "size": 2358,
+      "date": "2022-03-11",
+      "sha1": "b1229b826f0096808628474538cea8fec2922a9b",
+      "md5": "1f20f3168ee63d90de033edac2ce383c",
+      "sha256": "d04972a91b1563fb4b7acab4b9ff2b84e57368953cc0596d5f5ea17d97315fd0",
+      "mime_type": "text/plain",
+      "file_type": "ASCII text, with CRLF line terminators",
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/gcc_gvmat64",
+      "type": "directory",
+      "name": "gcc_gvmat64",
+      "base_name": "gcc_gvmat64",
+      "extension": "",
+      "size": 0,
+      "date": null,
+      "sha1": null,
+      "md5": null,
+      "sha256": null,
+      "mime_type": null,
+      "file_type": null,
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": false,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 1,
+      "dirs_count": 0,
+      "size_count": 16413,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/gcc_gvmat64/gvmat64.S",
+      "type": "file",
+      "name": "gvmat64.S",
+      "base_name": "gvmat64",
+      "extension": ".S",
+      "size": 16413,
+      "date": "2022-03-11",
+      "sha1": "742603cba1af98a1432cc02efb019b1a5760adf2",
+      "md5": "5e772d7302475e5473d0c4c57b9861e8",
+      "sha256": "22ff411b8b1d1b04aeaa8418b68245400267dc43c6f44104f6ccd37f0daee89f",
+      "mime_type": "text/x-c",
+      "file_type": "C source, ASCII text, with CRLF line terminators",
+      "programming_language": "GAS",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/infback9",
+      "type": "directory",
+      "name": "infback9",
+      "base_name": "infback9",
+      "extension": "",
+      "size": 0,
+      "date": null,
+      "sha1": null,
+      "md5": null,
+      "sha256": null,
+      "mime_type": null,
+      "file_type": null,
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": false,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 2,
+      "dirs_count": 0,
+      "size_count": 23223,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/infback9/infback9.c",
+      "type": "file",
+      "name": "infback9.c",
+      "base_name": "infback9",
+      "extension": ".c",
+      "size": 21629,
+      "date": "2022-03-11",
+      "sha1": "17fb362c03755b12f2dda5b12a68cf38162674bd",
+      "md5": "23ff5edec0817da303cb1294c1e4205c",
+      "sha256": "0a715c85a1ce3bb8b5a18d60941ffabc0186a886bcc66ba2ee0c4115a8e274e9",
+      "mime_type": "text/x-c",
+      "file_type": "C source, ASCII text",
+      "programming_language": "C",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/infback9/infback9.h",
+      "type": "file",
+      "name": "infback9.h",
+      "base_name": "infback9",
+      "extension": ".h",
+      "size": 1594,
+      "date": "2022-03-11",
+      "sha1": "d0486a32b558dcaceded5f0746fad62e680a4734",
+      "md5": "52b1ed99960d3ed7ed60cd20295e64a8",
+      "sha256": "dda2302f28157fe43a6143f84802af1740393572c2766559593996fd7a5a3245",
+      "mime_type": "text/x-c",
+      "file_type": "C source, ASCII text",
+      "programming_language": "C",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/iostream2",
+      "type": "directory",
+      "name": "iostream2",
+      "base_name": "iostream2",
+      "extension": "",
+      "size": 0,
+      "date": null,
+      "sha1": null,
+      "md5": null,
+      "sha256": null,
+      "mime_type": null,
+      "file_type": null,
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": false,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 2,
+      "dirs_count": 0,
+      "size_count": 9994,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/iostream2/zstream.h",
+      "type": "file",
+      "name": "zstream.h",
+      "base_name": "zstream",
+      "extension": ".h",
+      "size": 9283,
+      "date": "2022-03-11",
+      "sha1": "fca4540d490fff36bb90fd801cf9cd8fc695bb17",
+      "md5": "a980b61c1e8be68d5cdb1236ba6b43e7",
+      "sha256": "d0343e0c57ff58008b6f29643d289c72713aa2d653fe3dcd2e939fc77e7e20b6",
+      "mime_type": "text/x-c++",
+      "file_type": "C++ source, ASCII text",
+      "programming_language": "C",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "home/foobar/scancode-toolkit/samples/zlib/iostream2/zstream_test.cpp",
+      "type": "file",
+      "name": "zstream_test.cpp",
+      "base_name": "zstream_test",
+      "extension": ".cpp",
+      "size": 711,
+      "date": "2022-03-11",
+      "sha1": "e18a6d55cbbd8b832f8d795530553467e5c74fcf",
+      "md5": "d32476bde4e6d5f889092fdff6f8cdb0",
+      "sha256": "f789df183cc58b78751985466380c656308490a9036eb48a7ef79704c3d3f229",
+      "mime_type": "text/x-c",
+      "file_type": "C source, ASCII text",
+      "programming_language": "C++",
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": true,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    }
+  ]
+}
diff --git a/tests/data/resource/virtual_codebase/full-root-info-one-expected.json b/tests/data/resource/virtual_codebase/full-root-info-one-expected.json
new file mode 100644
index 0000000..fde2694
--- /dev/null
+++ b/tests/data/resource/virtual_codebase/full-root-info-one-expected.json
@@ -0,0 +1,27 @@
+[
+  {
+    "path": "home/foobar/scancode-toolkit/samples/README",
+    "type": "file",
+    "name": "README",
+    "base_name": "README",
+    "extension": "",
+    "size": 236,
+    "date": "2022-03-11",
+    "sha1": "2e07e32c52d607204fad196052d70e3d18fb8636",
+    "md5": "effc6856ef85a9250fb1a470792b3f38",
+    "sha256": "165da86bfdf296cd5a0a3e20c1d1ee86d70ecb8a1fa579d6f8cadad8eee85878",
+    "mime_type": "text/plain",
+    "file_type": "ASCII text",
+    "programming_language": null,
+    "is_binary": false,
+    "is_text": true,
+    "is_archive": false,
+    "is_media": false,
+    "is_source": false,
+    "is_script": false,
+    "files_count": 0,
+    "dirs_count": 0,
+    "size_count": 0,
+    "scan_errors": []
+  }
+]
\ No newline at end of file
diff --git a/tests/data/resource/virtual_codebase/full-root-info-one.json b/tests/data/resource/virtual_codebase/full-root-info-one.json
new file mode 100644
index 0000000..b4cf85e
--- /dev/null
+++ b/tests/data/resource/virtual_codebase/full-root-info-one.json
@@ -0,0 +1,43 @@
+{
+  "headers": [
+    {
+      "tool_name": "scancode-toolkit",
+      "tool_version": "31.0.0b4",
+      "options": {
+        "input": [
+          "samples/README"
+        ],
+        "--full-root": true,
+        "--info": true,
+        "--json-pp": "-"
+      }
+    }
+  ],
+  "files": [
+    {
+      "path": "home/foobar/scancode-toolkit//samples/README",
+      "type": "file",
+      "name": "README",
+      "base_name": "README",
+      "extension": "",
+      "size": 236,
+      "date": "2022-03-11",
+      "sha1": "2e07e32c52d607204fad196052d70e3d18fb8636",
+      "md5": "effc6856ef85a9250fb1a470792b3f38",
+      "sha256": "165da86bfdf296cd5a0a3e20c1d1ee86d70ecb8a1fa579d6f8cadad8eee85878",
+      "mime_type": "text/plain",
+      "file_type": "ASCII text",
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": true,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    }
+  ]
+}
diff --git a/tests/data/resource/virtual_codebase/license-scan.json b/tests/data/resource/virtual_codebase/license-scan.json
new file mode 100644
index 0000000..8aa3470
--- /dev/null
+++ b/tests/data/resource/virtual_codebase/license-scan.json
@@ -0,0 +1,148 @@
+{
+  "headers": [
+    {
+      "tool_name": "scancode-toolkit",
+      "tool_version": "31.0.0b4",
+      "options": {
+        "input": [
+          "tests/licensedcode/data/plugin_license/license_reference/scan/scan-ref/"
+        ],
+        "--json-pp": "license-scan.json",
+        "--license": true,
+        "--license-text": true,
+        "--license-text-diagnostics": true
+      },
+      "notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.",
+      "start_timestamp": "2022-05-14T202411.569222",
+      "end_timestamp": "2022-05-14T202414.906550",
+      "output_format_version": "2.0.0",
+      "duration": 3.3373379707336426,
+      "message": null,
+      "errors": [],
+      "warnings": [],
+      "extra_data": {
+        "system_environment": {
+          "operating_system": "linux",
+          "cpu_architecture": "64",
+          "platform": "Linux-4.15.0-176-generic-x86_64-with-glibc2.17",
+          "platform_version": "#185~16.04.1-Ubuntu SMP Mon Apr 4 10:41:35 UTC 2022",
+          "python_version": "3.8.12 (default, Jan 29 2022, 10:00:28) \n[GCC 5.4.0 20160609]"
+        },
+        "spdx_license_list_version": "3.16",
+        "files_count": 2
+      }
+    }
+  ],
+  "files": [
+    {
+      "path": "scan-ref",
+      "type": "directory",
+      "licenses": [],
+      "license_expressions": [],
+      "percentage_of_license_text": 0,
+      "scan_errors": []
+    },
+    {
+      "path": "scan-ref/LICENSE",
+      "type": "file",
+      "licenses": [
+        {
+          "key": "mit",
+          "score": 100.0,
+          "name": "MIT License",
+          "short_name": "MIT License",
+          "category": "Permissive",
+          "is_exception": false,
+          "is_unknown": false,
+          "owner": "MIT",
+          "homepage_url": "http://opensource.org/licenses/mit-license.php",
+          "text_url": "http://opensource.org/licenses/mit-license.php",
+          "reference_url": "https://scancode-licensedb.aboutcode.org/mit",
+          "scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
+          "scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.yml",
+          "spdx_license_key": "MIT",
+          "spdx_url": "https://spdx.org/licenses/MIT",
+          "start_line": 1,
+          "end_line": 1,
+          "matched_rule": {
+            "identifier": "mit_66.RULE",
+            "license_expression": "mit",
+            "licenses": [
+              "mit"
+            ],
+            "referenced_filenames": [],
+            "is_license_text": false,
+            "is_license_notice": true,
+            "is_license_reference": false,
+            "is_license_tag": false,
+            "is_license_intro": false,
+            "has_unknown": false,
+            "matcher": "1-hash",
+            "rule_length": 10,
+            "matched_length": 10,
+            "match_coverage": 100.0,
+            "rule_relevance": 100
+          },
+          "matched_text": "that is licensed under [MIT](http://opensource.org/licenses/MIT)."
+        }
+      ],
+      "license_expressions": [
+        "mit"
+      ],
+      "percentage_of_license_text": 100.0,
+      "scan_errors": []
+    },
+    {
+      "path": "scan-ref/license-notice.txt",
+      "type": "file",
+      "licenses": [
+        {
+          "key": "unknown-license-reference",
+          "score": 100.0,
+          "name": "Unknown License file reference",
+          "short_name": "Unknown License reference",
+          "category": "Unstated License",
+          "is_exception": false,
+          "is_unknown": true,
+          "owner": "Unspecified",
+          "homepage_url": null,
+          "text_url": "",
+          "reference_url": "https://scancode-licensedb.aboutcode.org/unknown-license-reference",
+          "scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
+          "scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.yml",
+          "spdx_license_key": "LicenseRef-scancode-unknown-license-reference",
+          "spdx_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/unknown-license-reference.LICENSE",
+          "start_line": 34,
+          "end_line": 34,
+          "matched_rule": {
+            "identifier": "unknown-license-reference_25.RULE",
+            "license_expression": "unknown-license-reference",
+            "licenses": [
+              "unknown-license-reference"
+            ],
+            "referenced_filenames": [
+              "LICENSE"
+            ],
+            "is_license_text": false,
+            "is_license_notice": false,
+            "is_license_reference": false,
+            "is_license_tag": true,
+            "is_license_intro": false,
+            "has_unknown": true,
+            "matcher": "2-aho",
+            "rule_length": 5,
+            "matched_length": 5,
+            "match_coverage": 100.0,
+            "rule_relevance": 100
+          },
+          "matched_text": "license\": \"SEE LICENSE IN LICENSE."
+        }
+      ],
+      "license_expressions": [
+        "unknown-license-reference"
+      ],
+      "percentage_of_license_text": 0.2,
+      "scan_errors": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/resource/virtual_codebase/path_full_root.json b/tests/data/resource/virtual_codebase/path_full_root.json
deleted file mode 100644
index dc768ec..0000000
--- a/tests/data/resource/virtual_codebase/path_full_root.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "scancode_notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.",
-    "scancode_version": "2.2.1.post12.6d07756e",
-    "scancode_options": {
-      "--info": true,
-      "--license-score": 0,
-      "--full-root": true,
-      "--format": "json-pp"
-    },
-    "files_count": 1,
-    "files": [
-      {
-        "path": "/Users/sesser/code/nexb/scancode-toolkit/samples/README",
-        "type": "file",
-        "name": "README",
-        "base_name": "README",
-        "extension": "",
-        "date": "2017-09-22",
-        "size": 236,
-        "sha1": "2e07e32c52d607204fad196052d70e3d18fb8636",
-        "md5": "effc6856ef85a9250fb1a470792b3f38",
-        "files_count": null,
-        "mime_type": "text/plain",
-        "file_type": "ASCII text",
-        "programming_language": null,
-        "is_binary": false,
-        "is_text": true,
-        "is_archive": false,
-        "is_media": false,
-        "is_source": false,
-        "is_script": false,
-        "scan_errors": []
-      }
-    ]
-  }
-  
\ No newline at end of file
diff --git a/tests/data/resource/virtual_codebase/resource.json b/tests/data/resource/virtual_codebase/resource.json
index 4056903..9df67d6 100644
--- a/tests/data/resource/virtual_codebase/resource.json
+++ b/tests/data/resource/virtual_codebase/resource.json
@@ -1 +1,28 @@
-{"files":[{"path":"resource","type":"directory","name":"resource","base_name":"resource","extension":"","size":0,"date":null,"sha1":null,"md5":null,"mime_type":null,"file_type":null,"programming_language":null,"is_binary":false,"is_text":false,"is_archive":false,"is_media":false,"is_source":false,"is_script":false,"files_count":0,"dirs_count":0,"size_count":0,"scan_errors":[]}]}
+{
+  "files": [
+    {
+      "path": "resource",
+      "type": "directory",
+      "name": "resource",
+      "base_name": "resource",
+      "extension": "",
+      "size": 0,
+      "date": null,
+      "sha1": null,
+      "md5": null,
+      "mime_type": null,
+      "file_type": null,
+      "programming_language": null,
+      "is_binary": false,
+      "is_text": false,
+      "is_archive": false,
+      "is_media": false,
+      "is_source": false,
+      "is_script": false,
+      "files_count": 0,
+      "dirs_count": 0,
+      "size_count": 0,
+      "scan_errors": []
+    }
+  ]
+}
diff --git a/tests/data/resource/with_path/codebase-expected.json b/tests/data/resource/with_path/codebase-expected.json
new file mode 100644
index 0000000..ee8cbd4
--- /dev/null
+++ b/tests/data/resource/with_path/codebase-expected.json
@@ -0,0 +1,17 @@
+[
+  {
+    "path": "codebase",
+    "type": "directory",
+    "scan_errors": []
+  },
+  {
+    "path": "codebase/other dir",
+    "type": "directory",
+    "scan_errors": []
+  },
+  {
+    "path": "codebase/other dir/file",
+    "type": "file",
+    "scan_errors": []
+  }
+]
\ No newline at end of file
diff --git a/tests/data/resource/with_path/codebase/abc b/tests/data/resource/with_path/codebase/abc
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/resource/with_path/codebase/dir/that b/tests/data/resource/with_path/codebase/dir/that
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/resource/with_path/codebase/dir/this b/tests/data/resource/with_path/codebase/dir/this
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/resource/with_path/codebase/et131x.h b/tests/data/resource/with_path/codebase/et131x.h
new file mode 100644
index 0000000..4ffb839
--- /dev/null
+++ b/tests/data/resource/with_path/codebase/et131x.h
@@ -0,0 +1,47 @@
+/* Copyright © 2005 Agere Systems Inc.
+ * All rights reserved.
+ *   http://www.agere.com
+ *
+ * SOFTWARE LICENSE
+ *
+ * This software is provided subject to the following terms and conditions,
+ * which you should read carefully before using the software.  Using this
+ * software indicates your acceptance of these terms and conditions.  If you do
+ * not agree with these terms and conditions, do not use the software.
+ *
+ * Copyright © 2005 Agere Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source or binary forms, with or without
+ * modifications, are permitted provided that the following conditions are met:
+ *
+ * . Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following Disclaimer as comments in the code as
+ *    well as in the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * . Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following Disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * . Neither the name of Agere Systems Inc. nor the names of the contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * Disclaimer
+ *
+ * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, INFRINGEMENT AND THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  ANY
+ * USE, MODIFICATION OR DISTRIBUTION OF THIS SOFTWARE IS SOLELY AT THE USERS OWN
+ * RISK. IN NO EVENT SHALL AGERE SYSTEMS INC. OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, INCLUDING, BUT NOT LIMITED TO, CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
diff --git a/tests/data/resource/with_path/codebase/other dir/file b/tests/data/resource/with_path/codebase/other dir/file
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/resource/with_path/virtual-codebase-expected.json b/tests/data/resource/with_path/virtual-codebase-expected.json
new file mode 100644
index 0000000..ee8cbd4
--- /dev/null
+++ b/tests/data/resource/with_path/virtual-codebase-expected.json
@@ -0,0 +1,17 @@
+[
+  {
+    "path": "codebase",
+    "type": "directory",
+    "scan_errors": []
+  },
+  {
+    "path": "codebase/other dir",
+    "type": "directory",
+    "scan_errors": []
+  },
+  {
+    "path": "codebase/other dir/file",
+    "type": "file",
+    "scan_errors": []
+  }
+]
\ No newline at end of file
diff --git a/tests/data/resource/with_path/virtual-codebase.json b/tests/data/resource/with_path/virtual-codebase.json
new file mode 100644
index 0000000..db44206
--- /dev/null
+++ b/tests/data/resource/with_path/virtual-codebase.json
@@ -0,0 +1,75 @@
+{
+  "headers": [
+    {
+      "tool_name": "scancode-toolkit",
+      "tool_version": "31.0.0b4",
+      "options": {
+        "input": [
+          "tests/data/resource/with_path/codebase"
+        ],
+        "--json-pp": "tests/data/resource/with_path/virtual-codebase.json"
+      },
+      "notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.",
+      "start_timestamp": "2022-05-16T125046.244384",
+      "end_timestamp": "2022-05-16T125046.286382",
+      "output_format_version": "2.0.0",
+      "duration": 0.0420069694519043,
+      "message": null,
+      "errors": [],
+      "warnings": [],
+      "extra_data": {
+        "system_environment": {
+          "operating_system": "linux",
+          "cpu_architecture": "64",
+          "platform": "Linux-4.15.0-177-generic-x86_64-with-glibc2.17",
+          "platform_version": "#186~16.04.1-Ubuntu SMP Wed Apr 20 09:41:17 UTC 2022",
+          "python_version": "3.8.12 (default, Jan 29 2022, 10:00:28) \n[GCC 5.4.0 20160609]"
+        },
+        "spdx_license_list_version": "3.16",
+        "files_count": 5
+      }
+    }
+  ],
+  "files": [
+    {
+      "path": "codebase",
+      "type": "directory",
+      "scan_errors": []
+    },
+    {
+      "path": "codebase/abc",
+      "type": "file",
+      "scan_errors": []
+    },
+    {
+      "path": "codebase/et131x.h",
+      "type": "file",
+      "scan_errors": []
+    },
+    {
+      "path": "codebase/dir",
+      "type": "directory",
+      "scan_errors": []
+    },
+    {
+      "path": "codebase/dir/that",
+      "type": "file",
+      "scan_errors": []
+    },
+    {
+      "path": "codebase/dir/this",
+      "type": "file",
+      "scan_errors": []
+    },
+    {
+      "path": "codebase/other dir",
+      "type": "directory",
+      "scan_errors": []
+    },
+    {
+      "path": "codebase/other dir/file",
+      "type": "file",
+      "scan_errors": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/test_filetype.py b/tests/test_filetype.py
index d272dff..2ca4d1d 100644
--- a/tests/test_filetype.py
+++ b/tests/test_filetype.py
@@ -7,9 +7,9 @@
 #
 
 import os
-
 from os.path import join
 from os.path import exists
+from unittest import skipIf
 
 from commoncode import filetype
 from commoncode import fileutils
@@ -194,7 +194,7 @@ def test_get_file_count(self):
             assert result == count
 
 
-def SymlinkTest(FileBasedTesting):
+class SymlinkTest(FileBasedTesting):
     test_data_dir = os.path.join(os.path.dirname(__file__), 'data')
 
     @skipIf(on_windows, 'os.symlink does not work on Windows')
diff --git a/tests/test_paths.py b/tests/test_paths.py
index 10c42e0..7357182 100644
--- a/tests/test_paths.py
+++ b/tests/test_paths.py
@@ -70,7 +70,7 @@ def test_safe_path_posix_style_french_char(self):
 
     def test_safe_path_posix_style_chinese_char(self):
         test = paths.safe_path(b'/includes/webform.compon\xd2\xaants.inc/')
-        expected = 'includes/webform.componS_nts.inc'
+        expected = 'includes/webform.componNSnts.inc'
         assert test == expected
 
     def test_safe_path_windows_style_dots(self):
diff --git a/tests/test_resource.py b/tests/test_resource.py
index 03feb06..907436d 100644
--- a/tests/test_resource.py
+++ b/tests/test_resource.py
@@ -8,17 +8,17 @@
 
 import json
 import os
-
 from os.path import dirname
 from os.path import exists
 from os.path import join
 
 from commoncode.fileutils import parent_directory
-from commoncode.testcase import FileBasedTesting
 from commoncode.resource import Codebase
-from commoncode.resource import get_path
+from commoncode.resource import Resource
 from commoncode.resource import VirtualCodebase
 from commoncode.resource import depth_walk
+from commoncode.testcase import FileBasedTesting
+from commoncode.testcase import check_against_expected_json_file
 
 
 class TestCodebase(FileBasedTesting):
@@ -30,19 +30,20 @@ def test_walk_defaults(self):
         results = list(codebase.walk())
         expected = [
             ('codebase', False),
-              ('abc', True),
-              ('et131x.h', True),
-              ('dir', False),
-                ('that', True),
-                ('this', True),
-              ('other dir', False),
-                ('file', True),
+            ('abc', True),
+            ('et131x.h', True),
+            ('dir', False),
+            ('that', True),
+            ('this', True),
+            ('other dir', False),
+            ('file', True),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_Codebase_do_not_ignore_by_default_older_sccs_and_rcs_dirs(self):
         # See https://github.com/nexB/scancode-toolkit/issues/1422
         from commoncode.fileutils import create_dir
+
         test_codebase = self.get_temp_dir()
         create_dir(join(test_codebase, 'sccs', 'a'))
         create_dir(join(test_codebase, 'rcs', 'b'))
@@ -57,13 +58,13 @@ def test_walk_topdown(self):
         results = list(codebase.walk(topdown=True))
         expected = [
             ('codebase', False),
-              ('abc', True),
-              ('et131x.h', True),
-              ('dir', False),
-                ('that', True),
-                ('this', True),
-              ('other dir', False),
-                ('file', True),
+            ('abc', True),
+            ('et131x.h', True),
+            ('dir', False),
+            ('that', True),
+            ('this', True),
+            ('other dir', False),
+            ('file', True),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
 
@@ -72,13 +73,13 @@ def test_walk_bottomup(self):
         codebase = Codebase(test_codebase)
         results = list(codebase.walk(topdown=False))
         expected = [
-              ('abc', True),
-              ('et131x.h', True),
-                ('that', True),
-                ('this', True),
-              ('dir', False),
-                ('file', True),
-              ('other dir', False),
+            ('abc', True),
+            ('et131x.h', True),
+            ('that', True),
+            ('this', True),
+            ('dir', False),
+            ('file', True),
+            ('other dir', False),
             ('codebase', False),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
@@ -91,10 +92,10 @@ def test_walk_skip_root_basic(self):
             ('abc', True),
             ('et131x.h', True),
             ('dir', False),
-              ('that', True),
-              ('this', True),
+            ('that', True),
+            ('this', True),
             ('other dir', False),
-              ('file', True),
+            ('file', True),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
 
@@ -204,11 +205,11 @@ def test_walk_filtered_dirs(self):
 
         results = list(codebase.walk_filtered(topdown=True))
         expected = [
-              ('abc', True),
-              ('et131x.h', True),
-                ('that', True),
-                ('this', True),
-                ('file', True),
+            ('abc', True),
+            ('et131x.h', True),
+            ('that', True),
+            ('this', True),
+            ('file', True),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
 
@@ -243,18 +244,14 @@ def test_walk_skip_root_single_file(self):
         test_codebase = self.get_test_loc('resource/codebase/et131x.h')
         codebase = Codebase(test_codebase)
         results = list(codebase.walk(skip_root=True))
-        expected = [
-            ('et131x.h', True)
-        ]
+        expected = [('et131x.h', True)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_walk_filtered_with_skip_root_and_single_file_not_filtered(self):
         test_codebase = self.get_test_loc('resource/codebase/et131x.h')
         codebase = Codebase(test_codebase)
         results = list(codebase.walk_filtered(skip_root=True))
-        expected = [
-            ('et131x.h', True)
-        ]
+        expected = [('et131x.h', True)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_walk_filtered__with_skip_root_and_filtered_single_file(self):
@@ -263,39 +260,38 @@ def test_walk_filtered__with_skip_root_and_filtered_single_file(self):
         codebase.root.is_filtered = True
         codebase.save_resource(codebase.root)
         results = list(codebase.walk_filtered(skip_root=True))
-        expected = [
-        ]
+        expected = []
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_walk_skip_root_single_file_with_children(self):
         test_codebase = self.get_test_loc('resource/codebase/et131x.h')
         codebase = Codebase(test_codebase, strip_root=True)
+        root = codebase.root
+
+        c1 = codebase._get_or_create_resource('child1', parent=root, is_file=True)
+        codebase._get_or_create_resource('child2', parent=c1, is_file=False)
 
-        c1 = codebase._create_resource('some child', parent=codebase.root, is_file=True)
-        _c2 = codebase._create_resource('some child2', parent=c1, is_file=False)
         results = list(codebase.walk(skip_root=True))
-        expected = [
-            (u'some child', True), (u'some child2', False)
-        ]
+        expected = [('et131x.h', True), ('child1', True), ('child2', False)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_walk_filtered_with_skip_root_and_single_file_with_children(self):
         test_codebase = self.get_test_loc('resource/codebase/et131x.h')
         codebase = Codebase(test_codebase, strip_root=True)
 
-        c1 = codebase._create_resource('some child', parent=codebase.root, is_file=True)
-        c2 = codebase._create_resource('some child2', parent=c1, is_file=False)
+        c1 = codebase._get_or_create_resource('some child', parent=codebase.root, is_file=True)
+        c2 = codebase._get_or_create_resource('some child2', parent=c1, is_file=False)
         c2.is_filtered = True
         codebase.save_resource(c2)
 
         results = list(codebase.walk_filtered(skip_root=True))
-        expected = [(u'some child', True)]
+        expected = [('et131x.h', True), ('some child', True)]
         assert [(r.name, r.is_file) for r in results] == expected
 
         c1.is_filtered = True
         codebase.save_resource(c1)
         results = list(codebase.walk_filtered(skip_root=True))
-        expected = []
+        expected = [('et131x.h', True)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_walk_skip_root_single_dir(self):
@@ -303,22 +299,24 @@ def test_walk_skip_root_single_dir(self):
         codebase = Codebase(test_codebase, strip_root=True)
 
         results = list(codebase.walk(skip_root=True))
-        expected = [
-            ('walk', False)
-        ]
+        expected = [('walk', False)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_walk_skipped_directories_should_not_be_yielded(self):
         # Resources that we continue past should not be added to the result list
         test_codebase = self.get_test_loc('resource/skip_directories_during_walk')
-        codebase = Codebase(test_codebase)
-        result = []
+        cdbs = Codebase(test_codebase)
 
-        def _ignored(_resource, _codebase):
-            return _resource.is_dir and _resource.name == 'skip-this-directory'
+        def _ignored(resource, codebase):
+            return resource.is_dir and resource.name == 'skip-this-directory'
 
-        for resource in codebase.walk(topdown=True, ignored=_ignored,):
-            result.append(resource.name)
+        result = [
+            res.name
+            for res in cdbs.walk(
+                topdown=True,
+                ignored=_ignored,
+            )
+        ]
 
         expected = ['skip_directories_during_walk', 'this-should-be-returned']
         assert result == expected
@@ -326,26 +324,37 @@ def _ignored(_resource, _codebase):
     def test__create_resource_can_add_child_to_file(self):
         test_codebase = self.get_test_loc('resource/codebase/et131x.h')
         codebase = Codebase(test_codebase)
-        codebase._create_resource('some child', codebase.root, is_file=True)
+        codebase._get_or_create_resource('some child', codebase.root, is_file=True)
         results = list(codebase.walk())
-        expected = [('et131x.h', True), (u'some child', True)]
+        expected = [('et131x.h', True), ('some child', True)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test__create_resource_can_add_child_to_dir(self):
         test_codebase = self.get_temp_dir('resource')
         codebase = Codebase(test_codebase)
-        codebase._create_resource('some child', codebase.root, is_file=False)
+        codebase._get_or_create_resource('some child', codebase.root, is_file=False)
         results = list(codebase.walk())
-        expected = [('resource', False), (u'some child', False)]
+        expected = [('resource', False), ('some child', False)]
         assert [(r.name, r.is_file) for r in results] == expected
 
-    def test_get_resource(self):
+    def test_get_resource_for_single_resource_codebase(self):
+        test_codebase = self.get_temp_dir('resource')
+        codebase = Codebase(test_codebase)
+        assert not (codebase.root is codebase.get_resource('resource'))
+        assert codebase.get_resource('resource') == codebase.root
+
+    def test_get_resource_for_multiple_resource_codebase(self):
         test_codebase = self.get_temp_dir('resource')
+        for name in ('a', 'b', 'c'):
+            with open(os.path.join(test_codebase, name), 'w') as o:
+                o.write('\n')
+
         codebase = Codebase(test_codebase)
-        assert not (codebase.root is codebase.get_resource(0))
-        assert codebase.get_resource(0) == codebase.root
+        assert codebase.get_resource('resource/a').path == 'resource/a'
+        assert codebase.get_resource('/resource/c').path == 'resource/c'
+        assert codebase.get_resource('resource/dsasda/../b/').path == 'resource/b'
 
-    def test_get_path(self):
+    def test_Resource_build_path(self):
         test_dir = self.get_test_loc('resource/samples')
         locations = []
         for top, dirs, files in os.walk(test_dir):
@@ -354,75 +363,113 @@ def test_get_path(self):
             for x in files:
                 locations.append(os.path.join(top, x))
 
+        codebase = Codebase(location=test_dir)
+        resources_no_root = list(codebase.walk(skip_root=True))
+
         expected_default = [
-            u'samples/JGroups', u'samples/zlib', u'samples/arch',
-            u'samples/README', u'samples/screenshot.png',
-            u'samples/JGroups/src', u'samples/JGroups/licenses',
-            u'samples/JGroups/LICENSE', u'samples/JGroups/EULA',
-            u'samples/JGroups/src/GuardedBy.java',
-            u'samples/JGroups/src/ImmutableReference.java',
-            u'samples/JGroups/src/RouterStub.java',
-            u'samples/JGroups/src/S3_PING.java',
-            u'samples/JGroups/src/FixedMembershipToken.java',
-            u'samples/JGroups/src/RouterStubManager.java',
-            u'samples/JGroups/src/RATE_LIMITER.java',
-            u'samples/JGroups/licenses/cpl-1.0.txt',
-            u'samples/JGroups/licenses/bouncycastle.txt',
-            u'samples/JGroups/licenses/lgpl.txt',
-            u'samples/JGroups/licenses/apache-2.0.txt',
-            u'samples/JGroups/licenses/apache-1.1.txt', u'samples/zlib/dotzlib',
-            u'samples/zlib/iostream2', u'samples/zlib/infback9',
-            u'samples/zlib/gcc_gvmat64', u'samples/zlib/ada',
-            u'samples/zlib/deflate.h', u'samples/zlib/zutil.c',
-            u'samples/zlib/zlib.h', u'samples/zlib/deflate.c',
-            u'samples/zlib/zutil.h', u'samples/zlib/adler32.c',
-            u'samples/zlib/dotzlib/AssemblyInfo.cs',
-            u'samples/zlib/dotzlib/LICENSE_1_0.txt',
-            u'samples/zlib/dotzlib/readme.txt',
-            u'samples/zlib/dotzlib/ChecksumImpl.cs',
-            u'samples/zlib/iostream2/zstream_test.cpp',
-            u'samples/zlib/iostream2/zstream.h',
-            u'samples/zlib/infback9/infback9.c',
-            u'samples/zlib/infback9/infback9.h',
-            u'samples/zlib/gcc_gvmat64/gvmat64.S', u'samples/zlib/ada/zlib.ads',
-            u'samples/arch/zlib.tar.gz']
-
-        default = sorted(get_path(test_dir, loc) for loc in locations)
+            'samples/JGroups',
+            'samples/zlib',
+            'samples/arch',
+            'samples/README',
+            'samples/screenshot.png',
+            'samples/JGroups/src',
+            'samples/JGroups/licenses',
+            'samples/JGroups/LICENSE',
+            'samples/JGroups/EULA',
+            'samples/JGroups/src/GuardedBy.java',
+            'samples/JGroups/src/ImmutableReference.java',
+            'samples/JGroups/src/RouterStub.java',
+            'samples/JGroups/src/S3_PING.java',
+            'samples/JGroups/src/FixedMembershipToken.java',
+            'samples/JGroups/src/RouterStubManager.java',
+            'samples/JGroups/src/RATE_LIMITER.java',
+            'samples/JGroups/licenses/cpl-1.0.txt',
+            'samples/JGroups/licenses/bouncycastle.txt',
+            'samples/JGroups/licenses/lgpl.txt',
+            'samples/JGroups/licenses/apache-2.0.txt',
+            'samples/JGroups/licenses/apache-1.1.txt',
+            'samples/zlib/dotzlib',
+            'samples/zlib/iostream2',
+            'samples/zlib/infback9',
+            'samples/zlib/gcc_gvmat64',
+            'samples/zlib/ada',
+            'samples/zlib/deflate.h',
+            'samples/zlib/zutil.c',
+            'samples/zlib/zlib.h',
+            'samples/zlib/deflate.c',
+            'samples/zlib/zutil.h',
+            'samples/zlib/adler32.c',
+            'samples/zlib/dotzlib/AssemblyInfo.cs',
+            'samples/zlib/dotzlib/LICENSE_1_0.txt',
+            'samples/zlib/dotzlib/readme.txt',
+            'samples/zlib/dotzlib/ChecksumImpl.cs',
+            'samples/zlib/iostream2/zstream_test.cpp',
+            'samples/zlib/iostream2/zstream.h',
+            'samples/zlib/infback9/infback9.c',
+            'samples/zlib/infback9/infback9.h',
+            'samples/zlib/gcc_gvmat64/gvmat64.S',
+            'samples/zlib/ada/zlib.ads',
+            'samples/arch/zlib.tar.gz',
+        ]
+
+        default = sorted(
+            Resource.build_path(root_location=test_dir, location=loc) for loc in locations
+        )
         assert default == sorted(expected_default)
 
         expected_strip_root = [
-            u'JGroups', u'zlib', u'arch', u'README', u'screenshot.png',
-            u'JGroups/src', u'JGroups/licenses', u'JGroups/LICENSE',
-            u'JGroups/EULA', u'JGroups/src/GuardedBy.java',
-            u'JGroups/src/ImmutableReference.java',
-            u'JGroups/src/RouterStub.java', u'JGroups/src/S3_PING.java',
-            u'JGroups/src/FixedMembershipToken.java',
-            u'JGroups/src/RouterStubManager.java',
-            u'JGroups/src/RATE_LIMITER.java', u'JGroups/licenses/cpl-1.0.txt',
-            u'JGroups/licenses/bouncycastle.txt', u'JGroups/licenses/lgpl.txt',
-            u'JGroups/licenses/apache-2.0.txt',
-            u'JGroups/licenses/apache-1.1.txt', u'zlib/dotzlib',
-            u'zlib/iostream2', u'zlib/infback9', u'zlib/gcc_gvmat64',
-            u'zlib/ada', u'zlib/deflate.h', u'zlib/zutil.c', u'zlib/zlib.h',
-            u'zlib/deflate.c', u'zlib/zutil.h', u'zlib/adler32.c',
-            u'zlib/dotzlib/AssemblyInfo.cs', u'zlib/dotzlib/LICENSE_1_0.txt',
-            u'zlib/dotzlib/readme.txt', u'zlib/dotzlib/ChecksumImpl.cs',
-            u'zlib/iostream2/zstream_test.cpp', u'zlib/iostream2/zstream.h',
-            u'zlib/infback9/infback9.c', u'zlib/infback9/infback9.h',
-            u'zlib/gcc_gvmat64/gvmat64.S', u'zlib/ada/zlib.ads',
-            u'arch/zlib.tar.gz']
-
-        skipped = sorted(get_path(test_dir, loc, strip_root=True) for loc in locations)
-        assert skipped == sorted(expected_strip_root)
+            'JGroups',
+            'zlib',
+            'arch',
+            'README',
+            'screenshot.png',
+            'JGroups/src',
+            'JGroups/licenses',
+            'JGroups/LICENSE',
+            'JGroups/EULA',
+            'JGroups/src/GuardedBy.java',
+            'JGroups/src/ImmutableReference.java',
+            'JGroups/src/RouterStub.java',
+            'JGroups/src/S3_PING.java',
+            'JGroups/src/FixedMembershipToken.java',
+            'JGroups/src/RouterStubManager.java',
+            'JGroups/src/RATE_LIMITER.java',
+            'JGroups/licenses/cpl-1.0.txt',
+            'JGroups/licenses/bouncycastle.txt',
+            'JGroups/licenses/lgpl.txt',
+            'JGroups/licenses/apache-2.0.txt',
+            'JGroups/licenses/apache-1.1.txt',
+            'zlib/dotzlib',
+            'zlib/iostream2',
+            'zlib/infback9',
+            'zlib/gcc_gvmat64',
+            'zlib/ada',
+            'zlib/deflate.h',
+            'zlib/zutil.c',
+            'zlib/zlib.h',
+            'zlib/deflate.c',
+            'zlib/zutil.h',
+            'zlib/adler32.c',
+            'zlib/dotzlib/AssemblyInfo.cs',
+            'zlib/dotzlib/LICENSE_1_0.txt',
+            'zlib/dotzlib/readme.txt',
+            'zlib/dotzlib/ChecksumImpl.cs',
+            'zlib/iostream2/zstream_test.cpp',
+            'zlib/iostream2/zstream.h',
+            'zlib/infback9/infback9.c',
+            'zlib/infback9/infback9.h',
+            'zlib/gcc_gvmat64/gvmat64.S',
+            'zlib/ada/zlib.ads',
+            'arch/zlib.tar.gz',
+        ]
+        stripped = sorted(r.strip_root_path for r in resources_no_root)
+        assert stripped == sorted(expected_strip_root)
 
         expected_full_ends = sorted(expected_default)
-        full = sorted(get_path(test_dir, loc, full_root=True) for loc in locations)
+        full = sorted(r.full_root_path for r in resources_no_root)
         for full_loc, ending in zip(full, expected_full_ends):
             assert full_loc.endswith((ending))
 
-        full_skipped = sorted(get_path(test_dir, loc, full_root=True, strip_root=True) for loc in locations)
-        assert full_skipped == full
-
     def test_compute_counts_when_using_disk_cache(self):
         test_codebase = self.get_test_loc('resource/samples')
         codebase = Codebase(test_codebase, strip_root=True, max_in_memory=-1)
@@ -431,114 +478,25 @@ def test_compute_counts_when_using_disk_cache(self):
         assert 11 == dirs_count
         assert 0 == size_count
 
-    def test_low_max_in_memory_does_not_raise_exception_when_ignoring_files(self):
-
-        from commoncode.fileset import is_included
-
-        test_codebase = self.get_test_loc('resource/client')
-        codebase = Codebase(test_codebase, strip_root=True, max_in_memory=1)
-
-        # Ignore GIFs, code taken from scancode/plugin_ignore.py
-        ignores = {
-            '*.gif': 'User ignore: Supplied by --ignore'
-        }
-        remove_resource = codebase.remove_resource
-
-        for resource in codebase.walk(topdown=True):
-            if not is_included(resource.path, excludes=ignores):
-                for child in resource.children(codebase):
-                    remove_resource(child)
-                if not resource.is_root:
-                    remove_resource(resource)
-
-        # Walk through the codebase and save each Resource,
-        # UnknownResource exception should not be raised
-        save_resource = codebase.save_resource
-        for resource in codebase.walk(topdown=True):
-            save_resource(resource)
-
-    def test_lowest_common_parent_1(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1')
-        codebase = Codebase(test_codebase)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.path == 'test1'
-        assert lcp.name == 'test1'
-
-    def test_lowest_common_parent_strip(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1')
-        codebase = Codebase(test_codebase, strip_root=True)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.path == ''
-        assert lcp.name == 'test1'
-
-    def test_lowest_common_parent_full(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1')
-        codebase = Codebase(test_codebase, full_root=True)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.name == 'test1'
-
-    def test_lowest_common_parent_2(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1/zlib')
-        codebase = Codebase(test_codebase)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.path == 'zlib'
-        assert lcp.name == 'zlib'
-
-    def test_lowest_common_parent_3(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1/simple')
-        codebase = Codebase(test_codebase)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.path == 'simple'
-        assert lcp.name == 'simple'
-
-    def test_lowest_common_parent_deep(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1/simple/org')
-        codebase = Codebase(test_codebase)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.path == 'org/jvnet/glassfish/comms/sipagent'
-        assert lcp.name == 'sipagent'
-
-    def test_lowest_common_parent_solo_file(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1/screenshot.png')
-        codebase = Codebase(test_codebase)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.path == 'screenshot.png'
-        assert lcp.name == 'screenshot.png'
-
-    def test_lowest_common_parent_solo_file_strip(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1/screenshot.png')
-        codebase = Codebase(test_codebase, strip_root=True)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.path == 'screenshot.png'
-        assert lcp.name == 'screenshot.png'
-
-    def test_lowest_common_parent_solo_file_full(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1/screenshot.png')
-        codebase = Codebase(test_codebase, full_root=True)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.name == 'screenshot.png'
-
     def test_distance(self):
         test_dir = self.get_test_loc('resource/dist')
         codebase = Codebase(test_dir)
         assert codebase.root.distance(test_dir) == 0
 
-        res = codebase.get_resource(1)
+        res = codebase.get_resource('dist/JGroups')
         assert res.name == 'JGroups'
         assert res.distance(codebase) == 1
 
-        res = codebase.get_resource(10)
+        res = codebase.get_resource('dist/simple/META-INF/MANIFEST.MF')
         assert res.name == 'MANIFEST.MF'
+        assert res.full_root_path.endswith('resource/dist/simple/META-INF/MANIFEST.MF')
         assert res.distance(codebase) == 3
 
     def test_skip_files_and_subdirs_of_ignored_dirs(self):
         test_dir = self.get_test_loc('resource/ignore')
         codebase = Codebase(test_dir)
         # The `cvs` directory should not be visited
-        expected = [
-            'ignore',
-            'ignore/file1'
-        ]
+        expected = ['ignore', 'ignore/file1']
         result = [r.path for r in codebase.walk(topdown=True)]
         self.assertEqual(expected, result)
 
@@ -573,11 +531,16 @@ def test_depth_walk_with_depth_2(self):
         results = list(depth_walk(test_codebase, 2))
         result_dirs = [i for j in results for i in j[1]].sort()
         result_files = [i for j in results for i in j[2]].sort()
-        expected_files = ['level1_file1', 'level1_file2', 'level2_file2',
-                          'level2_file1', 'level2_file3', 'level2_file4',
-                          'level2_file5'].sort()
-        expected_dirs = ['level1_dir1', 'level1_dir2', 'level2_dir1',
-                         'level2_dir3'].sort()
+        expected_files = [
+            'level1_file1',
+            'level1_file2',
+            'level2_file2',
+            'level2_file1',
+            'level2_file3',
+            'level2_file4',
+            'level2_file5',
+        ].sort()
+        expected_dirs = ['level1_dir1', 'level1_dir2', 'level2_dir1', 'level2_dir3'].sort()
         self.assertEqual(result_dirs, expected_dirs)
         self.assertEqual(result_files, expected_files)
 
@@ -586,12 +549,26 @@ def test_depth_walk_with_depth_3(self):
         results = list(depth_walk(test_codebase, 3))
         result_dirs = [i for j in results for i in j[1]].sort()
         result_files = [i for j in results for i in j[2]].sort()
-        expected_files = ['level1_file1', 'level1_file2', 'level2_file2',
-                          'level2_file1', 'level3_file2', 'level3_file1',
-                          'level2_file3', 'level2_file4', 'level2_file5',
-                          'level3_file4', 'level3_file3'].sort()
-        expected_dirs = ['level1_dir1', 'level1_dir2', 'level2_dir1',
-                         'level3_dir1', 'level2_dir3'].sort()
+        expected_files = [
+            'level1_file1',
+            'level1_file2',
+            'level2_file2',
+            'level2_file1',
+            'level3_file2',
+            'level3_file1',
+            'level2_file3',
+            'level2_file4',
+            'level2_file5',
+            'level3_file4',
+            'level3_file3',
+        ].sort()
+        expected_dirs = [
+            'level1_dir1',
+            'level1_dir2',
+            'level2_dir1',
+            'level3_dir1',
+            'level2_dir3',
+        ].sort()
         self.assertEqual(result_dirs, expected_dirs)
         self.assertEqual(result_files, expected_files)
 
@@ -601,10 +578,10 @@ def test_specify_depth_1(self):
         results = list(codebase.walk())
         expected = [
             ('deeply_nested', False),
-                ('level1_dir1', False),
-                ('level1_dir2', False),
-                ('level1_file1', True),
-                ('level1_file2', True),
+            ('level1_dir1', False),
+            ('level1_dir2', False),
+            ('level1_file1', True),
+            ('level1_file2', True),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
 
@@ -615,17 +592,17 @@ def test_specify_depth_2(self):
 
         expected = [
             ('deeply_nested', False),
-                ('level1_file1', True),
-                ('level1_file2', True),
-                ('level1_dir1', False),
-                    ('level2_dir1', False),
-                    ('level2_file1', True),
-                    ('level2_file2', True),
-                ('level1_dir2', False),
-                    ('level2_dir3', False),
-                    ('level2_file3', True),
-                    ('level2_file4', True),
-                    ('level2_file5', True),
+            ('level1_file1', True),
+            ('level1_file2', True),
+            ('level1_dir1', False),
+            ('level2_dir1', False),
+            ('level2_file1', True),
+            ('level2_file2', True),
+            ('level1_dir2', False),
+            ('level2_dir3', False),
+            ('level2_file3', True),
+            ('level2_file4', True),
+            ('level2_file5', True),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
 
@@ -636,93 +613,143 @@ def test_specify_depth_3(self):
 
         expected = [
             ('deeply_nested', False),
-                ('level1_file1', True),
-                ('level1_file2', True),
-                ('level1_dir1', False),
-                    ('level2_file1', True),
-                    ('level2_file2', True),
-                    ('level2_dir1', False),
-                        ('level3_dir1', False),
-                        ('level3_file1', True),
-                        ('level3_file2', True),
-                ('level1_dir2', False),
-                    ('level2_file3', True),
-                    ('level2_file4', True),
-                    ('level2_file5', True),
-                    ('level2_dir3', False),
-                        ('level3_file3', True),
-                        ('level3_file4', True),
+            ('level1_file1', True),
+            ('level1_file2', True),
+            ('level1_dir1', False),
+            ('level2_file1', True),
+            ('level2_file2', True),
+            ('level2_dir1', False),
+            ('level3_dir1', False),
+            ('level3_file1', True),
+            ('level3_file2', True),
+            ('level1_dir2', False),
+            ('level2_file3', True),
+            ('level2_file4', True),
+            ('level2_file5', True),
+            ('level2_dir3', False),
+            ('level3_file3', True),
+            ('level3_file4', True),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
 
 
+class TestCodebaseWithPath(FileBasedTesting):
+    test_data_dir = join(dirname(__file__), 'data')
+
+    def test_Codebase_with_paths_works(self):
+        test_codebase = self.get_test_loc('resource/with_path/codebase')
+        paths = ['codebase/other dir/file']
+        codebase = Codebase(location=test_codebase, paths=paths)
+        assert not codebase.errors
+        results = [r.to_dict() for r in codebase.walk()]
+        print(r.path for r in codebase.walk())
+        expected_file = self.get_test_loc(
+            'resource/with_path/codebase-expected.json',
+            must_exist=False,
+        )
+        check_against_expected_json_file(results, expected_file, regen=False)
+
+    def test_VirtualCodebase_with_paths_works(self):
+        test_codebase = self.get_test_loc('resource/with_path/virtual-codebase.json')
+        paths = ['codebase/other dir/file']
+        codebase = VirtualCodebase(location=test_codebase, paths=paths)
+        assert not codebase.errors
+        results = [r.to_dict() for r in codebase.walk()]
+        expected_file = self.get_test_loc(
+            'resource/with_path/virtual-codebase-expected.json',
+            must_exist=False,
+        )
+        check_against_expected_json_file(results, expected_file, regen=False)
+
+
 class TestCodebaseCache(FileBasedTesting):
     test_data_dir = join(dirname(__file__), 'data')
 
     def test_codebase_cache_default(self):
         test_codebase = self.get_test_loc('resource/cache2')
         codebase = Codebase(test_codebase)
+
         assert codebase.temp_dir
         assert codebase.cache_dir
-        codebase.cache_dir
+
         root = codebase.root
 
-        cp = codebase._get_resource_cache_location(root.rid, create=False)
-        assert not exists(cp)
-        cp = codebase._get_resource_cache_location(root.rid, create=True)
+        cp = codebase._get_resource_cache_location(root.path, create_dirs=True)
         assert not exists(cp)
         assert exists(parent_directory(cp))
 
-        child = codebase._create_resource('child', root, is_file=True)
+        child = codebase._get_or_create_resource(name='child', parent=root, is_file=True)
         child.size = 12
         codebase.save_resource(child)
-        child_2 = codebase.get_resource(child.rid)
+        child_2 = codebase.get_resource(path=child.path)
         assert child_2 == child
 
     def test_codebase_cache_all_in_memory(self):
         test_codebase = self.get_test_loc('resource/cache2')
         codebase = Codebase(test_codebase, max_in_memory=0)
-        for rid in codebase.resource_ids:
-            if rid == 0:
-                assert codebase.get_resource(rid) == codebase.root
-                assert codebase._exists_in_memory(rid)
-                assert not codebase._exists_on_disk(rid)
+        for path, res in codebase.resources_by_path.items():
+            if res is Codebase.CACHED_RESOURCE:
+                res = codebase.get_resource(path)
+            if res.is_root:
+                assert codebase.get_resource(path) == codebase.root == res
+                assert codebase._exists_in_memory(path)
+                assert not codebase._exists_on_disk(path)
             else:
-                assert codebase._exists_in_memory(rid)
-                assert not codebase._exists_on_disk(rid)
+                assert codebase._exists_in_memory(path)
+                assert not codebase._exists_on_disk(path)
 
-        assert len(list(codebase.walk())) == len(codebase.resource_ids)
+        assert (
+            len(list(codebase.walk()))
+            == len(codebase.resources_by_path)
+            == codebase.resources_count
+        )
 
     def test_codebase_cache_all_on_disk(self):
         test_codebase = self.get_test_loc('resource/cache2')
         codebase = Codebase(test_codebase, max_in_memory=-1)
-        for rid in codebase.resource_ids:
-            if rid == 0:
-                assert codebase.get_resource(rid) == codebase.root
-                assert codebase._exists_in_memory(rid)
-                assert not codebase._exists_on_disk(rid)
+        for path, res in codebase.resources_by_path.items():
+            if res is Codebase.CACHED_RESOURCE:
+                res = codebase.get_resource(path)
+            if res.is_root:
+                assert codebase.get_resource(path) == codebase.root == res
+                assert codebase._exists_in_memory(path)
+                assert not codebase._exists_on_disk(path)
             else:
-                assert not codebase._exists_in_memory(rid)
-                assert codebase._exists_on_disk(rid)
+                assert not codebase._exists_in_memory(path)
+                assert codebase._exists_on_disk(path)
 
-        assert len(list(codebase.walk())) == len(codebase.resource_ids)
+        assert (
+            len(list(codebase.walk()))
+            == len(codebase.resources_by_path)
+            == codebase.resources_count
+        )
 
     def test_codebase_cache_mixed_two_in_memory(self):
         test_codebase = self.get_test_loc('resource/cache2')
         codebase = Codebase(test_codebase, max_in_memory=2)
-        for rid in codebase.resource_ids:
-            if rid == 0:
-                assert codebase.get_resource(rid) == codebase.root
-                assert codebase._exists_in_memory(rid)
-                assert not codebase._exists_on_disk(rid)
-            elif rid < 2:
-                assert codebase._exists_in_memory(rid)
-                assert not codebase._exists_on_disk(rid)
+        counter = 0
+        for path, res in codebase.resources_by_path.items():
+            if res is Codebase.CACHED_RESOURCE:
+                res = codebase.get_resource(path)
+
+            if res.is_root:
+                assert codebase.get_resource(path) == codebase.root == res
+                assert codebase._exists_in_memory(path)
+                assert not codebase._exists_on_disk(path)
+                counter += 1
+            elif counter < 2:
+                assert codebase._exists_in_memory(path)
+                assert not codebase._exists_on_disk(path)
+                counter += 1
             else:
-                assert not codebase._exists_in_memory(rid)
-                assert codebase._exists_on_disk(rid)
+                assert not codebase._exists_in_memory(path)
+                assert codebase._exists_on_disk(path)
 
-        assert len(list(codebase.walk())) == len(codebase.resource_ids)
+        assert (
+            len(list(codebase.walk()))
+            == len(codebase.resources_by_path)
+            == codebase.resources_count
+        )
 
 
 class TestVirtualCodebase(FileBasedTesting):
@@ -734,13 +761,13 @@ def test_virtual_codebase_walk_defaults(self):
         results = list(codebase.walk())
         expected = [
             ('codebase', False),
-              ('abc', True),
-              ('et131x.h', True),
-              ('dir', False),
-                ('that', True),
-                ('this', True),
-              ('other dir', False),
-                ('file', True),
+            ('abc', True),
+            ('et131x.h', True),
+            ('dir', False),
+            ('that', True),
+            ('this', True),
+            ('other dir', False),
+            ('file', True),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
 
@@ -750,13 +777,13 @@ def test_virtual_codebase_walk_topdown(self):
         results = list(codebase.walk(topdown=True))
         expected = [
             ('codebase', False),
-              ('abc', True),
-              ('et131x.h', True),
-              ('dir', False),
-                ('that', True),
-                ('this', True),
-              ('other dir', False),
-                ('file', True),
+            ('abc', True),
+            ('et131x.h', True),
+            ('dir', False),
+            ('that', True),
+            ('this', True),
+            ('other dir', False),
+            ('file', True),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
 
@@ -765,13 +792,13 @@ def test_virtual_codebase_walk_bottomup(self):
         codebase = VirtualCodebase(location=test_file)
         results = list(codebase.walk(topdown=False))
         expected = [
-              ('abc', True),
-              ('et131x.h', True),
-                ('that', True),
-                ('this', True),
-              ('dir', False),
-                ('file', True),
-              ('other dir', False),
+            ('abc', True),
+            ('et131x.h', True),
+            ('that', True),
+            ('this', True),
+            ('dir', False),
+            ('file', True),
+            ('other dir', False),
             ('codebase', False),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
@@ -784,10 +811,10 @@ def test_virtual_codebase_walk_skip_root_basic(self):
             ('abc', True),
             ('et131x.h', True),
             ('dir', False),
-              ('that', True),
-              ('this', True),
+            ('that', True),
+            ('this', True),
             ('other dir', False),
-              ('file', True),
+            ('file', True),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
 
@@ -798,6 +825,13 @@ def test_virtual_codebase_get_path_with_strip_root_and_walk_with_skip_root(self)
         expected = ['README', 'screenshot.png']
         assert expected == results
 
+    def test_virtual_codebase_to_list_with_strip_root_and_walk_with_skip_root(self):
+        scan_data = self.get_test_loc('resource/virtual_codebase/stripped-and-skipped-root.json')
+        virtual_codebase = VirtualCodebase(location=scan_data)
+        results = virtual_codebase.to_list(strip_root=True, skinny=True)
+        expected = [{'path': 'README', 'type': 'file'}, {'path': 'screenshot.png', 'type': 'file'}]
+        assert expected == results
+
     def test_virtual_codebase_walk_filtered_with_filtered_root(self):
         scan_data = self.get_test_loc('resource/virtual_codebase/virtual_codebase.json')
         virtual_codebase = VirtualCodebase(location=scan_data)
@@ -904,11 +938,11 @@ def test_virtual_codebase_walk_filtered_dirs(self):
                 virtual_codebase.save_resource(res)
         results = list(virtual_codebase.walk_filtered(topdown=True))
         expected = [
-              ('abc', True),
-              ('et131x.h', True),
-                ('that', True),
-                ('this', True),
-                ('file', True),
+            ('abc', True),
+            ('et131x.h', True),
+            ('that', True),
+            ('this', True),
+            ('file', True),
         ]
         assert [(r.name, r.is_file) for r in results] == expected
 
@@ -944,18 +978,14 @@ def test_virtual_codebase_walk_skip_root_single_file(self):
         scan_data = self.get_test_loc('resource/virtual_codebase/et131x.h.json')
         virtual_codebase = VirtualCodebase(location=scan_data)
         results = list(virtual_codebase.walk(skip_root=True))
-        expected = [
-            ('et131x.h', True)
-        ]
+        expected = [('et131x.h', True)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_virtual_codebase_walk_filtered_with_skip_root_and_single_file_not_filtered(self):
         scan_data = self.get_test_loc('resource/virtual_codebase/et131x.h.json')
         virtual_codebase = VirtualCodebase(location=scan_data)
         results = list(virtual_codebase.walk_filtered(skip_root=True))
-        expected = [
-            ('et131x.h', True)
-        ]
+        expected = [('et131x.h', True)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_virtual_codebase_walk_filtered__with_skip_root_and_filtered_single_file(self):
@@ -964,79 +994,104 @@ def test_virtual_codebase_walk_filtered__with_skip_root_and_filtered_single_file
         virtual_codebase.root.is_filtered = True
         virtual_codebase.save_resource(virtual_codebase.root)
         results = list(virtual_codebase.walk_filtered(skip_root=True))
-        expected = [
-        ]
+        expected = []
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_virtual_codebase_walk_skip_root_single_file_with_children(self):
         scan_data = self.get_test_loc('resource/virtual_codebase/et131x.h.json')
         virtual_codebase = VirtualCodebase(location=scan_data)
-        c1 = virtual_codebase._create_resource('some child', parent=virtual_codebase.root, is_file=True)
-        _c2 = virtual_codebase._create_resource('some child2', parent=c1, is_file=False)
+
+        c1 = virtual_codebase._get_or_create_resource(
+            'some child',
+            parent=virtual_codebase.root,
+            is_file=True,
+        )
+        _c2 = virtual_codebase._get_or_create_resource(
+            'some child2',
+            parent=c1,
+            is_file=False,
+        )
         results = list(virtual_codebase.walk(skip_root=True))
-        expected = [
-            (u'some child', True), (u'some child2', False)
-        ]
+        expected = [('et131x.h', True), ('some child', True), ('some child2', False)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_virtual_codebase_walk_filtered_with_skip_root_and_single_file_with_children(self):
         scan_data = self.get_test_loc('resource/virtual_codebase/et131x.h.json')
         virtual_codebase = VirtualCodebase(location=scan_data)
-        c1 = virtual_codebase._create_resource('some child', parent=virtual_codebase.root, is_file=True)
-        c2 = virtual_codebase._create_resource('some child2', parent=c1, is_file=False)
+
+        c1 = virtual_codebase._get_or_create_resource(
+            'some child',
+            parent=virtual_codebase.root,
+            is_file=True,
+        )
+
+        c2 = virtual_codebase._get_or_create_resource(
+            'some child2',
+            parent=c1,
+            is_file=False,
+        )
         c2.is_filtered = True
-        virtual_codebase.save_resource(c2)
+        c2.save(virtual_codebase)
 
         results = list(virtual_codebase.walk_filtered(skip_root=True))
-        expected = [(u'some child', True)]
+        expected = [('et131x.h', True), ('some child', True)]
         assert [(r.name, r.is_file) for r in results] == expected
 
         c1.is_filtered = True
-        virtual_codebase.save_resource(c1)
+        c1.save(virtual_codebase)
+
         results = list(virtual_codebase.walk_filtered(skip_root=True))
-        expected = []
+        expected = [('et131x.h', True)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_virtual_codebase__create_resource_can_add_child_to_file(self):
         scan_data = self.get_test_loc('resource/virtual_codebase/et131x.h.json')
         virtual_codebase = VirtualCodebase(location=scan_data)
-        virtual_codebase._create_resource('some child', virtual_codebase.root, is_file=True)
+        virtual_codebase._get_or_create_resource(
+            'some child',
+            virtual_codebase.root,
+            is_file=True,
+        )
         results = list(virtual_codebase.walk())
-        expected = [('et131x.h', True), (u'some child', True)]
+        expected = [('et131x.h', True), ('some child', True)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_virtual_codebase__create_resource_can_add_child_to_dir(self):
         scan_data = self.get_test_loc('resource/virtual_codebase/resource.json')
         virtual_codebase = VirtualCodebase(location=scan_data)
-        virtual_codebase._create_resource('some child', virtual_codebase.root, is_file=False)
+        virtual_codebase._get_or_create_resource(
+            'some child',
+            virtual_codebase.root,
+            is_file=False,
+        )
         results = list(virtual_codebase.walk())
-        expected = [('resource', False), (u'some child', False)]
+        expected = [('resource', False), ('some child', False)]
         assert [(r.name, r.is_file) for r in results] == expected
 
     def test_virtual_codebase_get_resource(self):
         scan_data = self.get_test_loc('resource/virtual_codebase/resource.json')
         virtual_codebase = VirtualCodebase(location=scan_data)
-        assert not (virtual_codebase.root is virtual_codebase.get_resource(0))
-        assert virtual_codebase.get_resource(0) == virtual_codebase.root
+        assert not (virtual_codebase.root is virtual_codebase.get_resource('resource'))
+        assert virtual_codebase.get_resource('resource') == virtual_codebase.root
 
     def test_virtual_codebase_can_process_minimal_resources_without_info(self):
         scan_data = self.get_test_loc('resource/virtual_codebase/noinfo.json')
         codebase = VirtualCodebase(location=scan_data)
         expected = [
-            dict([
-                (u'path', u'NOTICE'),
-                (u'type', u'file'),
-                (u'copyrights', [
-                    dict([
-                        (u'statements', [u'Copyright (c) 2017 nexB Inc. and others.']),
-                        (u'holders', [u'nexB Inc. and others.']),
-                        (u'authors', []),
-                        (u'start_line', 4),
-                        (u'end_line', 4)
-                    ])
-                ]),
-                (u'scan_errors', [])
-            ])
+            {
+                'path': 'NOTICE',
+                'type': 'file',
+                'copyrights': [
+                    {
+                        'statements': ['Copyright (c) 2017 nexB Inc. and others.'],
+                        'holders': ['nexB Inc. and others.'],
+                        'authors': [],
+                        'start_line': 4,
+                        'end_line': 4,
+                    }
+                ],
+                'scan_errors': [],
+            }
         ]
         assert [r.to_dict() for r in codebase.walk()] == expected
 
@@ -1044,16 +1099,8 @@ def test_virtual_codebase_can_process_minimal_resources_with_only_path(self):
         scan_data = self.get_test_loc('resource/virtual_codebase/only-path.json')
         codebase = VirtualCodebase(location=scan_data)
         expected = [
-                dict([
-                (u'path', u'samples'),
-                (u'type', u'directory'),
-                (u'scan_errors', [])
-            ]),
-            dict([
-                (u'path', u'samples/NOTICE'),
-                (u'type', u'file'),
-                (u'scan_errors', [])
-            ])
+            {'path': 'samples', 'type': 'directory', 'scan_errors': []},
+            {'path': 'samples/NOTICE', 'type': 'file', 'scan_errors': []},
         ]
         assert [r.to_dict() for r in codebase.walk()] == expected
 
@@ -1063,9 +1110,16 @@ def test_VirtualCodebase_account_fingerprint_attribute(self):
         resources_fingerprint = [resource.fingerprint for resource in codebase.walk()]
         assert "e30cf09443e7878dfed3288886e97542" in resources_fingerprint
         assert None in resources_fingerprint
-        assert codebase.get_resource(0) == codebase.root
+        assert codebase.get_resource('apache_to_all_notable_lic_new') == codebase.root
         assert resources_fingerprint.count(None) == 2
 
+    def test_VirtualCodebase_works_with_mapping_backed_codebase(self):
+        test_file = self.get_test_loc("resource/virtual_codebase/license-scan.json")
+        codebase = VirtualCodebase(test_file)
+        resource = codebase.get_resource('scan-ref/license-notice.txt')
+        assert resource
+        assert len(resource.license_expressions) == 1
+
 
 class TestCodebaseLowestCommonParent(FileBasedTesting):
     test_data_dir = join(dirname(__file__), 'data')
@@ -1081,25 +1135,15 @@ def test_virtual_codebase_has_default_for_plugin_attributes(self):
         scan_data = self.get_test_loc('resource/virtual_codebase/only-path.json')
         VirtualCodebase(location=scan_data)
 
-    def test_lowest_common_parent_1(self):
+    def test_lowest_common_parent_strip(self):
         test_codebase = self.get_test_loc('resource/lcp/test1')
         codebase = Codebase(test_codebase)
+        assert len(list(codebase.walk())) == 75
         lcp = codebase.lowest_common_parent()
         assert lcp.path == 'test1'
         assert lcp.name == 'test1'
-
-    def test_lowest_common_parent_strip(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1')
-        codebase = Codebase(test_codebase, strip_root=True)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.path == ''
-        assert lcp.name == 'test1'
-
-    def test_lowest_common_parent_full(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1')
-        codebase = Codebase(test_codebase, full_root=True)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.name == 'test1'
+        assert lcp.strip_root_path == ''
+        assert lcp.full_root_path.endswith('resource/lcp/test1')
 
     def test_lowest_common_parent_2(self):
         test_codebase = self.get_test_loc('resource/lcp/test1/zlib')
@@ -1107,6 +1151,8 @@ def test_lowest_common_parent_2(self):
         lcp = codebase.lowest_common_parent()
         assert lcp.path == 'zlib'
         assert lcp.name == 'zlib'
+        assert lcp.strip_root_path == ''
+        assert lcp.full_root_path.endswith('resource/lcp/test1/zlib')
 
     def test_lowest_common_parent_3(self):
         test_codebase = self.get_test_loc('resource/lcp/test1/simple')
@@ -1114,6 +1160,7 @@ def test_lowest_common_parent_3(self):
         lcp = codebase.lowest_common_parent()
         assert lcp.path == 'simple'
         assert lcp.name == 'simple'
+        assert lcp.strip_root_path == ''
 
     def test_lowest_common_parent_deep(self):
         test_codebase = self.get_test_loc('resource/lcp/test1/simple/org')
@@ -1121,6 +1168,10 @@ def test_lowest_common_parent_deep(self):
         lcp = codebase.lowest_common_parent()
         assert lcp.path == 'org/jvnet/glassfish/comms/sipagent'
         assert lcp.name == 'sipagent'
+        assert lcp.strip_root_path == 'jvnet/glassfish/comms/sipagent'
+        assert lcp.full_root_path.endswith(
+            'resource/lcp/test1/simple/org/jvnet/glassfish/comms/sipagent'
+        )
 
     def test_lowest_common_parent_solo_file(self):
         test_codebase = self.get_test_loc('resource/lcp/test1/screenshot.png')
@@ -1128,91 +1179,110 @@ def test_lowest_common_parent_solo_file(self):
         lcp = codebase.lowest_common_parent()
         assert lcp.path == 'screenshot.png'
         assert lcp.name == 'screenshot.png'
-
-    def test_lowest_common_parent_solo_file_strip(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1/screenshot.png')
-        codebase = Codebase(test_codebase, strip_root=True)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.path == 'screenshot.png'
-        assert lcp.name == 'screenshot.png'
-
-    def test_lowest_common_parent_solo_file_full(self):
-        test_codebase = self.get_test_loc('resource/lcp/test1/screenshot.png')
-        codebase = Codebase(test_codebase, full_root=True)
-        lcp = codebase.lowest_common_parent()
-        assert lcp.name == 'screenshot.png'
+        assert lcp.strip_root_path == ''
+        assert lcp.full_root_path.endswith('resource/lcp/test1/screenshot.png')
 
 
 class TestVirtualCodebaseCache(FileBasedTesting):
     test_data_dir = join(dirname(__file__), 'data')
 
     def test_virtual_codebase_cache_default(self):
-        scan_data = self.get_test_loc('resource/virtual_codebase/cache2.json')
+        scan_data = self.get_test_loc('resource/virtual_codebase/codebase-for-cache-tests.json')
         virtual_codebase = VirtualCodebase(location=scan_data)
         assert virtual_codebase.temp_dir
         assert virtual_codebase.cache_dir
         virtual_codebase.cache_dir
         root = virtual_codebase.root
 
-        cp = virtual_codebase._get_resource_cache_location(root.rid, create=False)
+        cp = virtual_codebase._get_resource_cache_location(root.path, create_dirs=False)
         assert not exists(cp)
-        cp = virtual_codebase._get_resource_cache_location(root.rid, create=True)
+
+        cp = virtual_codebase._get_resource_cache_location(root.path, create_dirs=True)
         assert not exists(cp)
         assert exists(parent_directory(cp))
 
-        child = virtual_codebase._create_resource('child', root, is_file=True)
+        child = virtual_codebase._get_or_create_resource('child', root, is_file=True)
         child.size = 12
         virtual_codebase.save_resource(child)
-        child_2 = virtual_codebase.get_resource(child.rid)
+        child_2 = virtual_codebase.get_resource(child.path)
         assert child_2 == child
 
     def test_virtual_codebase_cache_all_in_memory(self):
-        scan_data = self.get_test_loc('resource/virtual_codebase/cache2.json')
-        virtual_codebase = VirtualCodebase(location=scan_data,
-                                           max_in_memory=0)
-        for rid in virtual_codebase.resource_ids:
-            if rid == 0:
-                assert virtual_codebase.get_resource(rid) == virtual_codebase.root
-                assert virtual_codebase._exists_in_memory(rid)
-                assert not virtual_codebase._exists_on_disk(rid)
+        scan_data = self.get_test_loc('resource/virtual_codebase/codebase-for-cache-tests.json')
+        virtual_codebase = VirtualCodebase(location=scan_data, max_in_memory=0)
+        for path, res in virtual_codebase.resources_by_path.items():
+            assert res != Codebase.CACHED_RESOURCE
+            if res.is_root:
+                assert virtual_codebase.get_resource(path).to_dict(
+                    with_info=True
+                ) == virtual_codebase.root.to_dict(with_info=True)
+                assert virtual_codebase._exists_in_memory(path)
+                assert not virtual_codebase._exists_on_disk(path)
             else:
-                assert virtual_codebase._exists_in_memory(rid)
-                assert not virtual_codebase._exists_on_disk(rid)
+                assert virtual_codebase._exists_in_memory(path)
+                assert not virtual_codebase._exists_on_disk(path)
 
-        assert len(list(virtual_codebase.walk())) == len(virtual_codebase.resource_ids)
+        assert (
+            len(list(virtual_codebase.walk()))
+            == len(virtual_codebase.resources_by_path)
+            == virtual_codebase.resources_count
+        )
 
     def test_virtual_codebase_cache_all_on_disk(self):
-        scan_data = self.get_test_loc('resource/virtual_codebase/cache2.json')
-        virtual_codebase = VirtualCodebase(location=scan_data,
-                                           max_in_memory=-1)
-        for rid in virtual_codebase.resource_ids:
-            if rid == 0:
-                assert virtual_codebase.get_resource(rid) == virtual_codebase.root
-                assert virtual_codebase._exists_in_memory(rid)
-                assert not virtual_codebase._exists_on_disk(rid)
+        scan_data = self.get_test_loc('resource/virtual_codebase/codebase-for-cache-tests.json')
+        virtual_codebase = VirtualCodebase(location=scan_data, max_in_memory=-1)
+        for path, res in virtual_codebase.resources_by_path.items():
+
+            if res != Codebase.CACHED_RESOURCE:
+                assert res.is_root
+            else:
+                res = virtual_codebase.get_resource(path)
+
+            if res.is_root:
+                assert virtual_codebase.get_resource(path) == virtual_codebase.root
+                assert virtual_codebase._exists_in_memory(path)
+                assert not virtual_codebase._exists_on_disk(path)
             else:
-                assert not virtual_codebase._exists_in_memory(rid)
-                assert virtual_codebase._exists_on_disk(rid)
+                assert not virtual_codebase._exists_in_memory(path)
+                assert virtual_codebase._exists_on_disk(path)
 
-        assert len(list(virtual_codebase.walk())) == len(virtual_codebase.resource_ids)
+        assert (
+            len(list(virtual_codebase.walk()))
+            == len(virtual_codebase.resources_by_path)
+            == virtual_codebase.resources_count
+        )
 
     def test_virtual_codebase_cache_mixed_two_in_memory(self):
-        scan_data = self.get_test_loc('resource/virtual_codebase/cache2.json')
-        virtual_codebase = VirtualCodebase(location=scan_data,
-                                           max_in_memory=2)
-        for rid in virtual_codebase.resource_ids:
-            if rid == 0:
-                assert virtual_codebase.get_resource(rid) == virtual_codebase.root
-                assert virtual_codebase._exists_in_memory(rid)
-                assert not virtual_codebase._exists_on_disk(rid)
-            elif rid < 2:
-                assert virtual_codebase._exists_in_memory(rid)
-                assert not virtual_codebase._exists_on_disk(rid)
+        scan_data = self.get_test_loc('resource/virtual_codebase/codebase-for-cache-tests.json')
+        virtual_codebase = VirtualCodebase(location=scan_data, max_in_memory=2)
+        counter = 0
+
+        for path, res in virtual_codebase.resources_by_path.items():
+            if res is Codebase.CACHED_RESOURCE:
+                res = virtual_codebase.get_resource(path)
+
+            if res.is_root:
+                assert (
+                    virtual_codebase.get_resource(path).to_dict() == virtual_codebase.root.to_dict()
+                )
+                assert virtual_codebase._exists_in_memory(path)
+                assert not virtual_codebase._exists_on_disk(path)
+                counter += 1
+
+            elif counter < 2:
+                assert virtual_codebase._exists_in_memory(path)
+                assert not virtual_codebase._exists_on_disk(path)
+                counter += 1
+
             else:
-                assert not virtual_codebase._exists_in_memory(rid)
-                assert virtual_codebase._exists_on_disk(rid)
+                assert not virtual_codebase._exists_in_memory(path)
+                assert virtual_codebase._exists_on_disk(path)
 
-        assert len(list(virtual_codebase.walk())) == len(virtual_codebase.resource_ids)
+        assert (
+            len(list(virtual_codebase.walk()))
+            == len(virtual_codebase.resources_by_path)
+            == virtual_codebase.resources_count
+        )
 
 
 class TestVirtualCodebaseCreation(FileBasedTesting):
@@ -1257,27 +1327,16 @@ def test_VirtualCodebase_can_be_created_from_json_string(self):
 
     def test_VirtualCodebase_can_be_created_from_dict(self):
         test_scan = {
-              "scancode_notice": "Generated with ScanCode and provided on an ....",
-              "scancode_version": "2.9.7.post137.2e29fe3.dirty.20181120225811",
-              "scancode_options": {
-                "input": "han/",
-                "--json-pp": "-"
-              },
-              "scan_start": "2018-11-23T123252.191917",
-              "files_count": 1,
-              "files": [
-                {
-                  "path": "han",
-                  "type": "directory",
-                  "scan_errors": []
-                },
-                {
-                  "path": "han/bar.svg",
-                  "type": "file",
-                  "scan_errors": []
-                }
-              ]
-            }
+            "scancode_notice": "Generated with ScanCode and provided on an ....",
+            "scancode_version": "2.9.7.post137.2e29fe3.dirty.20181120225811",
+            "scancode_options": {"input": "han/", "--json-pp": "-"},
+            "scan_start": "2018-11-23T123252.191917",
+            "files_count": 1,
+            "files": [
+                {"path": "han", "type": "directory", "scan_errors": []},
+                {"path": "han/bar.svg", "type": "file", "scan_errors": []},
+            ],
+        }
         codebase = VirtualCodebase(test_scan)
 
         results = sorted(r.name for r in codebase.walk())
@@ -1286,7 +1345,9 @@ def test_VirtualCodebase_can_be_created_from_dict(self):
 
     def test_VirtualCodebase_create_from_scan_with_no_root_and_missing_parents(self):
         test_file = self.get_test_loc('resource/virtual_codebase/samples-only-findings.json')
-        result_file = self.get_test_loc('resource/virtual_codebase/samples-only-findings-expected.json')
+        result_file = self.get_test_loc(
+            'resource/virtual_codebase/samples-only-findings-expected.json'
+        )
         codebase = VirtualCodebase(test_file)
         expected_scan = json.load(open(result_file))
         results = sorted(r.path for r in codebase.walk())
@@ -1298,18 +1359,8 @@ def test_VirtualCodebase_check_that_already_existing_parent_is_updated_properly(
         codebase = VirtualCodebase(test_file)
         results = sorted((r.to_dict() for r in codebase.walk()), key=lambda x: tuple(x.items()))
         expected = [
-            dict([
-                (u'path', u'samples'),
-                (u'type', u'directory'),
-                (u'summary', [u'asd']),
-                (u'scan_errors', [])
-            ]),
-            dict([
-                (u'path', u'samples/NOTICE'),
-                (u'type', u'file'),
-                (u'summary', []),
-                (u'scan_errors', [])
-            ])
+            {'path': 'samples', 'type': 'directory', 'summary': ['asd'], 'scan_errors': []},
+            {'path': 'samples/NOTICE', 'type': 'file', 'summary': [], 'scan_errors': []},
         ]
         assert results == expected
 
@@ -1318,38 +1369,73 @@ def test_VirtualCodebase_create_from_multiple_scans(self):
         test_file_2 = self.get_test_loc('resource/virtual_codebase/combine-2.json')
         vinput = (test_file_1, test_file_2)
         codebase = VirtualCodebase(vinput)
-        results = sorted((r.to_dict() for r in codebase.walk()), key=lambda x: tuple(x.items()))
-        expected = [
-            dict([(u'path', u'virtual_root'), (u'type', u'directory'), (u'summary', []), (u'scan_errors', [])]),
-            dict([(u'path', u'virtual_root/samples'), (u'type', u'directory'), (u'summary', []), (u'scan_errors', [])]),
-            dict([(u'path', u'virtual_root/samples/NOTICE'), (u'type', u'file'), (u'summary', []), (u'scan_errors', [])]),
-            dict([(u'path', u'virtual_root/thirdparty'), (u'type', u'directory'), (u'summary', []), (u'scan_errors', [])]),
-            dict([(u'path', u'virtual_root/thirdparty/example.zip'), (u'type', u'file'), (u'summary', []), (u'scan_errors', [])])
-        ]
-        assert results == expected
+        results = [r.to_dict(with_info=False) for r in codebase.walk()]
+        expected_file = self.get_test_loc(
+            'resource/virtual_codebase/combine-expected.json',
+            must_exist=False,
+        )
+        check_against_expected_json_file(results, expected_file, regen=False)
+
+    def test_VirtualCodebase_create_from_multiple_scans_shared_directory_names(self):
+        test_file_1 = self.get_test_loc(
+            'resource/virtual_codebase/combine-shared-directory-name-1.json'
+        )
+        test_file_2 = self.get_test_loc(
+            'resource/virtual_codebase/combine-shared-directory-name-2.json'
+        )
+        vinput = (test_file_1, test_file_2)
+        codebase = VirtualCodebase(location=vinput)
+
+        results = [r.to_dict(with_info=False) for r in codebase.walk()]
+        expected_file = self.get_test_loc(
+            'resource/virtual_codebase/combine-shared-directory-name-expected.json',
+            must_exist=False,
+        )
+        check_against_expected_json_file(results, expected_file, regen=False)
+
+    def test_VirtualCodebase_compute_counts_with_full_root_info_one(self):
+        test_file = self.get_test_loc('resource/virtual_codebase/full-root-info-one.json')
+        codebase = VirtualCodebase(test_file)
+        resource = [r for r in codebase.walk() if r.is_file][0]
+        assert resource.path == 'home/foobar/scancode-toolkit/samples/README'
+        files_count, dirs_count, size_count = codebase.compute_counts()
+        assert files_count == 1
+        assert dirs_count == 0
+        assert size_count == 236
 
-    def test_VirtualCodebase_scanning_full_root(self):
-        test_file = self.get_test_loc("resource/virtual_codebase/path_full_root.json")
+    def test_VirtualCodebase_with_full_root_info_one(self):
+        test_file = self.get_test_loc('resource/virtual_codebase/full-root-info-one.json')
+        codebase = VirtualCodebase(test_file)
+        results = [r.to_dict(with_info=True) for r in codebase.walk()]
+        expected_file = self.get_test_loc(
+            'resource/virtual_codebase/full-root-info-one-expected.json', must_exist=False
+        )
+        check_against_expected_json_file(results, expected_file, regen=False)
+
+    def test_VirtualCodebase_with_full_root_info_many(self):
+        test_file = self.get_test_loc('resource/virtual_codebase/full-root-info-many.json')
         codebase = VirtualCodebase(test_file)
-        resource = sorted(r for r in codebase.walk())[0]
-        assert "/Users/sesser/code/nexb/scancode-toolkit/samples/README" == resource.path
-        assert 1 == codebase.compute_counts()[0]
+        results = [r.to_dict(with_info=True) for r in codebase.walk()]
+        expected_file = self.get_test_loc(
+            'resource/virtual_codebase/full-root-info-many-expected.json', must_exist=False
+        )
+        check_against_expected_json_file(results, expected_file, regen=False)
 
     def test_VirtualCodebase_can_compute_counts_with_null(self):
         # was failing with
         # size_count += child.size
         # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
-        test_file = self.get_test_loc("resource/virtual_codebase/node-16-slim.json")
+        test_file = self.get_test_loc('resource/virtual_codebase/node-16-slim.json')
         codebase = VirtualCodebase(test_file)
         codebase.compute_counts()
 
     def test_VirtualCodebase_can_be_created_with_single_path(self):
-        test_file = self.get_test_loc("resource/virtual_codebase/docker-hello-world.json")
+        test_file = self.get_test_loc('resource/virtual_codebase/docker-hello-world.json')
         VirtualCodebase(test_file)
 
     def test_VirtualCodebase_can_be_created_without_RecursionError(self):
         # was failing with RecursionError: maximum recursion depth exceeded
-        test_file = self.get_test_loc("resource/virtual_codebase/zephyr-binary.json")
+        test_file = self.get_test_loc('resource/virtual_codebase/zephyr-binary.json')
         VirtualCodebase(test_file)
 
 
@@ -1364,25 +1450,80 @@ def test_Resource_extracted_to_extracted_from(self):
             extracted_to = r.extracted_to(codebase)
             extracted_from = r.extracted_from(codebase)
 
-            extracted_to_path = ''
-            if extracted_to:
-                extracted_to_path = extracted_to.path
-
-            extracted_from_path = ''
-            if extracted_from:
-                extracted_from_path = extracted_from.path
+            extracted_to_path = extracted_to and extracted_to.path
+            extracted_from_path = extracted_from and extracted_from.path
             results.append((r.path, extracted_to_path, extracted_from_path))
 
         expected = [
-            ('test', '', ''),
-            ('test/c', '', ''),
-            ('test/foo.tar.gz', 'test/foo.tar.gz-extract', ''),
-            ('test/foo.tar.gz-extract', '', 'test/foo.tar.gz'),
-            ('test/foo.tar.gz-extract/foo', '', 'test/foo.tar.gz'),
-            ('test/foo.tar.gz-extract/foo/a', '', 'test/foo.tar.gz'),
-            ('test/foo.tar.gz-extract/foo/bar.tar.gz', 'test/foo.tar.gz-extract/foo/bar.tar.gz-extract', 'test/foo.tar.gz'),
-            ('test/foo.tar.gz-extract/foo/bar.tar.gz-extract', '', 'test/foo.tar.gz-extract/foo/bar.tar.gz'),
-            ('test/foo.tar.gz-extract/foo/bar.tar.gz-extract/bar', '', 'test/foo.tar.gz-extract/foo/bar.tar.gz'),
-            ('test/foo.tar.gz-extract/foo/bar.tar.gz-extract/bar/b', '', 'test/foo.tar.gz-extract/foo/bar.tar.gz')
+            (
+                'test',
+                None,
+                None,
+            ),
+            (
+                'test/c',
+                None,
+                None,
+            ),
+            (
+                'test/foo.tar.gz',
+                'test/foo.tar.gz-extract',
+                None,
+            ),
+            (
+                'test/foo.tar.gz-extract',
+                None,
+                'test/foo.tar.gz',
+            ),
+            (
+                'test/foo.tar.gz-extract/foo',
+                None,
+                'test/foo.tar.gz',
+            ),
+            (
+                'test/foo.tar.gz-extract/foo/a',
+                None,
+                'test/foo.tar.gz',
+            ),
+            (
+                'test/foo.tar.gz-extract/foo/bar.tar.gz',
+                'test/foo.tar.gz-extract/foo/bar.tar.gz-extract',
+                'test/foo.tar.gz',
+            ),
+            (
+                'test/foo.tar.gz-extract/foo/bar.tar.gz-extract',
+                None,
+                'test/foo.tar.gz-extract/foo/bar.tar.gz',
+            ),
+            (
+                'test/foo.tar.gz-extract/foo/bar.tar.gz-extract/bar',
+                None,
+                'test/foo.tar.gz-extract/foo/bar.tar.gz',
+            ),
+            (
+                'test/foo.tar.gz-extract/foo/bar.tar.gz-extract/bar/b',
+                None,
+                'test/foo.tar.gz-extract/foo/bar.tar.gz',
+            ),
         ]
         assert results == expected
+
+    def test_virtualcode_Resource_can_walk(self):
+        test_file = self.get_test_loc('resource/resource/test-extracted-from-to.json')
+        codebase = VirtualCodebase(location=test_file)
+        results = [r.path for r in codebase.walk(topdown=True)]
+
+        expected = [
+            'test',
+            'test/c',
+            'test/foo.tar.gz',
+            'test/foo.tar.gz-extract',
+            'test/foo.tar.gz-extract/foo',
+            'test/foo.tar.gz-extract/foo/a',
+            'test/foo.tar.gz-extract/foo/bar.tar.gz',
+            'test/foo.tar.gz-extract/foo/bar.tar.gz-extract',
+            'test/foo.tar.gz-extract/foo/bar.tar.gz-extract/bar',
+            'test/foo.tar.gz-extract/foo/bar.tar.gz-extract/bar/b',
+        ]
+
+        assert results == expected
diff --git a/thirdparty/README.rst b/thirdparty/README.rst
deleted file mode 100644
index b31482f..0000000
--- a/thirdparty/README.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Put your Python dependency wheels to be vendored in this directory.
-