From 2f0fbfbed24281e57b55e5d3090c2307fe313a04 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 15:55:18 +0100 Subject: [PATCH 01/12] CU-8693n892x: Save environment/dependency snapshot upon model pack creation --- medcat/cat.py | 7 +++ medcat/utils/saving/envsnapshot.py | 21 ++++++++ tests/utils/saving/test_envsnapshot.py | 69 ++++++++++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 medcat/utils/saving/envsnapshot.py create mode 100644 tests/utils/saving/test_envsnapshot.py diff --git a/medcat/cat.py b/medcat/cat.py index 8df7526b7..e327cf550 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -40,6 +40,7 @@ from medcat.utils.decorators import deprecated from medcat.ner.transformers_ner import TransformersNER from medcat.utils.saving.serializer import SPECIALITY_NAMES, ONE2MANY +from medcat.utils.saving.envsnapshot import get_environment_info from medcat.stats.stats import get_stats from medcat.utils.filters import set_project_filters @@ -315,6 +316,12 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M with open(model_card_path, 'w') as f: json.dump(self.get_model_card(as_dict=True), f, indent=2) + # add a dependency snapshot + env_info = get_environment_info() + env_info_path = os.path.join(save_dir_path, "environment_snapshot.json") + with open(env_info_path, 'w') as f: + json.dump(env_info, f) + # Zip everything shutil.make_archive(os.path.join(_save_dir_path, model_pack_name), 'zip', root_dir=save_dir_path) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py new file mode 100644 index 000000000..bb3852422 --- /dev/null +++ b/medcat/utils/saving/envsnapshot.py @@ -0,0 +1,21 @@ +from typing import List, Tuple, Dict, Any + +import pkg_resources +import platform + + + +def get_installed_packages() -> List[Tuple[str, str]]: + installed_packages = [] + for package in pkg_resources.working_set: + installed_packages.append([package.project_name, package.version]) + return installed_packages + + +def get_environment_info() -> Dict[str, Any]: + return { + "dependencies": get_installed_packages(), + "os": platform.platform(), + "cpu_architecture": platform.machine(), + "python_version": platform.python_version() + } diff --git a/tests/utils/saving/test_envsnapshot.py b/tests/utils/saving/test_envsnapshot.py new file mode 100644 index 000000000..7d974641c --- /dev/null +++ b/tests/utils/saving/test_envsnapshot.py @@ -0,0 +1,69 @@ +from typing import Any +import platform +import os +import tempfile +import json + +from medcat.cat import CAT +from medcat.utils.saving import envsnapshot + +import unittest + + +class EnvSnapshotAloneTests(unittest.TestCase): + + def setUp(self) -> None: + self.env_info = envsnapshot.get_environment_info() + + def test_info_is_dict(self): + self.assertIsInstance(self.env_info, dict) + + def test_info_is_not_empty(self): + self.assertTrue(self.env_info) + + def assert_has_target(self, target: str, expected: Any): + self.assertIn(target, self.env_info) + py_ver = self.env_info[target] + self.assertEqual(py_ver, expected) + + def test_has_os(self): + self.assert_has_target("os", platform.platform()) + + def test_has_py_ver(self): + self.assert_has_target("python_version", platform.python_version()) + + def test_has_cpu_arch(self): + self.assert_has_target("cpu_architecture", platform.machine()) + + def test_has_dependencies(self, name: str = "dependencies"): + # NOTE: just making sure it's a anon-empty list + self.assertIn(name, self.env_info) + deps = self.env_info[name] + self.assertTrue(deps) + + +CAT_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples") + + +class EnvSnapshotInCATTests(unittest.TestCase): + expected_env = envsnapshot.get_environment_info() + + @classmethod + def setUpClass(cls) -> None: + cls.cat = CAT.load_model_pack(CAT_PATH) + cls._temp_dir = tempfile.TemporaryDirectory() + mpn = cls.cat.create_model_pack(cls._temp_dir.name) + cls.cat_folder = os.path.join(cls._temp_dir.name, mpn) + cls.envrion_file_path = os.path.join(cls.cat_folder, "environment_snapshot.json") + + def test_has_environment(self): + self.assertTrue(os.path.exists(self.envrion_file_path)) + + def test_eviron_saved(self): + with open(self.envrion_file_path) as f: + saved_info: dict = json.load(f) + self.assertEqual(saved_info.keys(), self.expected_env.keys()) + for k in saved_info: + with self.subTest(k): + v1, v2 = saved_info[k], self.expected_env[k] + self.assertEqual(v1, v2) From d9dac3fe86ee39c03f440db24419913bc1a16d89 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 15:57:50 +0100 Subject: [PATCH 02/12] CU-8693n892x: Fix typing for env snapshot module --- medcat/utils/saving/envsnapshot.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index bb3852422..ef996ca12 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -1,11 +1,10 @@ -from typing import List, Tuple, Dict, Any +from typing import List, Dict, Any import pkg_resources import platform - -def get_installed_packages() -> List[Tuple[str, str]]: +def get_installed_packages() -> List[List[str]]: installed_packages = [] for package in pkg_resources.working_set: installed_packages.append([package.project_name, package.version]) From c3e7a758726a222292e594d2e93150af5a3c9485 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 15:58:14 +0100 Subject: [PATCH 03/12] CU-8693n892x: Add test for env file existance in .zip --- tests/utils/saving/test_envsnapshot.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/utils/saving/test_envsnapshot.py b/tests/utils/saving/test_envsnapshot.py index 7d974641c..e43b1c64c 100644 --- a/tests/utils/saving/test_envsnapshot.py +++ b/tests/utils/saving/test_envsnapshot.py @@ -3,6 +3,7 @@ import os import tempfile import json +import zipfile from medcat.cat import CAT from medcat.utils.saving import envsnapshot @@ -10,6 +11,11 @@ import unittest +def list_zip_contents(zip_file_path): + with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: + return zip_ref.namelist() + + class EnvSnapshotAloneTests(unittest.TestCase): def setUp(self) -> None: @@ -43,6 +49,7 @@ def test_has_dependencies(self, name: str = "dependencies"): CAT_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples") +ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" class EnvSnapshotInCATTests(unittest.TestCase): @@ -54,7 +61,7 @@ def setUpClass(cls) -> None: cls._temp_dir = tempfile.TemporaryDirectory() mpn = cls.cat.create_model_pack(cls._temp_dir.name) cls.cat_folder = os.path.join(cls._temp_dir.name, mpn) - cls.envrion_file_path = os.path.join(cls.cat_folder, "environment_snapshot.json") + cls.envrion_file_path = os.path.join(cls.cat_folder, ENV_SNAPSHOT_FILE_NAME) def test_has_environment(self): self.assertTrue(os.path.exists(self.envrion_file_path)) @@ -67,3 +74,7 @@ def test_eviron_saved(self): with self.subTest(k): v1, v2 = saved_info[k], self.expected_env[k] self.assertEqual(v1, v2) + + def test_zip_has_env_snapshot(self): + filenames = list_zip_contents(self.cat_folder + ".zip") + self.assertIn(ENV_SNAPSHOT_FILE_NAME, filenames) From f48e08402d92801f63d0e00e789aa63eea5b7c58 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 15:59:47 +0100 Subject: [PATCH 04/12] CU-8693n892x: Add doc strings --- medcat/utils/saving/envsnapshot.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index ef996ca12..4f0b0dd46 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -5,6 +5,11 @@ def get_installed_packages() -> List[List[str]]: + """Get the installed packages and their versions. + + Returns: + List[List[str]]: List of lists. Each item contains of a dependency name and version. + """ installed_packages = [] for package in pkg_resources.working_set: installed_packages.append([package.project_name, package.version]) @@ -12,6 +17,13 @@ def get_installed_packages() -> List[List[str]]: def get_environment_info() -> Dict[str, Any]: + """Get the current environment information. + + This includes dependency versions, the OS, the CPU architecture and the python version. + + Returns: + Dict[str, Any]: _description_ + """ return { "dependencies": get_installed_packages(), "os": platform.platform(), From 2ca3be8b154d7ef4fa14833e6abb0d65e15ab8a7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 16:42:45 +0100 Subject: [PATCH 05/12] CU-8693n892x: Centralise env snapshot file name --- medcat/cat.py | 4 ++-- medcat/utils/saving/envsnapshot.py | 3 +++ tests/utils/saving/test_envsnapshot.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index e327cf550..9fbd38271 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -40,7 +40,7 @@ from medcat.utils.decorators import deprecated from medcat.ner.transformers_ner import TransformersNER from medcat.utils.saving.serializer import SPECIALITY_NAMES, ONE2MANY -from medcat.utils.saving.envsnapshot import get_environment_info +from medcat.utils.saving.envsnapshot import get_environment_info, ENV_SNAPSHOT_FILE_NAME from medcat.stats.stats import get_stats from medcat.utils.filters import set_project_filters @@ -318,7 +318,7 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M # add a dependency snapshot env_info = get_environment_info() - env_info_path = os.path.join(save_dir_path, "environment_snapshot.json") + env_info_path = os.path.join(save_dir_path, ENV_SNAPSHOT_FILE_NAME) with open(env_info_path, 'w') as f: json.dump(env_info, f) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index 4f0b0dd46..2fb5ace6d 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -4,6 +4,9 @@ import platform +ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" + + def get_installed_packages() -> List[List[str]]: """Get the installed packages and their versions. diff --git a/tests/utils/saving/test_envsnapshot.py b/tests/utils/saving/test_envsnapshot.py index e43b1c64c..36fb0dc83 100644 --- a/tests/utils/saving/test_envsnapshot.py +++ b/tests/utils/saving/test_envsnapshot.py @@ -49,7 +49,7 @@ def test_has_dependencies(self, name: str = "dependencies"): CAT_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples") -ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" +ENV_SNAPSHOT_FILE_NAME = envsnapshot.ENV_SNAPSHOT_FILE_NAME class EnvSnapshotInCATTests(unittest.TestCase): From 46a52b69f8109be916e8d83dc33c33b58c905d42 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 16:44:43 +0100 Subject: [PATCH 06/12] CU-8693n892x: Add env snapshot file to exceptions in serialisation tests --- tests/utils/saving/test_serialization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/utils/saving/test_serialization.py b/tests/utils/saving/test_serialization.py index c2c44da16..6f636e3f0 100644 --- a/tests/utils/saving/test_serialization.py +++ b/tests/utils/saving/test_serialization.py @@ -10,6 +10,7 @@ from medcat.vocab import Vocab from medcat.utils.saving.serializer import JsonSetSerializer, CDBSerializer, SPECIALITY_NAMES, ONE2MANY +from medcat.utils.saving.envsnapshot import ENV_SNAPSHOT_FILE_NAME import medcat.utils.saving.coding as _ @@ -60,6 +61,7 @@ class ModelCreationTests(unittest.TestCase): json_model_pack = tempfile.TemporaryDirectory() EXAMPLES = os.path.join(os.path.dirname( os.path.realpath(__file__)), "..", "..", "..", "examples") + EXCEPTIONAL_JSONS = ['model_card.json', ENV_SNAPSHOT_FILE_NAME] @classmethod def setUpClass(cls) -> None: @@ -95,7 +97,7 @@ def test_dill_to_json(self): SPECIALITY_NAMES) - len(ONE2MANY)) for json in jsons: with self.subTest(f'JSON {json}'): - if json.endswith('model_card.json'): + if any(json.endswith(exception) for exception in self.EXCEPTIONAL_JSONS): continue # ignore model card here if any(name in json for name in ONE2MANY): # ignore cui2many and name2many From 401fe8c64582797ea2052867d913e9570bf53747 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 29 May 2024 16:21:45 +0100 Subject: [PATCH 07/12] CU-8693n892x: Only list direct dependencies --- medcat/utils/saving/envsnapshot.py | 39 +++++++++++++++++++++++++- tests/utils/saving/test_envsnapshot.py | 20 +++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index 2fb5ace6d..2a2a76eae 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -1,10 +1,44 @@ -from typing import List, Dict, Any +from typing import List, Dict, Any, Set +import os +import re import pkg_resources import platform ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" +SETUP_PY_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "setup.py")) +SETUP_PY_REGEX = re.compile("install_requires=\[([\s\S]*?)\]") + + +def get_direct_dependencies() -> Set[str]: + """Get the set of direct dependeny names. + + The current implementation reads setup.py for the install_requires + keyword argument, evaluates the list, removes the versions and returns + the names as a set. + + Raises: + FileNotFoundError: If the setup.py file was not found. + ValueError: If found different sets of instal lrequirements. + + Returns: + Set[str]: The set of direct dependeny names. + """ + if not os.path.exists(SETUP_PY_PATH): + raise FileNotFoundError(f"{SETUP_PY_PATH} does not exist.") + with open(SETUP_PY_PATH) as f: + setup_py_code = f.read() + found = SETUP_PY_REGEX.findall(setup_py_code) + if not found: + raise ValueError("Did not find install requirements in setup.py") + if len(found) > 1: + raise ValueError("Ambiguous install requirements in setup.py") + deps_str = found[0] + # evaluate list of dependencies (including potential version pins) + deps: List[str] = eval("[" + deps_str + "]") + # remove versions where applicable + return set(re.split("[<=>~]", dep)[0] for dep in deps) def get_installed_packages() -> List[List[str]]: @@ -13,8 +47,11 @@ def get_installed_packages() -> List[List[str]]: Returns: List[List[str]]: List of lists. Each item contains of a dependency name and version. """ + direct_deps = get_direct_dependencies() installed_packages = [] for package in pkg_resources.working_set: + if package.project_name not in direct_deps: + continue installed_packages.append([package.project_name, package.version]) return installed_packages diff --git a/tests/utils/saving/test_envsnapshot.py b/tests/utils/saving/test_envsnapshot.py index 36fb0dc83..937a4dfe2 100644 --- a/tests/utils/saving/test_envsnapshot.py +++ b/tests/utils/saving/test_envsnapshot.py @@ -16,6 +16,26 @@ def list_zip_contents(zip_file_path): return zip_ref.namelist() +class DirectDependenciesTests(unittest.TestCase): + + def setUp(self) -> None: + self.direct_deps = envsnapshot.get_direct_dependencies() + + def test_nonempty(self): + self.assertTrue(self.direct_deps) + + def test_does_not_contain_versions(self, version_starters: str = '<=>~'): + for dep in self.direct_deps: + for vs in version_starters: + with self.subTest(f"DEP '{dep}' check for '{vs}'"): + self.assertNotIn(vs, dep) + + def test_deps_are_installed_packages(self): + for dep in self.direct_deps: + with self.subTest(f"Has '{dep}'"): + envsnapshot.pkg_resources.require(dep) + + class EnvSnapshotAloneTests(unittest.TestCase): def setUp(self) -> None: From 7f8a1643e1227f3d54fbe3ea1468773661ddfb40 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 29 May 2024 16:30:02 +0100 Subject: [PATCH 08/12] CU-8693n892x: Add test that verifies all direct dependencies are listed in environment --- tests/utils/saving/test_envsnapshot.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/utils/saving/test_envsnapshot.py b/tests/utils/saving/test_envsnapshot.py index 937a4dfe2..16bee1ffb 100644 --- a/tests/utils/saving/test_envsnapshot.py +++ b/tests/utils/saving/test_envsnapshot.py @@ -67,6 +67,11 @@ def test_has_dependencies(self, name: str = "dependencies"): deps = self.env_info[name] self.assertTrue(deps) + def test_all_direct_dependencies_are_installed(self): + deps = self.env_info['dependencies'] + direct_deps = envsnapshot.get_direct_dependencies() + self.assertEqual(len(deps), len(direct_deps)) + CAT_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples") ENV_SNAPSHOT_FILE_NAME = envsnapshot.ENV_SNAPSHOT_FILE_NAME From 0f13eeec72652e425531fbf10e98d86b83162007 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 10 Jun 2024 16:13:46 +0100 Subject: [PATCH 09/12] CU-8693n892x: Move requirements to separate file and use that for environment snapshot --- install_requires.txt | 24 +++++++++++++++++++++ medcat/utils/saving/envsnapshot.py | 26 ++++++----------------- setup.py | 34 +++++++----------------------- 3 files changed, 39 insertions(+), 45 deletions(-) create mode 100644 install_requires.txt diff --git a/install_requires.txt b/install_requires.txt new file mode 100644 index 000000000..da26267aa --- /dev/null +++ b/install_requires.txt @@ -0,0 +1,24 @@ +'numpy>=1.22.0,<1.26.0' # 1.22.0 is first to support python 3.11; post 1.26.0 there's issues with scipy +'pandas>=1.4.2' # first to support 3.11 +'gensim>=4.3.0,<5.0.0' # 5.3.0 is first to support 3.11; avoid major version bump +'spacy>=3.6.0,<4.0.0' # Some later model packs (e.g HPO) are made with 3.6.0 spacy model; avoid major version bump +'scipy~=1.9.2' # 1.9.2 is first to support 3.11 +'transformers>=4.34.0,<5.0.0' # avoid major version bump +'accelerate>=0.23.0' # required by Trainer class in de-id +'torch>=1.13.0,<3.0.0' # 1.13 is first to support 3.11; 2.1.2 has been compatible, but avoid major 3.0.0 for now +'tqdm>=4.27' +'scikit-learn>=1.1.3,<2.0.0' # 1.1.3 is first to supporrt 3.11; avoid major version bump +'dill>=0.3.6,<1.0.0' # stuff saved in 0.3.6/0.3.7 is not always compatible with 0.3.4/0.3.5; avoid major bump +'datasets>=2.2.2,<3.0.0' # avoid major bump +'jsonpickle>=2.0.0' # allow later versions, tested with 3.0.0 +'psutil>=5.8.0' +# 0.70.12 uses older version of dill (i.e less than 0.3.5) which is required for datasets +'multiprocess~=0.70.12' # 0.70.14 seemed to work just fine +'aiofiles>=0.8.0' # allow later versions, tested with 22.1.0 +'ipywidgets>=7.6.5' # allow later versions, tested with 0.8.0 +'xxhash>=3.0.0' # allow later versions, tested with 3.1.0 +'blis>=0.7.5' # allow later versions, tested with 0.7.9 +'click>=8.0.4' # allow later versions, tested with 8.1.3 +'pydantic>=1.10.0,<2.0' # for spacy compatibility; avoid 2.0 due to breaking changes +"humanfriendly~=10.0" # for human readable file / RAM sizes +"peft>=0.8.2" \ No newline at end of file diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index 2a2a76eae..d5642f987 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -14,30 +14,18 @@ def get_direct_dependencies() -> Set[str]: """Get the set of direct dependeny names. - The current implementation reads setup.py for the install_requires - keyword argument, evaluates the list, removes the versions and returns + The current implementation reads install_requires.txt for dependenceies, + removes comments, whitespace, quotes; removes the versions and returns the names as a set. - Raises: - FileNotFoundError: If the setup.py file was not found. - ValueError: If found different sets of instal lrequirements. - Returns: Set[str]: The set of direct dependeny names. """ - if not os.path.exists(SETUP_PY_PATH): - raise FileNotFoundError(f"{SETUP_PY_PATH} does not exist.") - with open(SETUP_PY_PATH) as f: - setup_py_code = f.read() - found = SETUP_PY_REGEX.findall(setup_py_code) - if not found: - raise ValueError("Did not find install requirements in setup.py") - if len(found) > 1: - raise ValueError("Ambiguous install requirements in setup.py") - deps_str = found[0] - # evaluate list of dependencies (including potential version pins) - deps: List[str] = eval("[" + deps_str + "]") - # remove versions where applicable + with open("install_requires.txt") as f: + # read every line, strip quotes and comments + dep_lines = [line.split("#")[0].replace("'", "").replace('"', "").strip() for line in f.readlines()] + # remove comment-only (or empty) lines + deps = [dep for dep in dep_lines if dep] return set(re.split("[<=>~]", dep)[0] for dep in deps) diff --git a/setup.py b/setup.py index 966de3406..2fd5fd773 100644 --- a/setup.py +++ b/setup.py @@ -4,6 +4,13 @@ long_description = fh.read() +with open("install_requires.txt") as f: + # read every line, strip quotes and comments + dep_lines = [l.split("#")[0].replace("'", "").replace('"', "").strip() for l in f.readlines()] + # remove comment-only (or empty) lines + install_requires = [dep for dep in dep_lines if dep] + + setuptools.setup( name="medcat", setup_requires=["setuptools_scm"], @@ -17,32 +24,7 @@ packages=['medcat', 'medcat.utils', 'medcat.preprocessing', 'medcat.ner', 'medcat.linking', 'medcat.datasets', 'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.utils.ner', 'medcat.utils.relation_extraction', 'medcat.utils.saving', 'medcat.utils.regression', 'medcat.stats'], - install_requires=[ - 'numpy>=1.22.0,<1.26.0', # 1.22.0 is first to support python 3.11; post 1.26.0 there's issues with scipy - 'pandas>=1.4.2', # first to support 3.11 - 'gensim>=4.3.0,<5.0.0', # 5.3.0 is first to support 3.11; avoid major version bump - 'spacy>=3.6.0,<4.0.0', # Some later model packs (e.g HPO) are made with 3.6.0 spacy model; avoid major version bump - 'scipy~=1.9.2', # 1.9.2 is first to support 3.11 - 'transformers>=4.34.0,<5.0.0', # avoid major version bump - 'accelerate>=0.23.0', # required by Trainer class in de-id - 'torch>=1.13.0,<3.0.0', # 1.13 is first to support 3.11; 2.1.2 has been compatible, but avoid major 3.0.0 for now - 'tqdm>=4.27', - 'scikit-learn>=1.1.3,<2.0.0', # 1.1.3 is first to supporrt 3.11; avoid major version bump - 'dill>=0.3.6,<1.0.0', # stuff saved in 0.3.6/0.3.7 is not always compatible with 0.3.4/0.3.5; avoid major bump - 'datasets>=2.2.2,<3.0.0', # avoid major bump - 'jsonpickle>=2.0.0', # allow later versions, tested with 3.0.0 - 'psutil>=5.8.0', - # 0.70.12 uses older version of dill (i.e less than 0.3.5) which is required for datasets - 'multiprocess~=0.70.12', # 0.70.14 seemed to work just fine - 'aiofiles>=0.8.0', # allow later versions, tested with 22.1.0 - 'ipywidgets>=7.6.5', # allow later versions, tested with 0.8.0 - 'xxhash>=3.0.0', # allow later versions, tested with 3.1.0 - 'blis>=0.7.5', # allow later versions, tested with 0.7.9 - 'click>=8.0.4', # allow later versions, tested with 8.1.3 - 'pydantic>=1.10.0,<2.0', # for spacy compatibility; avoid 2.0 due to breaking changes - "humanfriendly~=10.0", # for human readable file / RAM sizes - "peft>=0.8.2", # allow later versions, tested with 0.10.0 - ], + install_requires=install_requires, classifiers=[ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.8", From f769e8e8cc7ccbcba9a6f5907cd1e9e5bbe21977 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 17 Jun 2024 13:03:02 +0100 Subject: [PATCH 10/12] CU-8693n892x: Remove unused constants --- medcat/utils/saving/envsnapshot.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index d5642f987..f7f7127ba 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -1,14 +1,11 @@ from typing import List, Dict, Any, Set -import os import re import pkg_resources import platform ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" -SETUP_PY_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "setup.py")) -SETUP_PY_REGEX = re.compile("install_requires=\[([\s\S]*?)\]") def get_direct_dependencies() -> Set[str]: From 56ad4e94af7d27d9861987e0526a520c35915687 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 17 Jun 2024 13:12:56 +0100 Subject: [PATCH 11/12] CU-8693n892x: Allow URL based dependencies when using direct dependencies --- medcat/utils/saving/envsnapshot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index f7f7127ba..9c0ccbb96 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -23,7 +23,7 @@ def get_direct_dependencies() -> Set[str]: dep_lines = [line.split("#")[0].replace("'", "").replace('"', "").strip() for line in f.readlines()] # remove comment-only (or empty) lines deps = [dep for dep in dep_lines if dep] - return set(re.split("[<=>~]", dep)[0] for dep in deps) + return set(re.split("[@<=>~]", dep)[0].strip() for dep in deps) def get_installed_packages() -> List[List[str]]: From 131538c2b5ae1444fcf77037fc5a550dd2daf0b4 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 17 Jun 2024 16:01:48 +0100 Subject: [PATCH 12/12] CU-8693n892x: Distribute install_requires.txt alongside the package; use correct path in distributed version --- medcat/utils/saving/envsnapshot.py | 18 +++++++++++++++++- setup.py | 5 +++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index 9c0ccbb96..262c48410 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -1,5 +1,6 @@ from typing import List, Dict, Any, Set +import os import re import pkg_resources import platform @@ -7,6 +8,17 @@ ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" +INSTALL_REQUIRES_FILE_PATH = os.path.join(os.path.dirname(__file__), + "..", "..", "..", + "install_requires.txt") +# NOTE: The install_requires.txt file is copied into the wheel during build +# so that it can be included in the distributed package. +# However, that means it's 1 folder closer to this file since it'll now +# be in the root of the package rather than the root of the project. +INSTALL_REQUIRES_FILE_PATH_PIP = os.path.join(os.path.dirname(__file__), + "..", "..", + "install_requires.txt") + def get_direct_dependencies() -> Set[str]: """Get the set of direct dependeny names. @@ -18,7 +30,11 @@ def get_direct_dependencies() -> Set[str]: Returns: Set[str]: The set of direct dependeny names. """ - with open("install_requires.txt") as f: + req_file = INSTALL_REQUIRES_FILE_PATH + if not os.path.exists(req_file): + # When pip-installed. See note above near constant definiation + req_file = INSTALL_REQUIRES_FILE_PATH_PIP + with open(req_file) as f: # read every line, strip quotes and comments dep_lines = [line.split("#")[0].replace("'", "").replace('"', "").strip() for line in f.readlines()] # remove comment-only (or empty) lines diff --git a/setup.py b/setup.py index 2fd5fd773..549e7c091 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,11 @@ import setuptools +import shutil with open("./README.md", "r") as fh: long_description = fh.read() +# make a copy of install requirements so that it gets distributed with the wheel +shutil.copy('install_requires.txt', 'medcat/install_requires.txt') with open("install_requires.txt") as f: # read every line, strip quotes and comments @@ -25,6 +28,8 @@ 'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.utils.ner', 'medcat.utils.relation_extraction', 'medcat.utils.saving', 'medcat.utils.regression', 'medcat.stats'], install_requires=install_requires, + include_package_data=True, + package_data={"medcat": ["install_requires.txt"]}, classifiers=[ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.8",