diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d334b1c4..1dc6e823 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -27,12 +27,12 @@ jobs: conda info -a conda create -q -n test-environment python=${{ matrix.python-version }} numpy scipy numba pandas matplotlib source activate test-environment - - name: Run setup.py + - name: Install Hatch + uses: pypa/hatch@install + - name: Build and install package run: | - pip install build - python setup.py sdist --formats=zip -k - python -m build - find ./dist -iname "*.zip" -print0 | xargs -0 pip install + hatch build + find ./dist -iname "*.tar.gz" -print0 | xargs -0 pip install pip install codecov - name: Download test files run: | diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml index fa78180e..df218e74 100644 --- a/.github/workflows/build_test_and_push.yml +++ b/.github/workflows/build_test_and_push.yml @@ -28,10 +28,12 @@ jobs: conda info -a conda create -q -n test-environment python=${{ matrix.python-version }} numpy scipy numba pandas matplotlib source activate test-environment - - name: Run setup.py + - name: Install Hatch + uses: pypa/hatch@install + - name: Build and install package run: | - python setup.py sdist --formats=zip -k - find ./dist -iname "*.zip" -print0 | xargs -0 pip install + hatch build + find ./dist -iname "*.tar.gz" -print0 | xargs -0 pip install pip install codecov - name: Download test files run: | @@ -42,14 +44,8 @@ jobs: with: run: coverage run -m unittest discover -s test -p "Test*.py" working-directory: ./ #optional - - name: Publish evcouplings to test PyPI - if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@master - with: - password: ${{ secrets.PYPI_ACCESS_TOKEN_TEST }} - repository_url: https://test.pypi.org/legacy/ - name: Publish evcouplings to PyPI - if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@v1.9.0 with: + user: __token__ password: ${{ secrets.PYPI_ACCESS_TOKEN }} diff --git a/.gitignore b/.gitignore index 283bd20a..ca839eb1 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ __pycache__ *.ipynb_checkpoints* notebooks_dev/* evcouplings.egg-info/* +/dist/ diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 42a20880..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include README.md -include evcouplings/fold/cns_templates/*.* -include evcouplings/couplings/scoring_models/*.* diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index 850a4be3..c13a74a9 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -470,7 +470,9 @@ def __init__(self, filehandle, keep_full_data=False): "_atom_site.pdbx_formal_charge": "charge", } - HELIX_TARGET_COLS = { + # full list of conf types: https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Items/_struct_conf_type.id.html; + # mapping between file types: https://manpages.debian.org/unstable/dssp/mkdssp.1.en.html + CONF_TARGET_COLS = { "_struct_conf.conf_type_id": "conformation_type", "_struct_conf.id": "id", # label_asym_id and label_seq_id are sufficient for merging to atom table; @@ -508,11 +510,15 @@ def __init__(self, filehandle, keep_full_data=False): # decode information into dataframe with BioPython helper method; note this section may not be # present if no helices exist in the structure try: - self.helix_table = pd.DataFrame({ - name: _decode(data[source_column]) for source_column, name in HELIX_TARGET_COLS.items() - }) + self.conf_table = pd.DataFrame({ + name: _decode(data[source_column]) for source_column, name in CONF_TARGET_COLS.items() + }).query( + # there are a handful of PDB entries that have (probably wrong) secondary structure assignments + # extending over more than one segment (e.g. 2bp7, 2wjv), drop these rather than raising an error + "beg_label_asym_id == end_label_asym_id" + ) except KeyError: - self.helix_table = None + self.conf_table = None # decode information into dataframe with BioPython helper method; note this section may not be # present if no sheets exist in the structure @@ -526,16 +532,23 @@ def __init__(self, filehandle, keep_full_data=False): # create secondary structure table for merging to chain tables # (will only contain helix/H and strand/E, coil/C will need to be filled in) sse_raw = [] - for sse_type, sse_table in [ - ("H", self.helix_table), - ("E", self.sheet_table) + for sse_type, sse_table, sse_filter in [ + ("H", self.conf_table, "HELX"), + ("E", self.sheet_table, None), + # also retrieve beta strands/bridges from conf_table if available + ("E", self.conf_table, "STRN"), ]: # skip if secondary structure element not present in PDB file at all if sse_table is None: continue + # filter table down to relevant entries for current secondary structure type + if sse_filter is not None: + sse_table = sse_table.query( + f"conformation_type.str.startswith('{sse_filter}')" + ) + for _, row in sse_table.iterrows(): - assert row.beg_label_asym_id == row.end_label_asym_id for seq_id in range(row.beg_label_seq_id, row.end_label_seq_id + 1): sse_raw.append({ "label_asym_id": row.beg_label_asym_id, @@ -694,7 +707,7 @@ def get_chain(self, chain, model=0, is_author_id=True): # create coordinate ID from author residue ID + insertion code # (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID) coord_id=lambda df: df.auth_seq_id.astype(str) + df.insertion_code, - seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", np.nan), + seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", pd.NA).replace("", pd.NA), one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1, na_action="ignore"), # note that MSE will now be labeled as HETATM, which was not the case with MMTF hetatm=lambda df: df.record_type == "HETATM", @@ -720,12 +733,13 @@ def get_chain(self, chain, model=0, is_author_id=True): how="left" ) else: + # initialize to pd.NA instead of np.nan or warning about assigning str to float64 column appears res_sse = res.assign( - sec_struct_3state=np.nan + sec_struct_3state=pd.NA ) res_sse.loc[ - res_sse.sec_struct_3state.isnull() & (res_sse.label_seq_id > 0), + res_sse.sec_struct_3state.isnull() & res_sse.seqres_id.notnull(), "sec_struct_3state" ] = "C" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..63bda4e4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,64 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "evcouplings" +version = "0.2.1" +description = "A Framework for evolutionary couplings analysis" +readme = "README.md" +license = "MIT" +authors = [ + { name = "Thomas Hopf", email = "thomas.hopf@gmail.com" }, +] +keywords = [ + "analysis", + "couplings", + "evolutionary", +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "billiard", + "biopython>=1.84", + "bokeh", + "click", + "filelock", + "jinja2", + "matplotlib", + "msgpack", + "numba", + "numpy", + "pandas", + "psutil", + "requests", + "ruamel.yaml<0.18", + "scikit-learn", + "scipy", + "seaborn", + "setuptools>=18.2", +] + +[project.scripts] +evcouplings = "evcouplings.utils.app:app" +evcouplings_dbupdate = "evcouplings.utils.update_database:app" +evcouplings_runcfg = "evcouplings.utils.pipeline:app" +evcouplings_summarize = "evcouplings.utils.summarize:app" + +[project.urls] +Homepage = "https://github.com/debbiemarkslab/EVcouplings" + +[tool.hatch.version] +path = "evcouplings/__init__.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/evcouplings", +] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e7397c17..00000000 --- a/requirements.txt +++ /dev/null @@ -1,17 +0,0 @@ -numpy -scipy -pandas -numba -ruamel.yaml -matplotlib -requests -click -filelock -psutil -bokeh -jinja2 -biopython -seaborn -billiard -scikit-learn -msgpack \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 5a663219..00000000 --- a/setup.py +++ /dev/null @@ -1,105 +0,0 @@ -from setuptools import setup, find_packages # Always prefer setuptools over distutils -from codecs import open # To use a consistent encoding -from os import path - -here = path.abspath(path.dirname(__file__)) - -# Get the long description from the relevant file -with open(path.join(here, 'README.md'), encoding='utf-8') as f: - readme = f.read() - - -# for packaging files must be in a package (with init) and listed in package_data -# package-externals can be included with data_files, -# and there is a bug in pattern matching http://bugs.python.org/issue19286 -# install unclear for data_files - -setup( - name='evcouplings', - - # Version: - version='0.2.0', - - description='A Framework for evolutionary couplings analysis', - long_description=readme, - long_description_content_type='text/markdown', - - # The project's main homepage. - url='https://github.com/debbiemarkslab/EVcouplings', - - # Author details - author='Thomas Hopf', - author_email='thomas.hopf@gmail.com', - - # Choose your license - license='MIT', - - # See https://pypi.python.org/pypi?%3Aaction=list_classifiers - classifiers=[ - # How mature is this project? Common values are - # 3 - Alpha - # 4 - Beta - # 5 - Production/Stable - 'Development Status :: 4 - Beta', - - # Indicate who your project is intended for - 'Intended Audience :: Developers', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - - # The license as you wish (should match "license" above) - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - ], - - # What EVcouplings relates to: - keywords='evolutionary couplings analysis', - - # Specify packages via find_packages() and exclude the tests and - # documentation: - packages=find_packages(), - - # If there are data files included in your packages that need to be - # installed, specify them here. If using Python 2.6 or less, then these - # have to be included in MANIFEST.in as well. - include_package_data=True, - package_data={ - 'evcouplings.fold.cns_templates': ['*.*'], - 'evcouplings.couplings.scoring_models': ['*.*'], - }, - - #package_data is a lie: - # http://stackoverflow.com/questions/7522250/how-to-include-package-data-with-setuptools-distribute - - # 'package_data' is used to also install non package data files - # see http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files - # example: - # data_files=data_files, - - # Entry points provide cross-platform support and allow - # pip to create the appropriate form of executable for the target platform. - # IMPORTANT: script names need to be in lower case ! ! ! (otherwise - # deinstallation does not work) - - # Note: evcouplings.utils.app depends on the names evcouplings_runcfg - # and evcouplings_summarize, so any change here must be applied there too! - entry_points={ - 'console_scripts': [ - 'evcouplings=evcouplings.utils.app:app', - 'evcouplings_runcfg=evcouplings.utils.pipeline:app', - 'evcouplings_summarize=evcouplings.utils.summarize:app', - 'evcouplings_dbupdate=evcouplings.utils.update_database:app' - ], - }, - - # Runtime dependencies. (will be installed by pip when EVcouplings is installed) - #setup_requires=['setuptools>=18.2', 'numpy'], - - install_requires=['setuptools>=18.2', 'numpy', - 'pandas', 'scipy', 'numba', 'ruamel.yaml<0.18', 'matplotlib', 'requests', - 'click', 'filelock', 'psutil', 'bokeh', 'jinja2', - 'biopython>=1.84', 'seaborn', 'billiard', 'scikit-learn', 'msgpack' - ], - -)