Skip to content

Commit

Permalink
Allow users to access the distances data as a numpy array
Browse files Browse the repository at this point in the history
- add read-only property `lt_array` to `Dataset` that provides the raw distances data as a 1-d numpy array
- add example of use to readme
- bump deps
- add python 3.13
- bump version
- resolves #134
  • Loading branch information
lkeegan committed Aug 19, 2024
1 parent 860fae9 commit d749829
Show file tree
Hide file tree
Showing 10 changed files with 38 additions and 14 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ jobs:

strategy:
matrix:
os: [ubuntu-latest, macos-13, macos-14, windows-latest]
os: [ubuntu-latest, macos-13, macos-latest, windows-latest]

steps:
- uses: actions/checkout@v4
with:
submodules: "recursive"
- uses: pypa/cibuildwheel@v2.19
- uses: pypa/cibuildwheel@v2.20
env:
CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux2014_x86_64_cuda_11.8
- uses: actions/upload-artifact@v4
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ repos:
- id: mixed-line-ending

- repo: https://github.com/psf/black
rev: 24.4.2
rev: 24.8.0
hooks:
- id: black

Expand All @@ -29,7 +29,7 @@ repos:
- id: prettier

- repo: https://github.com/python-jsonschema/check-jsonschema
rev: 0.28.6
rev: 0.29.1
hooks:
- id: check-github-workflows
- id: check-readthedocs
Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
cmake_minimum_required(VERSION 3.23..3.29)
cmake_minimum_required(VERSION 3.23..3.30)

project(
hammingdist
VERSION 1.2.0
VERSION 1.3.0
LANGUAGES CXX)

include(CTest)
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ data.dump_sparse("sparse.txt", threshold=3)
# If the `remove_duplicates` option was used, the sequence indices can also be written.
# For each input sequence, this prints the corresponding index in the output:
data.dump_sequence_indices("indices.txt")

# The lower-triangular distance elements can also be directly accessed as a 1-d numpy array:
lt_array = data.lt_array
# The elements in this array correspond to the 2-d indices (row=1,col=0), (row=2,col=0), (row=2,col=1), ...
# These indices can be generated using the numpy tril_indices function, e.g. to construct the lower-triangular matrix:
lt_matrix = np.zeros((n_seq, n_seq))
lt_matrix[np.tril_indices(n_seq, -1)] = lt_array
```

## Duplicates
Expand Down
2 changes: 1 addition & 1 deletion ext/Catch2
Submodule Catch2 updated 133 files
2 changes: 1 addition & 1 deletion ext/benchmark
Submodule benchmark updated 108 files
2 changes: 1 addition & 1 deletion ext/pybind11
Submodule pybind11 updated 50 files
+2 −0 .clang-tidy
+8 −0 .github/workflows/ci.yml
+30 −0 .github/workflows/emscripten.yaml
+1 −1 .github/workflows/format.yml
+1 −1 .github/workflows/pip.yml
+6 −6 .pre-commit-config.yaml
+1 −0 CMakeLists.txt
+1 −1 docs/advanced/cast/eigen.rst
+92 −0 docs/changelog.rst
+3 −3 docs/compiling.rst
+0 −4 docs/limitations.rst
+3 −3 docs/requirements.txt
+9 −2 include/pybind11/cast.h
+19 −3 include/pybind11/detail/common.h
+4 −2 include/pybind11/detail/init.h
+22 −7 include/pybind11/detail/internals.h
+2 −62 include/pybind11/detail/type_caster_base.h
+77 −0 include/pybind11/detail/value_and_holder.h
+0 −3 include/pybind11/eigen/tensor.h
+45 −34 include/pybind11/functional.h
+10 −1 include/pybind11/gil_safe_call_once.h
+6 −2 include/pybind11/numpy.h
+11 −2 include/pybind11/stl/filesystem.h
+1 −1 include/pybind11/stl_bind.h
+7 −2 include/pybind11/typing.h
+26 −3 pybind11/__main__.py
+1 −1 pybind11/_version.py
+10 −1 tests/CMakeLists.txt
+1 −1 tests/constructor_stats.h
+1 −0 tests/extra_python_package/test_files.py
+21 −0 tests/pyproject.toml
+5 −0 tests/test_async.py
+2 −0 tests/test_builtin_casters.py
+3 −0 tests/test_callbacks.py
+11 −5 tests/test_eigen_tensor.inl
+1 −1 tests/test_exceptions.py
+9 −4 tests/test_gil_scoped.py
+4 −0 tests/test_iostream.py
+6 −6 tests/test_modules.cpp
+2 −0 tests/test_numpy_dtypes.cpp
+1 −1 tests/test_opaque_types.cpp
+4 −4 tests/test_pytypes.cpp
+2 −2 tests/test_pytypes.py
+1 −0 tests/test_tagbased_polymorphic.cpp
+5 −0 tests/test_thread.py
+46 −0 tests/test_type_caster_std_function_specializations.cpp
+15 −0 tests/test_type_caster_std_function_specializations.py
+3 −0 tests/test_virtual_functions.py
+29 −3 tools/pybind11Common.cmake
+1 −1 tools/pybind11Config.cmake.in
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"

[project]
name = "hammingdist"
version = "1.2.0"
version = "1.3.0"
description = "A fast tool to calculate Hamming distances"
readme = "README.md"
license = {text = "MIT"}
Expand All @@ -23,6 +23,7 @@ classifiers=[
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"Operating System :: MacOS :: MacOS X",
Expand Down
10 changes: 8 additions & 2 deletions python/hammingdist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ PYBIND11_MODULE(hammingdist, m) {
&DataSet<DefaultDistIntType>::dump_sequence_indices,
"Dump row index in distances matrix for each input sequence")
.def("__getitem__", &DataSet<DefaultDistIntType>::operator[])
.def_readonly("_distances", &DataSet<DefaultDistIntType>::result);
.def_readonly("_distances", &DataSet<DefaultDistIntType>::result)
.def_property_readonly("lt_array", [](DataSet<DefaultDistIntType> &self) {
return py::array(self.result.size(), self.result.data());
});

py::class_<DataSet<uint16_t>>(m, "DataSetLarge")
.def("dump", &DataSet<uint16_t>::dump,
Expand All @@ -58,7 +61,10 @@ PYBIND11_MODULE(hammingdist, m) {
.def("dump_sequence_indices", &DataSet<uint16_t>::dump_sequence_indices,
"Dump row index in distances matrix for each input sequence")
.def("__getitem__", &DataSet<uint16_t>::operator[])
.def_readonly("_distances", &DataSet<uint16_t>::result);
.def_readonly("_distances", &DataSet<uint16_t>::result)
.def_property_readonly("lt_array", [](DataSet<uint16_t> &self) {
return py::array(self.result.size(), self.result.data());
});

m.def("from_stringlist", &from_stringlist,
"Creates a dataset from a list of strings");
Expand Down
14 changes: 12 additions & 2 deletions python/tests/test_hammingdist.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ def write_fasta_file(filename, sequences):


def check_output_sizes(dat, n_in, n_out, tmp_out_file, fasta_sequence_indices=None):
assert dat.lt_array.shape == (n_out * (n_out - 1) // 2,)

dat.dump(tmp_out_file)
dump = np.loadtxt(tmp_out_file, delimiter=",")
assert len(dump) == n_out
Expand Down Expand Up @@ -97,8 +99,9 @@ def test_from_fasta(from_fasta_func, use_gpu, tmp_path):
)
@pytest.mark.parametrize("max_distance", [0, 1, 2, 3, 89, 497, 9999999])
def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path):
# generate 50 sequences, each with 25 characters
sequences = ["".join(random.choices(chars, k=25)) for i in range(50)]
n_seq = 50
n_chars = 25
sequences = ["".join(random.choices(chars, k=n_chars)) for i in range(n_seq)]
fasta_file = str(tmp_path / "fasta.txt")
write_fasta_file(fasta_file, sequences)
# calculate distances matrix
Expand All @@ -108,6 +111,12 @@ def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path):
include_x=include_x,
max_distance=max_distance,
)
# get lower-triangular data as 1-d array
lt_array = data.lt_array
assert lt_array.shape == (n_seq * (n_seq - 1) // 2,)
# reshape to lower-triangular matrix
lt_matrix = np.zeros((n_seq, n_seq), dtype=np.uint8)
lt_matrix[np.tril_indices(n_seq, -1)] = lt_array
# use each sequence in turn as the reference sequence & calculate reference distances
for i, sequence in enumerate(sequences):
vec = hammingdist.fasta_reference_distances(
Expand All @@ -120,6 +129,7 @@ def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path):
# if x is not included, invalid chars have distance 1 but data[i,i] returns 0 by construction
if include_x or i != j:
assert data[i, j] == min(max_distance, dist)
assert lt_matrix[max(i, j), min(i, j)] == min(max_distance, dist)
# should also agree with output of distance function for these two sequences
assert dist == hammingdist.distance(
sequences[i], sequences[j], include_x=include_x
Expand Down

0 comments on commit d749829

Please sign in to comment.