diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 41d8fc3..d11cb61 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -23,9 +23,9 @@ jobs: run: pip install -e ".[dev]" if: steps.python-cache.outputs.cache-hit != 'true' - # check code style - - name: Run black - run: black src --check --diff + # check with ruff + - name: Run ruff + run: ruff check # check docs - name: Check that documentation can be built diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 401d802..456fe8a 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -11,7 +11,7 @@ on: env: # python version used to calculate and submit code coverage - COV_PYTHON_VERSION: "3.11" + COV_PYTHON_VERSION: "3.12" jobs: python-unit: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f75929b..dfeb0d9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,11 @@ files: \.py repos: - - repo: https://github.com/psf/black - rev: 22.10.0 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.3.4 hooks: - - id: black + - id: ruff + args: [ --select, I, --fix, --exit-non-zero-on-fix ] + - id: ruff-format - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.3.0 hooks: @@ -14,7 +16,7 @@ repos: - id: mixed-line-ending - id: trailing-whitespace - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.2 + rev: v1.13.0 hooks: - id: mypy - additional_dependencies: [types-python-dateutil] + additional_dependencies: [numpy] diff --git a/.pythonversion b/.pythonversion new file mode 100644 index 0000000..4516194 --- /dev/null +++ b/.pythonversion @@ -0,0 +1 @@ +Python 3.12.7 diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 3cfe1f6..415e17b 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -46,5 +46,6 @@ We use [All Contributors](https://allcontributors.org/) because we recognize tha ### Related blog posts -- [by Rebecca Sutton Koeser](#blog-rlskoeser) +(blog-rlskoeser)= +#### [by Rebecca Sutton Koeser](#blog-rlskoeser) - [Join me for a DHTech hackathon? It’s an un-date!](https://dh-tech.github.io/blog/2023-02-09-hackathon-summary/) 2023-02-09 on DHTech blog diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md index 7817c88..6d4918c 100644 --- a/DEVELOPER_NOTES.md +++ b/DEVELOPER_NOTES.md @@ -31,10 +31,12 @@ If you cannot or prefer not to install git flow, you can do the equivalent manua ### Create a Python virtual environment -Use a recent version of python 3. We highly recommend using a python virtualenv, e.g. +Use a recent version of python 3 (we recommend 3.12). If you use [pyenv](https://github.com/pyenv/pyenv), run `pyenv install` to get the current recommended python version for development (specified in `.pythonversion`). + +We highly recommend using a python virtualenv to isolate dependencies, e.g. ``` -python3 -m venv undate -source undate/bin/activate +python3 -m venv .venv +source .venv/bin/activate ``` ### Install local version of undate with development python dependencies @@ -47,12 +49,12 @@ pip install -e ".[dev]" ### Install pre-commit hooks +We use [pre-commit](https://pre-commit.com/) for automated checks and consistent formatting. If you're planning to contribute, please install these when you set up your local development. + ```sh pre-commit install ``` -We use [pre-commit](https://pre-commit.com/) for automated checks and consistent formatting. If you're planning to contribute, please install these when you set up your local development. - ## Tests, documentation, and other checks ### Running unit tests @@ -65,7 +67,7 @@ To test cases by method name, use `-k`: `pytest -k test_str` ### Check python types -Python typing is currently enforced on pull requests as part of a GitHub Actions Continuous Integration check using `mypy`. +Python typing is currently enforced on pull requests as part of a GitHub Actions Continuous Integration check using `mypy` and via pre-commit hook. To check types locally: 1. Install the necessary typing libraries (first run only): diff --git a/README.md b/README.md index 2008c60..1d31ce7 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ It was initially created as part of a [DH-Tech](https://dh-tech.github.io/) hack [![Documentation Status](https://readthedocs.org/projects/undate-python/badge/?version=latest)](https://undate-python.readthedocs.io/en/latest/?badge=latest) [![unit tests](https://github.com/dh-tech/undate-python/actions/workflows/unit_tests.yml/badge.svg)](https://github.com/dh-tech/undate-python/actions/workflows/unit_tests.yml) [![codecov](https://codecov.io/gh/dh-tech/undate-python/branch/main/graph/badge.svg?token=GE7HZE8C9D)](https://codecov.io/gh/dh-tech/undate-python) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![All Contributors](https://img.shields.io/badge/all_contributors-5-orange.svg?style=flat-square)](CONTRIBUTORS.md) diff --git a/docs/conf.py b/docs/conf.py index 1294ce3..8961d82 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -86,4 +86,4 @@ } # turn on relative links; make sure both github and sphinx links work -m2r_parse_relative_links = True +myst_enable_extensions = ["linkify"] diff --git a/examples/notebooks/shxco_partial_date_durations.ipynb b/examples/notebooks/shxco_partial_date_durations.ipynb index 8d00a66..b89661f 100644 --- a/examples/notebooks/shxco_partial_date_durations.ipynb +++ b/examples/notebooks/shxco_partial_date_durations.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -29,8 +29,8 @@ "output_type": "stream", "text": [ "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/Users/rkoeser/workarea/env/undate/bin/python -m pip install --upgrade pip\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/Users/rkoeser/workarea/env/undate-py3.10/bin/python3.10 -m pip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } @@ -311,17 +311,18 @@ "\n", "Define a method to initialize an `UndateInterval` from start and end date strings in ISO format as used in S&co datasets\n", "\n", - "**Note:** There's an off-by-one discrepancy between how we currently calculate duration in Undate and in the Shakespeare and Company Project code; becauS&co code counts the first day in the range but not the last (this could also be thought of as counting half of the start and end dates). For simplicity of comparison here, we subtract one day from the result returned by `UndateInterval.duration`." + "**Note:** There's an off-by-one discrepancy between how we currently calculate duration in Undate and in the Shakespeare and Company Project code. This is because S&co code counts the first day in the range but not the last (this could also be thought of as counting half of the start and end dates). For simplicity of comparison here, we subtract one day from the result returned by `UndateInterval.duration`." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "id": "y_MqgrQW64uI" }, "outputs": [], "source": [ + "from undate.date import ONE_DAY\n", "from undate.undate import UndateInterval\n", "from undate.dateformat.iso8601 import ISO8601DateFormat\n", "\n", @@ -333,9 +334,8 @@ " interval = UndateInterval(earliest=unstart, latest=unend)\n", "\n", " # subtract one here for simplicity of comparison,\n", - " # to reconcile difference between how duration logic\n", - "\n", - " return interval.duration().days - 1" + " # to reconcile differences between duration logic\n", + " return interval.duration() - ONE_DAY" ] }, { @@ -353,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -452,7 +452,7 @@ "260 4 months 122.0 " ] }, - "execution_count": 18, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -478,7 +478,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -497,15 +497,15 @@ "91.0 397\n", "365.0 337\n", " ... \n", - "69.0 1\n", - "36.0 1\n", - "73.0 1\n", - "574.0 1\n", - "171.0 1\n", + "200.0 1\n", + "277.0 1\n", + "169.0 1\n", + "45.0 1\n", + "38.0 1\n", "Name: count, Length: 133, dtype: int64" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -517,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -540,7 +540,7 @@ "Name: subscription_duration_days, dtype: float64" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -558,7 +558,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -589,44 +589,25 @@ " \n", " \n", " \n", - " event_type\n", + " member_names\n", " start_date\n", " end_date\n", - " member_uris\n", - " member_names\n", - " member_sort_names\n", - " subscription_price_paid\n", - " subscription_deposit\n", " subscription_duration\n", " subscription_duration_days\n", - " ...\n", - " item_uri\n", - " item_title\n", - " item_volume\n", - " item_authors\n", - " item_year\n", - " item_notes\n", - " source_type\n", - " source_citation\n", - " source_manifest\n", - " source_image\n", " \n", " \n", " \n", " \n", "\n", - "

0 rows Γ— 28 columns

\n", "" ], "text/plain": [ "Empty DataFrame\n", - "Columns: [event_type, start_date, end_date, member_uris, member_names, member_sort_names, subscription_price_paid, subscription_deposit, subscription_duration, subscription_duration_days, subscription_volumes, subscription_category, subscription_purchase_date, reimbursement_refund, borrow_status, borrow_duration_days, purchase_price, currency, item_uri, item_title, item_volume, item_authors, item_year, item_notes, source_type, source_citation, source_manifest, source_image]\n", - "Index: []\n", - "\n", - "[0 rows x 28 columns]" + "Columns: [member_names, start_date, end_date, subscription_duration, subscription_duration_days]\n", + "Index: []" ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -638,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -707,7 +688,7 @@ "13686 NaN 31.0 " ] }, - "execution_count": 19, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -726,7 +707,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 9, "metadata": { "id": "jwvN9-CgLQRx" }, @@ -746,7 +727,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -793,7 +774,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 70\n", @@ -802,7 +783,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 233\n", @@ -811,7 +792,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " \n", " \n", " 234\n", @@ -820,7 +801,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " \n", " \n", " 260\n", @@ -829,7 +810,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " \n", " \n", "\n", @@ -843,15 +824,15 @@ "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 1922-02 \n", "260 Victor Llona 1923-06 1923-10 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "233 1 month 31.0 61 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 " + " subscription_duration subscription_duration_days undate_duration \n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "233 1 month 31.0 61 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days " ] }, - "execution_count": 21, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -864,7 +845,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -911,7 +892,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 70\n", @@ -920,7 +901,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " \n", " \n", " 233\n", @@ -929,7 +910,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " \n", " \n", " 234\n", @@ -938,7 +919,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " \n", " \n", " 260\n", @@ -947,7 +928,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " \n", " \n", "\n", @@ -961,15 +942,15 @@ "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell 1921-09 1922-02 \n", "260 Victor Llona 1923-06 1923-10 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "233 1 month 31.0 61 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 " + " subscription_duration subscription_duration_days undate_duration \n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "233 1 month 31.0 61 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days " ] }, - "execution_count": 23, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -981,7 +962,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1029,7 +1010,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1039,7 +1020,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1049,7 +1030,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1059,7 +1040,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " 27.0\n", " \n", " \n", @@ -1069,7 +1050,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " 30.0\n", " \n", " \n", @@ -1089,7 +1070,7 @@ " 1941-12-24\n", " 1 month\n", " 30.0\n", - " 30\n", + " 30 days\n", " 0.0\n", " \n", " \n", @@ -1099,7 +1080,7 @@ " 1941-12-24\n", " 1 month\n", " 30.0\n", - " 30\n", + " 30 days\n", " 0.0\n", " \n", " \n", @@ -1109,7 +1090,7 @@ " 1942-01-04\n", " 1 month\n", " 31.0\n", - " 31\n", + " 31 days\n", " 0.0\n", " \n", " \n", @@ -1119,7 +1100,7 @@ " 1942-03-08\n", " 3 months\n", " 90.0\n", - " 90\n", + " 90 days\n", " 0.0\n", " \n", " \n", @@ -1129,7 +1110,7 @@ " 1942-01-09\n", " 1 month\n", " 31.0\n", - " 31\n", + " 31 days\n", " 0.0\n", " \n", " \n", @@ -1164,36 +1145,36 @@ "35118 1942-03-08 3 months 90.0 \n", "35119 1942-01-09 1 month 31.0 \n", "\n", - " undate_duration duration_diff \n", - "28 730 365.0 \n", - "70 730 365.0 \n", - "233 61 30.0 \n", - "234 180 27.0 \n", - "260 152 30.0 \n", - "... ... ... \n", - "35114 30 0.0 \n", - "35115 30 0.0 \n", - "35116 31 0.0 \n", - "35118 90 0.0 \n", - "35119 31 0.0 \n", + " undate_duration duration_diff \n", + "28 730 days 365.0 \n", + "70 730 days 365.0 \n", + "233 61 days 30.0 \n", + "234 180 days 27.0 \n", + "260 152 days 30.0 \n", + "... ... ... \n", + "35114 30 days 0.0 \n", + "35115 30 days 0.0 \n", + "35116 31 days 0.0 \n", + "35118 90 days 0.0 \n", + "35119 31 days 0.0 \n", "\n", "[9144 rows x 7 columns]" ] }, - "execution_count": 24, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# what's the difference between the two?\n", - "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration - row.subscription_duration_days, axis=1)\n", + "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration.astype(\"int\") - row.subscription_duration_days, axis=1)\n", "subs_duration" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1206,20 +1187,20 @@ "data": { "text/plain": [ "duration_diff\n", - " 0.0 9065\n", - " 30.0 30\n", - " 29.0 21\n", - " 1.0 10\n", - "-1.0 9\n", - " 28.0 4\n", - " 365.0 2\n", - " 27.0 1\n", - " 2.0 1\n", - "-3.0 1\n", + "0.0 9065\n", + "30.0 30\n", + "29.0 21\n", + "1.0 10\n", + "-1.0 9\n", + "28.0 4\n", + "365.0 2\n", + "27.0 1\n", + "2.0 1\n", + "-3.0 1\n", "Name: count, dtype: int64" ] }, - "execution_count": 25, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1239,7 +1220,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1287,7 +1268,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1297,7 +1278,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1307,7 +1288,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1317,7 +1298,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " 27.0\n", " \n", " \n", @@ -1327,7 +1308,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " 30.0\n", " \n", " \n", @@ -1337,7 +1318,7 @@ " 1923-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1347,7 +1328,7 @@ " 1924-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1357,7 +1338,7 @@ " 1924-04\n", " 2 months\n", " 60.0\n", - " 89\n", + " 89 days\n", " 29.0\n", " \n", " \n", @@ -1367,7 +1348,7 @@ " 1926-10\n", " 7 months\n", " 214.0\n", - " 244\n", + " 244 days\n", " 30.0\n", " \n", " \n", @@ -1377,7 +1358,7 @@ " 1926-12\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1397,32 +1378,32 @@ "293 Madeleine Lorsignol 1926-03 1926-10 \n", "313 M. Mathieu 1926-11 1926-12 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \\\n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "233 1 month 31.0 61 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 \n", - "261 1 month 31.0 60 \n", - "271 1 month 29.0 59 \n", - "272 2 months 60.0 89 \n", - "293 7 months 214.0 244 \n", - "313 1 month 30.0 60 \n", + " subscription_duration subscription_duration_days undate_duration \\\n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "233 1 month 31.0 61 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days \n", + "261 1 month 31.0 60 days \n", + "271 1 month 29.0 59 days \n", + "272 2 months 60.0 89 days \n", + "293 7 months 214.0 244 days \n", + "313 1 month 30.0 60 days \n", "\n", - " duration_diff \n", - "28 365.0 \n", - "70 365.0 \n", - "233 30.0 \n", - "234 27.0 \n", - "260 30.0 \n", - "261 29.0 \n", - "271 30.0 \n", - "272 29.0 \n", - "293 30.0 \n", - "313 30.0 " + " duration_diff \n", + "28 365.0 \n", + "70 365.0 \n", + "233 30.0 \n", + "234 27.0 \n", + "260 30.0 \n", + "261 29.0 \n", + "271 30.0 \n", + "272 29.0 \n", + "293 30.0 \n", + "313 30.0 " ] }, - "execution_count": 41, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1435,7 +1416,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1455,14 +1436,14 @@ "4 months 5\n", "5 months 3\n", "1 year 2\n", - "7 months 2\n", "8 months 2\n", + "7 months 2\n", "11 months 1\n", "10 months 1\n", "Name: count, dtype: int64" ] }, - "execution_count": 27, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1474,7 +1455,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1522,7 +1503,7 @@ " 1921-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1532,7 +1513,7 @@ " 1923-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1542,7 +1523,7 @@ " 1924-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1552,7 +1533,7 @@ " 1926-12\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1562,7 +1543,7 @@ " 1928-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1572,7 +1553,7 @@ " 1928-03\n", " 1 month\n", " 29.0\n", - " 59\n", + " 59 days\n", " 30.0\n", " \n", " \n", @@ -1582,7 +1563,7 @@ " 1929-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1592,7 +1573,7 @@ " 1929-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1602,7 +1583,7 @@ " 1930-06\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1612,7 +1593,7 @@ " 1930-12\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1622,7 +1603,7 @@ " 1931-06\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1632,7 +1613,7 @@ " 1931-07\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1642,7 +1623,7 @@ " 1931-08\n", " 1 month\n", " 31.0\n", - " 61\n", + " 61 days\n", " 30.0\n", " \n", " \n", @@ -1652,7 +1633,7 @@ " 1931-09\n", " 1 month\n", " 31.0\n", - " 60\n", + " 60 days\n", " 29.0\n", " \n", " \n", @@ -1662,7 +1643,7 @@ " 1931-10\n", " 1 month\n", " 30.0\n", - " 60\n", + " 60 days\n", " 30.0\n", " \n", " \n", @@ -1687,25 +1668,25 @@ "468 Elaine Cammett 1931-08 1931-09 1 month \n", "472 Frederick McWilliam 1931-09 1931-10 1 month \n", "\n", - " subscription_duration_days undate_duration duration_diff \n", - "233 31.0 61 30.0 \n", - "261 31.0 60 29.0 \n", - "271 29.0 59 30.0 \n", - "313 30.0 60 30.0 \n", - "354 29.0 59 30.0 \n", - "356 29.0 59 30.0 \n", - "393 31.0 60 29.0 \n", - "394 31.0 60 29.0 \n", - "430 31.0 60 29.0 \n", - "444 30.0 60 30.0 \n", - "462 31.0 60 29.0 \n", - "464 30.0 60 30.0 \n", - "466 31.0 61 30.0 \n", - "468 31.0 60 29.0 \n", - "472 30.0 60 30.0 " + " subscription_duration_days undate_duration duration_diff \n", + "233 31.0 61 days 30.0 \n", + "261 31.0 60 days 29.0 \n", + "271 29.0 59 days 30.0 \n", + "313 30.0 60 days 30.0 \n", + "354 29.0 59 days 30.0 \n", + "356 29.0 59 days 30.0 \n", + "393 31.0 60 days 29.0 \n", + "394 31.0 60 days 29.0 \n", + "430 31.0 60 days 29.0 \n", + "444 30.0 60 days 30.0 \n", + "462 31.0 60 days 29.0 \n", + "464 30.0 60 days 30.0 \n", + "466 31.0 61 days 30.0 \n", + "468 31.0 60 days 29.0 \n", + "472 30.0 60 days 30.0 " ] }, - "execution_count": 43, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1728,7 +1709,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1776,7 +1757,7 @@ " 1928\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1786,7 +1767,7 @@ " 1932\n", " 1 year\n", " 365.0\n", - " 730\n", + " 730 days\n", " 365.0\n", " \n", " \n", @@ -1796,7 +1777,7 @@ " 1922-02\n", " 5 months\n", " 153.0\n", - " 180\n", + " 180 days\n", " 27.0\n", " \n", " \n", @@ -1806,7 +1787,7 @@ " 1923-10\n", " 4 months\n", " 122.0\n", - " 152\n", + " 152 days\n", " 30.0\n", " \n", " \n", @@ -1816,7 +1797,7 @@ " 1924-04\n", " 2 months\n", " 60.0\n", - " 89\n", + " 89 days\n", " 29.0\n", " \n", " \n", @@ -1826,7 +1807,7 @@ " 1926-10\n", " 7 months\n", " 214.0\n", - " 244\n", + " 244 days\n", " 30.0\n", " \n", " \n", @@ -1836,7 +1817,7 @@ " 1928-02\n", " 11 months\n", " 337.0\n", - " 365\n", + " 365 days\n", " 28.0\n", " \n", " \n", @@ -1846,7 +1827,7 @@ " 1927-10\n", " 3 months\n", " 92.0\n", - " 122\n", + " 122 days\n", " 30.0\n", " \n", " \n", @@ -1856,7 +1837,7 @@ " 1928-06\n", " 8 months\n", " 244.0\n", - " 273\n", + " 273 days\n", " 29.0\n", " \n", " \n", @@ -1866,7 +1847,7 @@ " 1928-04\n", " 3 months\n", " 91.0\n", - " 120\n", + " 120 days\n", " 29.0\n", " \n", " \n", @@ -1876,7 +1857,7 @@ " 1930-04\n", " 10 months\n", " 304.0\n", - " 333\n", + " 333 days\n", " 29.0\n", " \n", " \n", @@ -1886,7 +1867,7 @@ " 1930-04\n", " 3 months\n", " 90.0\n", - " 119\n", + " 119 days\n", " 29.0\n", " \n", " \n", @@ -1896,7 +1877,7 @@ " 1930-04\n", " 3 months\n", " 90.0\n", - " 119\n", + " 119 days\n", " 29.0\n", " \n", " \n", @@ -1906,7 +1887,7 @@ " 1930-09\n", " 8 months\n", " 243.0\n", - " 272\n", + " 272 days\n", " 29.0\n", " \n", " \n", @@ -1916,7 +1897,7 @@ " 1930-06\n", " 4 months\n", " 120.0\n", - " 149\n", + " 149 days\n", " 29.0\n", " \n", " \n", @@ -1941,42 +1922,42 @@ "412 Jacques Delmond 1930-01 1930-09 \n", "415 Loren Mozley 1930-02 1930-06 \n", "\n", - " subscription_duration subscription_duration_days undate_duration \\\n", - "28 1 year 365.0 730 \n", - "70 1 year 365.0 730 \n", - "234 5 months 153.0 180 \n", - "260 4 months 122.0 152 \n", - "272 2 months 60.0 89 \n", - "293 7 months 214.0 244 \n", - "321 11 months 337.0 365 \n", - "331 3 months 92.0 122 \n", - "337 8 months 244.0 273 \n", - "349 3 months 91.0 120 \n", - "388 10 months 304.0 333 \n", - "408 3 months 90.0 119 \n", - "409 3 months 90.0 119 \n", - "412 8 months 243.0 272 \n", - "415 4 months 120.0 149 \n", + " subscription_duration subscription_duration_days undate_duration \\\n", + "28 1 year 365.0 730 days \n", + "70 1 year 365.0 730 days \n", + "234 5 months 153.0 180 days \n", + "260 4 months 122.0 152 days \n", + "272 2 months 60.0 89 days \n", + "293 7 months 214.0 244 days \n", + "321 11 months 337.0 365 days \n", + "331 3 months 92.0 122 days \n", + "337 8 months 244.0 273 days \n", + "349 3 months 91.0 120 days \n", + "388 10 months 304.0 333 days \n", + "408 3 months 90.0 119 days \n", + "409 3 months 90.0 119 days \n", + "412 8 months 243.0 272 days \n", + "415 4 months 120.0 149 days \n", "\n", - " duration_diff \n", - "28 365.0 \n", - "70 365.0 \n", - "234 27.0 \n", - "260 30.0 \n", - "272 29.0 \n", - "293 30.0 \n", - "321 28.0 \n", - "331 30.0 \n", - "337 29.0 \n", - "349 29.0 \n", - "388 29.0 \n", - "408 29.0 \n", - "409 29.0 \n", - "412 29.0 \n", - "415 29.0 " + " duration_diff \n", + "28 365.0 \n", + "70 365.0 \n", + "234 27.0 \n", + "260 30.0 \n", + "272 29.0 \n", + "293 30.0 \n", + "321 28.0 \n", + "331 30.0 \n", + "337 29.0 \n", + "349 29.0 \n", + "388 29.0 \n", + "408 29.0 \n", + "409 29.0 \n", + "412 29.0 \n", + "415 29.0 " ] }, - "execution_count": 44, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -2001,7 +1982,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2087,7 +2068,7 @@ "606 G. E. Pulsford --01-20 --01-28 8.0" ] }, - "execution_count": 32, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -2101,7 +2082,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2187,7 +2168,7 @@ "29908 Ann Samyn 1961-10-04 1962-03-21 168.0" ] }, - "execution_count": 33, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -2198,7 +2179,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2243,7 +2224,7 @@ " --01-07\n", " --01-13\n", " 6.0\n", - " 6\n", + " 6 days\n", " \n", " \n", " 603\n", @@ -2251,7 +2232,7 @@ " --01-12\n", " --01-20\n", " 8.0\n", - " 8\n", + " 8 days\n", " \n", " \n", " 604\n", @@ -2259,7 +2240,7 @@ " --01-16\n", " --02-16\n", " 31.0\n", - " 31\n", + " 31 days\n", " \n", " \n", " 605\n", @@ -2267,7 +2248,7 @@ " --01-19\n", " --01-24\n", " 5.0\n", - " 5\n", + " 5 days\n", " \n", " \n", " 606\n", @@ -2275,7 +2256,7 @@ " --01-20\n", " --01-28\n", " 8.0\n", - " 8\n", + " 8 days\n", " \n", " \n", " 607\n", @@ -2283,7 +2264,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " \n", " \n", " 608\n", @@ -2291,7 +2272,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " \n", " \n", " 609\n", @@ -2299,7 +2280,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " \n", " \n", " 610\n", @@ -2307,7 +2288,7 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " \n", " \n", " 611\n", @@ -2315,27 +2296,27 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " \n", " \n", "\n", "" ], "text/plain": [ - " member_names start_date end_date borrow_duration_days undate_duration\n", - "602 G. E. Pulsford --01-07 --01-13 6.0 6\n", - "603 G. E. Pulsford --01-12 --01-20 8.0 8\n", - "604 Robert D. Sage --01-16 --02-16 31.0 31\n", - "605 Gertrude Stein --01-19 --01-24 5.0 5\n", - "606 G. E. Pulsford --01-20 --01-28 8.0 8\n", - "607 Gertrude Stein --01-24 --03-20 55.0 55\n", - "608 Gertrude Stein --01-24 --03-20 55.0 55\n", - "609 Gertrude Stein --01-24 --03-20 55.0 55\n", - "610 Gertrude Stein --01-24 --05-30 126.0 126\n", - "611 Gertrude Stein --01-24 --05-30 126.0 126" + " member_names start_date end_date borrow_duration_days undate_duration\n", + "602 G. E. Pulsford --01-07 --01-13 6.0 6 days\n", + "603 G. E. Pulsford --01-12 --01-20 8.0 8 days\n", + "604 Robert D. Sage --01-16 --02-16 31.0 31 days\n", + "605 Gertrude Stein --01-19 --01-24 5.0 5 days\n", + "606 G. E. Pulsford --01-20 --01-28 8.0 8 days\n", + "607 Gertrude Stein --01-24 --03-20 55.0 55 days\n", + "608 Gertrude Stein --01-24 --03-20 55.0 55 days\n", + "609 Gertrude Stein --01-24 --03-20 55.0 55 days\n", + "610 Gertrude Stein --01-24 --05-30 126.0 126 days\n", + "611 Gertrude Stein --01-24 --05-30 126.0 126 days" ] }, - "execution_count": 34, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2348,7 +2329,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2394,7 +2375,7 @@ " --01-07\n", " --01-13\n", " 6.0\n", - " 6\n", + " 6 days\n", " 0.0\n", " \n", " \n", @@ -2403,7 +2384,7 @@ " --01-12\n", " --01-20\n", " 8.0\n", - " 8\n", + " 8 days\n", " 0.0\n", " \n", " \n", @@ -2412,7 +2393,7 @@ " --01-16\n", " --02-16\n", " 31.0\n", - " 31\n", + " 31 days\n", " 0.0\n", " \n", " \n", @@ -2421,7 +2402,7 @@ " --01-19\n", " --01-24\n", " 5.0\n", - " 5\n", + " 5 days\n", " 0.0\n", " \n", " \n", @@ -2430,7 +2411,7 @@ " --01-20\n", " --01-28\n", " 8.0\n", - " 8\n", + " 8 days\n", " 0.0\n", " \n", " \n", @@ -2439,7 +2420,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " 0.0\n", " \n", " \n", @@ -2448,7 +2429,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " 0.0\n", " \n", " \n", @@ -2457,7 +2438,7 @@ " --01-24\n", " --03-20\n", " 55.0\n", - " 55\n", + " 55 days\n", " 0.0\n", " \n", " \n", @@ -2466,7 +2447,7 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " 0.0\n", " \n", " \n", @@ -2475,7 +2456,7 @@ " --01-24\n", " --05-30\n", " 126.0\n", - " 126\n", + " 126 days\n", " 0.0\n", " \n", " \n", @@ -2483,45 +2464,45 @@ "" ], "text/plain": [ - " member_names start_date end_date borrow_duration_days \\\n", - "602 G. E. Pulsford --01-07 --01-13 6.0 \n", - "603 G. E. Pulsford --01-12 --01-20 8.0 \n", - "604 Robert D. Sage --01-16 --02-16 31.0 \n", - "605 Gertrude Stein --01-19 --01-24 5.0 \n", - "606 G. E. Pulsford --01-20 --01-28 8.0 \n", - "607 Gertrude Stein --01-24 --03-20 55.0 \n", - "608 Gertrude Stein --01-24 --03-20 55.0 \n", - "609 Gertrude Stein --01-24 --03-20 55.0 \n", - "610 Gertrude Stein --01-24 --05-30 126.0 \n", - "611 Gertrude Stein --01-24 --05-30 126.0 \n", + " member_names start_date end_date borrow_duration_days undate_duration \\\n", + "602 G. E. Pulsford --01-07 --01-13 6.0 6 days \n", + "603 G. E. Pulsford --01-12 --01-20 8.0 8 days \n", + "604 Robert D. Sage --01-16 --02-16 31.0 31 days \n", + "605 Gertrude Stein --01-19 --01-24 5.0 5 days \n", + "606 G. E. Pulsford --01-20 --01-28 8.0 8 days \n", + "607 Gertrude Stein --01-24 --03-20 55.0 55 days \n", + "608 Gertrude Stein --01-24 --03-20 55.0 55 days \n", + "609 Gertrude Stein --01-24 --03-20 55.0 55 days \n", + "610 Gertrude Stein --01-24 --05-30 126.0 126 days \n", + "611 Gertrude Stein --01-24 --05-30 126.0 126 days \n", "\n", - " undate_duration duration_diff \n", - "602 6 0.0 \n", - "603 8 0.0 \n", - "604 31 0.0 \n", - "605 5 0.0 \n", - "606 8 0.0 \n", - "607 55 0.0 \n", - "608 55 0.0 \n", - "609 55 0.0 \n", - "610 126 0.0 \n", - "611 126 0.0 " + " duration_diff \n", + "602 0.0 \n", + "603 0.0 \n", + "604 0.0 \n", + "605 0.0 \n", + "606 0.0 \n", + "607 0.0 \n", + "608 0.0 \n", + "609 0.0 \n", + "610 0.0 \n", + "611 0.0 " ] }, - "execution_count": 36, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# what's the difference between the two?\n", - "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration - row.borrow_duration_days, axis=1)\n", + "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration.astype(\"int\") - row.borrow_duration_days, axis=1)\n", "borrow_duration.head(10)" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2538,7 +2519,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 37, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2556,14 +2537,21 @@ "source": [ "Woohoo, everything matches! πŸŽ‰\n", "\n", + "* * * \n", + "\n", "In a previous run, there were two borrow events where the calculation did not match; this was due to an error in undate duration method when the start and end dates have unknown years and dates wrap to the following year (e.g., december to january), which has now been corrected.\n", "\n", - "**Note:** One of those events has a range (--06-07/--06-06) that looks like a data error in S&co, but the data matches what is [written on the lending card](https://shakespeareandco.princeton.edu/members/davet-yvonne/cards/cf96d38f-e651-491c-a575-131ea32ce425/#)." + "**Note:** One of those events has a range (--06-07/--06-06) that looks like a data error in S&co, but the data matches what is [written on the lending card](https://shakespeareandco.princeton.edu/members/davet-yvonne/cards/cf96d38f-e651-491c-a575-131ea32ce425/#).\n", + "\n", + "* * * \n", + "\n", + "In a preliminary implementation of the numpy datetime64 integration, the new earliest possible year turned out to be a leap year, resulting in the counts for Gertrude Stein's borrows from January to March to be off by one. This was corrected by adjusting the minimum year by one to ensure it is not a leap year.\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 28, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2572,54 +2560,10 @@ "id": "-Bq76gtDWljg", "outputId": "f1ee526d-b938-4cbf-e93c-c6c91c077ae7" }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
member_namesstart_dateend_dateborrow_duration_daysundate_durationduration_diff
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [member_names, start_date, end_date, borrow_duration_days, undate_duration, duration_diff]\n", - "Index: []" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "borrow_duration[borrow_duration.duration_diff != 0]" + "# Confirm that we have no mismatches\n", + "assert len(borrow_duration[borrow_duration.duration_diff != 0]) == 0" ] } ], @@ -2648,7 +2592,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 18647e4..7cc27da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,24 +6,28 @@ build-backend = "hatchling.build" name = "undate" description = "library for working with uncertain, fuzzy, or partially unknown dates and date intervals" readme = "README.md" -license = {text = "Apache-2"} +license = { text = "Apache-2" } requires-python = ">= 3.9" dynamic = ["version"] -dependencies = [ - "python-dateutil", - "lark" -] +dependencies = ["python-dateutil", "lark", "numpy"] authors = [ - {name = "Rebecca Sutton Koeser"}, - {name = "Cole Crawford"}, - {name = "Julia Damerow"}, - {name = "Robert Casties"}, - {name = "Malte Vogl"}, -# {name = "DHTech", email="dhtech.community@gmail.com"} ? + { name = "Rebecca Sutton Koeser" }, + { name = "Cole Crawford" }, + { name = "Julia Damerow" }, + { name = "Robert Casties" }, + { name = "Malte Vogl" }, ] # currently no maintainers separate from authors -keywords = ["dates", "dating", "uncertainty", "uncertain-dates", "unknown", "partially-known", "digital-humanities"] +keywords = [ + "dates", + "dating", + "uncertainty", + "uncertain-dates", + "unknown", + "partially-known", + "digital-humanities", +] classifiers = [ "Development Status :: 2 - Pre-Alpha", "Programming Language :: Python :: 3", @@ -33,36 +37,32 @@ classifiers = [ "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Intended Audience :: Developers", + "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Scientific/Engineering", "Topic :: Utilities", - "Typing :: Typed" + "Typing :: Typed", + ] [project.optional-dependencies] -docs = [ - "sphinx>=7.0.0", - "alabaster", - "myst-parser" -] +docs = ["sphinx>=7.0.0", "alabaster", "myst-parser", "myst-parser[linkify]"] dev = [ - "black>=22.10.0", - "pre-commit>=2.20.0", - "twine", - "wheel", - "build", - "mypy", - "treon", - "undate", - "undate[docs]", - "undate[test]", -] -test = [ - "pytest>=7.2", - "pytest-ordering", - "pytest-cov", + "ruff", + "pre-commit>=2.20.0", + "twine", + "wheel", + "build", + "mypy", + "treon", + "undate", + "undate[docs]", + "undate[test]", ] +test = ["pytest>=7.2", "pytest-ordering", "pytest-cov"] +all = ["undate[dev]", "undate[test]"] [project.urls] Homepage = "https://github.com/dh-tech/undate-python" @@ -76,15 +76,14 @@ Changelog = "https://github.com/dh-tech/undate/main/master/CHANGELOG.md" path = "src/undate/__init__.py" [tool.hatch.build.targets.sdist] -include = [ - "src/undate/**/*.py", - "src/undate/**/*.lark", - "/tests", -] +include = ["src/undate/**/*.py", "src/undate/**/*.lark", "/tests"] [tool.pytest.ini_options] pythonpath = "src/" markers = [ "last : run marked tests after all others", - "first : run marked tests before all others" -] \ No newline at end of file + "first : run marked tests efore all others", +] + +[tool.mypy] +plugins = ["numpy.typing.mypy_plugin"] diff --git a/src/undate/date.py b/src/undate/date.py new file mode 100644 index 0000000..bac47f3 --- /dev/null +++ b/src/undate/date.py @@ -0,0 +1,146 @@ +from enum import IntEnum + +# Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None +from typing import Optional, Union + +import numpy as np + + +class Timedelta(np.ndarray): + """Convenience class to make :class:`numpy.timedelta64` act + more like the built-in python :class:`datetime.timedelta`.""" + + def __new__(cls, deltadays: Union[np.timedelta64, int]): + if isinstance(deltadays, int): + deltadays = np.timedelta64(deltadays, "D") + data = np.asarray(deltadays, dtype="timedelta64") + return data.view(cls) + + def Export(self): + return self + + def __array_finalize__(self, obj): + if obj is None: + return + + @property + def days(self) -> int: + """number of days, as an integer""" + return int(self.astype("datetime64[D]").astype("int")) + + +#: timedelta for single day +ONE_DAY = Timedelta(1) # ~ equivalent to datetime.timedelta(days=1) +#: timedelta for a single year (non-leap year) +ONE_YEAR = Timedelta(365) # ~ relativedelta(years=1) +#: timedelta for a month, assuming maximum month length (31 days) +ONE_MONTH_MAX = Timedelta(31) + + +class Date(np.ndarray): + """Convenience class to make :class:`numpy.datetime64` act + more like the built-in python :class:`datetime.date`.""" + + # extend np.datetime64 datatype + # adapted from https://stackoverflow.com/a/27129510/9706217 + + def __new__( + cls, + year: Union[int, np.datetime64], + month: Optional[int] = None, + day: Optional[int] = None, + ): + if isinstance(year, np.datetime64): + _data = year + else: + datestr = str(year) + if month is not None: + datestr = f"{year}-{month:02d}" + if day is not None: + datestr = f"{datestr}-{day:02d}" + _data = np.datetime64(datestr) + + data = np.asarray(_data, dtype="datetime64") + + # expected dtype depends on date unit / how much of date is known + expected_unit = "Y" + if day is not None and month is not None: + expected_unit = "D" + elif month: + expected_unit = "M" + expected_dtype = f"datetime64[{expected_unit}]" + + if data.dtype != expected_dtype: + raise ValueError( + f"Unable to parse dates adequately as {expected_dtype}: {data}" + ) + obj = data.view(cls) + return obj + + def Export(self): + return self + + def __array_finalize__(self, obj): + if obj is None: + return + + # custom properties to access year, month, day + + @property + def year(self) -> int: + return int(str(self.astype("datetime64[Y]"))) + + @property + def month(self) -> Optional[int]: + # if date unit is year, don't return a month (only M/D) + if self.dtype != "datetime64[Y]": + return int(str(self.astype("datetime64[M]")).split("-")[-1]) + return None + + @property + def day(self) -> Optional[int]: + # only return a day if date unit is in days + if self.dtype == "datetime64[D]": + return int(str(self.astype("datetime64[D]")).split("-")[-1]) + return None + + def __sub__(self, other): + # modify to conditionally return a timedelta object instead of a + # Date object with dtype timedelta64[D] (default behavior) + + result = super().__sub__(other) + # if the result has a timedelta type (i.e., date minus date = timedelta), + # cast to local Timedelta object; otherwise, leave as is + # (i.e., date minus timedelta = date) + if result.dtype == "timedelta64[D]": + result = Timedelta(result) + return result + + # NOTE: add should not be subclassed because we want to return a Date, not a delta + + +class DatePrecision(IntEnum): + """date precision, to indicate date precision independent from how much + of the date is known.""" + + # NOTE: values MUST be ordered based on the relative size or + # precison of the time unit. That is, the smaller the unit, the more precise + # it is: a day is more precise than a month, a month is more precise than a year, + # (DatePrecision.year < DatePrecision.month) + + #: century + CENTURY = 1 + #: decade + DECADE = 2 + #: year + YEAR = 3 + #: month + MONTH = 4 + #: day + DAY = 5 + + def __str__(self): + return f"{self.name}" + + # NOTE: consider harmonizing / using numpy date units: + # years (β€˜Y’), months (β€˜M’), weeks (β€˜W’), and days (β€˜D’) diff --git a/src/undate/dateformat/__init__.py b/src/undate/dateformat/__init__.py index 7092a80..5dc5c3c 100644 --- a/src/undate/dateformat/__init__.py +++ b/src/undate/dateformat/__init__.py @@ -1,3 +1,3 @@ -from undate.dateformat.base import BaseDateFormat +from undate.dateformat.base import BaseDateFormat as BaseDateFormat # from undate.dateformat.iso8601 import ISO8601DateFormat diff --git a/src/undate/dateformat/base.py b/src/undate/dateformat/base.py index f4435f4..59777b1 100644 --- a/src/undate/dateformat/base.py +++ b/src/undate/dateformat/base.py @@ -15,9 +15,8 @@ import importlib import logging import pkgutil +from functools import cache from typing import Dict, Type -from functools import lru_cache # functools.cache not available until 3.9 - logger = logging.getLogger(__name__) @@ -40,7 +39,7 @@ def to_string(self, undate) -> str: # cache import class method to ensure we only import once @classmethod - @lru_cache + @cache def import_formatters(cls) -> int: """Import all undate.dateformat formatters so that they will be included in available formatters diff --git a/src/undate/dateformat/edtf/parser.py b/src/undate/dateformat/edtf/parser.py index 8826b2d..6ab5139 100644 --- a/src/undate/dateformat/edtf/parser.py +++ b/src/undate/dateformat/edtf/parser.py @@ -2,7 +2,6 @@ from lark import Lark - grammar_path = os.path.join(os.path.dirname(__file__), "edtf.lark") with open(grammar_path) as grammar: diff --git a/src/undate/dateformat/edtf/transformer.py b/src/undate/dateformat/edtf/transformer.py index cca3609..a5578de 100644 --- a/src/undate/dateformat/edtf/transformer.py +++ b/src/undate/dateformat/edtf/transformer.py @@ -1,4 +1,5 @@ -from lark import Transformer, Tree, Token +from lark import Token, Transformer, Tree + from undate.undate import Undate, UndateInterval @@ -67,4 +68,4 @@ def year_fivedigitsplus(self, token): # strip off the leading Y and convert to integer # TODO: undate is currently limited to 4-digit years # (datetime max year of 9999) - return tok.update(int(token[:1])) + return token.update(int(token[:1])) diff --git a/src/undate/dateformat/iso8601.py b/src/undate/dateformat/iso8601.py index f1c5cca..aa3296c 100644 --- a/src/undate/dateformat/iso8601.py +++ b/src/undate/dateformat/iso8601.py @@ -1,6 +1,7 @@ +from typing import Dict, List, Union + from undate.dateformat.base import BaseDateFormat from undate.undate import Undate, UndateInterval -from typing import Dict, List, Union class ISO8601DateFormat(BaseDateFormat): @@ -61,8 +62,11 @@ def to_string(self, undate: Undate) -> str: # and not others; force year to always be 4 digits if date_portion == "year": date_parts.append("%04d" % undate.earliest.year) - else: - date_parts.append(undate.earliest.strftime(iso_format)) + elif date_portion == "month" and undate.earliest.month: + date_parts.append("%02d" % undate.earliest.month) + elif date_portion == "day" and undate.earliest.day: + date_parts.append("%02d" % undate.earliest.day) # type: ignore + elif date_portion == "year": # if not known but this is year, add '-' for --MM-DD unknown year format date_parts.append("-") diff --git a/src/undate/undate.py b/src/undate/undate.py index 971ded7..3ee5dc4 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -1,48 +1,24 @@ import datetime -from calendar import monthrange -from enum import IntEnum import re +from calendar import monthrange # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None -from typing import Optional, Dict, Union - -from dateutil.relativedelta import relativedelta +from typing import Dict, Optional, Union +from undate.date import ONE_DAY, ONE_MONTH_MAX, ONE_YEAR, Date, DatePrecision, Timedelta from undate.dateformat.base import BaseDateFormat -#: duration of a single day -ONE_DAY = datetime.timedelta(days=1) - - -class DatePrecision(IntEnum): - """date precision, to indicate date precision independent from how much - of the date is known.""" - - # numbers should be set to allow logical greater than / less than - # comparison, e.g. year precision < month - - #: day - DAY = 1 - #: month - MONTH = 2 - #: year - YEAR = 3 - - def __str__(self): - return f"{self.name}" - - class Undate: - """Simple object for representing uncertain, fuzzy or partially unknown dates""" + """object for representing uncertain, fuzzy or partially unknown dates""" DEFAULT_FORMAT: str = "ISO8601" #: symbol for unknown digits within a date value MISSING_DIGIT: str = "X" - earliest: datetime.date - latest: datetime.date + earliest: Date + latest: Date #: A string to label a specific undate, e.g. "German Unity Date 2022" for Oct. 3, 2022. #: Labels are not taken into account when comparing undate objects. label: Union[str, None] = None @@ -52,6 +28,13 @@ class Undate: #: known non-leap year NON_LEAP_YEAR: int = 2022 + # numpy datetime is stored as 64-bit integer, so min/max + # depends on the time unit; assume days for now + # See https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units + # It just so happens that int(2.5e16) is a leap year, which is a weird default, + # so let's increase our lower bound by one year. + MIN_ALLOWABLE_YEAR = int(-2.5e16) + 1 + MAX_ALLOWABLE_YEAR = int(2.5e16) def __init__( self, @@ -88,14 +71,16 @@ def __init__( min_year = int(str(year).replace(self.MISSING_DIGIT, "0")) max_year = int(str(year).replace(self.MISSING_DIGIT, "9")) else: - min_year = datetime.MINYEAR - max_year = datetime.MAXYEAR + # use the configured min/max allowable years if we + # don't have any other bounds + min_year = self.MIN_ALLOWABLE_YEAR + max_year = self.MAX_ALLOWABLE_YEAR # if month is passed in as a string but completely unknown, # treat as none # TODO: we should preserve this information somehow; # difference between just a year and and an unknown month within a year - # maybe in terms of granularity / size ? + # maybe in terms of date precision ? if month == "XX": month = None @@ -141,10 +126,13 @@ def __init__( if day is not None: min_day, max_day = self._missing_digit_minmax(day, min_day, max_day) + # TODO: special case, if we get a Feb 29 date with unknown year, + # must switch the min/max years to known leap years! + # for unknowns, assume smallest possible value for earliest and # largest valid for latest - self.earliest = datetime.date(min_year, min_month, min_day) - self.latest = datetime.date(max_year, max_month, max_day) + self.earliest = Date(min_year, min_month, min_day) + self.latest = Date(max_year, max_month, max_day) if formatter is None: # import all subclass definitions; initialize the default @@ -171,7 +159,7 @@ def __str__(self) -> str: f"{day:02d}" if isinstance(day, int) else day, ] # combine, skipping any values that are None - return "-".join([str(p) for p in parts if p != None]) + return "-".join([str(p) for p in parts if p is not None]) return self.formatter.to_string(self) @@ -261,7 +249,7 @@ def __gt__(self, other: object) -> bool: # strictly greater than must rule out equals return not (self < other or self == other) - def __le__(self, other: Union["Undate", datetime.date]) -> bool: + def __le__(self, other: object) -> bool: return self == other or self < other def __contains__(self, other: object) -> bool: @@ -272,15 +260,20 @@ def __contains__(self, other: object) -> bool: if self == other: return False - return ( - self.earliest <= other.earliest - and self.latest >= other.latest - # is precision sufficient for comparing partially known dates? - and self.precision > other.precision + return all( + [ + self.earliest <= other.earliest, + self.latest >= other.latest, + # is precision sufficient for comparing partially known dates? + # checking based on less precise /less granular time unit, + # e.g. a day or month could be contained in a year + # but not the reverse + self.precision < other.precision, + ] ) @staticmethod - def from_datetime_date(dt_date): + def from_datetime_date(dt_date: datetime.date): """Initialize an :class:`Undate` object from a :class:`datetime.date`""" return Undate(dt_date.year, dt_date.month, dt_date.day) @@ -300,7 +293,7 @@ def is_known(self, part: str) -> bool: def is_partially_known(self, part: str) -> bool: return isinstance(self.initial_values[part], str) - def duration(self) -> datetime.timedelta: + def duration(self) -> Timedelta: """What is the duration of this date? Calculate based on earliest and latest date within range, taking into account the precision of the date even if not all @@ -318,16 +311,18 @@ def duration(self) -> datetime.timedelta: if not self.known_year: # if year is unknown, calculate month duration in # a single year - latest = datetime.date( - self.earliest.year, self.latest.month, self.latest.day - ) + latest = Date(self.earliest.year, self.latest.month, self.latest.day) + + # latest = datetime.date( + # self.earliest.year, self.latest.month, self.latest.day + # ) delta = latest - self.earliest + ONE_DAY # month duration can't ever be more than 31 days # (could we ever know if it's smaller?) # if granularity == month but not known month, duration = 31 - if delta.days > 31: - return datetime.timedelta(days=31) + if delta.astype(int) > 31: + return ONE_MONTH_MAX return delta # otherwise, calculate based on earliest/latest range @@ -407,11 +402,11 @@ def __eq__(self, other) -> bool: # consider interval equal if both dates are equal return self.earliest == other.earliest and self.latest == other.latest - def duration(self) -> datetime.timedelta: + def duration(self) -> Timedelta: """Calculate the duration between two undates. :returns: A duration - :rtype: timedelta + :rtype: Timedelta """ # what is the duration of this date range? @@ -432,7 +427,7 @@ def duration(self) -> datetime.timedelta: # to the beginning of the next; # recalculate assuming second date is in the subsequent year if duration.days < 0: - end = self.latest.earliest + relativedelta(years=1) + end = self.latest.earliest + ONE_YEAR duration = end - self.earliest.earliest # add the additional day *after* checking for a negative diff --git a/tests/test_date.py b/tests/test_date.py new file mode 100644 index 0000000..5ff017d --- /dev/null +++ b/tests/test_date.py @@ -0,0 +1,79 @@ +import numpy as np +from undate.date import ONE_YEAR, Date, DatePrecision, Timedelta + + +class TestDatePrecision: + def test_str(self): + assert str(DatePrecision.YEAR) == "YEAR" + + def test_precision_comparison(self): + assert DatePrecision.DAY > DatePrecision.MONTH + assert DatePrecision.MONTH > DatePrecision.YEAR + + +class TestDate: + def test_init_year(self): + d = Date(2001) + assert isinstance(d, Date) + assert d.dtype == "datetime64[Y]" + assert str(d) == "2001" + + def test_init_year_np_datetime64(self): + d = Date(np.datetime64("2024")) + assert isinstance(d, Date) + assert d.dtype == "datetime64[Y]" + assert str(d) == "2024" + + def test_init_year_month(self): + d = Date(2010, 5) + assert isinstance(d, Date) + assert d.dtype == "datetime64[M]" + assert str(d) == "2010-05" + + def test_init_year_month_day(self): + d = Date(2021, 6, 15) + assert isinstance(d, Date) + assert d.dtype == "datetime64[D]" + assert str(d) == "2021-06-15" + + def test_properties_year(self): + assert Date(2001).year == 2001 + assert Date(2010, 5).year == 2010 + assert Date(2021, 6, 15).year == 2021 + + def test_properties_month(self): + assert Date(2001).month is None + assert Date(2010, 5).month == 5 + assert Date(2021, 6, 15).month == 6 + + def test_properties_day(self): + assert Date(2001).day is None + assert Date(2010, 5).day is None + assert Date(2021, 6, 15).day == 15 + + def test_substract(self): + # date - date = timedelta + date_difference = Date(2024, 1, 2) - Date(2024, 1, 1) + assert isinstance(date_difference, Timedelta) + assert date_difference.days == 1 + + # date - timedelta = date + year_prior = Date(2024, 1, 2) - ONE_YEAR + assert isinstance(year_prior, Date) + + +class TestTimeDelta: + def test_init_from_int(self): + td = Timedelta(31) + assert isinstance(td, Timedelta) + assert td.dtype == "timedelta64[D]" + assert td.astype("int") == 31 + + def test_init_from_np_timedelta64(self): + td = Timedelta(np.timedelta64(12, "D")) + assert isinstance(td, Timedelta) + assert td.dtype == "timedelta64[D]" + assert td.astype("int") == 12 + + def test_days(self): + assert Timedelta(10).days == 10 diff --git a/tests/test_dateformat/edtf/test_edtf_parser.py b/tests/test_dateformat/edtf/test_edtf_parser.py index 5a2b8ea..3a2604b 100644 --- a/tests/test_dateformat/edtf/test_edtf_parser.py +++ b/tests/test_dateformat/edtf/test_edtf_parser.py @@ -1,5 +1,4 @@ import pytest - from undate.dateformat.edtf.parser import edtf_parser # for now, just test that valid dates can be parsed diff --git a/tests/test_dateformat/edtf/test_edtf_transformer.py b/tests/test_dateformat/edtf/test_edtf_transformer.py index 3271b8b..821e42e 100644 --- a/tests/test_dateformat/edtf/test_edtf_transformer.py +++ b/tests/test_dateformat/edtf/test_edtf_transformer.py @@ -1,8 +1,7 @@ import pytest - -from undate.undate import Undate, UndateInterval from undate.dateformat.edtf.parser import edtf_parser from undate.dateformat.edtf.transformer import EDTFTransformer +from undate.undate import Undate, UndateInterval # for now, just test that valid dates can be parsed diff --git a/tests/test_dateformat/test_base.py b/tests/test_dateformat/test_base.py index 3687a37..1d184db 100644 --- a/tests/test_dateformat/test_base.py +++ b/tests/test_dateformat/test_base.py @@ -1,14 +1,13 @@ import logging import pytest - from undate.dateformat.base import BaseDateFormat class TestBaseDateFormat: def test_available_formatters(self): available_formatters = BaseDateFormat.available_formatters() - assert type(available_formatters) == dict + assert isinstance(available_formatters, dict) # NOTE: import _after_ generating available formatters # so we can confirm it gets loaded diff --git a/tests/test_undate.py b/tests/test_undate.py index cf0d9ce..39c1f86 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -1,13 +1,9 @@ -from datetime import timedelta, date +import calendar +from datetime import date import pytest - -from undate.undate import Undate, UndateInterval, DatePrecision - - -class TestDatePrecision: - def test_str(self): - assert str(DatePrecision.YEAR) == "YEAR" +from undate.date import Timedelta +from undate.undate import Undate, UndateInterval class TestUndate: @@ -117,6 +113,10 @@ def test_init_partially_known_day(self): uncertain_day = Undate(2024, 2, "2X") assert uncertain_day.latest.day == 29 + # TODO: handle leap day in an unknown year + # (currently causes an exception because min/max years are not leap years) + # Undate(None, 2, 29) + def test_init_invalid(self): with pytest.raises(ValueError): Undate("19xx") @@ -291,14 +291,14 @@ def test_sorting(self): def test_duration(self): day_duration = Undate(2022, 11, 7).duration() - assert isinstance(day_duration, timedelta) + assert isinstance(day_duration, Timedelta) assert day_duration.days == 1 january_duration = Undate(2022, 1).duration() assert january_duration.days == 31 feb_duration = Undate(2022, 2).duration() assert feb_duration.days == 28 - # next leap year will be 2024 + # 2024 is a known leap year leapyear_feb_duration = Undate(2024, 2).duration() assert leapyear_feb_duration.days == 29 @@ -309,6 +309,7 @@ def test_duration(self): def test_partiallyknown_duration(self): # day in unknown month/year + # assert Undate(day=5).duration().days == 1 assert Undate(day=5).duration().days == 1 assert Undate(year=1900, month=11, day="2X").duration().days == 1 @@ -394,11 +395,14 @@ def test_not_eq(self): ) assert UndateInterval(Undate(2022, 5)) != UndateInterval(Undate(2022, 6)) + def test_min_year_non_leapyear(self): + assert not calendar.isleap(Undate.MIN_ALLOWABLE_YEAR) + def test_duration(self): week_duration = UndateInterval( Undate(2022, 11, 1), Undate(2022, 11, 7) ).duration() - assert isinstance(week_duration, timedelta) + assert isinstance(week_duration, Timedelta) assert week_duration.days == 7 twomonths = UndateInterval(Undate(2022, 11), Undate(2022, 12)).duration() @@ -419,14 +423,20 @@ def test_duration(self): Undate(None, 12, 1), Undate(None, 1, 1) ).duration() assert month_noyear_duration.days == 32 - # this seems wrong, but we currently count both start and dates - # real case from Shakespeare and Company Project data; + # real world test cases from Shakespeare and Company Project data; # second date is a year minus one day in the future month_noyear_duration = UndateInterval( Undate(None, 6, 7), Undate(None, 6, 6) ).duration() assert month_noyear_duration.days == 365 + # durations that span february in unknown years should assume + # non-leap years + jan_march_duration = UndateInterval( + Undate(None, 2, 28), Undate(None, 3, 1) + ).duration() + assert jan_march_duration.days == 2 + # duration is not supported for open-ended intervals assert UndateInterval(Undate(2000), None).duration() == NotImplemented